@femtomc/mu-server 26.2.70 → 26.2.72
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/activities.d.ts +2 -0
- package/dist/api/activities.js +160 -0
- package/dist/api/config.d.ts +2 -0
- package/dist/api/config.js +45 -0
- package/dist/api/control_plane.d.ts +2 -0
- package/dist/api/control_plane.js +28 -0
- package/dist/api/cron.d.ts +2 -0
- package/dist/api/cron.js +182 -0
- package/dist/api/heartbeats.d.ts +2 -0
- package/dist/api/heartbeats.js +211 -0
- package/dist/api/identities.d.ts +2 -0
- package/dist/api/identities.js +103 -0
- package/dist/api/runs.d.ts +2 -0
- package/dist/api/runs.js +207 -0
- package/dist/cli.js +58 -3
- package/dist/config.d.ts +4 -21
- package/dist/config.js +24 -75
- package/dist/control_plane.d.ts +4 -2
- package/dist/control_plane.js +226 -25
- package/dist/control_plane_bootstrap_helpers.d.ts +2 -1
- package/dist/control_plane_bootstrap_helpers.js +11 -1
- package/dist/control_plane_contract.d.ts +57 -0
- package/dist/control_plane_contract.js +1 -1
- package/dist/control_plane_reload.d.ts +63 -0
- package/dist/control_plane_reload.js +525 -0
- package/dist/control_plane_run_queue_coordinator.d.ts +48 -0
- package/dist/control_plane_run_queue_coordinator.js +327 -0
- package/dist/control_plane_telegram_generation.js +0 -1
- package/dist/control_plane_wake_delivery.d.ts +50 -0
- package/dist/control_plane_wake_delivery.js +123 -0
- package/dist/index.d.ts +4 -1
- package/dist/index.js +2 -0
- package/dist/run_queue.d.ts +95 -0
- package/dist/run_queue.js +817 -0
- package/dist/run_supervisor.d.ts +20 -0
- package/dist/run_supervisor.js +25 -1
- package/dist/server.d.ts +5 -10
- package/dist/server.js +337 -528
- package/dist/server_program_orchestration.js +2 -0
- package/dist/server_routing.d.ts +3 -2
- package/dist/server_routing.js +28 -900
- package/package.json +7 -6
package/dist/server.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import { GenerationTelemetryRecorder
|
|
1
|
+
import { GenerationTelemetryRecorder } from "@femtomc/mu-control-plane";
|
|
2
2
|
import { currentRunId, EventLog, FsJsonlStore, getStorePaths, JsonlEventSink } from "@femtomc/mu-core/node";
|
|
3
3
|
import { ForumStore } from "@femtomc/mu-forum";
|
|
4
4
|
import { IssueStore } from "@femtomc/mu-issue";
|
|
5
5
|
import { ControlPlaneActivitySupervisor } from "./activity_supervisor.js";
|
|
6
6
|
import { DEFAULT_MU_CONFIG, readMuConfigFile, writeMuConfigFile, } from "./config.js";
|
|
7
7
|
import { bootstrapControlPlane } from "./control_plane.js";
|
|
8
|
-
import {
|
|
8
|
+
import { createReloadManager, } from "./control_plane_reload.js";
|
|
9
9
|
import { ActivityHeartbeatScheduler } from "./heartbeat_scheduler.js";
|
|
10
10
|
import { createProcessSessionLifecycle } from "./session_lifecycle.js";
|
|
11
11
|
import { createServerProgramOrchestration } from "./server_program_orchestration.js";
|
|
@@ -19,16 +19,85 @@ function describeError(err) {
|
|
|
19
19
|
return err.message;
|
|
20
20
|
return String(err);
|
|
21
21
|
}
|
|
22
|
-
function
|
|
23
|
-
if (!handle) {
|
|
24
|
-
return { active: false, adapters: [], routes: [] };
|
|
25
|
-
}
|
|
22
|
+
function emptyNotifyOperatorsResult() {
|
|
26
23
|
return {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
24
|
+
queued: 0,
|
|
25
|
+
duplicate: 0,
|
|
26
|
+
skipped: 0,
|
|
27
|
+
decisions: [],
|
|
30
28
|
};
|
|
31
29
|
}
|
|
30
|
+
function normalizeWakeTurnMode(value) {
|
|
31
|
+
if (typeof value !== "string") {
|
|
32
|
+
return "off";
|
|
33
|
+
}
|
|
34
|
+
const normalized = value.trim().toLowerCase();
|
|
35
|
+
if (normalized === "shadow") {
|
|
36
|
+
return "shadow";
|
|
37
|
+
}
|
|
38
|
+
if (normalized === "active") {
|
|
39
|
+
return "active";
|
|
40
|
+
}
|
|
41
|
+
return "off";
|
|
42
|
+
}
|
|
43
|
+
function stringField(payload, key) {
|
|
44
|
+
const value = payload[key];
|
|
45
|
+
if (typeof value !== "string") {
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
const trimmed = value.trim();
|
|
49
|
+
return trimmed.length > 0 ? trimmed : null;
|
|
50
|
+
}
|
|
51
|
+
function numberField(payload, key) {
|
|
52
|
+
const value = payload[key];
|
|
53
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
return Math.trunc(value);
|
|
57
|
+
}
|
|
58
|
+
function computeWakeId(opts) {
|
|
59
|
+
const source = stringField(opts.payload, "wake_source") ?? "unknown";
|
|
60
|
+
const programId = stringField(opts.payload, "program_id") ?? "unknown";
|
|
61
|
+
const sourceTsMs = numberField(opts.payload, "source_ts_ms");
|
|
62
|
+
const target = Object.hasOwn(opts.payload, "target") ? opts.payload.target : null;
|
|
63
|
+
let targetFingerprint = "null";
|
|
64
|
+
try {
|
|
65
|
+
targetFingerprint = JSON.stringify(target) ?? "null";
|
|
66
|
+
}
|
|
67
|
+
catch {
|
|
68
|
+
targetFingerprint = "[unserializable]";
|
|
69
|
+
}
|
|
70
|
+
const hasher = new Bun.CryptoHasher("sha256");
|
|
71
|
+
hasher.update(`${source}|${programId}|${sourceTsMs ?? "na"}|${opts.dedupeKey}|${targetFingerprint}`);
|
|
72
|
+
return hasher.digest("hex").slice(0, 16);
|
|
73
|
+
}
|
|
74
|
+
function buildWakeTurnCommandText(opts) {
|
|
75
|
+
const wakeSource = stringField(opts.payload, "wake_source") ?? "unknown";
|
|
76
|
+
const programId = stringField(opts.payload, "program_id") ?? "unknown";
|
|
77
|
+
const wakeMode = stringField(opts.payload, "wake_mode") ?? "immediate";
|
|
78
|
+
const targetKind = stringField(opts.payload, "target_kind") ?? "unknown";
|
|
79
|
+
const reason = stringField(opts.payload, "reason") ?? "scheduled";
|
|
80
|
+
let target = "null";
|
|
81
|
+
try {
|
|
82
|
+
target = JSON.stringify(Object.hasOwn(opts.payload, "target") ? opts.payload.target : null) ?? "null";
|
|
83
|
+
}
|
|
84
|
+
catch {
|
|
85
|
+
target = "[unserializable]";
|
|
86
|
+
}
|
|
87
|
+
return [
|
|
88
|
+
"Autonomous wake turn triggered by heartbeat/cron scheduler.",
|
|
89
|
+
`wake_id=${opts.wakeId}`,
|
|
90
|
+
`wake_source=${wakeSource}`,
|
|
91
|
+
`program_id=${programId}`,
|
|
92
|
+
`wake_mode=${wakeMode}`,
|
|
93
|
+
`target_kind=${targetKind}`,
|
|
94
|
+
`reason=${reason}`,
|
|
95
|
+
`message=${opts.message}`,
|
|
96
|
+
`target=${target}`,
|
|
97
|
+
"",
|
|
98
|
+
"If an action is needed, produce exactly one `/mu ...` command. If no action is needed, provide a short operator response.",
|
|
99
|
+
].join("\n");
|
|
100
|
+
}
|
|
32
101
|
export function createContext(repoRoot) {
|
|
33
102
|
const paths = getStorePaths(repoRoot);
|
|
34
103
|
const eventsStore = new FsJsonlStore(paths.eventsPath);
|
|
@@ -68,6 +137,12 @@ function createServer(options = {}) {
|
|
|
68
137
|
const autoRunHeartbeatEveryMs = Math.max(1_000, toNonNegativeInt(options.autoRunHeartbeatEveryMs, DEFAULT_AUTO_RUN_HEARTBEAT_EVERY_MS));
|
|
69
138
|
const operatorWakeLastByKey = new Map();
|
|
70
139
|
const sessionLifecycle = options.sessionLifecycle ?? createProcessSessionLifecycle({ repoRoot });
|
|
140
|
+
const emitWakeDeliveryEvent = async (payload) => {
|
|
141
|
+
await context.eventLog.emit("operator.wake.delivery", {
|
|
142
|
+
source: "mu-server.operator-wake",
|
|
143
|
+
payload,
|
|
144
|
+
});
|
|
145
|
+
};
|
|
71
146
|
const emitOperatorWake = async (opts) => {
|
|
72
147
|
const dedupeKey = opts.dedupeKey.trim();
|
|
73
148
|
if (!dedupeKey) {
|
|
@@ -80,6 +155,160 @@ function createServer(options = {}) {
|
|
|
80
155
|
return false;
|
|
81
156
|
}
|
|
82
157
|
operatorWakeLastByKey.set(dedupeKey, nowMs);
|
|
158
|
+
const wakeId = computeWakeId({ dedupeKey, payload: opts.payload });
|
|
159
|
+
const selectedWakeMode = stringField(opts.payload, "wake_mode");
|
|
160
|
+
const wakeSource = stringField(opts.payload, "wake_source");
|
|
161
|
+
const programId = stringField(opts.payload, "program_id");
|
|
162
|
+
const sourceTsMs = numberField(opts.payload, "source_ts_ms");
|
|
163
|
+
let wakeTurnMode = normalizeWakeTurnMode(fallbackConfig.control_plane.operator.wake_turn_mode);
|
|
164
|
+
let configReadError = null;
|
|
165
|
+
try {
|
|
166
|
+
const config = await loadConfigFromDisk();
|
|
167
|
+
wakeTurnMode = normalizeWakeTurnMode(config.control_plane.operator.wake_turn_mode);
|
|
168
|
+
}
|
|
169
|
+
catch (err) {
|
|
170
|
+
configReadError = describeError(err);
|
|
171
|
+
}
|
|
172
|
+
let decision;
|
|
173
|
+
if (wakeTurnMode === "off") {
|
|
174
|
+
decision = {
|
|
175
|
+
outcome: "skipped",
|
|
176
|
+
reason: "feature_disabled",
|
|
177
|
+
wakeTurnMode,
|
|
178
|
+
selectedWakeMode,
|
|
179
|
+
turnRequestId: null,
|
|
180
|
+
turnResultKind: null,
|
|
181
|
+
error: configReadError,
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
else if (wakeTurnMode === "shadow") {
|
|
185
|
+
decision = {
|
|
186
|
+
outcome: "skipped",
|
|
187
|
+
reason: "shadow_mode",
|
|
188
|
+
wakeTurnMode,
|
|
189
|
+
selectedWakeMode,
|
|
190
|
+
turnRequestId: null,
|
|
191
|
+
turnResultKind: null,
|
|
192
|
+
error: configReadError,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
else if (typeof controlPlaneProxy.submitTerminalCommand !== "function") {
|
|
196
|
+
decision = {
|
|
197
|
+
outcome: "fallback",
|
|
198
|
+
reason: "control_plane_unavailable",
|
|
199
|
+
wakeTurnMode,
|
|
200
|
+
selectedWakeMode,
|
|
201
|
+
turnRequestId: null,
|
|
202
|
+
turnResultKind: null,
|
|
203
|
+
error: configReadError,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
const turnRequestId = `wake-turn-${wakeId}`;
|
|
208
|
+
try {
|
|
209
|
+
const turnResult = await controlPlaneProxy.submitTerminalCommand({
|
|
210
|
+
commandText: buildWakeTurnCommandText({
|
|
211
|
+
wakeId,
|
|
212
|
+
message: opts.message,
|
|
213
|
+
payload: opts.payload,
|
|
214
|
+
}),
|
|
215
|
+
repoRoot: context.repoRoot,
|
|
216
|
+
requestId: turnRequestId,
|
|
217
|
+
});
|
|
218
|
+
if (turnResult.kind === "noop" || turnResult.kind === "invalid") {
|
|
219
|
+
decision = {
|
|
220
|
+
outcome: "fallback",
|
|
221
|
+
reason: `turn_result_${turnResult.kind}`,
|
|
222
|
+
wakeTurnMode,
|
|
223
|
+
selectedWakeMode,
|
|
224
|
+
turnRequestId,
|
|
225
|
+
turnResultKind: turnResult.kind,
|
|
226
|
+
error: configReadError,
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
else {
|
|
230
|
+
decision = {
|
|
231
|
+
outcome: "triggered",
|
|
232
|
+
reason: "turn_invoked",
|
|
233
|
+
wakeTurnMode,
|
|
234
|
+
selectedWakeMode,
|
|
235
|
+
turnRequestId,
|
|
236
|
+
turnResultKind: turnResult.kind,
|
|
237
|
+
error: configReadError,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
catch (err) {
|
|
242
|
+
const error = describeError(err);
|
|
243
|
+
decision = {
|
|
244
|
+
outcome: "fallback",
|
|
245
|
+
reason: error === "control_plane_unavailable" ? "control_plane_unavailable" : "turn_execution_failed",
|
|
246
|
+
wakeTurnMode,
|
|
247
|
+
selectedWakeMode,
|
|
248
|
+
turnRequestId,
|
|
249
|
+
turnResultKind: null,
|
|
250
|
+
error,
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
await context.eventLog.emit("operator.wake.decision", {
|
|
255
|
+
source: "mu-server.operator-wake",
|
|
256
|
+
payload: {
|
|
257
|
+
wake_id: wakeId,
|
|
258
|
+
dedupe_key: dedupeKey,
|
|
259
|
+
wake_source: wakeSource,
|
|
260
|
+
program_id: programId,
|
|
261
|
+
source_ts_ms: sourceTsMs,
|
|
262
|
+
selected_wake_mode: selectedWakeMode,
|
|
263
|
+
wake_turn_mode: decision.wakeTurnMode,
|
|
264
|
+
wake_turn_feature_enabled: decision.wakeTurnMode === "active",
|
|
265
|
+
outcome: decision.outcome,
|
|
266
|
+
reason: decision.reason,
|
|
267
|
+
turn_request_id: decision.turnRequestId,
|
|
268
|
+
turn_result_kind: decision.turnResultKind,
|
|
269
|
+
error: decision.error,
|
|
270
|
+
},
|
|
271
|
+
});
|
|
272
|
+
let notifyResult = emptyNotifyOperatorsResult();
|
|
273
|
+
let notifyError = null;
|
|
274
|
+
if (typeof controlPlaneProxy.notifyOperators === "function") {
|
|
275
|
+
try {
|
|
276
|
+
notifyResult = await controlPlaneProxy.notifyOperators({
|
|
277
|
+
message: opts.message,
|
|
278
|
+
dedupeKey,
|
|
279
|
+
wake: {
|
|
280
|
+
wakeId,
|
|
281
|
+
wakeSource,
|
|
282
|
+
programId,
|
|
283
|
+
sourceTsMs,
|
|
284
|
+
},
|
|
285
|
+
metadata: {
|
|
286
|
+
wake_delivery_reason: "heartbeat_cron_wake",
|
|
287
|
+
wake_turn_outcome: decision.outcome,
|
|
288
|
+
wake_turn_reason: decision.reason,
|
|
289
|
+
},
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
catch (err) {
|
|
293
|
+
notifyError = describeError(err);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
for (const deliveryDecision of notifyResult.decisions) {
|
|
297
|
+
await emitWakeDeliveryEvent({
|
|
298
|
+
state: deliveryDecision.state,
|
|
299
|
+
reason_code: deliveryDecision.reason_code,
|
|
300
|
+
wake_id: wakeId,
|
|
301
|
+
dedupe_key: dedupeKey,
|
|
302
|
+
binding_id: deliveryDecision.binding_id,
|
|
303
|
+
channel: deliveryDecision.channel,
|
|
304
|
+
outbox_id: deliveryDecision.outbox_id,
|
|
305
|
+
outbox_dedupe_key: deliveryDecision.dedupe_key,
|
|
306
|
+
attempt_count: null,
|
|
307
|
+
wake_source: wakeSource,
|
|
308
|
+
program_id: programId,
|
|
309
|
+
source_ts_ms: sourceTsMs,
|
|
310
|
+
});
|
|
311
|
+
}
|
|
83
312
|
await context.eventLog.emit("operator.wake", {
|
|
84
313
|
source: "mu-server.operator-wake",
|
|
85
314
|
payload: {
|
|
@@ -87,28 +316,43 @@ function createServer(options = {}) {
|
|
|
87
316
|
dedupe_key: dedupeKey,
|
|
88
317
|
coalesce_ms: coalesceMs,
|
|
89
318
|
...opts.payload,
|
|
319
|
+
wake_id: wakeId,
|
|
320
|
+
decision_outcome: decision.outcome,
|
|
321
|
+
decision_reason: decision.reason,
|
|
322
|
+
wake_turn_mode: decision.wakeTurnMode,
|
|
323
|
+
selected_wake_mode: decision.selectedWakeMode,
|
|
324
|
+
wake_turn_feature_enabled: decision.wakeTurnMode === "active",
|
|
325
|
+
turn_request_id: decision.turnRequestId,
|
|
326
|
+
turn_result_kind: decision.turnResultKind,
|
|
327
|
+
decision_error: decision.error,
|
|
328
|
+
delivery: {
|
|
329
|
+
queued: notifyResult.queued,
|
|
330
|
+
duplicate: notifyResult.duplicate,
|
|
331
|
+
skipped: notifyResult.skipped,
|
|
332
|
+
},
|
|
333
|
+
delivery_summary_v2: {
|
|
334
|
+
queued: notifyResult.queued,
|
|
335
|
+
duplicate: notifyResult.duplicate,
|
|
336
|
+
skipped: notifyResult.skipped,
|
|
337
|
+
total: notifyResult.decisions.length,
|
|
338
|
+
},
|
|
339
|
+
delivery_error: notifyError,
|
|
90
340
|
},
|
|
91
341
|
});
|
|
92
342
|
return true;
|
|
93
343
|
};
|
|
94
|
-
let controlPlaneCurrent = options.controlPlane ?? null;
|
|
95
|
-
let reloadInFlight = null;
|
|
96
344
|
const generationTelemetry = options.generationTelemetry ?? new GenerationTelemetryRecorder();
|
|
97
|
-
const
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
345
|
+
const loadConfigFromDisk = async () => {
|
|
346
|
+
try {
|
|
347
|
+
return await readConfig(context.repoRoot);
|
|
348
|
+
}
|
|
349
|
+
catch (err) {
|
|
350
|
+
if (err?.code === "ENOENT") {
|
|
351
|
+
return fallbackConfig;
|
|
103
352
|
}
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
generation_id: generation.generation_id,
|
|
108
|
-
generation_seq: generation.generation_seq,
|
|
109
|
-
supervisor: "control_plane",
|
|
110
|
-
component,
|
|
111
|
-
});
|
|
353
|
+
throw err;
|
|
354
|
+
}
|
|
355
|
+
};
|
|
112
356
|
const controlPlaneReloader = options.controlPlaneReloader ??
|
|
113
357
|
(async ({ repoRoot, config, generation }) => {
|
|
114
358
|
return await bootstrapControlPlane({
|
|
@@ -118,75 +362,129 @@ function createServer(options = {}) {
|
|
|
118
362
|
generation,
|
|
119
363
|
telemetry: generationTelemetry,
|
|
120
364
|
sessionLifecycle,
|
|
365
|
+
wakeDeliveryObserver: (event) => {
|
|
366
|
+
void emitWakeDeliveryEvent({
|
|
367
|
+
state: event.state,
|
|
368
|
+
reason_code: event.reason_code,
|
|
369
|
+
wake_id: event.wake_id,
|
|
370
|
+
dedupe_key: event.dedupe_key,
|
|
371
|
+
binding_id: event.binding_id,
|
|
372
|
+
channel: event.channel,
|
|
373
|
+
outbox_id: event.outbox_id,
|
|
374
|
+
outbox_dedupe_key: event.outbox_dedupe_key,
|
|
375
|
+
attempt_count: event.attempt_count,
|
|
376
|
+
});
|
|
377
|
+
},
|
|
121
378
|
terminalEnabled: true,
|
|
122
379
|
});
|
|
123
380
|
});
|
|
381
|
+
const reloadManager = createReloadManager({
|
|
382
|
+
repoRoot: context.repoRoot,
|
|
383
|
+
initialControlPlane: options.controlPlane ?? null,
|
|
384
|
+
controlPlaneReloader,
|
|
385
|
+
generationTelemetry,
|
|
386
|
+
loadConfigFromDisk,
|
|
387
|
+
});
|
|
388
|
+
const applyWakeDeliveryObserver = () => {
|
|
389
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
390
|
+
handle?.setWakeDeliveryObserver?.((event) => {
|
|
391
|
+
void emitWakeDeliveryEvent({
|
|
392
|
+
state: event.state,
|
|
393
|
+
reason_code: event.reason_code,
|
|
394
|
+
wake_id: event.wake_id,
|
|
395
|
+
dedupe_key: event.dedupe_key,
|
|
396
|
+
binding_id: event.binding_id,
|
|
397
|
+
channel: event.channel,
|
|
398
|
+
outbox_id: event.outbox_id,
|
|
399
|
+
outbox_dedupe_key: event.outbox_dedupe_key,
|
|
400
|
+
attempt_count: event.attempt_count,
|
|
401
|
+
});
|
|
402
|
+
});
|
|
403
|
+
};
|
|
404
|
+
applyWakeDeliveryObserver();
|
|
405
|
+
const reloadControlPlane = async (reason) => {
|
|
406
|
+
const result = await reloadManager.reloadControlPlane(reason);
|
|
407
|
+
applyWakeDeliveryObserver();
|
|
408
|
+
return result;
|
|
409
|
+
};
|
|
124
410
|
const controlPlaneProxy = {
|
|
125
411
|
get activeAdapters() {
|
|
126
|
-
return
|
|
412
|
+
return reloadManager.getControlPlaneCurrent()?.activeAdapters ?? [];
|
|
127
413
|
},
|
|
128
414
|
async handleWebhook(path, req) {
|
|
129
|
-
const handle =
|
|
415
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
130
416
|
if (!handle)
|
|
131
417
|
return null;
|
|
132
418
|
return await handle.handleWebhook(path, req);
|
|
133
419
|
},
|
|
420
|
+
async notifyOperators(opts) {
|
|
421
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
422
|
+
if (!handle?.notifyOperators) {
|
|
423
|
+
return emptyNotifyOperatorsResult();
|
|
424
|
+
}
|
|
425
|
+
return await handle.notifyOperators(opts);
|
|
426
|
+
},
|
|
427
|
+
setWakeDeliveryObserver(observer) {
|
|
428
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
429
|
+
handle?.setWakeDeliveryObserver?.(observer ?? null);
|
|
430
|
+
},
|
|
134
431
|
async listRuns(opts) {
|
|
135
|
-
const handle =
|
|
432
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
136
433
|
if (!handle?.listRuns)
|
|
137
434
|
return [];
|
|
138
435
|
return await handle.listRuns(opts);
|
|
139
436
|
},
|
|
140
437
|
async getRun(idOrRoot) {
|
|
141
|
-
const handle =
|
|
438
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
142
439
|
if (!handle?.getRun)
|
|
143
440
|
return null;
|
|
144
441
|
return await handle.getRun(idOrRoot);
|
|
145
442
|
},
|
|
146
443
|
async startRun(opts) {
|
|
147
|
-
const handle =
|
|
444
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
148
445
|
if (!handle?.startRun) {
|
|
149
446
|
throw new Error("run_supervisor_unavailable");
|
|
150
447
|
}
|
|
151
448
|
return await handle.startRun(opts);
|
|
152
449
|
},
|
|
153
450
|
async resumeRun(opts) {
|
|
154
|
-
const handle =
|
|
451
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
155
452
|
if (!handle?.resumeRun) {
|
|
156
453
|
throw new Error("run_supervisor_unavailable");
|
|
157
454
|
}
|
|
158
455
|
return await handle.resumeRun(opts);
|
|
159
456
|
},
|
|
160
457
|
async interruptRun(opts) {
|
|
161
|
-
const handle =
|
|
458
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
162
459
|
if (!handle?.interruptRun) {
|
|
163
460
|
return { ok: false, reason: "not_found", run: null };
|
|
164
461
|
}
|
|
165
462
|
return await handle.interruptRun(opts);
|
|
166
463
|
},
|
|
167
464
|
async heartbeatRun(opts) {
|
|
168
|
-
const handle =
|
|
465
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
169
466
|
if (!handle?.heartbeatRun) {
|
|
170
467
|
return { ok: false, reason: "not_found", run: null };
|
|
171
468
|
}
|
|
172
469
|
return await handle.heartbeatRun(opts);
|
|
173
470
|
},
|
|
174
471
|
async traceRun(opts) {
|
|
175
|
-
const handle =
|
|
472
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
176
473
|
if (!handle?.traceRun)
|
|
177
474
|
return null;
|
|
178
475
|
return await handle.traceRun(opts);
|
|
179
476
|
},
|
|
180
477
|
async submitTerminalCommand(opts) {
|
|
181
|
-
const handle =
|
|
478
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
182
479
|
if (!handle?.submitTerminalCommand) {
|
|
183
480
|
throw new Error("control_plane_unavailable");
|
|
184
481
|
}
|
|
185
482
|
return await handle.submitTerminalCommand(opts);
|
|
186
483
|
},
|
|
187
484
|
async stop() {
|
|
188
|
-
const handle =
|
|
189
|
-
|
|
485
|
+
const handle = reloadManager.getControlPlaneCurrent();
|
|
486
|
+
handle?.setWakeDeliveryObserver?.(null);
|
|
487
|
+
reloadManager.setControlPlaneCurrent(null);
|
|
190
488
|
await handle?.stop();
|
|
191
489
|
},
|
|
192
490
|
};
|
|
@@ -199,490 +497,6 @@ function createServer(options = {}) {
|
|
|
199
497
|
autoRunHeartbeatEveryMs,
|
|
200
498
|
emitOperatorWake,
|
|
201
499
|
});
|
|
202
|
-
const loadConfigFromDisk = async () => {
|
|
203
|
-
try {
|
|
204
|
-
return await readConfig(context.repoRoot);
|
|
205
|
-
}
|
|
206
|
-
catch (err) {
|
|
207
|
-
if (err?.code === "ENOENT") {
|
|
208
|
-
return fallbackConfig;
|
|
209
|
-
}
|
|
210
|
-
throw err;
|
|
211
|
-
}
|
|
212
|
-
};
|
|
213
|
-
const performControlPlaneReload = async (reason) => {
|
|
214
|
-
const startedAtMs = Date.now();
|
|
215
|
-
const planned = generationSupervisor.beginReload(reason);
|
|
216
|
-
const attempt = planned.attempt;
|
|
217
|
-
const previous = controlPlaneCurrent;
|
|
218
|
-
const previousSummary = summarizeControlPlane(previous);
|
|
219
|
-
const tags = generationTagsFor(attempt.to_generation, "server.reload");
|
|
220
|
-
const baseFields = {
|
|
221
|
-
reason,
|
|
222
|
-
attempt_id: attempt.attempt_id,
|
|
223
|
-
coalesced: planned.coalesced,
|
|
224
|
-
from_generation_id: attempt.from_generation?.generation_id ?? null,
|
|
225
|
-
};
|
|
226
|
-
const logLifecycle = (opts) => {
|
|
227
|
-
generationTelemetry.log({
|
|
228
|
-
level: opts.level,
|
|
229
|
-
message: `reload transition ${opts.stage}:${opts.state}`,
|
|
230
|
-
fields: {
|
|
231
|
-
...tags,
|
|
232
|
-
...baseFields,
|
|
233
|
-
...(opts.extra ?? {}),
|
|
234
|
-
},
|
|
235
|
-
});
|
|
236
|
-
};
|
|
237
|
-
let swapped = false;
|
|
238
|
-
let failedStage = "warmup";
|
|
239
|
-
let drainDurationMs = 0;
|
|
240
|
-
let drainStartedAtMs = null;
|
|
241
|
-
let nextHandle = null;
|
|
242
|
-
try {
|
|
243
|
-
logLifecycle({ level: "info", stage: "warmup", state: "start" });
|
|
244
|
-
const latestConfig = await loadConfigFromDisk();
|
|
245
|
-
const telegramGeneration = (await previous?.reloadTelegramGeneration?.({
|
|
246
|
-
config: latestConfig.control_plane,
|
|
247
|
-
reason,
|
|
248
|
-
})) ?? null;
|
|
249
|
-
if (telegramGeneration?.handled) {
|
|
250
|
-
if (telegramGeneration.warmup) {
|
|
251
|
-
logLifecycle({
|
|
252
|
-
level: telegramGeneration.warmup.ok ? "info" : "error",
|
|
253
|
-
stage: "warmup",
|
|
254
|
-
state: telegramGeneration.warmup.ok ? "complete" : "failed",
|
|
255
|
-
extra: {
|
|
256
|
-
warmup_elapsed_ms: telegramGeneration.warmup.elapsed_ms,
|
|
257
|
-
error: telegramGeneration.warmup.error,
|
|
258
|
-
telegram_generation_id: telegramGeneration.to_generation?.generation_id ?? null,
|
|
259
|
-
},
|
|
260
|
-
});
|
|
261
|
-
}
|
|
262
|
-
else {
|
|
263
|
-
logLifecycle({
|
|
264
|
-
level: "info",
|
|
265
|
-
stage: "warmup",
|
|
266
|
-
state: "skipped",
|
|
267
|
-
extra: {
|
|
268
|
-
warmup_reason: "telegram_generation_no_warmup",
|
|
269
|
-
telegram_generation_id: telegramGeneration.to_generation?.generation_id ?? null,
|
|
270
|
-
},
|
|
271
|
-
});
|
|
272
|
-
}
|
|
273
|
-
if (telegramGeneration.cutover) {
|
|
274
|
-
logLifecycle({ level: "info", stage: "cutover", state: "start" });
|
|
275
|
-
logLifecycle({
|
|
276
|
-
level: telegramGeneration.cutover.ok ? "info" : "error",
|
|
277
|
-
stage: "cutover",
|
|
278
|
-
state: telegramGeneration.cutover.ok ? "complete" : "failed",
|
|
279
|
-
extra: {
|
|
280
|
-
cutover_elapsed_ms: telegramGeneration.cutover.elapsed_ms,
|
|
281
|
-
error: telegramGeneration.cutover.error,
|
|
282
|
-
active_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
283
|
-
},
|
|
284
|
-
});
|
|
285
|
-
}
|
|
286
|
-
else {
|
|
287
|
-
logLifecycle({
|
|
288
|
-
level: "info",
|
|
289
|
-
stage: "cutover",
|
|
290
|
-
state: "skipped",
|
|
291
|
-
extra: {
|
|
292
|
-
cutover_reason: "telegram_generation_no_cutover",
|
|
293
|
-
active_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
294
|
-
},
|
|
295
|
-
});
|
|
296
|
-
}
|
|
297
|
-
if (telegramGeneration.drain) {
|
|
298
|
-
logLifecycle({ level: "info", stage: "drain", state: "start" });
|
|
299
|
-
drainDurationMs = Math.max(0, Math.trunc(telegramGeneration.drain.elapsed_ms));
|
|
300
|
-
generationTelemetry.recordDrainDuration(tags, {
|
|
301
|
-
durationMs: drainDurationMs,
|
|
302
|
-
timedOut: telegramGeneration.drain.timed_out,
|
|
303
|
-
metadata: {
|
|
304
|
-
...baseFields,
|
|
305
|
-
telegram_forced_stop: telegramGeneration.drain.forced_stop,
|
|
306
|
-
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
307
|
-
},
|
|
308
|
-
});
|
|
309
|
-
logLifecycle({
|
|
310
|
-
level: telegramGeneration.drain.ok ? "info" : "warn",
|
|
311
|
-
stage: "drain",
|
|
312
|
-
state: telegramGeneration.drain.ok ? "complete" : "failed",
|
|
313
|
-
extra: {
|
|
314
|
-
drain_duration_ms: telegramGeneration.drain.elapsed_ms,
|
|
315
|
-
drain_timed_out: telegramGeneration.drain.timed_out,
|
|
316
|
-
forced_stop: telegramGeneration.drain.forced_stop,
|
|
317
|
-
error: telegramGeneration.drain.error,
|
|
318
|
-
},
|
|
319
|
-
});
|
|
320
|
-
}
|
|
321
|
-
else {
|
|
322
|
-
logLifecycle({
|
|
323
|
-
level: "info",
|
|
324
|
-
stage: "drain",
|
|
325
|
-
state: "skipped",
|
|
326
|
-
extra: {
|
|
327
|
-
drain_reason: "telegram_generation_no_drain",
|
|
328
|
-
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
329
|
-
},
|
|
330
|
-
});
|
|
331
|
-
}
|
|
332
|
-
const shouldLogRollbackStart = telegramGeneration.rollback.requested ||
|
|
333
|
-
telegramGeneration.rollback.attempted ||
|
|
334
|
-
telegramGeneration.rollback.trigger != null ||
|
|
335
|
-
!telegramGeneration.ok;
|
|
336
|
-
if (shouldLogRollbackStart) {
|
|
337
|
-
logLifecycle({
|
|
338
|
-
level: telegramGeneration.rollback.ok ? "warn" : "error",
|
|
339
|
-
stage: "rollback",
|
|
340
|
-
state: "start",
|
|
341
|
-
extra: {
|
|
342
|
-
rollback_requested: telegramGeneration.rollback.requested,
|
|
343
|
-
rollback_trigger: telegramGeneration.rollback.trigger,
|
|
344
|
-
rollback_attempted: telegramGeneration.rollback.attempted,
|
|
345
|
-
},
|
|
346
|
-
});
|
|
347
|
-
logLifecycle({
|
|
348
|
-
level: telegramGeneration.rollback.ok ? "info" : "error",
|
|
349
|
-
stage: "rollback",
|
|
350
|
-
state: telegramGeneration.rollback.ok ? "complete" : "failed",
|
|
351
|
-
extra: {
|
|
352
|
-
rollback_requested: telegramGeneration.rollback.requested,
|
|
353
|
-
rollback_trigger: telegramGeneration.rollback.trigger,
|
|
354
|
-
rollback_attempted: telegramGeneration.rollback.attempted,
|
|
355
|
-
error: telegramGeneration.rollback.error,
|
|
356
|
-
},
|
|
357
|
-
});
|
|
358
|
-
}
|
|
359
|
-
else {
|
|
360
|
-
logLifecycle({
|
|
361
|
-
level: "debug",
|
|
362
|
-
stage: "rollback",
|
|
363
|
-
state: "skipped",
|
|
364
|
-
extra: {
|
|
365
|
-
rollback_reason: "not_requested",
|
|
366
|
-
},
|
|
367
|
-
});
|
|
368
|
-
}
|
|
369
|
-
if (telegramGeneration.ok) {
|
|
370
|
-
swapped = generationSupervisor.markSwapInstalled(attempt.attempt_id);
|
|
371
|
-
generationSupervisor.finishReload(attempt.attempt_id, "success");
|
|
372
|
-
const elapsedMs = Math.max(0, Date.now() - startedAtMs);
|
|
373
|
-
generationTelemetry.recordReloadSuccess(tags, {
|
|
374
|
-
...baseFields,
|
|
375
|
-
elapsed_ms: elapsedMs,
|
|
376
|
-
drain_duration_ms: drainDurationMs,
|
|
377
|
-
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
378
|
-
telegram_rollback_attempted: telegramGeneration.rollback.attempted,
|
|
379
|
-
telegram_rollback_trigger: telegramGeneration.rollback.trigger,
|
|
380
|
-
});
|
|
381
|
-
generationTelemetry.trace({
|
|
382
|
-
name: "control_plane.reload",
|
|
383
|
-
status: "ok",
|
|
384
|
-
durationMs: elapsedMs,
|
|
385
|
-
fields: {
|
|
386
|
-
...tags,
|
|
387
|
-
...baseFields,
|
|
388
|
-
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
389
|
-
},
|
|
390
|
-
});
|
|
391
|
-
return {
|
|
392
|
-
ok: true,
|
|
393
|
-
reason,
|
|
394
|
-
previous_control_plane: previousSummary,
|
|
395
|
-
control_plane: summarizeControlPlane(controlPlaneCurrent),
|
|
396
|
-
generation: {
|
|
397
|
-
attempt_id: attempt.attempt_id,
|
|
398
|
-
coalesced: planned.coalesced,
|
|
399
|
-
from_generation: attempt.from_generation,
|
|
400
|
-
to_generation: attempt.to_generation,
|
|
401
|
-
active_generation: generationSupervisor.activeGeneration(),
|
|
402
|
-
outcome: "success",
|
|
403
|
-
},
|
|
404
|
-
telegram_generation: telegramGeneration,
|
|
405
|
-
};
|
|
406
|
-
}
|
|
407
|
-
generationSupervisor.finishReload(attempt.attempt_id, "failure");
|
|
408
|
-
const error = telegramGeneration.error ?? "telegram_generation_reload_failed";
|
|
409
|
-
const elapsedMs = Math.max(0, Date.now() - startedAtMs);
|
|
410
|
-
generationTelemetry.recordReloadFailure(tags, {
|
|
411
|
-
...baseFields,
|
|
412
|
-
elapsed_ms: elapsedMs,
|
|
413
|
-
drain_duration_ms: drainDurationMs,
|
|
414
|
-
error,
|
|
415
|
-
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
416
|
-
telegram_rollback_trigger: telegramGeneration.rollback.trigger,
|
|
417
|
-
});
|
|
418
|
-
generationTelemetry.trace({
|
|
419
|
-
name: "control_plane.reload",
|
|
420
|
-
status: "error",
|
|
421
|
-
durationMs: elapsedMs,
|
|
422
|
-
fields: {
|
|
423
|
-
...tags,
|
|
424
|
-
...baseFields,
|
|
425
|
-
error,
|
|
426
|
-
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
427
|
-
telegram_rollback_trigger: telegramGeneration.rollback.trigger,
|
|
428
|
-
},
|
|
429
|
-
});
|
|
430
|
-
return {
|
|
431
|
-
ok: false,
|
|
432
|
-
reason,
|
|
433
|
-
previous_control_plane: previousSummary,
|
|
434
|
-
control_plane: summarizeControlPlane(controlPlaneCurrent),
|
|
435
|
-
generation: {
|
|
436
|
-
attempt_id: attempt.attempt_id,
|
|
437
|
-
coalesced: planned.coalesced,
|
|
438
|
-
from_generation: attempt.from_generation,
|
|
439
|
-
to_generation: attempt.to_generation,
|
|
440
|
-
active_generation: generationSupervisor.activeGeneration(),
|
|
441
|
-
outcome: "failure",
|
|
442
|
-
},
|
|
443
|
-
telegram_generation: telegramGeneration,
|
|
444
|
-
error,
|
|
445
|
-
};
|
|
446
|
-
}
|
|
447
|
-
const next = await controlPlaneReloader({
|
|
448
|
-
repoRoot: context.repoRoot,
|
|
449
|
-
previous,
|
|
450
|
-
config: latestConfig.control_plane,
|
|
451
|
-
generation: attempt.to_generation,
|
|
452
|
-
});
|
|
453
|
-
nextHandle = next;
|
|
454
|
-
logLifecycle({ level: "info", stage: "warmup", state: "complete" });
|
|
455
|
-
failedStage = "cutover";
|
|
456
|
-
logLifecycle({ level: "info", stage: "cutover", state: "start" });
|
|
457
|
-
controlPlaneCurrent = next;
|
|
458
|
-
swapped = generationSupervisor.markSwapInstalled(attempt.attempt_id);
|
|
459
|
-
logLifecycle({
|
|
460
|
-
level: "info",
|
|
461
|
-
stage: "cutover",
|
|
462
|
-
state: "complete",
|
|
463
|
-
extra: {
|
|
464
|
-
active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
|
|
465
|
-
},
|
|
466
|
-
});
|
|
467
|
-
failedStage = "drain";
|
|
468
|
-
if (previous && previous !== next) {
|
|
469
|
-
logLifecycle({ level: "info", stage: "drain", state: "start" });
|
|
470
|
-
drainStartedAtMs = Date.now();
|
|
471
|
-
await previous.stop();
|
|
472
|
-
drainDurationMs = Math.max(0, Date.now() - drainStartedAtMs);
|
|
473
|
-
generationTelemetry.recordDrainDuration(tags, {
|
|
474
|
-
durationMs: drainDurationMs,
|
|
475
|
-
metadata: {
|
|
476
|
-
...baseFields,
|
|
477
|
-
},
|
|
478
|
-
});
|
|
479
|
-
logLifecycle({
|
|
480
|
-
level: "info",
|
|
481
|
-
stage: "drain",
|
|
482
|
-
state: "complete",
|
|
483
|
-
extra: {
|
|
484
|
-
drain_duration_ms: drainDurationMs,
|
|
485
|
-
},
|
|
486
|
-
});
|
|
487
|
-
}
|
|
488
|
-
else {
|
|
489
|
-
logLifecycle({
|
|
490
|
-
level: "info",
|
|
491
|
-
stage: "drain",
|
|
492
|
-
state: "skipped",
|
|
493
|
-
extra: {
|
|
494
|
-
drain_reason: "no_previous_generation",
|
|
495
|
-
},
|
|
496
|
-
});
|
|
497
|
-
}
|
|
498
|
-
logLifecycle({
|
|
499
|
-
level: "debug",
|
|
500
|
-
stage: "rollback",
|
|
501
|
-
state: "skipped",
|
|
502
|
-
extra: {
|
|
503
|
-
rollback_reason: "not_requested",
|
|
504
|
-
},
|
|
505
|
-
});
|
|
506
|
-
generationSupervisor.finishReload(attempt.attempt_id, "success");
|
|
507
|
-
const elapsedMs = Math.max(0, Date.now() - startedAtMs);
|
|
508
|
-
generationTelemetry.recordReloadSuccess(tags, {
|
|
509
|
-
...baseFields,
|
|
510
|
-
elapsed_ms: elapsedMs,
|
|
511
|
-
drain_duration_ms: drainDurationMs,
|
|
512
|
-
});
|
|
513
|
-
generationTelemetry.trace({
|
|
514
|
-
name: "control_plane.reload",
|
|
515
|
-
status: "ok",
|
|
516
|
-
durationMs: elapsedMs,
|
|
517
|
-
fields: {
|
|
518
|
-
...tags,
|
|
519
|
-
...baseFields,
|
|
520
|
-
},
|
|
521
|
-
});
|
|
522
|
-
return {
|
|
523
|
-
ok: true,
|
|
524
|
-
reason,
|
|
525
|
-
previous_control_plane: previousSummary,
|
|
526
|
-
control_plane: summarizeControlPlane(next),
|
|
527
|
-
generation: {
|
|
528
|
-
attempt_id: attempt.attempt_id,
|
|
529
|
-
coalesced: planned.coalesced,
|
|
530
|
-
from_generation: attempt.from_generation,
|
|
531
|
-
to_generation: attempt.to_generation,
|
|
532
|
-
active_generation: generationSupervisor.activeGeneration(),
|
|
533
|
-
outcome: "success",
|
|
534
|
-
},
|
|
535
|
-
};
|
|
536
|
-
}
|
|
537
|
-
catch (err) {
|
|
538
|
-
const error = describeError(err);
|
|
539
|
-
if (failedStage === "drain" && drainStartedAtMs != null) {
|
|
540
|
-
drainDurationMs = Math.max(0, Date.now() - drainStartedAtMs);
|
|
541
|
-
generationTelemetry.recordDrainDuration(tags, {
|
|
542
|
-
durationMs: drainDurationMs,
|
|
543
|
-
metadata: {
|
|
544
|
-
...baseFields,
|
|
545
|
-
error,
|
|
546
|
-
},
|
|
547
|
-
});
|
|
548
|
-
}
|
|
549
|
-
logLifecycle({
|
|
550
|
-
level: "error",
|
|
551
|
-
stage: failedStage,
|
|
552
|
-
state: "failed",
|
|
553
|
-
extra: {
|
|
554
|
-
error,
|
|
555
|
-
drain_duration_ms: failedStage === "drain" ? drainDurationMs : undefined,
|
|
556
|
-
},
|
|
557
|
-
});
|
|
558
|
-
if (swapped) {
|
|
559
|
-
logLifecycle({
|
|
560
|
-
level: "warn",
|
|
561
|
-
stage: "rollback",
|
|
562
|
-
state: "start",
|
|
563
|
-
extra: {
|
|
564
|
-
rollback_reason: "reload_failed_after_cutover",
|
|
565
|
-
rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
|
|
566
|
-
rollback_source_generation_id: attempt.to_generation.generation_id,
|
|
567
|
-
},
|
|
568
|
-
});
|
|
569
|
-
if (!previous) {
|
|
570
|
-
logLifecycle({
|
|
571
|
-
level: "error",
|
|
572
|
-
stage: "rollback",
|
|
573
|
-
state: "failed",
|
|
574
|
-
extra: {
|
|
575
|
-
rollback_reason: "no_previous_generation",
|
|
576
|
-
rollback_source_generation_id: attempt.to_generation.generation_id,
|
|
577
|
-
},
|
|
578
|
-
});
|
|
579
|
-
}
|
|
580
|
-
else {
|
|
581
|
-
try {
|
|
582
|
-
const restored = generationSupervisor.rollbackSwapInstalled(attempt.attempt_id);
|
|
583
|
-
if (!restored) {
|
|
584
|
-
throw new Error("generation_rollback_state_mismatch");
|
|
585
|
-
}
|
|
586
|
-
controlPlaneCurrent = previous;
|
|
587
|
-
if (nextHandle && nextHandle !== previous) {
|
|
588
|
-
await nextHandle.stop();
|
|
589
|
-
}
|
|
590
|
-
logLifecycle({
|
|
591
|
-
level: "info",
|
|
592
|
-
stage: "rollback",
|
|
593
|
-
state: "complete",
|
|
594
|
-
extra: {
|
|
595
|
-
active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
|
|
596
|
-
rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
|
|
597
|
-
},
|
|
598
|
-
});
|
|
599
|
-
}
|
|
600
|
-
catch (rollbackErr) {
|
|
601
|
-
logLifecycle({
|
|
602
|
-
level: "error",
|
|
603
|
-
stage: "rollback",
|
|
604
|
-
state: "failed",
|
|
605
|
-
extra: {
|
|
606
|
-
error: describeError(rollbackErr),
|
|
607
|
-
active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
|
|
608
|
-
rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
|
|
609
|
-
rollback_source_generation_id: attempt.to_generation.generation_id,
|
|
610
|
-
},
|
|
611
|
-
});
|
|
612
|
-
}
|
|
613
|
-
}
|
|
614
|
-
}
|
|
615
|
-
else {
|
|
616
|
-
logLifecycle({
|
|
617
|
-
level: "debug",
|
|
618
|
-
stage: "rollback",
|
|
619
|
-
state: "skipped",
|
|
620
|
-
extra: {
|
|
621
|
-
rollback_reason: "cutover_not_installed",
|
|
622
|
-
},
|
|
623
|
-
});
|
|
624
|
-
}
|
|
625
|
-
generationSupervisor.finishReload(attempt.attempt_id, "failure");
|
|
626
|
-
const elapsedMs = Math.max(0, Date.now() - startedAtMs);
|
|
627
|
-
generationTelemetry.recordReloadFailure(tags, {
|
|
628
|
-
...baseFields,
|
|
629
|
-
elapsed_ms: elapsedMs,
|
|
630
|
-
drain_duration_ms: drainDurationMs,
|
|
631
|
-
error,
|
|
632
|
-
});
|
|
633
|
-
generationTelemetry.trace({
|
|
634
|
-
name: "control_plane.reload",
|
|
635
|
-
status: "error",
|
|
636
|
-
durationMs: elapsedMs,
|
|
637
|
-
fields: {
|
|
638
|
-
...tags,
|
|
639
|
-
...baseFields,
|
|
640
|
-
error,
|
|
641
|
-
},
|
|
642
|
-
});
|
|
643
|
-
return {
|
|
644
|
-
ok: false,
|
|
645
|
-
reason,
|
|
646
|
-
previous_control_plane: previousSummary,
|
|
647
|
-
control_plane: summarizeControlPlane(controlPlaneCurrent),
|
|
648
|
-
generation: {
|
|
649
|
-
attempt_id: attempt.attempt_id,
|
|
650
|
-
coalesced: planned.coalesced,
|
|
651
|
-
from_generation: attempt.from_generation,
|
|
652
|
-
to_generation: attempt.to_generation,
|
|
653
|
-
active_generation: generationSupervisor.activeGeneration(),
|
|
654
|
-
outcome: "failure",
|
|
655
|
-
},
|
|
656
|
-
error,
|
|
657
|
-
};
|
|
658
|
-
}
|
|
659
|
-
};
|
|
660
|
-
const reloadControlPlane = async (reason) => {
|
|
661
|
-
if (reloadInFlight) {
|
|
662
|
-
const pending = generationSupervisor.pendingReload();
|
|
663
|
-
const fallbackGeneration = generationSupervisor.activeGeneration() ??
|
|
664
|
-
generationSupervisor.snapshot().last_reload?.to_generation ??
|
|
665
|
-
null;
|
|
666
|
-
const generation = pending?.to_generation ?? fallbackGeneration;
|
|
667
|
-
if (generation) {
|
|
668
|
-
generationTelemetry.recordDuplicateSignal(generationTagsFor(generation, "server.reload"), {
|
|
669
|
-
source: "server_reload",
|
|
670
|
-
signal: "coalesced_reload_request",
|
|
671
|
-
dedupe_key: pending?.attempt_id ?? "reload_in_flight",
|
|
672
|
-
record_id: pending?.attempt_id ?? "reload_in_flight",
|
|
673
|
-
metadata: {
|
|
674
|
-
reason,
|
|
675
|
-
pending_reason: pending?.reason ?? null,
|
|
676
|
-
},
|
|
677
|
-
});
|
|
678
|
-
}
|
|
679
|
-
return await reloadInFlight;
|
|
680
|
-
}
|
|
681
|
-
reloadInFlight = performControlPlaneReload(reason).finally(() => {
|
|
682
|
-
reloadInFlight = null;
|
|
683
|
-
});
|
|
684
|
-
return await reloadInFlight;
|
|
685
|
-
};
|
|
686
500
|
const handleRequest = createServerRequestHandler({
|
|
687
501
|
context,
|
|
688
502
|
controlPlaneProxy,
|
|
@@ -692,16 +506,11 @@ function createServer(options = {}) {
|
|
|
692
506
|
loadConfigFromDisk,
|
|
693
507
|
writeConfig,
|
|
694
508
|
reloadControlPlane,
|
|
695
|
-
getControlPlaneStatus:
|
|
696
|
-
...summarizeControlPlane(controlPlaneCurrent),
|
|
697
|
-
generation: generationSupervisor.snapshot(),
|
|
698
|
-
observability: {
|
|
699
|
-
counters: generationTelemetry.counters(),
|
|
700
|
-
},
|
|
701
|
-
}),
|
|
509
|
+
getControlPlaneStatus: reloadManager.getControlPlaneStatus,
|
|
702
510
|
registerAutoRunHeartbeatProgram,
|
|
703
511
|
disableAutoRunHeartbeatProgram,
|
|
704
512
|
describeError,
|
|
513
|
+
initiateShutdown: options.initiateShutdown,
|
|
705
514
|
});
|
|
706
515
|
const server = {
|
|
707
516
|
port: options.port || 3000,
|