@femtomc/mu-server 26.2.55 → 26.2.56
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -4
- package/dist/control_plane.d.ts +61 -2
- package/dist/control_plane.js +621 -37
- package/dist/generation_supervisor.d.ts +21 -0
- package/dist/generation_supervisor.js +107 -0
- package/dist/server.d.ts +5 -2
- package/dist/server.js +495 -23
- package/package.json +1 -1
package/dist/server.js
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import { extname, join, resolve } from "node:path";
|
|
2
|
+
import { GenerationTelemetryRecorder, getControlPlanePaths, IdentityStore, ROLE_SCOPES, } from "@femtomc/mu-control-plane";
|
|
2
3
|
import { currentRunId, EventLog, FsJsonlStore, getStorePaths, JsonlEventSink } from "@femtomc/mu-core/node";
|
|
3
4
|
import { ForumStore } from "@femtomc/mu-forum";
|
|
4
5
|
import { IssueStore } from "@femtomc/mu-issue";
|
|
5
|
-
import {
|
|
6
|
+
import { ControlPlaneActivitySupervisor } from "./activity_supervisor.js";
|
|
6
7
|
import { eventRoutes } from "./api/events.js";
|
|
7
8
|
import { forumRoutes } from "./api/forum.js";
|
|
8
9
|
import { issueRoutes } from "./api/issues.js";
|
|
9
10
|
import { applyMuConfigPatch, DEFAULT_MU_CONFIG, getMuConfigPath, muConfigPresence, readMuConfigFile, redactMuConfigSecrets, writeMuConfigFile, } from "./config.js";
|
|
10
|
-
import {
|
|
11
|
-
import {
|
|
12
|
-
import { ActivityHeartbeatScheduler } from "./heartbeat_scheduler.js";
|
|
11
|
+
import { bootstrapControlPlane, } from "./control_plane.js";
|
|
12
|
+
import { ControlPlaneGenerationSupervisor } from "./generation_supervisor.js";
|
|
13
13
|
import { HeartbeatProgramRegistry } from "./heartbeat_programs.js";
|
|
14
|
+
import { ActivityHeartbeatScheduler } from "./heartbeat_scheduler.js";
|
|
14
15
|
const MIME_TYPES = {
|
|
15
16
|
".html": "text/html; charset=utf-8",
|
|
16
17
|
".js": "text/javascript; charset=utf-8",
|
|
@@ -77,9 +78,31 @@ export function createServer(options = {}) {
|
|
|
77
78
|
});
|
|
78
79
|
let controlPlaneCurrent = options.controlPlane ?? null;
|
|
79
80
|
let reloadInFlight = null;
|
|
81
|
+
const generationTelemetry = options.generationTelemetry ?? new GenerationTelemetryRecorder();
|
|
82
|
+
const generationSupervisor = new ControlPlaneGenerationSupervisor({
|
|
83
|
+
supervisorId: "control-plane",
|
|
84
|
+
initialGeneration: controlPlaneCurrent
|
|
85
|
+
? {
|
|
86
|
+
generation_id: "control-plane-gen-0",
|
|
87
|
+
generation_seq: 0,
|
|
88
|
+
}
|
|
89
|
+
: null,
|
|
90
|
+
});
|
|
91
|
+
const generationTagsFor = (generation, component) => ({
|
|
92
|
+
generation_id: generation.generation_id,
|
|
93
|
+
generation_seq: generation.generation_seq,
|
|
94
|
+
supervisor: "control_plane",
|
|
95
|
+
component,
|
|
96
|
+
});
|
|
80
97
|
const controlPlaneReloader = options.controlPlaneReloader ??
|
|
81
|
-
(async ({ repoRoot, config }) => {
|
|
82
|
-
return await bootstrapControlPlane({
|
|
98
|
+
(async ({ repoRoot, config, generation }) => {
|
|
99
|
+
return await bootstrapControlPlane({
|
|
100
|
+
repoRoot,
|
|
101
|
+
config,
|
|
102
|
+
heartbeatScheduler,
|
|
103
|
+
generation,
|
|
104
|
+
telemetry: generationTelemetry,
|
|
105
|
+
});
|
|
83
106
|
});
|
|
84
107
|
const controlPlaneProxy = {
|
|
85
108
|
get activeAdapters() {
|
|
@@ -185,38 +208,471 @@ export function createServer(options = {}) {
|
|
|
185
208
|
}
|
|
186
209
|
};
|
|
187
210
|
const performControlPlaneReload = async (reason) => {
|
|
211
|
+
const startedAtMs = Date.now();
|
|
212
|
+
const planned = generationSupervisor.beginReload(reason);
|
|
213
|
+
const attempt = planned.attempt;
|
|
188
214
|
const previous = controlPlaneCurrent;
|
|
189
215
|
const previousSummary = summarizeControlPlane(previous);
|
|
216
|
+
const tags = generationTagsFor(attempt.to_generation, "server.reload");
|
|
217
|
+
const baseFields = {
|
|
218
|
+
reason,
|
|
219
|
+
attempt_id: attempt.attempt_id,
|
|
220
|
+
coalesced: planned.coalesced,
|
|
221
|
+
from_generation_id: attempt.from_generation?.generation_id ?? null,
|
|
222
|
+
};
|
|
223
|
+
const logLifecycle = (opts) => {
|
|
224
|
+
generationTelemetry.log({
|
|
225
|
+
level: opts.level,
|
|
226
|
+
message: `reload transition ${opts.stage}:${opts.state}`,
|
|
227
|
+
fields: {
|
|
228
|
+
...tags,
|
|
229
|
+
...baseFields,
|
|
230
|
+
...(opts.extra ?? {}),
|
|
231
|
+
},
|
|
232
|
+
});
|
|
233
|
+
};
|
|
234
|
+
let swapped = false;
|
|
235
|
+
let failedStage = "warmup";
|
|
236
|
+
let drainDurationMs = 0;
|
|
237
|
+
let drainStartedAtMs = null;
|
|
238
|
+
let nextHandle = null;
|
|
190
239
|
try {
|
|
240
|
+
logLifecycle({ level: "info", stage: "warmup", state: "start" });
|
|
191
241
|
const latestConfig = await loadConfigFromDisk();
|
|
242
|
+
const telegramGeneration = (await previous?.reloadTelegramGeneration?.({
|
|
243
|
+
config: latestConfig.control_plane,
|
|
244
|
+
reason,
|
|
245
|
+
})) ?? null;
|
|
246
|
+
if (telegramGeneration?.handled) {
|
|
247
|
+
if (telegramGeneration.warmup) {
|
|
248
|
+
logLifecycle({
|
|
249
|
+
level: telegramGeneration.warmup.ok ? "info" : "error",
|
|
250
|
+
stage: "warmup",
|
|
251
|
+
state: telegramGeneration.warmup.ok ? "complete" : "failed",
|
|
252
|
+
extra: {
|
|
253
|
+
warmup_elapsed_ms: telegramGeneration.warmup.elapsed_ms,
|
|
254
|
+
error: telegramGeneration.warmup.error,
|
|
255
|
+
telegram_generation_id: telegramGeneration.to_generation?.generation_id ?? null,
|
|
256
|
+
},
|
|
257
|
+
});
|
|
258
|
+
}
|
|
259
|
+
else {
|
|
260
|
+
logLifecycle({
|
|
261
|
+
level: "info",
|
|
262
|
+
stage: "warmup",
|
|
263
|
+
state: "skipped",
|
|
264
|
+
extra: {
|
|
265
|
+
warmup_reason: "telegram_generation_no_warmup",
|
|
266
|
+
telegram_generation_id: telegramGeneration.to_generation?.generation_id ?? null,
|
|
267
|
+
},
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
if (telegramGeneration.cutover) {
|
|
271
|
+
logLifecycle({ level: "info", stage: "cutover", state: "start" });
|
|
272
|
+
logLifecycle({
|
|
273
|
+
level: telegramGeneration.cutover.ok ? "info" : "error",
|
|
274
|
+
stage: "cutover",
|
|
275
|
+
state: telegramGeneration.cutover.ok ? "complete" : "failed",
|
|
276
|
+
extra: {
|
|
277
|
+
cutover_elapsed_ms: telegramGeneration.cutover.elapsed_ms,
|
|
278
|
+
error: telegramGeneration.cutover.error,
|
|
279
|
+
active_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
280
|
+
},
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
else {
|
|
284
|
+
logLifecycle({
|
|
285
|
+
level: "info",
|
|
286
|
+
stage: "cutover",
|
|
287
|
+
state: "skipped",
|
|
288
|
+
extra: {
|
|
289
|
+
cutover_reason: "telegram_generation_no_cutover",
|
|
290
|
+
active_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
291
|
+
},
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
if (telegramGeneration.drain) {
|
|
295
|
+
logLifecycle({ level: "info", stage: "drain", state: "start" });
|
|
296
|
+
drainDurationMs = Math.max(0, Math.trunc(telegramGeneration.drain.elapsed_ms));
|
|
297
|
+
generationTelemetry.recordDrainDuration(tags, {
|
|
298
|
+
durationMs: drainDurationMs,
|
|
299
|
+
timedOut: telegramGeneration.drain.timed_out,
|
|
300
|
+
metadata: {
|
|
301
|
+
...baseFields,
|
|
302
|
+
telegram_forced_stop: telegramGeneration.drain.forced_stop,
|
|
303
|
+
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
304
|
+
},
|
|
305
|
+
});
|
|
306
|
+
logLifecycle({
|
|
307
|
+
level: telegramGeneration.drain.ok ? "info" : "warn",
|
|
308
|
+
stage: "drain",
|
|
309
|
+
state: telegramGeneration.drain.ok ? "complete" : "failed",
|
|
310
|
+
extra: {
|
|
311
|
+
drain_duration_ms: telegramGeneration.drain.elapsed_ms,
|
|
312
|
+
drain_timed_out: telegramGeneration.drain.timed_out,
|
|
313
|
+
forced_stop: telegramGeneration.drain.forced_stop,
|
|
314
|
+
error: telegramGeneration.drain.error,
|
|
315
|
+
},
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
else {
|
|
319
|
+
logLifecycle({
|
|
320
|
+
level: "info",
|
|
321
|
+
stage: "drain",
|
|
322
|
+
state: "skipped",
|
|
323
|
+
extra: {
|
|
324
|
+
drain_reason: "telegram_generation_no_drain",
|
|
325
|
+
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
326
|
+
},
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
const shouldLogRollbackStart = telegramGeneration.rollback.requested ||
|
|
330
|
+
telegramGeneration.rollback.attempted ||
|
|
331
|
+
telegramGeneration.rollback.trigger != null ||
|
|
332
|
+
!telegramGeneration.ok;
|
|
333
|
+
if (shouldLogRollbackStart) {
|
|
334
|
+
logLifecycle({
|
|
335
|
+
level: telegramGeneration.rollback.ok ? "warn" : "error",
|
|
336
|
+
stage: "rollback",
|
|
337
|
+
state: "start",
|
|
338
|
+
extra: {
|
|
339
|
+
rollback_requested: telegramGeneration.rollback.requested,
|
|
340
|
+
rollback_trigger: telegramGeneration.rollback.trigger,
|
|
341
|
+
rollback_attempted: telegramGeneration.rollback.attempted,
|
|
342
|
+
},
|
|
343
|
+
});
|
|
344
|
+
logLifecycle({
|
|
345
|
+
level: telegramGeneration.rollback.ok ? "info" : "error",
|
|
346
|
+
stage: "rollback",
|
|
347
|
+
state: telegramGeneration.rollback.ok ? "complete" : "failed",
|
|
348
|
+
extra: {
|
|
349
|
+
rollback_requested: telegramGeneration.rollback.requested,
|
|
350
|
+
rollback_trigger: telegramGeneration.rollback.trigger,
|
|
351
|
+
rollback_attempted: telegramGeneration.rollback.attempted,
|
|
352
|
+
error: telegramGeneration.rollback.error,
|
|
353
|
+
},
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
else {
|
|
357
|
+
logLifecycle({
|
|
358
|
+
level: "debug",
|
|
359
|
+
stage: "rollback",
|
|
360
|
+
state: "skipped",
|
|
361
|
+
extra: {
|
|
362
|
+
rollback_reason: "not_requested",
|
|
363
|
+
},
|
|
364
|
+
});
|
|
365
|
+
}
|
|
366
|
+
if (telegramGeneration.ok) {
|
|
367
|
+
swapped = generationSupervisor.markSwapInstalled(attempt.attempt_id);
|
|
368
|
+
generationSupervisor.finishReload(attempt.attempt_id, "success");
|
|
369
|
+
const elapsedMs = Math.max(0, Date.now() - startedAtMs);
|
|
370
|
+
generationTelemetry.recordReloadSuccess(tags, {
|
|
371
|
+
...baseFields,
|
|
372
|
+
elapsed_ms: elapsedMs,
|
|
373
|
+
drain_duration_ms: drainDurationMs,
|
|
374
|
+
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
375
|
+
telegram_rollback_attempted: telegramGeneration.rollback.attempted,
|
|
376
|
+
telegram_rollback_trigger: telegramGeneration.rollback.trigger,
|
|
377
|
+
});
|
|
378
|
+
generationTelemetry.trace({
|
|
379
|
+
name: "control_plane.reload",
|
|
380
|
+
status: "ok",
|
|
381
|
+
durationMs: elapsedMs,
|
|
382
|
+
fields: {
|
|
383
|
+
...tags,
|
|
384
|
+
...baseFields,
|
|
385
|
+
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
386
|
+
},
|
|
387
|
+
});
|
|
388
|
+
return {
|
|
389
|
+
ok: true,
|
|
390
|
+
reason,
|
|
391
|
+
previous_control_plane: previousSummary,
|
|
392
|
+
control_plane: summarizeControlPlane(controlPlaneCurrent),
|
|
393
|
+
generation: {
|
|
394
|
+
attempt_id: attempt.attempt_id,
|
|
395
|
+
coalesced: planned.coalesced,
|
|
396
|
+
from_generation: attempt.from_generation,
|
|
397
|
+
to_generation: attempt.to_generation,
|
|
398
|
+
active_generation: generationSupervisor.activeGeneration(),
|
|
399
|
+
outcome: "success",
|
|
400
|
+
},
|
|
401
|
+
telegram_generation: telegramGeneration,
|
|
402
|
+
};
|
|
403
|
+
}
|
|
404
|
+
generationSupervisor.finishReload(attempt.attempt_id, "failure");
|
|
405
|
+
const error = telegramGeneration.error ?? "telegram_generation_reload_failed";
|
|
406
|
+
const elapsedMs = Math.max(0, Date.now() - startedAtMs);
|
|
407
|
+
generationTelemetry.recordReloadFailure(tags, {
|
|
408
|
+
...baseFields,
|
|
409
|
+
elapsed_ms: elapsedMs,
|
|
410
|
+
drain_duration_ms: drainDurationMs,
|
|
411
|
+
error,
|
|
412
|
+
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
413
|
+
telegram_rollback_trigger: telegramGeneration.rollback.trigger,
|
|
414
|
+
});
|
|
415
|
+
generationTelemetry.trace({
|
|
416
|
+
name: "control_plane.reload",
|
|
417
|
+
status: "error",
|
|
418
|
+
durationMs: elapsedMs,
|
|
419
|
+
fields: {
|
|
420
|
+
...tags,
|
|
421
|
+
...baseFields,
|
|
422
|
+
error,
|
|
423
|
+
telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
|
|
424
|
+
telegram_rollback_trigger: telegramGeneration.rollback.trigger,
|
|
425
|
+
},
|
|
426
|
+
});
|
|
427
|
+
return {
|
|
428
|
+
ok: false,
|
|
429
|
+
reason,
|
|
430
|
+
previous_control_plane: previousSummary,
|
|
431
|
+
control_plane: summarizeControlPlane(controlPlaneCurrent),
|
|
432
|
+
generation: {
|
|
433
|
+
attempt_id: attempt.attempt_id,
|
|
434
|
+
coalesced: planned.coalesced,
|
|
435
|
+
from_generation: attempt.from_generation,
|
|
436
|
+
to_generation: attempt.to_generation,
|
|
437
|
+
active_generation: generationSupervisor.activeGeneration(),
|
|
438
|
+
outcome: "failure",
|
|
439
|
+
},
|
|
440
|
+
telegram_generation: telegramGeneration,
|
|
441
|
+
error,
|
|
442
|
+
};
|
|
443
|
+
}
|
|
192
444
|
const next = await controlPlaneReloader({
|
|
193
445
|
repoRoot: context.repoRoot,
|
|
194
446
|
previous,
|
|
195
447
|
config: latestConfig.control_plane,
|
|
448
|
+
generation: attempt.to_generation,
|
|
196
449
|
});
|
|
450
|
+
nextHandle = next;
|
|
451
|
+
logLifecycle({ level: "info", stage: "warmup", state: "complete" });
|
|
452
|
+
failedStage = "cutover";
|
|
453
|
+
logLifecycle({ level: "info", stage: "cutover", state: "start" });
|
|
197
454
|
controlPlaneCurrent = next;
|
|
455
|
+
swapped = generationSupervisor.markSwapInstalled(attempt.attempt_id);
|
|
456
|
+
logLifecycle({
|
|
457
|
+
level: "info",
|
|
458
|
+
stage: "cutover",
|
|
459
|
+
state: "complete",
|
|
460
|
+
extra: {
|
|
461
|
+
active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
|
|
462
|
+
},
|
|
463
|
+
});
|
|
464
|
+
failedStage = "drain";
|
|
198
465
|
if (previous && previous !== next) {
|
|
466
|
+
logLifecycle({ level: "info", stage: "drain", state: "start" });
|
|
467
|
+
drainStartedAtMs = Date.now();
|
|
199
468
|
await previous.stop();
|
|
469
|
+
drainDurationMs = Math.max(0, Date.now() - drainStartedAtMs);
|
|
470
|
+
generationTelemetry.recordDrainDuration(tags, {
|
|
471
|
+
durationMs: drainDurationMs,
|
|
472
|
+
metadata: {
|
|
473
|
+
...baseFields,
|
|
474
|
+
},
|
|
475
|
+
});
|
|
476
|
+
logLifecycle({
|
|
477
|
+
level: "info",
|
|
478
|
+
stage: "drain",
|
|
479
|
+
state: "complete",
|
|
480
|
+
extra: {
|
|
481
|
+
drain_duration_ms: drainDurationMs,
|
|
482
|
+
},
|
|
483
|
+
});
|
|
484
|
+
}
|
|
485
|
+
else {
|
|
486
|
+
logLifecycle({
|
|
487
|
+
level: "info",
|
|
488
|
+
stage: "drain",
|
|
489
|
+
state: "skipped",
|
|
490
|
+
extra: {
|
|
491
|
+
drain_reason: "no_previous_generation",
|
|
492
|
+
},
|
|
493
|
+
});
|
|
200
494
|
}
|
|
495
|
+
logLifecycle({
|
|
496
|
+
level: "debug",
|
|
497
|
+
stage: "rollback",
|
|
498
|
+
state: "skipped",
|
|
499
|
+
extra: {
|
|
500
|
+
rollback_reason: "not_requested",
|
|
501
|
+
},
|
|
502
|
+
});
|
|
503
|
+
generationSupervisor.finishReload(attempt.attempt_id, "success");
|
|
504
|
+
const elapsedMs = Math.max(0, Date.now() - startedAtMs);
|
|
505
|
+
generationTelemetry.recordReloadSuccess(tags, {
|
|
506
|
+
...baseFields,
|
|
507
|
+
elapsed_ms: elapsedMs,
|
|
508
|
+
drain_duration_ms: drainDurationMs,
|
|
509
|
+
});
|
|
510
|
+
generationTelemetry.trace({
|
|
511
|
+
name: "control_plane.reload",
|
|
512
|
+
status: "ok",
|
|
513
|
+
durationMs: elapsedMs,
|
|
514
|
+
fields: {
|
|
515
|
+
...tags,
|
|
516
|
+
...baseFields,
|
|
517
|
+
},
|
|
518
|
+
});
|
|
201
519
|
return {
|
|
202
520
|
ok: true,
|
|
203
521
|
reason,
|
|
204
522
|
previous_control_plane: previousSummary,
|
|
205
523
|
control_plane: summarizeControlPlane(next),
|
|
524
|
+
generation: {
|
|
525
|
+
attempt_id: attempt.attempt_id,
|
|
526
|
+
coalesced: planned.coalesced,
|
|
527
|
+
from_generation: attempt.from_generation,
|
|
528
|
+
to_generation: attempt.to_generation,
|
|
529
|
+
active_generation: generationSupervisor.activeGeneration(),
|
|
530
|
+
outcome: "success",
|
|
531
|
+
},
|
|
206
532
|
};
|
|
207
533
|
}
|
|
208
534
|
catch (err) {
|
|
535
|
+
const error = describeError(err);
|
|
536
|
+
if (failedStage === "drain" && drainStartedAtMs != null) {
|
|
537
|
+
drainDurationMs = Math.max(0, Date.now() - drainStartedAtMs);
|
|
538
|
+
generationTelemetry.recordDrainDuration(tags, {
|
|
539
|
+
durationMs: drainDurationMs,
|
|
540
|
+
metadata: {
|
|
541
|
+
...baseFields,
|
|
542
|
+
error,
|
|
543
|
+
},
|
|
544
|
+
});
|
|
545
|
+
}
|
|
546
|
+
logLifecycle({
|
|
547
|
+
level: "error",
|
|
548
|
+
stage: failedStage,
|
|
549
|
+
state: "failed",
|
|
550
|
+
extra: {
|
|
551
|
+
error,
|
|
552
|
+
drain_duration_ms: failedStage === "drain" ? drainDurationMs : undefined,
|
|
553
|
+
},
|
|
554
|
+
});
|
|
555
|
+
if (swapped) {
|
|
556
|
+
logLifecycle({
|
|
557
|
+
level: "warn",
|
|
558
|
+
stage: "rollback",
|
|
559
|
+
state: "start",
|
|
560
|
+
extra: {
|
|
561
|
+
rollback_reason: "reload_failed_after_cutover",
|
|
562
|
+
rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
|
|
563
|
+
rollback_source_generation_id: attempt.to_generation.generation_id,
|
|
564
|
+
},
|
|
565
|
+
});
|
|
566
|
+
if (!previous) {
|
|
567
|
+
logLifecycle({
|
|
568
|
+
level: "error",
|
|
569
|
+
stage: "rollback",
|
|
570
|
+
state: "failed",
|
|
571
|
+
extra: {
|
|
572
|
+
rollback_reason: "no_previous_generation",
|
|
573
|
+
rollback_source_generation_id: attempt.to_generation.generation_id,
|
|
574
|
+
},
|
|
575
|
+
});
|
|
576
|
+
}
|
|
577
|
+
else {
|
|
578
|
+
try {
|
|
579
|
+
const restored = generationSupervisor.rollbackSwapInstalled(attempt.attempt_id);
|
|
580
|
+
if (!restored) {
|
|
581
|
+
throw new Error("generation_rollback_state_mismatch");
|
|
582
|
+
}
|
|
583
|
+
controlPlaneCurrent = previous;
|
|
584
|
+
if (nextHandle && nextHandle !== previous) {
|
|
585
|
+
await nextHandle.stop();
|
|
586
|
+
}
|
|
587
|
+
logLifecycle({
|
|
588
|
+
level: "info",
|
|
589
|
+
stage: "rollback",
|
|
590
|
+
state: "complete",
|
|
591
|
+
extra: {
|
|
592
|
+
active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
|
|
593
|
+
rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
|
|
594
|
+
},
|
|
595
|
+
});
|
|
596
|
+
}
|
|
597
|
+
catch (rollbackErr) {
|
|
598
|
+
logLifecycle({
|
|
599
|
+
level: "error",
|
|
600
|
+
stage: "rollback",
|
|
601
|
+
state: "failed",
|
|
602
|
+
extra: {
|
|
603
|
+
error: describeError(rollbackErr),
|
|
604
|
+
active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
|
|
605
|
+
rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
|
|
606
|
+
rollback_source_generation_id: attempt.to_generation.generation_id,
|
|
607
|
+
},
|
|
608
|
+
});
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
else {
|
|
613
|
+
logLifecycle({
|
|
614
|
+
level: "debug",
|
|
615
|
+
stage: "rollback",
|
|
616
|
+
state: "skipped",
|
|
617
|
+
extra: {
|
|
618
|
+
rollback_reason: "cutover_not_installed",
|
|
619
|
+
},
|
|
620
|
+
});
|
|
621
|
+
}
|
|
622
|
+
generationSupervisor.finishReload(attempt.attempt_id, "failure");
|
|
623
|
+
const elapsedMs = Math.max(0, Date.now() - startedAtMs);
|
|
624
|
+
generationTelemetry.recordReloadFailure(tags, {
|
|
625
|
+
...baseFields,
|
|
626
|
+
elapsed_ms: elapsedMs,
|
|
627
|
+
drain_duration_ms: drainDurationMs,
|
|
628
|
+
error,
|
|
629
|
+
});
|
|
630
|
+
generationTelemetry.trace({
|
|
631
|
+
name: "control_plane.reload",
|
|
632
|
+
status: "error",
|
|
633
|
+
durationMs: elapsedMs,
|
|
634
|
+
fields: {
|
|
635
|
+
...tags,
|
|
636
|
+
...baseFields,
|
|
637
|
+
error,
|
|
638
|
+
},
|
|
639
|
+
});
|
|
209
640
|
return {
|
|
210
641
|
ok: false,
|
|
211
642
|
reason,
|
|
212
643
|
previous_control_plane: previousSummary,
|
|
213
|
-
control_plane: summarizeControlPlane(
|
|
214
|
-
|
|
644
|
+
control_plane: summarizeControlPlane(controlPlaneCurrent),
|
|
645
|
+
generation: {
|
|
646
|
+
attempt_id: attempt.attempt_id,
|
|
647
|
+
coalesced: planned.coalesced,
|
|
648
|
+
from_generation: attempt.from_generation,
|
|
649
|
+
to_generation: attempt.to_generation,
|
|
650
|
+
active_generation: generationSupervisor.activeGeneration(),
|
|
651
|
+
outcome: "failure",
|
|
652
|
+
},
|
|
653
|
+
error,
|
|
215
654
|
};
|
|
216
655
|
}
|
|
217
656
|
};
|
|
218
657
|
const reloadControlPlane = async (reason) => {
|
|
219
658
|
if (reloadInFlight) {
|
|
659
|
+
const pending = generationSupervisor.pendingReload();
|
|
660
|
+
const fallbackGeneration = generationSupervisor.activeGeneration() ??
|
|
661
|
+
generationSupervisor.snapshot().last_reload?.to_generation ??
|
|
662
|
+
null;
|
|
663
|
+
const generation = pending?.to_generation ?? fallbackGeneration;
|
|
664
|
+
if (generation) {
|
|
665
|
+
generationTelemetry.recordDuplicateSignal(generationTagsFor(generation, "server.reload"), {
|
|
666
|
+
source: "server_reload",
|
|
667
|
+
signal: "coalesced_reload_request",
|
|
668
|
+
dedupe_key: pending?.attempt_id ?? "reload_in_flight",
|
|
669
|
+
record_id: pending?.attempt_id ?? "reload_in_flight",
|
|
670
|
+
metadata: {
|
|
671
|
+
reason,
|
|
672
|
+
pending_reason: pending?.reason ?? null,
|
|
673
|
+
},
|
|
674
|
+
});
|
|
675
|
+
}
|
|
220
676
|
return await reloadInFlight;
|
|
221
677
|
}
|
|
222
678
|
reloadInFlight = performControlPlaneReload(reason).finally(() => {
|
|
@@ -299,11 +755,24 @@ export function createServer(options = {}) {
|
|
|
299
755
|
const result = await reloadControlPlane(reason);
|
|
300
756
|
return Response.json(result, { status: result.ok ? 200 : 500, headers });
|
|
301
757
|
}
|
|
758
|
+
if (path === "/api/control-plane/rollback") {
|
|
759
|
+
if (request.method !== "POST") {
|
|
760
|
+
return Response.json({ error: "Method Not Allowed" }, { status: 405, headers });
|
|
761
|
+
}
|
|
762
|
+
const result = await reloadControlPlane("rollback");
|
|
763
|
+
return Response.json(result, { status: result.ok ? 200 : 500, headers });
|
|
764
|
+
}
|
|
302
765
|
if (path === "/api/status") {
|
|
303
766
|
const issues = await context.issueStore.list();
|
|
304
767
|
const openIssues = issues.filter((i) => i.status === "open");
|
|
305
768
|
const readyIssues = await context.issueStore.ready();
|
|
306
|
-
const controlPlane =
|
|
769
|
+
const controlPlane = {
|
|
770
|
+
...summarizeControlPlane(controlPlaneCurrent),
|
|
771
|
+
generation: generationSupervisor.snapshot(),
|
|
772
|
+
observability: {
|
|
773
|
+
counters: generationTelemetry.counters(),
|
|
774
|
+
},
|
|
775
|
+
};
|
|
307
776
|
return Response.json({
|
|
308
777
|
repo_root: context.repoRoot,
|
|
309
778
|
open_count: openIssues.length,
|
|
@@ -473,9 +942,7 @@ export function createServer(options = {}) {
|
|
|
473
942
|
const targetKindRaw = url.searchParams.get("target_kind")?.trim().toLowerCase();
|
|
474
943
|
const targetKind = targetKindRaw === "run" || targetKindRaw === "activity" ? targetKindRaw : undefined;
|
|
475
944
|
const limitRaw = url.searchParams.get("limit");
|
|
476
|
-
const limit = limitRaw && /^\d+$/.test(limitRaw)
|
|
477
|
-
? Math.max(1, Math.min(500, Number.parseInt(limitRaw, 10)))
|
|
478
|
-
: undefined;
|
|
945
|
+
const limit = limitRaw && /^\d+$/.test(limitRaw) ? Math.max(1, Math.min(500, Number.parseInt(limitRaw, 10))) : undefined;
|
|
479
946
|
const programs = await heartbeatPrograms.list({ enabled, targetKind, limit });
|
|
480
947
|
return Response.json({ count: programs.length, programs }, { headers });
|
|
481
948
|
}
|
|
@@ -673,17 +1140,12 @@ export function createServer(options = {}) {
|
|
|
673
1140
|
return Response.json({ error: "Method Not Allowed" }, { status: 405, headers });
|
|
674
1141
|
}
|
|
675
1142
|
const statusRaw = url.searchParams.get("status")?.trim().toLowerCase();
|
|
676
|
-
const status = statusRaw === "running" ||
|
|
677
|
-
statusRaw === "completed" ||
|
|
678
|
-
statusRaw === "failed" ||
|
|
679
|
-
statusRaw === "cancelled"
|
|
1143
|
+
const status = statusRaw === "running" || statusRaw === "completed" || statusRaw === "failed" || statusRaw === "cancelled"
|
|
680
1144
|
? statusRaw
|
|
681
1145
|
: undefined;
|
|
682
1146
|
const kind = url.searchParams.get("kind")?.trim() || undefined;
|
|
683
1147
|
const limitRaw = url.searchParams.get("limit");
|
|
684
|
-
const limit = limitRaw && /^\d+$/.test(limitRaw)
|
|
685
|
-
? Math.max(1, Math.min(500, Number.parseInt(limitRaw, 10)))
|
|
686
|
-
: undefined;
|
|
1148
|
+
const limit = limitRaw && /^\d+$/.test(limitRaw) ? Math.max(1, Math.min(500, Number.parseInt(limitRaw, 10))) : undefined;
|
|
687
1149
|
const activities = activitySupervisor.list({ status, kind, limit });
|
|
688
1150
|
return Response.json({ count: activities.length, activities }, { headers });
|
|
689
1151
|
}
|
|
@@ -706,9 +1168,7 @@ export function createServer(options = {}) {
|
|
|
706
1168
|
const heartbeatEveryMs = typeof body.heartbeat_every_ms === "number" && Number.isFinite(body.heartbeat_every_ms)
|
|
707
1169
|
? Math.max(0, Math.trunc(body.heartbeat_every_ms))
|
|
708
1170
|
: undefined;
|
|
709
|
-
const source = body.source === "api" || body.source === "command" || body.source === "system"
|
|
710
|
-
? body.source
|
|
711
|
-
: "api";
|
|
1171
|
+
const source = body.source === "api" || body.source === "command" || body.source === "system" ? body.source : "api";
|
|
712
1172
|
try {
|
|
713
1173
|
const activity = activitySupervisor.start({
|
|
714
1174
|
title,
|
|
@@ -994,12 +1454,24 @@ export async function createServerAsync(options = {}) {
|
|
|
994
1454
|
const repoRoot = options.repoRoot || process.cwd();
|
|
995
1455
|
const config = options.config ?? (await readMuConfigFile(repoRoot));
|
|
996
1456
|
const heartbeatScheduler = options.heartbeatScheduler ?? new ActivityHeartbeatScheduler();
|
|
1457
|
+
const generationTelemetry = options.generationTelemetry ?? new GenerationTelemetryRecorder();
|
|
997
1458
|
const controlPlane = await bootstrapControlPlane({
|
|
998
1459
|
repoRoot,
|
|
999
1460
|
config: config.control_plane,
|
|
1000
1461
|
heartbeatScheduler,
|
|
1462
|
+
generation: {
|
|
1463
|
+
generation_id: "control-plane-gen-0",
|
|
1464
|
+
generation_seq: 0,
|
|
1465
|
+
},
|
|
1466
|
+
telemetry: generationTelemetry,
|
|
1467
|
+
});
|
|
1468
|
+
const serverConfig = createServer({
|
|
1469
|
+
...options,
|
|
1470
|
+
heartbeatScheduler,
|
|
1471
|
+
controlPlane,
|
|
1472
|
+
config,
|
|
1473
|
+
generationTelemetry,
|
|
1001
1474
|
});
|
|
1002
|
-
const serverConfig = createServer({ ...options, heartbeatScheduler, controlPlane, config });
|
|
1003
1475
|
return {
|
|
1004
1476
|
serverConfig,
|
|
1005
1477
|
controlPlane: serverConfig.controlPlane,
|