@femtomc/mu-server 26.2.70 → 26.2.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/api/activities.d.ts +2 -0
  2. package/dist/api/activities.js +160 -0
  3. package/dist/api/config.d.ts +2 -0
  4. package/dist/api/config.js +45 -0
  5. package/dist/api/control_plane.d.ts +2 -0
  6. package/dist/api/control_plane.js +28 -0
  7. package/dist/api/cron.d.ts +2 -0
  8. package/dist/api/cron.js +182 -0
  9. package/dist/api/heartbeats.d.ts +2 -0
  10. package/dist/api/heartbeats.js +211 -0
  11. package/dist/api/identities.d.ts +2 -0
  12. package/dist/api/identities.js +103 -0
  13. package/dist/api/runs.d.ts +2 -0
  14. package/dist/api/runs.js +207 -0
  15. package/dist/cli.js +58 -3
  16. package/dist/config.d.ts +4 -21
  17. package/dist/config.js +24 -75
  18. package/dist/control_plane.d.ts +4 -2
  19. package/dist/control_plane.js +226 -25
  20. package/dist/control_plane_bootstrap_helpers.d.ts +2 -1
  21. package/dist/control_plane_bootstrap_helpers.js +11 -1
  22. package/dist/control_plane_contract.d.ts +57 -0
  23. package/dist/control_plane_contract.js +1 -1
  24. package/dist/control_plane_reload.d.ts +63 -0
  25. package/dist/control_plane_reload.js +525 -0
  26. package/dist/control_plane_run_queue_coordinator.d.ts +48 -0
  27. package/dist/control_plane_run_queue_coordinator.js +327 -0
  28. package/dist/control_plane_telegram_generation.js +0 -1
  29. package/dist/control_plane_wake_delivery.d.ts +50 -0
  30. package/dist/control_plane_wake_delivery.js +123 -0
  31. package/dist/index.d.ts +4 -1
  32. package/dist/index.js +2 -0
  33. package/dist/run_queue.d.ts +95 -0
  34. package/dist/run_queue.js +817 -0
  35. package/dist/run_supervisor.d.ts +20 -0
  36. package/dist/run_supervisor.js +25 -1
  37. package/dist/server.d.ts +5 -10
  38. package/dist/server.js +337 -528
  39. package/dist/server_program_orchestration.js +2 -0
  40. package/dist/server_routing.d.ts +3 -2
  41. package/dist/server_routing.js +28 -900
  42. package/package.json +7 -6
package/dist/server.js CHANGED
@@ -1,11 +1,11 @@
1
- import { GenerationTelemetryRecorder, } from "@femtomc/mu-control-plane";
1
+ import { GenerationTelemetryRecorder } from "@femtomc/mu-control-plane";
2
2
  import { currentRunId, EventLog, FsJsonlStore, getStorePaths, JsonlEventSink } from "@femtomc/mu-core/node";
3
3
  import { ForumStore } from "@femtomc/mu-forum";
4
4
  import { IssueStore } from "@femtomc/mu-issue";
5
5
  import { ControlPlaneActivitySupervisor } from "./activity_supervisor.js";
6
6
  import { DEFAULT_MU_CONFIG, readMuConfigFile, writeMuConfigFile, } from "./config.js";
7
7
  import { bootstrapControlPlane } from "./control_plane.js";
8
- import { ControlPlaneGenerationSupervisor } from "./generation_supervisor.js";
8
+ import { createReloadManager, } from "./control_plane_reload.js";
9
9
  import { ActivityHeartbeatScheduler } from "./heartbeat_scheduler.js";
10
10
  import { createProcessSessionLifecycle } from "./session_lifecycle.js";
11
11
  import { createServerProgramOrchestration } from "./server_program_orchestration.js";
@@ -19,16 +19,85 @@ function describeError(err) {
19
19
  return err.message;
20
20
  return String(err);
21
21
  }
22
- function summarizeControlPlane(handle) {
23
- if (!handle) {
24
- return { active: false, adapters: [], routes: [] };
25
- }
22
+ function emptyNotifyOperatorsResult() {
26
23
  return {
27
- active: handle.activeAdapters.length > 0,
28
- adapters: handle.activeAdapters.map((adapter) => adapter.name),
29
- routes: handle.activeAdapters.map((adapter) => ({ name: adapter.name, route: adapter.route })),
24
+ queued: 0,
25
+ duplicate: 0,
26
+ skipped: 0,
27
+ decisions: [],
30
28
  };
31
29
  }
30
+ function normalizeWakeTurnMode(value) {
31
+ if (typeof value !== "string") {
32
+ return "off";
33
+ }
34
+ const normalized = value.trim().toLowerCase();
35
+ if (normalized === "shadow") {
36
+ return "shadow";
37
+ }
38
+ if (normalized === "active") {
39
+ return "active";
40
+ }
41
+ return "off";
42
+ }
43
+ function stringField(payload, key) {
44
+ const value = payload[key];
45
+ if (typeof value !== "string") {
46
+ return null;
47
+ }
48
+ const trimmed = value.trim();
49
+ return trimmed.length > 0 ? trimmed : null;
50
+ }
51
+ function numberField(payload, key) {
52
+ const value = payload[key];
53
+ if (typeof value !== "number" || !Number.isFinite(value)) {
54
+ return null;
55
+ }
56
+ return Math.trunc(value);
57
+ }
58
+ function computeWakeId(opts) {
59
+ const source = stringField(opts.payload, "wake_source") ?? "unknown";
60
+ const programId = stringField(opts.payload, "program_id") ?? "unknown";
61
+ const sourceTsMs = numberField(opts.payload, "source_ts_ms");
62
+ const target = Object.hasOwn(opts.payload, "target") ? opts.payload.target : null;
63
+ let targetFingerprint = "null";
64
+ try {
65
+ targetFingerprint = JSON.stringify(target) ?? "null";
66
+ }
67
+ catch {
68
+ targetFingerprint = "[unserializable]";
69
+ }
70
+ const hasher = new Bun.CryptoHasher("sha256");
71
+ hasher.update(`${source}|${programId}|${sourceTsMs ?? "na"}|${opts.dedupeKey}|${targetFingerprint}`);
72
+ return hasher.digest("hex").slice(0, 16);
73
+ }
74
+ function buildWakeTurnCommandText(opts) {
75
+ const wakeSource = stringField(opts.payload, "wake_source") ?? "unknown";
76
+ const programId = stringField(opts.payload, "program_id") ?? "unknown";
77
+ const wakeMode = stringField(opts.payload, "wake_mode") ?? "immediate";
78
+ const targetKind = stringField(opts.payload, "target_kind") ?? "unknown";
79
+ const reason = stringField(opts.payload, "reason") ?? "scheduled";
80
+ let target = "null";
81
+ try {
82
+ target = JSON.stringify(Object.hasOwn(opts.payload, "target") ? opts.payload.target : null) ?? "null";
83
+ }
84
+ catch {
85
+ target = "[unserializable]";
86
+ }
87
+ return [
88
+ "Autonomous wake turn triggered by heartbeat/cron scheduler.",
89
+ `wake_id=${opts.wakeId}`,
90
+ `wake_source=${wakeSource}`,
91
+ `program_id=${programId}`,
92
+ `wake_mode=${wakeMode}`,
93
+ `target_kind=${targetKind}`,
94
+ `reason=${reason}`,
95
+ `message=${opts.message}`,
96
+ `target=${target}`,
97
+ "",
98
+ "If an action is needed, produce exactly one `/mu ...` command. If no action is needed, provide a short operator response.",
99
+ ].join("\n");
100
+ }
32
101
  export function createContext(repoRoot) {
33
102
  const paths = getStorePaths(repoRoot);
34
103
  const eventsStore = new FsJsonlStore(paths.eventsPath);
@@ -68,6 +137,12 @@ function createServer(options = {}) {
68
137
  const autoRunHeartbeatEveryMs = Math.max(1_000, toNonNegativeInt(options.autoRunHeartbeatEveryMs, DEFAULT_AUTO_RUN_HEARTBEAT_EVERY_MS));
69
138
  const operatorWakeLastByKey = new Map();
70
139
  const sessionLifecycle = options.sessionLifecycle ?? createProcessSessionLifecycle({ repoRoot });
140
+ const emitWakeDeliveryEvent = async (payload) => {
141
+ await context.eventLog.emit("operator.wake.delivery", {
142
+ source: "mu-server.operator-wake",
143
+ payload,
144
+ });
145
+ };
71
146
  const emitOperatorWake = async (opts) => {
72
147
  const dedupeKey = opts.dedupeKey.trim();
73
148
  if (!dedupeKey) {
@@ -80,6 +155,160 @@ function createServer(options = {}) {
80
155
  return false;
81
156
  }
82
157
  operatorWakeLastByKey.set(dedupeKey, nowMs);
158
+ const wakeId = computeWakeId({ dedupeKey, payload: opts.payload });
159
+ const selectedWakeMode = stringField(opts.payload, "wake_mode");
160
+ const wakeSource = stringField(opts.payload, "wake_source");
161
+ const programId = stringField(opts.payload, "program_id");
162
+ const sourceTsMs = numberField(opts.payload, "source_ts_ms");
163
+ let wakeTurnMode = normalizeWakeTurnMode(fallbackConfig.control_plane.operator.wake_turn_mode);
164
+ let configReadError = null;
165
+ try {
166
+ const config = await loadConfigFromDisk();
167
+ wakeTurnMode = normalizeWakeTurnMode(config.control_plane.operator.wake_turn_mode);
168
+ }
169
+ catch (err) {
170
+ configReadError = describeError(err);
171
+ }
172
+ let decision;
173
+ if (wakeTurnMode === "off") {
174
+ decision = {
175
+ outcome: "skipped",
176
+ reason: "feature_disabled",
177
+ wakeTurnMode,
178
+ selectedWakeMode,
179
+ turnRequestId: null,
180
+ turnResultKind: null,
181
+ error: configReadError,
182
+ };
183
+ }
184
+ else if (wakeTurnMode === "shadow") {
185
+ decision = {
186
+ outcome: "skipped",
187
+ reason: "shadow_mode",
188
+ wakeTurnMode,
189
+ selectedWakeMode,
190
+ turnRequestId: null,
191
+ turnResultKind: null,
192
+ error: configReadError,
193
+ };
194
+ }
195
+ else if (typeof controlPlaneProxy.submitTerminalCommand !== "function") {
196
+ decision = {
197
+ outcome: "fallback",
198
+ reason: "control_plane_unavailable",
199
+ wakeTurnMode,
200
+ selectedWakeMode,
201
+ turnRequestId: null,
202
+ turnResultKind: null,
203
+ error: configReadError,
204
+ };
205
+ }
206
+ else {
207
+ const turnRequestId = `wake-turn-${wakeId}`;
208
+ try {
209
+ const turnResult = await controlPlaneProxy.submitTerminalCommand({
210
+ commandText: buildWakeTurnCommandText({
211
+ wakeId,
212
+ message: opts.message,
213
+ payload: opts.payload,
214
+ }),
215
+ repoRoot: context.repoRoot,
216
+ requestId: turnRequestId,
217
+ });
218
+ if (turnResult.kind === "noop" || turnResult.kind === "invalid") {
219
+ decision = {
220
+ outcome: "fallback",
221
+ reason: `turn_result_${turnResult.kind}`,
222
+ wakeTurnMode,
223
+ selectedWakeMode,
224
+ turnRequestId,
225
+ turnResultKind: turnResult.kind,
226
+ error: configReadError,
227
+ };
228
+ }
229
+ else {
230
+ decision = {
231
+ outcome: "triggered",
232
+ reason: "turn_invoked",
233
+ wakeTurnMode,
234
+ selectedWakeMode,
235
+ turnRequestId,
236
+ turnResultKind: turnResult.kind,
237
+ error: configReadError,
238
+ };
239
+ }
240
+ }
241
+ catch (err) {
242
+ const error = describeError(err);
243
+ decision = {
244
+ outcome: "fallback",
245
+ reason: error === "control_plane_unavailable" ? "control_plane_unavailable" : "turn_execution_failed",
246
+ wakeTurnMode,
247
+ selectedWakeMode,
248
+ turnRequestId,
249
+ turnResultKind: null,
250
+ error,
251
+ };
252
+ }
253
+ }
254
+ await context.eventLog.emit("operator.wake.decision", {
255
+ source: "mu-server.operator-wake",
256
+ payload: {
257
+ wake_id: wakeId,
258
+ dedupe_key: dedupeKey,
259
+ wake_source: wakeSource,
260
+ program_id: programId,
261
+ source_ts_ms: sourceTsMs,
262
+ selected_wake_mode: selectedWakeMode,
263
+ wake_turn_mode: decision.wakeTurnMode,
264
+ wake_turn_feature_enabled: decision.wakeTurnMode === "active",
265
+ outcome: decision.outcome,
266
+ reason: decision.reason,
267
+ turn_request_id: decision.turnRequestId,
268
+ turn_result_kind: decision.turnResultKind,
269
+ error: decision.error,
270
+ },
271
+ });
272
+ let notifyResult = emptyNotifyOperatorsResult();
273
+ let notifyError = null;
274
+ if (typeof controlPlaneProxy.notifyOperators === "function") {
275
+ try {
276
+ notifyResult = await controlPlaneProxy.notifyOperators({
277
+ message: opts.message,
278
+ dedupeKey,
279
+ wake: {
280
+ wakeId,
281
+ wakeSource,
282
+ programId,
283
+ sourceTsMs,
284
+ },
285
+ metadata: {
286
+ wake_delivery_reason: "heartbeat_cron_wake",
287
+ wake_turn_outcome: decision.outcome,
288
+ wake_turn_reason: decision.reason,
289
+ },
290
+ });
291
+ }
292
+ catch (err) {
293
+ notifyError = describeError(err);
294
+ }
295
+ }
296
+ for (const deliveryDecision of notifyResult.decisions) {
297
+ await emitWakeDeliveryEvent({
298
+ state: deliveryDecision.state,
299
+ reason_code: deliveryDecision.reason_code,
300
+ wake_id: wakeId,
301
+ dedupe_key: dedupeKey,
302
+ binding_id: deliveryDecision.binding_id,
303
+ channel: deliveryDecision.channel,
304
+ outbox_id: deliveryDecision.outbox_id,
305
+ outbox_dedupe_key: deliveryDecision.dedupe_key,
306
+ attempt_count: null,
307
+ wake_source: wakeSource,
308
+ program_id: programId,
309
+ source_ts_ms: sourceTsMs,
310
+ });
311
+ }
83
312
  await context.eventLog.emit("operator.wake", {
84
313
  source: "mu-server.operator-wake",
85
314
  payload: {
@@ -87,28 +316,43 @@ function createServer(options = {}) {
87
316
  dedupe_key: dedupeKey,
88
317
  coalesce_ms: coalesceMs,
89
318
  ...opts.payload,
319
+ wake_id: wakeId,
320
+ decision_outcome: decision.outcome,
321
+ decision_reason: decision.reason,
322
+ wake_turn_mode: decision.wakeTurnMode,
323
+ selected_wake_mode: decision.selectedWakeMode,
324
+ wake_turn_feature_enabled: decision.wakeTurnMode === "active",
325
+ turn_request_id: decision.turnRequestId,
326
+ turn_result_kind: decision.turnResultKind,
327
+ decision_error: decision.error,
328
+ delivery: {
329
+ queued: notifyResult.queued,
330
+ duplicate: notifyResult.duplicate,
331
+ skipped: notifyResult.skipped,
332
+ },
333
+ delivery_summary_v2: {
334
+ queued: notifyResult.queued,
335
+ duplicate: notifyResult.duplicate,
336
+ skipped: notifyResult.skipped,
337
+ total: notifyResult.decisions.length,
338
+ },
339
+ delivery_error: notifyError,
90
340
  },
91
341
  });
92
342
  return true;
93
343
  };
94
- let controlPlaneCurrent = options.controlPlane ?? null;
95
- let reloadInFlight = null;
96
344
  const generationTelemetry = options.generationTelemetry ?? new GenerationTelemetryRecorder();
97
- const generationSupervisor = new ControlPlaneGenerationSupervisor({
98
- supervisorId: "control-plane",
99
- initialGeneration: controlPlaneCurrent
100
- ? {
101
- generation_id: "control-plane-gen-0",
102
- generation_seq: 0,
345
+ const loadConfigFromDisk = async () => {
346
+ try {
347
+ return await readConfig(context.repoRoot);
348
+ }
349
+ catch (err) {
350
+ if (err?.code === "ENOENT") {
351
+ return fallbackConfig;
103
352
  }
104
- : null,
105
- });
106
- const generationTagsFor = (generation, component) => ({
107
- generation_id: generation.generation_id,
108
- generation_seq: generation.generation_seq,
109
- supervisor: "control_plane",
110
- component,
111
- });
353
+ throw err;
354
+ }
355
+ };
112
356
  const controlPlaneReloader = options.controlPlaneReloader ??
113
357
  (async ({ repoRoot, config, generation }) => {
114
358
  return await bootstrapControlPlane({
@@ -118,75 +362,129 @@ function createServer(options = {}) {
118
362
  generation,
119
363
  telemetry: generationTelemetry,
120
364
  sessionLifecycle,
365
+ wakeDeliveryObserver: (event) => {
366
+ void emitWakeDeliveryEvent({
367
+ state: event.state,
368
+ reason_code: event.reason_code,
369
+ wake_id: event.wake_id,
370
+ dedupe_key: event.dedupe_key,
371
+ binding_id: event.binding_id,
372
+ channel: event.channel,
373
+ outbox_id: event.outbox_id,
374
+ outbox_dedupe_key: event.outbox_dedupe_key,
375
+ attempt_count: event.attempt_count,
376
+ });
377
+ },
121
378
  terminalEnabled: true,
122
379
  });
123
380
  });
381
+ const reloadManager = createReloadManager({
382
+ repoRoot: context.repoRoot,
383
+ initialControlPlane: options.controlPlane ?? null,
384
+ controlPlaneReloader,
385
+ generationTelemetry,
386
+ loadConfigFromDisk,
387
+ });
388
+ const applyWakeDeliveryObserver = () => {
389
+ const handle = reloadManager.getControlPlaneCurrent();
390
+ handle?.setWakeDeliveryObserver?.((event) => {
391
+ void emitWakeDeliveryEvent({
392
+ state: event.state,
393
+ reason_code: event.reason_code,
394
+ wake_id: event.wake_id,
395
+ dedupe_key: event.dedupe_key,
396
+ binding_id: event.binding_id,
397
+ channel: event.channel,
398
+ outbox_id: event.outbox_id,
399
+ outbox_dedupe_key: event.outbox_dedupe_key,
400
+ attempt_count: event.attempt_count,
401
+ });
402
+ });
403
+ };
404
+ applyWakeDeliveryObserver();
405
+ const reloadControlPlane = async (reason) => {
406
+ const result = await reloadManager.reloadControlPlane(reason);
407
+ applyWakeDeliveryObserver();
408
+ return result;
409
+ };
124
410
  const controlPlaneProxy = {
125
411
  get activeAdapters() {
126
- return controlPlaneCurrent?.activeAdapters ?? [];
412
+ return reloadManager.getControlPlaneCurrent()?.activeAdapters ?? [];
127
413
  },
128
414
  async handleWebhook(path, req) {
129
- const handle = controlPlaneCurrent;
415
+ const handle = reloadManager.getControlPlaneCurrent();
130
416
  if (!handle)
131
417
  return null;
132
418
  return await handle.handleWebhook(path, req);
133
419
  },
420
+ async notifyOperators(opts) {
421
+ const handle = reloadManager.getControlPlaneCurrent();
422
+ if (!handle?.notifyOperators) {
423
+ return emptyNotifyOperatorsResult();
424
+ }
425
+ return await handle.notifyOperators(opts);
426
+ },
427
+ setWakeDeliveryObserver(observer) {
428
+ const handle = reloadManager.getControlPlaneCurrent();
429
+ handle?.setWakeDeliveryObserver?.(observer ?? null);
430
+ },
134
431
  async listRuns(opts) {
135
- const handle = controlPlaneCurrent;
432
+ const handle = reloadManager.getControlPlaneCurrent();
136
433
  if (!handle?.listRuns)
137
434
  return [];
138
435
  return await handle.listRuns(opts);
139
436
  },
140
437
  async getRun(idOrRoot) {
141
- const handle = controlPlaneCurrent;
438
+ const handle = reloadManager.getControlPlaneCurrent();
142
439
  if (!handle?.getRun)
143
440
  return null;
144
441
  return await handle.getRun(idOrRoot);
145
442
  },
146
443
  async startRun(opts) {
147
- const handle = controlPlaneCurrent;
444
+ const handle = reloadManager.getControlPlaneCurrent();
148
445
  if (!handle?.startRun) {
149
446
  throw new Error("run_supervisor_unavailable");
150
447
  }
151
448
  return await handle.startRun(opts);
152
449
  },
153
450
  async resumeRun(opts) {
154
- const handle = controlPlaneCurrent;
451
+ const handle = reloadManager.getControlPlaneCurrent();
155
452
  if (!handle?.resumeRun) {
156
453
  throw new Error("run_supervisor_unavailable");
157
454
  }
158
455
  return await handle.resumeRun(opts);
159
456
  },
160
457
  async interruptRun(opts) {
161
- const handle = controlPlaneCurrent;
458
+ const handle = reloadManager.getControlPlaneCurrent();
162
459
  if (!handle?.interruptRun) {
163
460
  return { ok: false, reason: "not_found", run: null };
164
461
  }
165
462
  return await handle.interruptRun(opts);
166
463
  },
167
464
  async heartbeatRun(opts) {
168
- const handle = controlPlaneCurrent;
465
+ const handle = reloadManager.getControlPlaneCurrent();
169
466
  if (!handle?.heartbeatRun) {
170
467
  return { ok: false, reason: "not_found", run: null };
171
468
  }
172
469
  return await handle.heartbeatRun(opts);
173
470
  },
174
471
  async traceRun(opts) {
175
- const handle = controlPlaneCurrent;
472
+ const handle = reloadManager.getControlPlaneCurrent();
176
473
  if (!handle?.traceRun)
177
474
  return null;
178
475
  return await handle.traceRun(opts);
179
476
  },
180
477
  async submitTerminalCommand(opts) {
181
- const handle = controlPlaneCurrent;
478
+ const handle = reloadManager.getControlPlaneCurrent();
182
479
  if (!handle?.submitTerminalCommand) {
183
480
  throw new Error("control_plane_unavailable");
184
481
  }
185
482
  return await handle.submitTerminalCommand(opts);
186
483
  },
187
484
  async stop() {
188
- const handle = controlPlaneCurrent;
189
- controlPlaneCurrent = null;
485
+ const handle = reloadManager.getControlPlaneCurrent();
486
+ handle?.setWakeDeliveryObserver?.(null);
487
+ reloadManager.setControlPlaneCurrent(null);
190
488
  await handle?.stop();
191
489
  },
192
490
  };
@@ -199,490 +497,6 @@ function createServer(options = {}) {
199
497
  autoRunHeartbeatEveryMs,
200
498
  emitOperatorWake,
201
499
  });
202
- const loadConfigFromDisk = async () => {
203
- try {
204
- return await readConfig(context.repoRoot);
205
- }
206
- catch (err) {
207
- if (err?.code === "ENOENT") {
208
- return fallbackConfig;
209
- }
210
- throw err;
211
- }
212
- };
213
- const performControlPlaneReload = async (reason) => {
214
- const startedAtMs = Date.now();
215
- const planned = generationSupervisor.beginReload(reason);
216
- const attempt = planned.attempt;
217
- const previous = controlPlaneCurrent;
218
- const previousSummary = summarizeControlPlane(previous);
219
- const tags = generationTagsFor(attempt.to_generation, "server.reload");
220
- const baseFields = {
221
- reason,
222
- attempt_id: attempt.attempt_id,
223
- coalesced: planned.coalesced,
224
- from_generation_id: attempt.from_generation?.generation_id ?? null,
225
- };
226
- const logLifecycle = (opts) => {
227
- generationTelemetry.log({
228
- level: opts.level,
229
- message: `reload transition ${opts.stage}:${opts.state}`,
230
- fields: {
231
- ...tags,
232
- ...baseFields,
233
- ...(opts.extra ?? {}),
234
- },
235
- });
236
- };
237
- let swapped = false;
238
- let failedStage = "warmup";
239
- let drainDurationMs = 0;
240
- let drainStartedAtMs = null;
241
- let nextHandle = null;
242
- try {
243
- logLifecycle({ level: "info", stage: "warmup", state: "start" });
244
- const latestConfig = await loadConfigFromDisk();
245
- const telegramGeneration = (await previous?.reloadTelegramGeneration?.({
246
- config: latestConfig.control_plane,
247
- reason,
248
- })) ?? null;
249
- if (telegramGeneration?.handled) {
250
- if (telegramGeneration.warmup) {
251
- logLifecycle({
252
- level: telegramGeneration.warmup.ok ? "info" : "error",
253
- stage: "warmup",
254
- state: telegramGeneration.warmup.ok ? "complete" : "failed",
255
- extra: {
256
- warmup_elapsed_ms: telegramGeneration.warmup.elapsed_ms,
257
- error: telegramGeneration.warmup.error,
258
- telegram_generation_id: telegramGeneration.to_generation?.generation_id ?? null,
259
- },
260
- });
261
- }
262
- else {
263
- logLifecycle({
264
- level: "info",
265
- stage: "warmup",
266
- state: "skipped",
267
- extra: {
268
- warmup_reason: "telegram_generation_no_warmup",
269
- telegram_generation_id: telegramGeneration.to_generation?.generation_id ?? null,
270
- },
271
- });
272
- }
273
- if (telegramGeneration.cutover) {
274
- logLifecycle({ level: "info", stage: "cutover", state: "start" });
275
- logLifecycle({
276
- level: telegramGeneration.cutover.ok ? "info" : "error",
277
- stage: "cutover",
278
- state: telegramGeneration.cutover.ok ? "complete" : "failed",
279
- extra: {
280
- cutover_elapsed_ms: telegramGeneration.cutover.elapsed_ms,
281
- error: telegramGeneration.cutover.error,
282
- active_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
283
- },
284
- });
285
- }
286
- else {
287
- logLifecycle({
288
- level: "info",
289
- stage: "cutover",
290
- state: "skipped",
291
- extra: {
292
- cutover_reason: "telegram_generation_no_cutover",
293
- active_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
294
- },
295
- });
296
- }
297
- if (telegramGeneration.drain) {
298
- logLifecycle({ level: "info", stage: "drain", state: "start" });
299
- drainDurationMs = Math.max(0, Math.trunc(telegramGeneration.drain.elapsed_ms));
300
- generationTelemetry.recordDrainDuration(tags, {
301
- durationMs: drainDurationMs,
302
- timedOut: telegramGeneration.drain.timed_out,
303
- metadata: {
304
- ...baseFields,
305
- telegram_forced_stop: telegramGeneration.drain.forced_stop,
306
- telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
307
- },
308
- });
309
- logLifecycle({
310
- level: telegramGeneration.drain.ok ? "info" : "warn",
311
- stage: "drain",
312
- state: telegramGeneration.drain.ok ? "complete" : "failed",
313
- extra: {
314
- drain_duration_ms: telegramGeneration.drain.elapsed_ms,
315
- drain_timed_out: telegramGeneration.drain.timed_out,
316
- forced_stop: telegramGeneration.drain.forced_stop,
317
- error: telegramGeneration.drain.error,
318
- },
319
- });
320
- }
321
- else {
322
- logLifecycle({
323
- level: "info",
324
- stage: "drain",
325
- state: "skipped",
326
- extra: {
327
- drain_reason: "telegram_generation_no_drain",
328
- telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
329
- },
330
- });
331
- }
332
- const shouldLogRollbackStart = telegramGeneration.rollback.requested ||
333
- telegramGeneration.rollback.attempted ||
334
- telegramGeneration.rollback.trigger != null ||
335
- !telegramGeneration.ok;
336
- if (shouldLogRollbackStart) {
337
- logLifecycle({
338
- level: telegramGeneration.rollback.ok ? "warn" : "error",
339
- stage: "rollback",
340
- state: "start",
341
- extra: {
342
- rollback_requested: telegramGeneration.rollback.requested,
343
- rollback_trigger: telegramGeneration.rollback.trigger,
344
- rollback_attempted: telegramGeneration.rollback.attempted,
345
- },
346
- });
347
- logLifecycle({
348
- level: telegramGeneration.rollback.ok ? "info" : "error",
349
- stage: "rollback",
350
- state: telegramGeneration.rollback.ok ? "complete" : "failed",
351
- extra: {
352
- rollback_requested: telegramGeneration.rollback.requested,
353
- rollback_trigger: telegramGeneration.rollback.trigger,
354
- rollback_attempted: telegramGeneration.rollback.attempted,
355
- error: telegramGeneration.rollback.error,
356
- },
357
- });
358
- }
359
- else {
360
- logLifecycle({
361
- level: "debug",
362
- stage: "rollback",
363
- state: "skipped",
364
- extra: {
365
- rollback_reason: "not_requested",
366
- },
367
- });
368
- }
369
- if (telegramGeneration.ok) {
370
- swapped = generationSupervisor.markSwapInstalled(attempt.attempt_id);
371
- generationSupervisor.finishReload(attempt.attempt_id, "success");
372
- const elapsedMs = Math.max(0, Date.now() - startedAtMs);
373
- generationTelemetry.recordReloadSuccess(tags, {
374
- ...baseFields,
375
- elapsed_ms: elapsedMs,
376
- drain_duration_ms: drainDurationMs,
377
- telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
378
- telegram_rollback_attempted: telegramGeneration.rollback.attempted,
379
- telegram_rollback_trigger: telegramGeneration.rollback.trigger,
380
- });
381
- generationTelemetry.trace({
382
- name: "control_plane.reload",
383
- status: "ok",
384
- durationMs: elapsedMs,
385
- fields: {
386
- ...tags,
387
- ...baseFields,
388
- telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
389
- },
390
- });
391
- return {
392
- ok: true,
393
- reason,
394
- previous_control_plane: previousSummary,
395
- control_plane: summarizeControlPlane(controlPlaneCurrent),
396
- generation: {
397
- attempt_id: attempt.attempt_id,
398
- coalesced: planned.coalesced,
399
- from_generation: attempt.from_generation,
400
- to_generation: attempt.to_generation,
401
- active_generation: generationSupervisor.activeGeneration(),
402
- outcome: "success",
403
- },
404
- telegram_generation: telegramGeneration,
405
- };
406
- }
407
- generationSupervisor.finishReload(attempt.attempt_id, "failure");
408
- const error = telegramGeneration.error ?? "telegram_generation_reload_failed";
409
- const elapsedMs = Math.max(0, Date.now() - startedAtMs);
410
- generationTelemetry.recordReloadFailure(tags, {
411
- ...baseFields,
412
- elapsed_ms: elapsedMs,
413
- drain_duration_ms: drainDurationMs,
414
- error,
415
- telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
416
- telegram_rollback_trigger: telegramGeneration.rollback.trigger,
417
- });
418
- generationTelemetry.trace({
419
- name: "control_plane.reload",
420
- status: "error",
421
- durationMs: elapsedMs,
422
- fields: {
423
- ...tags,
424
- ...baseFields,
425
- error,
426
- telegram_generation_id: telegramGeneration.active_generation?.generation_id ?? null,
427
- telegram_rollback_trigger: telegramGeneration.rollback.trigger,
428
- },
429
- });
430
- return {
431
- ok: false,
432
- reason,
433
- previous_control_plane: previousSummary,
434
- control_plane: summarizeControlPlane(controlPlaneCurrent),
435
- generation: {
436
- attempt_id: attempt.attempt_id,
437
- coalesced: planned.coalesced,
438
- from_generation: attempt.from_generation,
439
- to_generation: attempt.to_generation,
440
- active_generation: generationSupervisor.activeGeneration(),
441
- outcome: "failure",
442
- },
443
- telegram_generation: telegramGeneration,
444
- error,
445
- };
446
- }
447
- const next = await controlPlaneReloader({
448
- repoRoot: context.repoRoot,
449
- previous,
450
- config: latestConfig.control_plane,
451
- generation: attempt.to_generation,
452
- });
453
- nextHandle = next;
454
- logLifecycle({ level: "info", stage: "warmup", state: "complete" });
455
- failedStage = "cutover";
456
- logLifecycle({ level: "info", stage: "cutover", state: "start" });
457
- controlPlaneCurrent = next;
458
- swapped = generationSupervisor.markSwapInstalled(attempt.attempt_id);
459
- logLifecycle({
460
- level: "info",
461
- stage: "cutover",
462
- state: "complete",
463
- extra: {
464
- active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
465
- },
466
- });
467
- failedStage = "drain";
468
- if (previous && previous !== next) {
469
- logLifecycle({ level: "info", stage: "drain", state: "start" });
470
- drainStartedAtMs = Date.now();
471
- await previous.stop();
472
- drainDurationMs = Math.max(0, Date.now() - drainStartedAtMs);
473
- generationTelemetry.recordDrainDuration(tags, {
474
- durationMs: drainDurationMs,
475
- metadata: {
476
- ...baseFields,
477
- },
478
- });
479
- logLifecycle({
480
- level: "info",
481
- stage: "drain",
482
- state: "complete",
483
- extra: {
484
- drain_duration_ms: drainDurationMs,
485
- },
486
- });
487
- }
488
- else {
489
- logLifecycle({
490
- level: "info",
491
- stage: "drain",
492
- state: "skipped",
493
- extra: {
494
- drain_reason: "no_previous_generation",
495
- },
496
- });
497
- }
498
- logLifecycle({
499
- level: "debug",
500
- stage: "rollback",
501
- state: "skipped",
502
- extra: {
503
- rollback_reason: "not_requested",
504
- },
505
- });
506
- generationSupervisor.finishReload(attempt.attempt_id, "success");
507
- const elapsedMs = Math.max(0, Date.now() - startedAtMs);
508
- generationTelemetry.recordReloadSuccess(tags, {
509
- ...baseFields,
510
- elapsed_ms: elapsedMs,
511
- drain_duration_ms: drainDurationMs,
512
- });
513
- generationTelemetry.trace({
514
- name: "control_plane.reload",
515
- status: "ok",
516
- durationMs: elapsedMs,
517
- fields: {
518
- ...tags,
519
- ...baseFields,
520
- },
521
- });
522
- return {
523
- ok: true,
524
- reason,
525
- previous_control_plane: previousSummary,
526
- control_plane: summarizeControlPlane(next),
527
- generation: {
528
- attempt_id: attempt.attempt_id,
529
- coalesced: planned.coalesced,
530
- from_generation: attempt.from_generation,
531
- to_generation: attempt.to_generation,
532
- active_generation: generationSupervisor.activeGeneration(),
533
- outcome: "success",
534
- },
535
- };
536
- }
537
- catch (err) {
538
- const error = describeError(err);
539
- if (failedStage === "drain" && drainStartedAtMs != null) {
540
- drainDurationMs = Math.max(0, Date.now() - drainStartedAtMs);
541
- generationTelemetry.recordDrainDuration(tags, {
542
- durationMs: drainDurationMs,
543
- metadata: {
544
- ...baseFields,
545
- error,
546
- },
547
- });
548
- }
549
- logLifecycle({
550
- level: "error",
551
- stage: failedStage,
552
- state: "failed",
553
- extra: {
554
- error,
555
- drain_duration_ms: failedStage === "drain" ? drainDurationMs : undefined,
556
- },
557
- });
558
- if (swapped) {
559
- logLifecycle({
560
- level: "warn",
561
- stage: "rollback",
562
- state: "start",
563
- extra: {
564
- rollback_reason: "reload_failed_after_cutover",
565
- rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
566
- rollback_source_generation_id: attempt.to_generation.generation_id,
567
- },
568
- });
569
- if (!previous) {
570
- logLifecycle({
571
- level: "error",
572
- stage: "rollback",
573
- state: "failed",
574
- extra: {
575
- rollback_reason: "no_previous_generation",
576
- rollback_source_generation_id: attempt.to_generation.generation_id,
577
- },
578
- });
579
- }
580
- else {
581
- try {
582
- const restored = generationSupervisor.rollbackSwapInstalled(attempt.attempt_id);
583
- if (!restored) {
584
- throw new Error("generation_rollback_state_mismatch");
585
- }
586
- controlPlaneCurrent = previous;
587
- if (nextHandle && nextHandle !== previous) {
588
- await nextHandle.stop();
589
- }
590
- logLifecycle({
591
- level: "info",
592
- stage: "rollback",
593
- state: "complete",
594
- extra: {
595
- active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
596
- rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
597
- },
598
- });
599
- }
600
- catch (rollbackErr) {
601
- logLifecycle({
602
- level: "error",
603
- stage: "rollback",
604
- state: "failed",
605
- extra: {
606
- error: describeError(rollbackErr),
607
- active_generation_id: generationSupervisor.activeGeneration()?.generation_id ?? null,
608
- rollback_target_generation_id: attempt.from_generation?.generation_id ?? null,
609
- rollback_source_generation_id: attempt.to_generation.generation_id,
610
- },
611
- });
612
- }
613
- }
614
- }
615
- else {
616
- logLifecycle({
617
- level: "debug",
618
- stage: "rollback",
619
- state: "skipped",
620
- extra: {
621
- rollback_reason: "cutover_not_installed",
622
- },
623
- });
624
- }
625
- generationSupervisor.finishReload(attempt.attempt_id, "failure");
626
- const elapsedMs = Math.max(0, Date.now() - startedAtMs);
627
- generationTelemetry.recordReloadFailure(tags, {
628
- ...baseFields,
629
- elapsed_ms: elapsedMs,
630
- drain_duration_ms: drainDurationMs,
631
- error,
632
- });
633
- generationTelemetry.trace({
634
- name: "control_plane.reload",
635
- status: "error",
636
- durationMs: elapsedMs,
637
- fields: {
638
- ...tags,
639
- ...baseFields,
640
- error,
641
- },
642
- });
643
- return {
644
- ok: false,
645
- reason,
646
- previous_control_plane: previousSummary,
647
- control_plane: summarizeControlPlane(controlPlaneCurrent),
648
- generation: {
649
- attempt_id: attempt.attempt_id,
650
- coalesced: planned.coalesced,
651
- from_generation: attempt.from_generation,
652
- to_generation: attempt.to_generation,
653
- active_generation: generationSupervisor.activeGeneration(),
654
- outcome: "failure",
655
- },
656
- error,
657
- };
658
- }
659
- };
660
- const reloadControlPlane = async (reason) => {
661
- if (reloadInFlight) {
662
- const pending = generationSupervisor.pendingReload();
663
- const fallbackGeneration = generationSupervisor.activeGeneration() ??
664
- generationSupervisor.snapshot().last_reload?.to_generation ??
665
- null;
666
- const generation = pending?.to_generation ?? fallbackGeneration;
667
- if (generation) {
668
- generationTelemetry.recordDuplicateSignal(generationTagsFor(generation, "server.reload"), {
669
- source: "server_reload",
670
- signal: "coalesced_reload_request",
671
- dedupe_key: pending?.attempt_id ?? "reload_in_flight",
672
- record_id: pending?.attempt_id ?? "reload_in_flight",
673
- metadata: {
674
- reason,
675
- pending_reason: pending?.reason ?? null,
676
- },
677
- });
678
- }
679
- return await reloadInFlight;
680
- }
681
- reloadInFlight = performControlPlaneReload(reason).finally(() => {
682
- reloadInFlight = null;
683
- });
684
- return await reloadInFlight;
685
- };
686
500
  const handleRequest = createServerRequestHandler({
687
501
  context,
688
502
  controlPlaneProxy,
@@ -692,16 +506,11 @@ function createServer(options = {}) {
692
506
  loadConfigFromDisk,
693
507
  writeConfig,
694
508
  reloadControlPlane,
695
- getControlPlaneStatus: () => ({
696
- ...summarizeControlPlane(controlPlaneCurrent),
697
- generation: generationSupervisor.snapshot(),
698
- observability: {
699
- counters: generationTelemetry.counters(),
700
- },
701
- }),
509
+ getControlPlaneStatus: reloadManager.getControlPlaneStatus,
702
510
  registerAutoRunHeartbeatProgram,
703
511
  disableAutoRunHeartbeatProgram,
704
512
  describeError,
513
+ initiateShutdown: options.initiateShutdown,
705
514
  });
706
515
  const server = {
707
516
  port: options.port || 3000,