@agent-vm/agent-vm 0.0.92 → 0.0.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/dist/build/managed-image-dockerfile.d.ts +2 -1
  2. package/dist/build/managed-image-dockerfile.d.ts.map +1 -1
  3. package/dist/build/managed-image-dockerfile.js +51 -27
  4. package/dist/build/managed-image-dockerfile.js.map +1 -1
  5. package/dist/cli/commands/controller-definition.d.ts +42 -42
  6. package/dist/cli/commands/create-app.d.ts +42 -42
  7. package/dist/cli/manual-templates.d.ts.map +1 -1
  8. package/dist/cli/manual-templates.js +11 -2
  9. package/dist/cli/manual-templates.js.map +1 -1
  10. package/dist/config/system-config.d.ts +7 -0
  11. package/dist/config/system-config.d.ts.map +1 -1
  12. package/dist/config/system-config.js +35 -0
  13. package/dist/config/system-config.js.map +1 -1
  14. package/dist/controller/controller-runtime-operations.d.ts +1 -0
  15. package/dist/controller/controller-runtime-operations.d.ts.map +1 -1
  16. package/dist/controller/controller-runtime-operations.js +2 -0
  17. package/dist/controller/controller-runtime-operations.js.map +1 -1
  18. package/dist/controller/controller-runtime-types.d.ts +3 -0
  19. package/dist/controller/controller-runtime-types.d.ts.map +1 -1
  20. package/dist/controller/controller-runtime.d.ts +1 -1
  21. package/dist/controller/controller-runtime.d.ts.map +1 -1
  22. package/dist/controller/controller-runtime.js +207 -116
  23. package/dist/controller/controller-runtime.js.map +1 -1
  24. package/dist/controller/health/channel-provider-recovery-observation.d.ts +23 -0
  25. package/dist/controller/health/channel-provider-recovery-observation.d.ts.map +1 -0
  26. package/dist/controller/health/channel-provider-recovery-observation.js +69 -0
  27. package/dist/controller/health/channel-provider-recovery-observation.js.map +1 -0
  28. package/dist/controller/health/durable-health-event-log.d.ts +24 -0
  29. package/dist/controller/health/durable-health-event-log.d.ts.map +1 -0
  30. package/dist/controller/health/durable-health-event-log.js +89 -0
  31. package/dist/controller/health/durable-health-event-log.js.map +1 -0
  32. package/dist/controller/health/gateway-recovery-actions.d.ts +27 -0
  33. package/dist/controller/health/gateway-recovery-actions.d.ts.map +1 -0
  34. package/dist/controller/health/gateway-recovery-actions.js +71 -0
  35. package/dist/controller/health/gateway-recovery-actions.js.map +1 -0
  36. package/dist/controller/health/gateway-service-health-monitor.d.ts +41 -3
  37. package/dist/controller/health/gateway-service-health-monitor.d.ts.map +1 -1
  38. package/dist/controller/health/gateway-service-health-monitor.js +231 -57
  39. package/dist/controller/health/gateway-service-health-monitor.js.map +1 -1
  40. package/dist/controller/health/gateway-vm-recovery-policy.d.ts +20 -0
  41. package/dist/controller/health/gateway-vm-recovery-policy.d.ts.map +1 -1
  42. package/dist/controller/health/gateway-vm-recovery-policy.js +85 -21
  43. package/dist/controller/health/gateway-vm-recovery-policy.js.map +1 -1
  44. package/dist/controller/health/gateway-vm-recovery-runner.d.ts +39 -0
  45. package/dist/controller/health/gateway-vm-recovery-runner.d.ts.map +1 -0
  46. package/dist/controller/health/gateway-vm-recovery-runner.js +251 -0
  47. package/dist/controller/health/gateway-vm-recovery-runner.js.map +1 -0
  48. package/dist/controller/health/health-event-store.d.ts +4 -0
  49. package/dist/controller/health/health-event-store.d.ts.map +1 -1
  50. package/dist/controller/health/health-event-store.js +19 -0
  51. package/dist/controller/health/health-event-store.js.map +1 -1
  52. package/dist/controller/http/controller-health-event-routes.d.ts +6 -0
  53. package/dist/controller/http/controller-health-event-routes.d.ts.map +1 -1
  54. package/dist/controller/http/controller-health-event-routes.js +49 -0
  55. package/dist/controller/http/controller-health-event-routes.js.map +1 -1
  56. package/dist/controller/http/controller-http-routes.d.ts.map +1 -1
  57. package/dist/controller/http/controller-http-routes.js +6 -0
  58. package/dist/controller/http/controller-http-routes.js.map +1 -1
  59. package/dist/controller/leases/lease-manager.d.ts.map +1 -1
  60. package/dist/controller/leases/lease-manager.js +37 -16
  61. package/dist/controller/leases/lease-manager.js.map +1 -1
  62. package/dist/controller/leases/tool-vm-lease-lifecycle.d.ts +44 -0
  63. package/dist/controller/leases/tool-vm-lease-lifecycle.d.ts.map +1 -0
  64. package/dist/controller/leases/tool-vm-lease-lifecycle.js +28 -0
  65. package/dist/controller/leases/tool-vm-lease-lifecycle.js.map +1 -0
  66. package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.d.ts +37 -0
  67. package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.d.ts.map +1 -0
  68. package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.js +133 -0
  69. package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.js.map +1 -0
  70. package/dist/controller/zone-runtimes/gateway-zone-state-machine.d.ts +101 -0
  71. package/dist/controller/zone-runtimes/gateway-zone-state-machine.d.ts.map +1 -0
  72. package/dist/controller/zone-runtimes/gateway-zone-state-machine.js +143 -0
  73. package/dist/controller/zone-runtimes/gateway-zone-state-machine.js.map +1 -0
  74. package/dist/controller/zone-runtimes/openclaw-zone-runtime.d.ts +8 -1
  75. package/dist/controller/zone-runtimes/openclaw-zone-runtime.d.ts.map +1 -1
  76. package/dist/controller/zone-runtimes/openclaw-zone-runtime.js +621 -65
  77. package/dist/controller/zone-runtimes/openclaw-zone-runtime.js.map +1 -1
  78. package/dist/controller/zone-runtimes/zone-runtime-errors.d.ts +7 -1
  79. package/dist/controller/zone-runtimes/zone-runtime-errors.d.ts.map +1 -1
  80. package/dist/controller/zone-runtimes/zone-runtime-errors.js +5 -1
  81. package/dist/controller/zone-runtimes/zone-runtime-errors.js.map +1 -1
  82. package/dist/controller/zone-runtimes/zone-runtime-registry.d.ts +2 -0
  83. package/dist/controller/zone-runtimes/zone-runtime-registry.d.ts.map +1 -1
  84. package/dist/controller/zone-runtimes/zone-runtime-registry.js +23 -0
  85. package/dist/controller/zone-runtimes/zone-runtime-registry.js.map +1 -1
  86. package/dist/controller/zone-runtimes/zone-runtime-types.d.ts +7 -0
  87. package/dist/controller/zone-runtimes/zone-runtime-types.d.ts.map +1 -1
  88. package/dist/gateway/gateway-ownership-evidence.d.ts +35 -0
  89. package/dist/gateway/gateway-ownership-evidence.d.ts.map +1 -0
  90. package/dist/gateway/gateway-ownership-evidence.js +10 -0
  91. package/dist/gateway/gateway-ownership-evidence.js.map +1 -0
  92. package/dist/gateway/gateway-recovery.d.ts +16 -0
  93. package/dist/gateway/gateway-recovery.d.ts.map +1 -1
  94. package/dist/gateway/gateway-recovery.js +105 -9
  95. package/dist/gateway/gateway-recovery.js.map +1 -1
  96. package/dist/gateway/gateway-zone-orchestrator.d.ts.map +1 -1
  97. package/dist/gateway/gateway-zone-orchestrator.js +50 -39
  98. package/dist/gateway/gateway-zone-orchestrator.js.map +1 -1
  99. package/dist/integration-tests/{smoke-harness.d.ts → e2e-harness.d.ts} +45 -37
  100. package/dist/integration-tests/e2e-harness.d.ts.map +1 -0
  101. package/dist/integration-tests/{smoke-harness.js → e2e-harness.js} +112 -94
  102. package/dist/integration-tests/e2e-harness.js.map +1 -0
  103. package/dist/integration-tests/e2e-workspace-build-global-setup.d.ts +16 -0
  104. package/dist/integration-tests/e2e-workspace-build-global-setup.d.ts.map +1 -0
  105. package/dist/integration-tests/e2e-workspace-build-global-setup.js +27 -0
  106. package/dist/integration-tests/e2e-workspace-build-global-setup.js.map +1 -0
  107. package/dist/integration-tests/live-agent-model-roundtrip-deployment.d.ts +11 -0
  108. package/dist/integration-tests/live-agent-model-roundtrip-deployment.d.ts.map +1 -0
  109. package/dist/integration-tests/live-agent-model-roundtrip-deployment.js +48 -0
  110. package/dist/integration-tests/live-agent-model-roundtrip-deployment.js.map +1 -0
  111. package/dist/integration-tests/live-agent-model-roundtrip-gates.d.ts +11 -0
  112. package/dist/integration-tests/live-agent-model-roundtrip-gates.d.ts.map +1 -0
  113. package/dist/integration-tests/live-agent-model-roundtrip-gates.js +21 -0
  114. package/dist/integration-tests/live-agent-model-roundtrip-gates.js.map +1 -0
  115. package/dist/integration-tests/live-vm-e2e-gates.d.ts +2 -0
  116. package/dist/integration-tests/live-vm-e2e-gates.d.ts.map +1 -0
  117. package/dist/integration-tests/live-vm-e2e-gates.js +4 -0
  118. package/dist/integration-tests/live-vm-e2e-gates.js.map +1 -0
  119. package/dist/operations/controller-status.d.ts +5 -0
  120. package/dist/operations/controller-status.d.ts.map +1 -1
  121. package/dist/operations/controller-status.js +42 -0
  122. package/dist/operations/controller-status.js.map +1 -1
  123. package/package.json +11 -11
  124. package/dist/integration-tests/live-integration-gates.d.ts +0 -2
  125. package/dist/integration-tests/live-integration-gates.d.ts.map +0 -1
  126. package/dist/integration-tests/live-integration-gates.js +0 -4
  127. package/dist/integration-tests/live-integration-gates.js.map +0 -1
  128. package/dist/integration-tests/smoke-harness.d.ts.map +0 -1
  129. package/dist/integration-tests/smoke-harness.js.map +0 -1
@@ -1,13 +1,26 @@
1
+ import { randomUUID } from 'node:crypto';
1
2
  import { resolveZoneSecrets } from '../../gateway/credential-manager.js';
2
3
  import { runGatewayHealthCheck } from '../../gateway/gateway-health-check.js';
4
+ import { GatewayOwnershipUnsafeError } from '../../gateway/gateway-ownership-evidence.js';
3
5
  import { deleteGatewayRuntimeRecord as deleteGatewayRuntimeRecordDefault } from '../../gateway/gateway-runtime-record.js';
4
6
  import { startGatewayZone } from '../../gateway/gateway-zone-orchestrator.js';
5
7
  import { runControllerCredentialsRefresh as runControllerCredentialsRefreshDefault } from '../../operations/credentials-refresh.js';
6
8
  import { runControllerDestroy as runControllerDestroyDefault } from '../../operations/destroy-zone.js';
7
9
  import { runControllerUpgrade as runControllerUpgradeDefault } from '../../operations/upgrade-zone.js';
8
10
  import { runControllerLogs as runControllerLogsDefault } from '../../operations/zone-logs.js';
11
+ import { isProcessAlive as defaultIsProcessAlive } from '../../shared/managed-vm-process.js';
12
+ import { appendGatewayLifecycleOperationRecord as appendGatewayLifecycleOperationRecordDefault, } from './gateway-lifecycle-operation-record.js';
13
+ import { classifyGatewayStartError, deriveGatewayDiagnosisSnapshot, } from './gateway-zone-state-machine.js';
9
14
  import { ControllerZoneRuntimeStartError, ControllerZoneRuntimeUnavailableError, } from './zone-runtime-errors.js';
10
15
  const defaultGatewayCloseTimeoutMs = 60_000;
16
+ function isLifecycleOperationExecutionWithLock(execution) {
17
+ return typeof execution === 'object' && execution !== null && 'lock' in execution;
18
+ }
19
+ function isRecoverySecretResolutionFailure(record) {
20
+ return (record.errorCode === 'secret-resolution-failed' &&
21
+ (record.operationTrigger === 'auto-recovery' ||
22
+ record.operationTrigger === 'credentials-refresh'));
23
+ }
11
24
  class OpenClawZoneRestartTimeoutError extends Error {
12
25
  code = 'OPENCLAW_GATEWAY_RESTART_TIMEOUT';
13
26
  constructor(zoneId, timeoutMs) {
@@ -33,32 +46,280 @@ function buildOpenClawCombinedLogsCommand(logPath) {
33
46
  function writeOpenClawZoneRuntimeLog(message) {
34
47
  process.stderr.write(`[openclaw-zone-runtime] ${message}\n`);
35
48
  }
49
+ function unavailableReasonForState(state) {
50
+ switch (state.kind) {
51
+ case 'failed':
52
+ return state.error.message;
53
+ case 'owner-unsafe':
54
+ return `Gateway runtime ownership is unsafe: ${state.evidence.kind}.`;
55
+ case 'restarting':
56
+ case 'starting':
57
+ case 'stopping':
58
+ return `Gateway runtime is ${state.kind}.`;
59
+ case 'running':
60
+ case 'running-degraded':
61
+ case 'stopped':
62
+ return undefined;
63
+ }
64
+ return assertNeverGatewayZoneLifecycleState(state);
65
+ }
66
+ function assertNeverGatewayZoneLifecycleState(state) {
67
+ throw new Error(`Unhandled gateway zone lifecycle state: ${JSON.stringify(state)}`);
68
+ }
69
+ function assertNeverGatewayLifecycleOperationRecordKind(kind) {
70
+ throw new Error(`Unhandled gateway lifecycle operation record kind: ${String(kind)}`);
71
+ }
72
+ function gatewayIdentityFor(runtimeGateway) {
73
+ if (!runtimeGateway) {
74
+ return undefined;
75
+ }
76
+ const hostPid = runtimeGateway.vm.getHostPid();
77
+ return {
78
+ ...(typeof hostPid === 'number' && hostPid > 0 ? { hostPid } : {}),
79
+ vmId: runtimeGateway.vm.id,
80
+ };
81
+ }
82
+ async function executeGatewayCommand(runtimeGateway, command) {
83
+ const result = await runtimeGateway.vm.exec(command);
84
+ return {
85
+ exitCode: result.exitCode,
86
+ stderr: result.stderr,
87
+ stdout: result.stdout,
88
+ };
89
+ }
36
90
  export function createOpenClawZoneRuntime(options) {
37
91
  const clearTimeoutImpl = options.clearTimeoutImpl ?? clearTimeout;
38
92
  const closeGatewayTimeoutMs = options.closeGatewayTimeoutMs ?? defaultGatewayCloseTimeoutMs;
93
+ const isProcessAlive = options.isProcessAlive ?? defaultIsProcessAlive;
39
94
  const setTimeoutImpl = options.setTimeoutImpl ?? setTimeout;
95
+ const appendGatewayLifecycleOperationRecord = options.appendGatewayLifecycleOperationRecord;
40
96
  let gateway;
41
97
  let bootedAt;
42
98
  let lastError;
99
+ let lastOperation = 'none';
100
+ let originalOutageCause = { kind: 'unknown' };
101
+ let lifecycleState = { kind: 'stopped' };
43
102
  let lifecycleOperation = Promise.resolve();
44
103
  let lifecycleGeneration = 0;
45
- const startGateway = async () => options.restartGatewayZone
46
- ? await options.restartGatewayZone(options.zone.id)
104
+ let staleGatewayPendingClose;
105
+ const startGateway = async (startOptions = {}) => options.restartGatewayZone
106
+ ? await options.restartGatewayZone(options.zone.id, startOptions)
47
107
  : await startGatewayZone({
48
- secretResolver: options.secretResolver,
108
+ secretResolver: startOptions.secretResolver ?? options.secretResolver,
49
109
  systemConfig: options.systemConfig,
50
110
  zoneId: options.zone.id,
51
111
  });
52
112
  const requireGateway = () => {
53
- if (!gateway) {
54
- throw new ControllerZoneRuntimeUnavailableError(options.zone.id, lastError);
113
+ const currentState = getLifecycleState();
114
+ if (currentState.kind !== 'running' && currentState.kind !== 'running-degraded') {
115
+ throw new ControllerZoneRuntimeUnavailableError(options.zone.id, lastError ?? unavailableReasonForState(currentState));
116
+ }
117
+ return currentState.gateway;
118
+ };
119
+ const createOperationId = (operationName) => `${options.zone.id}-${operationName}-${randomUUID()}`;
120
+ const operationForRecordKind = (kind) => {
121
+ switch (kind) {
122
+ case 'cold-start-requested':
123
+ return 'cold-start';
124
+ case 'credentials-refresh-requested':
125
+ return 'credentials-refresh';
126
+ case 'restart-requested':
127
+ return 'restart';
128
+ case 'start-requested':
129
+ return 'start';
130
+ case 'stop-requested':
131
+ return 'stop';
132
+ case 'operation-failed':
133
+ case 'operation-finished':
134
+ case 'runtime-record-deleted':
135
+ case 'runtime-record-written':
136
+ case 'vm-close-finished':
137
+ case 'vm-close-started':
138
+ return undefined;
139
+ }
140
+ return assertNeverGatewayLifecycleOperationRecordKind(kind);
141
+ };
142
+ const setOriginalOutageCauseIfUnknown = (errorCode) => {
143
+ if (originalOutageCause.kind !== 'unknown') {
144
+ return;
145
+ }
146
+ originalOutageCause = {
147
+ ...(errorCode === undefined ? {} : { errorCode }),
148
+ eventKind: 'gateway-lifecycle-operation',
149
+ kind: 'proven',
150
+ };
151
+ };
152
+ const recordLifecycleOperation = async (record) => {
153
+ const operation = operationForRecordKind(record.kind);
154
+ if (operation !== undefined) {
155
+ lastOperation = operation;
156
+ }
157
+ if (record.kind === 'operation-failed' && !isRecoverySecretResolutionFailure(record)) {
158
+ setOriginalOutageCauseIfUnknown(record.errorCode);
159
+ }
160
+ const operationRecord = {
161
+ controllerPid: process.pid,
162
+ gatewayType: 'openclaw',
163
+ observedAtMs: options.now(),
164
+ zoneId: options.zone.id,
165
+ ...record,
166
+ };
167
+ try {
168
+ if (appendGatewayLifecycleOperationRecord) {
169
+ await appendGatewayLifecycleOperationRecord(operationRecord);
170
+ return;
171
+ }
172
+ await appendGatewayLifecycleOperationRecordDefault({
173
+ record: operationRecord,
174
+ runtimeDir: options.systemConfig.runtimeDir,
175
+ zoneId: options.zone.id,
176
+ });
177
+ }
178
+ catch (error) {
179
+ writeOpenClawZoneRuntimeLog(`failed to append gateway lifecycle operation record for zone '${options.zone.id}': ${formatUnknownError(error)}`);
180
+ }
181
+ };
182
+ const markGatewayHostPidMissing = (message) => {
183
+ if (staleGatewayPendingClose === undefined &&
184
+ (lifecycleState.kind === 'running' || lifecycleState.kind === 'running-degraded')) {
185
+ staleGatewayPendingClose = lifecycleState.gateway;
186
+ }
187
+ const errorMessage = `vm-process-missing: ${message}`;
188
+ setOriginalOutageCauseIfUnknown('vm-process-missing');
189
+ gateway = undefined;
190
+ bootedAt = undefined;
191
+ lastError = errorMessage;
192
+ lifecycleState = {
193
+ coldStartEligible: true,
194
+ error: { code: 'vm-process-missing', message: errorMessage },
195
+ kind: 'failed',
196
+ };
197
+ return lifecycleState;
198
+ };
199
+ const closeStaleGatewayBeforeColdStart = async (operationContext) => {
200
+ const staleGateway = staleGatewayPendingClose;
201
+ if (!staleGateway) {
202
+ return;
203
+ }
204
+ staleGatewayPendingClose = undefined;
205
+ try {
206
+ await recordLifecycleOperation({
207
+ kind: 'vm-close-started',
208
+ operationId: operationContext.operationId,
209
+ operationTrigger: operationContext.operationTrigger,
210
+ previousGateway: gatewayIdentityFor(staleGateway),
211
+ });
212
+ await closeGatewayWithDeadline(staleGateway);
213
+ await recordLifecycleOperation({
214
+ kind: 'vm-close-finished',
215
+ operationId: operationContext.operationId,
216
+ operationTrigger: operationContext.operationTrigger,
217
+ previousGateway: gatewayIdentityFor(staleGateway),
218
+ });
219
+ }
220
+ catch (error) {
221
+ staleGatewayPendingClose = staleGateway;
222
+ lastError = formatUnknownError(error);
223
+ lifecycleState = {
224
+ coldStartEligible: false,
225
+ error: {
226
+ code: 'owner-unsafe',
227
+ message: lastError,
228
+ },
229
+ kind: 'failed',
230
+ };
231
+ await recordLifecycleOperation({
232
+ errorCode: 'owner-unsafe',
233
+ errorMessage: lastError,
234
+ kind: 'operation-failed',
235
+ operationId: operationContext.operationId,
236
+ operationTrigger: operationContext.operationTrigger,
237
+ previousGateway: gatewayIdentityFor(staleGateway),
238
+ });
239
+ throw error;
240
+ }
241
+ };
242
+ const classifyLastError = (message) => {
243
+ if (message.startsWith('vm-process-missing:')) {
244
+ return {
245
+ coldStartEligible: true,
246
+ error: { code: 'vm-process-missing', message },
247
+ kind: 'failed',
248
+ };
55
249
  }
56
- return gateway;
250
+ const error = classifyGatewayStartError(new Error(message));
251
+ return {
252
+ coldStartEligible: error.code !== 'owner-unsafe',
253
+ error,
254
+ kind: 'failed',
255
+ };
256
+ };
257
+ const getLifecycleState = () => {
258
+ if (lifecycleState.kind === 'running' || lifecycleState.kind === 'running-degraded') {
259
+ const hostPid = lifecycleState.gateway.vm.getHostPid();
260
+ if (hostPid === undefined || hostPid === null) {
261
+ return markGatewayHostPidMissing(`Gateway VM host pid is unavailable for zone '${options.zone.id}'.`);
262
+ }
263
+ if (!isProcessAlive(hostPid)) {
264
+ return markGatewayHostPidMissing(`Gateway VM host pid ${String(hostPid)} is not alive for zone '${options.zone.id}'.`);
265
+ }
266
+ return lifecycleState;
267
+ }
268
+ if (lifecycleState.kind === 'failed' || lifecycleState.kind === 'owner-unsafe') {
269
+ return lifecycleState;
270
+ }
271
+ if (lifecycleState.kind === 'starting' ||
272
+ lifecycleState.kind === 'stopping' ||
273
+ lifecycleState.kind === 'restarting') {
274
+ return lifecycleState;
275
+ }
276
+ if (lastError) {
277
+ lifecycleState = classifyLastError(lastError);
278
+ return lifecycleState;
279
+ }
280
+ return lifecycleState;
57
281
  };
58
282
  const runLifecycleOperation = async (operation) => {
59
- const operationPromise = lifecycleOperation.then(operation, operation);
60
- lifecycleOperation = operationPromise.then(() => undefined, () => undefined);
61
- return await operationPromise;
283
+ const runAfterPrevious = async () => {
284
+ await lifecycleOperation.catch(() => undefined);
285
+ return await operation();
286
+ };
287
+ const executionPromise = runAfterPrevious();
288
+ const operationResultPromise = executionPromise.then(async (execution) => {
289
+ if (isLifecycleOperationExecutionWithLock(execution)) {
290
+ return await execution.publicResult;
291
+ }
292
+ return await execution;
293
+ });
294
+ lifecycleOperation = executionPromise
295
+ .then(async (execution) => {
296
+ if (isLifecycleOperationExecutionWithLock(execution)) {
297
+ await execution.lock;
298
+ return;
299
+ }
300
+ await execution;
301
+ })
302
+ .then(() => undefined, () => undefined);
303
+ return await operationResultPromise;
304
+ };
305
+ const withLifecycleTimeout = (props) => {
306
+ let timeout;
307
+ const timeoutPromise = new Promise((_resolve, reject) => {
308
+ timeout = setTimeoutImpl(() => {
309
+ lifecycleGeneration += 1;
310
+ reject(new OpenClawZoneRestartTimeoutError(options.zone.id, props.timeoutMs));
311
+ }, props.timeoutMs);
312
+ timeout.unref?.();
313
+ });
314
+ const publicResult = Promise.race([props.operation, timeoutPromise]).finally(() => {
315
+ if (timeout) {
316
+ clearTimeoutImpl(timeout);
317
+ }
318
+ });
319
+ return {
320
+ lock: props.operation.then(() => undefined, () => undefined),
321
+ publicResult,
322
+ };
62
323
  };
63
324
  const releaseZoneLeases = async (zoneId) => {
64
325
  const leases = options.leaseManager
@@ -95,24 +356,125 @@ export function createOpenClawZoneRuntime(options) {
95
356
  }
96
357
  }
97
358
  };
98
- const stopNow = async () => {
359
+ const stopNow = async (next = 'stopped', operationContext) => {
99
360
  const activeGateway = gateway;
100
- gateway = undefined;
101
- bootedAt = undefined;
102
- lastError = undefined;
103
- if (activeGateway) {
104
- await closeGatewayWithDeadline(activeGateway);
361
+ const operationId = operationContext?.operationId ?? createOperationId('stop');
362
+ const operationTrigger = operationContext?.operationTrigger ?? 'operator-stop';
363
+ const previousGateway = operationContext?.previousGateway ?? activeGateway;
364
+ lifecycleState = {
365
+ kind: 'stopping',
366
+ next,
367
+ operationId,
368
+ previousGateway,
369
+ };
370
+ try {
371
+ await recordLifecycleOperation({
372
+ kind: 'stop-requested',
373
+ operationId,
374
+ operationTrigger,
375
+ previousGateway: gatewayIdentityFor(previousGateway),
376
+ });
377
+ if (activeGateway) {
378
+ await recordLifecycleOperation({
379
+ kind: 'vm-close-started',
380
+ operationId,
381
+ operationTrigger,
382
+ previousGateway: gatewayIdentityFor(previousGateway),
383
+ });
384
+ await closeGatewayWithDeadline(activeGateway);
385
+ await recordLifecycleOperation({
386
+ kind: 'vm-close-finished',
387
+ operationId,
388
+ operationTrigger,
389
+ previousGateway: gatewayIdentityFor(previousGateway),
390
+ });
391
+ }
392
+ await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
393
+ await recordLifecycleOperation({
394
+ kind: 'runtime-record-deleted',
395
+ operationId,
396
+ operationTrigger,
397
+ previousGateway: gatewayIdentityFor(previousGateway),
398
+ });
399
+ gateway = undefined;
400
+ bootedAt = undefined;
401
+ lastError = undefined;
402
+ lifecycleState = { kind: 'stopped' };
403
+ }
404
+ catch (error) {
405
+ lastError = formatUnknownError(error);
406
+ lifecycleState = {
407
+ coldStartEligible: false,
408
+ error: {
409
+ code: 'owner-unsafe',
410
+ message: lastError,
411
+ },
412
+ kind: 'failed',
413
+ };
414
+ await recordLifecycleOperation({
415
+ errorCode: 'owner-unsafe',
416
+ errorMessage: lastError,
417
+ kind: 'operation-failed',
418
+ operationId,
419
+ operationTrigger,
420
+ previousGateway: gatewayIdentityFor(previousGateway),
421
+ });
422
+ throw error;
105
423
  }
106
- await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
107
424
  };
108
- const startNow = async (expectedGeneration) => {
425
+ const startNow = async (expectedGeneration, startOptions = {}, operationContext) => {
426
+ const operationId = operationContext?.operationId ?? createOperationId('start');
427
+ const operationTrigger = operationContext?.operationTrigger ?? 'operator-start';
428
+ lifecycleState = {
429
+ kind: 'starting',
430
+ operationId,
431
+ startedAtMs: options.now(),
432
+ };
109
433
  try {
110
- const startedGateway = await startGateway();
434
+ await recordLifecycleOperation({
435
+ kind: 'start-requested',
436
+ operationId,
437
+ operationTrigger,
438
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
439
+ });
440
+ const startedGateway = await startGateway(startOptions);
111
441
  if (expectedGeneration !== undefined && expectedGeneration !== lifecycleGeneration) {
112
442
  try {
113
443
  await closeGatewayWithDeadline(startedGateway);
444
+ if (lifecycleGeneration === expectedGeneration + 1) {
445
+ await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
446
+ await recordLifecycleOperation({
447
+ currentGateway: gatewayIdentityFor(startedGateway),
448
+ kind: 'runtime-record-deleted',
449
+ operationId,
450
+ operationTrigger,
451
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
452
+ });
453
+ }
454
+ lastError = `stale-generation-closed: Closed stale gateway start for zone '${options.zone.id}'.`;
455
+ lifecycleState = classifyLastError(lastError);
456
+ await recordLifecycleOperation({
457
+ currentGateway: gatewayIdentityFor(startedGateway),
458
+ errorCode: 'stale-generation-closed',
459
+ errorMessage: lastError,
460
+ kind: 'operation-failed',
461
+ operationId,
462
+ operationTrigger,
463
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
464
+ });
114
465
  }
115
466
  catch (error) {
467
+ lastError = `stale-generation-closed: Failed to close stale gateway start for zone '${options.zone.id}': ${formatUnknownError(error)}`;
468
+ lifecycleState = classifyLastError(lastError);
469
+ await recordLifecycleOperation({
470
+ currentGateway: gatewayIdentityFor(startedGateway),
471
+ errorCode: 'stale-generation-closed',
472
+ errorMessage: lastError,
473
+ kind: 'operation-failed',
474
+ operationId,
475
+ operationTrigger,
476
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
477
+ });
116
478
  writeOpenClawZoneRuntimeLog(`stale gateway start cleanup failed for zone '${options.zone.id}': ${formatUnknownError(error)}`);
117
479
  }
118
480
  return;
@@ -120,54 +482,171 @@ export function createOpenClawZoneRuntime(options) {
120
482
  gateway = startedGateway;
121
483
  bootedAt = new Date(options.now()).toISOString();
122
484
  lastError = undefined;
485
+ lifecycleState = { gateway: startedGateway, kind: 'running' };
486
+ await recordLifecycleOperation({
487
+ currentGateway: gatewayIdentityFor(startedGateway),
488
+ kind: 'operation-finished',
489
+ operationId,
490
+ operationTrigger,
491
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
492
+ });
123
493
  }
124
494
  catch (error) {
495
+ if (error instanceof GatewayOwnershipUnsafeError) {
496
+ gateway = undefined;
497
+ bootedAt = undefined;
498
+ lastError = error.message;
499
+ lifecycleState = {
500
+ evidence: error.evidence,
501
+ kind: 'owner-unsafe',
502
+ };
503
+ await recordLifecycleOperation({
504
+ errorCode: 'owner-unsafe',
505
+ errorMessage: error.message,
506
+ kind: 'operation-failed',
507
+ operationId,
508
+ operationTrigger,
509
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
510
+ });
511
+ throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
512
+ gatewayLifecycleErrorCode: 'owner-unsafe',
513
+ operationId,
514
+ });
515
+ }
516
+ const classifiedError = classifyGatewayStartError(error);
125
517
  gateway = undefined;
126
518
  bootedAt = undefined;
127
519
  lastError = formatUnknownError(error);
128
- throw new ControllerZoneRuntimeStartError(options.zone.id, error);
520
+ lifecycleState = {
521
+ coldStartEligible: true,
522
+ error: classifiedError,
523
+ kind: 'failed',
524
+ };
525
+ await recordLifecycleOperation({
526
+ errorCode: classifiedError.code,
527
+ errorMessage: classifiedError.message,
528
+ kind: 'operation-failed',
529
+ operationId,
530
+ operationTrigger,
531
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
532
+ });
533
+ throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
534
+ gatewayLifecycleErrorCode: classifiedError.code,
535
+ operationId,
536
+ });
129
537
  }
130
538
  };
131
539
  const stop = async () => await runLifecycleOperation(async () => await stopNow());
132
- const start = async () => await runLifecycleOperation(async () => await startNow());
133
- const restart = async (restartOptions = {}) => {
540
+ const start = async () => await runLifecycleOperation(async () => await startNow(undefined, {}, {
541
+ operationId: createOperationId('start'),
542
+ operationTrigger: 'controller-start',
543
+ }));
544
+ const restartWithStartOptions = async (restartOptions = {}, startOptions = {}, operationMetadata = {}) => {
134
545
  return await runLifecycleOperation(async () => {
135
546
  lifecycleGeneration += 1;
136
547
  const operationGeneration = lifecycleGeneration;
137
- const restartOperation = (async () => {
548
+ const currentState = getLifecycleState();
549
+ const operationId = operationMetadata.operationId ?? createOperationId('restart');
550
+ const operationContext = {
551
+ operationId,
552
+ operationTrigger: operationMetadata.operationTrigger ??
553
+ restartOptions.operationTrigger ??
554
+ 'operator-restart',
555
+ previousGateway: currentState.kind === 'running' || currentState.kind === 'running-degraded'
556
+ ? currentState.gateway
557
+ : undefined,
558
+ };
559
+ if (currentState.kind === 'running' || currentState.kind === 'running-degraded') {
560
+ lifecycleState = {
561
+ kind: 'restarting',
562
+ operationId,
563
+ previousGateway: currentState.gateway,
564
+ };
565
+ }
566
+ const restartOperation = currentState.kind === 'running' || currentState.kind === 'running-degraded'
567
+ ? (async () => {
568
+ await recordLifecycleOperation({
569
+ kind: 'restart-requested',
570
+ operationId,
571
+ operationTrigger: operationContext.operationTrigger,
572
+ previousGateway: gatewayIdentityFor(operationContext.previousGateway),
573
+ });
574
+ const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
575
+ await stopNow('starting', operationContext);
576
+ await startNow(operationGeneration, startOptions, operationContext);
577
+ if (operationGeneration !== lifecycleGeneration) {
578
+ throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
579
+ }
580
+ return {
581
+ leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
582
+ operationId,
583
+ };
584
+ })()
585
+ : (async () => {
586
+ await recordLifecycleOperation({
587
+ kind: 'cold-start-requested',
588
+ operationId,
589
+ operationTrigger: operationContext.operationTrigger,
590
+ });
591
+ const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
592
+ await closeStaleGatewayBeforeColdStart(operationContext);
593
+ await startNow(operationGeneration, startOptions, operationContext);
594
+ if (operationGeneration !== lifecycleGeneration) {
595
+ throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
596
+ }
597
+ return {
598
+ leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
599
+ operationId,
600
+ };
601
+ })();
602
+ if (restartOptions.timeoutMs === undefined) {
603
+ return await restartOperation;
604
+ }
605
+ return withLifecycleTimeout({
606
+ operation: restartOperation,
607
+ timeoutMs: restartOptions.timeoutMs,
608
+ });
609
+ });
610
+ };
611
+ const restart = async (restartOptions = {}) => await restartWithStartOptions(restartOptions);
612
+ const coldStartWithStartOptions = async (restartOptions = {}, startOptions = {}, operationMetadata = {}) => {
613
+ return await runLifecycleOperation(async () => {
614
+ lifecycleGeneration += 1;
615
+ const operationGeneration = lifecycleGeneration;
616
+ const operationContext = {
617
+ operationId: operationMetadata.operationId ?? createOperationId('cold-start'),
618
+ operationTrigger: operationMetadata.operationTrigger ?? restartOptions.operationTrigger ?? 'auto-recovery',
619
+ };
620
+ const coldStartOperation = (async () => {
621
+ getLifecycleState();
622
+ await recordLifecycleOperation({
623
+ kind: 'cold-start-requested',
624
+ operationId: operationContext.operationId,
625
+ operationTrigger: operationContext.operationTrigger,
626
+ });
138
627
  const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
139
- await stopNow();
140
- await startNow(operationGeneration);
628
+ await closeStaleGatewayBeforeColdStart(operationContext);
629
+ await startNow(operationGeneration, startOptions, operationContext);
141
630
  if (operationGeneration !== lifecycleGeneration) {
142
631
  throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
143
632
  }
144
- return { leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length };
633
+ return {
634
+ leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
635
+ operationId: operationContext.operationId,
636
+ };
145
637
  })();
146
638
  if (restartOptions.timeoutMs === undefined) {
147
- return await restartOperation;
148
- }
149
- const restartTimeoutMs = restartOptions.timeoutMs;
150
- let timeout;
151
- try {
152
- return await Promise.race([
153
- restartOperation,
154
- new Promise((_resolve, reject) => {
155
- timeout = setTimeoutImpl(() => {
156
- lifecycleGeneration += 1;
157
- reject(new OpenClawZoneRestartTimeoutError(options.zone.id, restartTimeoutMs));
158
- }, restartTimeoutMs);
159
- timeout.unref?.();
160
- }),
161
- ]);
162
- }
163
- finally {
164
- if (timeout) {
165
- clearTimeoutImpl(timeout);
166
- }
639
+ return await coldStartOperation;
167
640
  }
641
+ return withLifecycleTimeout({
642
+ operation: coldStartOperation,
643
+ timeoutMs: restartOptions.timeoutMs,
644
+ });
168
645
  });
169
646
  };
647
+ const coldStart = async (restartOptions = {}) => await coldStartWithStartOptions(restartOptions);
170
648
  return {
649
+ coldStart,
171
650
  destroy: async (purge) => await (options.runControllerDestroy ?? runControllerDestroyDefault)({ purge, systemConfig: options.systemConfig, zoneId: options.zone.id }, {
172
651
  releaseZoneLeases: async (zoneId) => {
173
652
  await releaseZoneLeases(zoneId);
@@ -175,12 +654,13 @@ export function createOpenClawZoneRuntime(options) {
175
654
  stopGatewayZone: async () => await stop(),
176
655
  }),
177
656
  enableSsh: async () => await requireGateway().vm.enableSsh(),
178
- exec: async (command) => await requireGateway().vm.exec(command),
657
+ exec: async (command) => await executeGatewayCommand(requireGateway(), command),
179
658
  gatewayType: 'openclaw',
180
659
  getHealth: async () => {
660
+ getLifecycleState();
181
661
  const activeGateway = requireGateway();
182
662
  const result = await runGatewayHealthCheck({
183
- exec: async (command) => await activeGateway.vm.exec(command),
663
+ exec: async (command) => await executeGatewayCommand(activeGateway, command),
184
664
  healthCheck: activeGateway.processSpec.healthCheck,
185
665
  });
186
666
  return {
@@ -192,42 +672,118 @@ export function createOpenClawZoneRuntime(options) {
192
672
  zoneId: options.zone.id,
193
673
  };
194
674
  },
675
+ getDiagnosis: () => deriveGatewayDiagnosisSnapshot({
676
+ channelProviderPlane: 'unknown',
677
+ controllerLiveness: 'ok',
678
+ lastOperation,
679
+ originalOutageCause,
680
+ state: getLifecycleState(),
681
+ toolVmPlane: 'unknown',
682
+ }),
195
683
  getLogs: async () => {
196
684
  const activeGateway = requireGateway();
197
685
  return await (options.runControllerLogs ?? runControllerLogsDefault)({ zoneId: options.zone.id }, {
198
686
  readGatewayLogs: async () => (await activeGateway.vm.exec(buildOpenClawCombinedLogsCommand(activeGateway.processSpec.logPath))).stdout,
199
687
  });
200
688
  },
689
+ getLifecycleState,
201
690
  getSnapshot: () => {
202
- if (gateway) {
203
- const hostPid = gateway.vm.getHostPid();
691
+ const currentLifecycleState = getLifecycleState();
692
+ if (currentLifecycleState.kind === 'running') {
693
+ const hostPid = currentLifecycleState.gateway.vm.getHostPid();
694
+ if (hostPid === undefined || hostPid === null) {
695
+ const missingHostPidState = markGatewayHostPidMissing(`Gateway VM host pid is unavailable for zone '${options.zone.id}'.`);
696
+ return {
697
+ lastError: missingHostPidState.error.message,
698
+ lifecycleState: 'failed',
699
+ };
700
+ }
204
701
  return {
205
702
  ...(bootedAt ? { bootedAt } : {}),
206
703
  gateway: {
207
- ingress: gateway.ingress,
704
+ ingress: currentLifecycleState.gateway.ingress,
208
705
  vm: {
209
- ...(hostPid === undefined || hostPid === null ? {} : { hostPid }),
210
- id: gateway.vm.id,
706
+ hostPid,
707
+ id: currentLifecycleState.gateway.vm.id,
211
708
  },
212
709
  },
710
+ ...(lastError ? { lastError } : {}),
213
711
  lifecycleState: 'running',
214
712
  };
215
713
  }
216
714
  return lastError ? { lastError, lifecycleState: 'failed' } : { lifecycleState: 'stopped' };
217
715
  },
218
- refreshCredentials: async () => await (options.runControllerCredentialsRefresh ?? runControllerCredentialsRefreshDefault)({ zoneId: options.zone.id }, {
219
- refreshZoneSecrets: async (zoneId) => {
220
- await resolveZoneSecrets({
221
- audience: 'gateway',
222
- secretResolver: options.secretResolver,
223
- systemConfig: options.systemConfig,
224
- zoneId,
716
+ refreshCredentials: async () => await (async () => {
717
+ const operationId = createOperationId('credentials-refresh');
718
+ const operationTrigger = 'credentials-refresh';
719
+ await recordLifecycleOperation({
720
+ kind: 'credentials-refresh-requested',
721
+ operationId,
722
+ operationTrigger,
723
+ previousGateway: gatewayIdentityFor(gateway),
724
+ });
725
+ const failCredentialsRefreshSecretResolution = async (error) => {
726
+ const classifiedError = {
727
+ code: 'secret-resolution-failed',
728
+ message: formatUnknownError(error),
729
+ };
730
+ const currentLifecycleState = getLifecycleState();
731
+ lastError = classifiedError.message;
732
+ if (currentLifecycleState.kind !== 'running' &&
733
+ currentLifecycleState.kind !== 'running-degraded') {
734
+ lifecycleState = {
735
+ coldStartEligible: true,
736
+ error: classifiedError,
737
+ kind: 'failed',
738
+ };
739
+ }
740
+ await recordLifecycleOperation({
741
+ errorCode: classifiedError.code,
742
+ errorMessage: classifiedError.message,
743
+ kind: 'operation-failed',
744
+ operationId,
745
+ operationTrigger,
746
+ previousGateway: gatewayIdentityFor(gateway),
225
747
  });
226
- },
227
- restartGatewayZone: async () => {
228
- await restart();
229
- },
230
- }),
748
+ throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
749
+ gatewayLifecycleErrorCode: classifiedError.code,
750
+ operationId,
751
+ });
752
+ };
753
+ let refreshedSecretResolver;
754
+ try {
755
+ refreshedSecretResolver = options.createFreshSecretResolver
756
+ ? await options.createFreshSecretResolver()
757
+ : options.secretResolver;
758
+ }
759
+ catch (error) {
760
+ await failCredentialsRefreshSecretResolution(error);
761
+ }
762
+ return await (options.runControllerCredentialsRefresh ?? runControllerCredentialsRefreshDefault)({ zoneId: options.zone.id }, {
763
+ refreshZoneSecrets: async (zoneId) => {
764
+ try {
765
+ await resolveZoneSecrets({
766
+ audience: 'gateway',
767
+ secretResolver: refreshedSecretResolver,
768
+ systemConfig: options.systemConfig,
769
+ zoneId,
770
+ });
771
+ }
772
+ catch (error) {
773
+ await failCredentialsRefreshSecretResolution(error);
774
+ }
775
+ },
776
+ restartGatewayZone: async () => {
777
+ const currentLifecycleState = getLifecycleState();
778
+ if (currentLifecycleState.kind === 'running' ||
779
+ currentLifecycleState.kind === 'running-degraded') {
780
+ await restartWithStartOptions({}, { secretResolver: refreshedSecretResolver }, { operationId, operationTrigger });
781
+ return;
782
+ }
783
+ await coldStartWithStartOptions({}, { secretResolver: refreshedSecretResolver }, { operationId, operationTrigger });
784
+ },
785
+ });
786
+ })(),
231
787
  restart,
232
788
  shutdown: stop,
233
789
  start,
@@ -235,7 +791,7 @@ export function createOpenClawZoneRuntime(options) {
235
791
  upgrade: async () => await (options.runControllerUpgrade ?? runControllerUpgradeDefault)({ systemConfig: options.systemConfig, zoneId: options.zone.id }, {
236
792
  rebuildGatewayImage: async () => { },
237
793
  restartGatewayZone: async () => {
238
- await restart();
794
+ await restart({ operationTrigger: 'upgrade' });
239
795
  },
240
796
  }),
241
797
  zoneId: options.zone.id,