@agent-vm/agent-vm 0.0.91 → 0.0.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/dist/build/gondolin-image-builder.d.ts +1 -0
  2. package/dist/build/gondolin-image-builder.d.ts.map +1 -1
  3. package/dist/build/gondolin-image-builder.js +11 -1
  4. package/dist/build/gondolin-image-builder.js.map +1 -1
  5. package/dist/build/managed-image-dockerfile.d.ts +2 -1
  6. package/dist/build/managed-image-dockerfile.d.ts.map +1 -1
  7. package/dist/build/managed-image-dockerfile.js +51 -27
  8. package/dist/build/managed-image-dockerfile.js.map +1 -1
  9. package/dist/cli/commands/controller-definition.d.ts +42 -42
  10. package/dist/cli/commands/create-app.d.ts +60 -60
  11. package/dist/cli/manual-templates.d.ts.map +1 -1
  12. package/dist/cli/manual-templates.js +14 -1
  13. package/dist/cli/manual-templates.js.map +1 -1
  14. package/dist/config/system-config.d.ts +15 -0
  15. package/dist/config/system-config.d.ts.map +1 -1
  16. package/dist/config/system-config.js +74 -0
  17. package/dist/config/system-config.js.map +1 -1
  18. package/dist/controller/controller-runtime-operations.d.ts +1 -0
  19. package/dist/controller/controller-runtime-operations.d.ts.map +1 -1
  20. package/dist/controller/controller-runtime-operations.js +2 -0
  21. package/dist/controller/controller-runtime-operations.js.map +1 -1
  22. package/dist/controller/controller-runtime-types.d.ts +5 -0
  23. package/dist/controller/controller-runtime-types.d.ts.map +1 -1
  24. package/dist/controller/controller-runtime.d.ts +1 -0
  25. package/dist/controller/controller-runtime.d.ts.map +1 -1
  26. package/dist/controller/controller-runtime.js +220 -3
  27. package/dist/controller/controller-runtime.js.map +1 -1
  28. package/dist/controller/health/channel-provider-recovery-observation.d.ts +23 -0
  29. package/dist/controller/health/channel-provider-recovery-observation.d.ts.map +1 -0
  30. package/dist/controller/health/channel-provider-recovery-observation.js +69 -0
  31. package/dist/controller/health/channel-provider-recovery-observation.js.map +1 -0
  32. package/dist/controller/health/durable-health-event-log.d.ts +24 -0
  33. package/dist/controller/health/durable-health-event-log.d.ts.map +1 -0
  34. package/dist/controller/health/durable-health-event-log.js +89 -0
  35. package/dist/controller/health/durable-health-event-log.js.map +1 -0
  36. package/dist/controller/health/gateway-recovery-actions.d.ts +27 -0
  37. package/dist/controller/health/gateway-recovery-actions.d.ts.map +1 -0
  38. package/dist/controller/health/gateway-recovery-actions.js +71 -0
  39. package/dist/controller/health/gateway-recovery-actions.js.map +1 -0
  40. package/dist/controller/health/gateway-service-health-monitor.d.ts +71 -3
  41. package/dist/controller/health/gateway-service-health-monitor.d.ts.map +1 -1
  42. package/dist/controller/health/gateway-service-health-monitor.js +383 -10
  43. package/dist/controller/health/gateway-service-health-monitor.js.map +1 -1
  44. package/dist/controller/health/gateway-vm-recovery-policy.d.ts +68 -0
  45. package/dist/controller/health/gateway-vm-recovery-policy.d.ts.map +1 -0
  46. package/dist/controller/health/gateway-vm-recovery-policy.js +199 -0
  47. package/dist/controller/health/gateway-vm-recovery-policy.js.map +1 -0
  48. package/dist/controller/health/gateway-vm-recovery-runner.d.ts +39 -0
  49. package/dist/controller/health/gateway-vm-recovery-runner.d.ts.map +1 -0
  50. package/dist/controller/health/gateway-vm-recovery-runner.js +251 -0
  51. package/dist/controller/health/gateway-vm-recovery-runner.js.map +1 -0
  52. package/dist/controller/health/health-event-store.d.ts +4 -0
  53. package/dist/controller/health/health-event-store.d.ts.map +1 -1
  54. package/dist/controller/health/health-event-store.js +19 -0
  55. package/dist/controller/health/health-event-store.js.map +1 -1
  56. package/dist/controller/http/controller-health-event-routes.d.ts +6 -0
  57. package/dist/controller/http/controller-health-event-routes.d.ts.map +1 -1
  58. package/dist/controller/http/controller-health-event-routes.js +49 -0
  59. package/dist/controller/http/controller-health-event-routes.js.map +1 -1
  60. package/dist/controller/http/controller-http-routes.d.ts.map +1 -1
  61. package/dist/controller/http/controller-http-routes.js +6 -0
  62. package/dist/controller/http/controller-http-routes.js.map +1 -1
  63. package/dist/controller/leases/lease-manager.d.ts.map +1 -1
  64. package/dist/controller/leases/lease-manager.js +37 -16
  65. package/dist/controller/leases/lease-manager.js.map +1 -1
  66. package/dist/controller/leases/tool-vm-lease-lifecycle.d.ts +44 -0
  67. package/dist/controller/leases/tool-vm-lease-lifecycle.d.ts.map +1 -0
  68. package/dist/controller/leases/tool-vm-lease-lifecycle.js +28 -0
  69. package/dist/controller/leases/tool-vm-lease-lifecycle.js.map +1 -0
  70. package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.d.ts +37 -0
  71. package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.d.ts.map +1 -0
  72. package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.js +133 -0
  73. package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.js.map +1 -0
  74. package/dist/controller/zone-runtimes/gateway-zone-state-machine.d.ts +101 -0
  75. package/dist/controller/zone-runtimes/gateway-zone-state-machine.d.ts.map +1 -0
  76. package/dist/controller/zone-runtimes/gateway-zone-state-machine.js +143 -0
  77. package/dist/controller/zone-runtimes/gateway-zone-state-machine.js.map +1 -0
  78. package/dist/controller/zone-runtimes/openclaw-zone-runtime.d.ts +16 -1
  79. package/dist/controller/zone-runtimes/openclaw-zone-runtime.d.ts.map +1 -1
  80. package/dist/controller/zone-runtimes/openclaw-zone-runtime.js +700 -40
  81. package/dist/controller/zone-runtimes/openclaw-zone-runtime.js.map +1 -1
  82. package/dist/controller/zone-runtimes/zone-runtime-errors.d.ts +7 -1
  83. package/dist/controller/zone-runtimes/zone-runtime-errors.d.ts.map +1 -1
  84. package/dist/controller/zone-runtimes/zone-runtime-errors.js +5 -1
  85. package/dist/controller/zone-runtimes/zone-runtime-errors.js.map +1 -1
  86. package/dist/controller/zone-runtimes/zone-runtime-registry.d.ts +2 -0
  87. package/dist/controller/zone-runtimes/zone-runtime-registry.d.ts.map +1 -1
  88. package/dist/controller/zone-runtimes/zone-runtime-registry.js +23 -0
  89. package/dist/controller/zone-runtimes/zone-runtime-registry.js.map +1 -1
  90. package/dist/controller/zone-runtimes/zone-runtime-types.d.ts +14 -1
  91. package/dist/controller/zone-runtimes/zone-runtime-types.d.ts.map +1 -1
  92. package/dist/gateway/gateway-ownership-evidence.d.ts +35 -0
  93. package/dist/gateway/gateway-ownership-evidence.d.ts.map +1 -0
  94. package/dist/gateway/gateway-ownership-evidence.js +10 -0
  95. package/dist/gateway/gateway-ownership-evidence.js.map +1 -0
  96. package/dist/gateway/gateway-recovery.d.ts +16 -0
  97. package/dist/gateway/gateway-recovery.d.ts.map +1 -1
  98. package/dist/gateway/gateway-recovery.js +105 -9
  99. package/dist/gateway/gateway-recovery.js.map +1 -1
  100. package/dist/gateway/gateway-zone-orchestrator.d.ts.map +1 -1
  101. package/dist/gateway/gateway-zone-orchestrator.js +50 -39
  102. package/dist/gateway/gateway-zone-orchestrator.js.map +1 -1
  103. package/dist/integration-tests/{smoke-harness.d.ts → e2e-harness.d.ts} +45 -37
  104. package/dist/integration-tests/e2e-harness.d.ts.map +1 -0
  105. package/dist/integration-tests/{smoke-harness.js → e2e-harness.js} +134 -108
  106. package/dist/integration-tests/e2e-harness.js.map +1 -0
  107. package/dist/integration-tests/e2e-workspace-build-global-setup.d.ts +16 -0
  108. package/dist/integration-tests/e2e-workspace-build-global-setup.d.ts.map +1 -0
  109. package/dist/integration-tests/e2e-workspace-build-global-setup.js +27 -0
  110. package/dist/integration-tests/e2e-workspace-build-global-setup.js.map +1 -0
  111. package/dist/integration-tests/live-agent-model-roundtrip-deployment.d.ts +11 -0
  112. package/dist/integration-tests/live-agent-model-roundtrip-deployment.d.ts.map +1 -0
  113. package/dist/integration-tests/live-agent-model-roundtrip-deployment.js +48 -0
  114. package/dist/integration-tests/live-agent-model-roundtrip-deployment.js.map +1 -0
  115. package/dist/integration-tests/live-agent-model-roundtrip-gates.d.ts +11 -0
  116. package/dist/integration-tests/live-agent-model-roundtrip-gates.d.ts.map +1 -0
  117. package/dist/integration-tests/live-agent-model-roundtrip-gates.js +21 -0
  118. package/dist/integration-tests/live-agent-model-roundtrip-gates.js.map +1 -0
  119. package/dist/integration-tests/live-vm-e2e-gates.d.ts +2 -0
  120. package/dist/integration-tests/live-vm-e2e-gates.d.ts.map +1 -0
  121. package/dist/integration-tests/live-vm-e2e-gates.js +4 -0
  122. package/dist/integration-tests/live-vm-e2e-gates.js.map +1 -0
  123. package/dist/operations/controller-status.d.ts +5 -0
  124. package/dist/operations/controller-status.d.ts.map +1 -1
  125. package/dist/operations/controller-status.js +42 -0
  126. package/dist/operations/controller-status.js.map +1 -1
  127. package/package.json +11 -11
  128. package/dist/integration-tests/live-integration-gates.d.ts +0 -2
  129. package/dist/integration-tests/live-integration-gates.d.ts.map +0 -1
  130. package/dist/integration-tests/live-integration-gates.js +0 -4
  131. package/dist/integration-tests/live-integration-gates.js.map +0 -1
  132. package/dist/integration-tests/smoke-harness.d.ts.map +0 -1
  133. package/dist/integration-tests/smoke-harness.js.map +0 -1
@@ -1,12 +1,36 @@
1
+ import { randomUUID } from 'node:crypto';
1
2
  import { resolveZoneSecrets } from '../../gateway/credential-manager.js';
2
3
  import { runGatewayHealthCheck } from '../../gateway/gateway-health-check.js';
4
+ import { GatewayOwnershipUnsafeError } from '../../gateway/gateway-ownership-evidence.js';
3
5
  import { deleteGatewayRuntimeRecord as deleteGatewayRuntimeRecordDefault } from '../../gateway/gateway-runtime-record.js';
4
6
  import { startGatewayZone } from '../../gateway/gateway-zone-orchestrator.js';
5
7
  import { runControllerCredentialsRefresh as runControllerCredentialsRefreshDefault } from '../../operations/credentials-refresh.js';
6
8
  import { runControllerDestroy as runControllerDestroyDefault } from '../../operations/destroy-zone.js';
7
9
  import { runControllerUpgrade as runControllerUpgradeDefault } from '../../operations/upgrade-zone.js';
8
10
  import { runControllerLogs as runControllerLogsDefault } from '../../operations/zone-logs.js';
11
+ import { isProcessAlive as defaultIsProcessAlive } from '../../shared/managed-vm-process.js';
12
+ import { appendGatewayLifecycleOperationRecord as appendGatewayLifecycleOperationRecordDefault, } from './gateway-lifecycle-operation-record.js';
13
+ import { classifyGatewayStartError, deriveGatewayDiagnosisSnapshot, } from './gateway-zone-state-machine.js';
9
14
  import { ControllerZoneRuntimeStartError, ControllerZoneRuntimeUnavailableError, } from './zone-runtime-errors.js';
15
+ const defaultGatewayCloseTimeoutMs = 60_000;
16
+ function isLifecycleOperationExecutionWithLock(execution) {
17
+ return typeof execution === 'object' && execution !== null && 'lock' in execution;
18
+ }
19
+ function isRecoverySecretResolutionFailure(record) {
20
+ return (record.errorCode === 'secret-resolution-failed' &&
21
+ (record.operationTrigger === 'auto-recovery' ||
22
+ record.operationTrigger === 'credentials-refresh'));
23
+ }
24
+ class OpenClawZoneRestartTimeoutError extends Error {
25
+ code = 'OPENCLAW_GATEWAY_RESTART_TIMEOUT';
26
+ constructor(zoneId, timeoutMs) {
27
+ super(`OpenClaw gateway restart timed out for zone '${zoneId}' after ${timeoutMs}ms`);
28
+ this.name = 'OpenClawZoneRestartTimeoutError';
29
+ }
30
+ }
31
+ export function isOpenClawZoneRestartTimeoutError(error) {
32
+ return (error instanceof Error && 'code' in error && error.code === 'OPENCLAW_GATEWAY_RESTART_TIMEOUT');
33
+ }
10
34
  function formatUnknownError(error) {
11
35
  return error instanceof Error ? error.message : String(error);
12
36
  }
@@ -19,68 +43,624 @@ function buildOpenClawCombinedLogsCommand(logPath) {
19
43
  'latest_openclaw_log=$(ls -1t /agent-vm/logs/*.log 2>/dev/null | grep -v "/gateway-boot-latest\\.log$" | head -n 1); if [ -n "$latest_openclaw_log" ]; then tail -n 400 "$latest_openclaw_log"; fi',
20
44
  ].join('; ');
21
45
  }
46
+ function writeOpenClawZoneRuntimeLog(message) {
47
+ process.stderr.write(`[openclaw-zone-runtime] ${message}\n`);
48
+ }
49
+ function unavailableReasonForState(state) {
50
+ switch (state.kind) {
51
+ case 'failed':
52
+ return state.error.message;
53
+ case 'owner-unsafe':
54
+ return `Gateway runtime ownership is unsafe: ${state.evidence.kind}.`;
55
+ case 'restarting':
56
+ case 'starting':
57
+ case 'stopping':
58
+ return `Gateway runtime is ${state.kind}.`;
59
+ case 'running':
60
+ case 'running-degraded':
61
+ case 'stopped':
62
+ return undefined;
63
+ }
64
+ return assertNeverGatewayZoneLifecycleState(state);
65
+ }
66
+ function assertNeverGatewayZoneLifecycleState(state) {
67
+ throw new Error(`Unhandled gateway zone lifecycle state: ${JSON.stringify(state)}`);
68
+ }
69
+ function assertNeverGatewayLifecycleOperationRecordKind(kind) {
70
+ throw new Error(`Unhandled gateway lifecycle operation record kind: ${String(kind)}`);
71
+ }
72
+ function gatewayIdentityFor(runtimeGateway) {
73
+ if (!runtimeGateway) {
74
+ return undefined;
75
+ }
76
+ const hostPid = runtimeGateway.vm.getHostPid();
77
+ return {
78
+ ...(typeof hostPid === 'number' && hostPid > 0 ? { hostPid } : {}),
79
+ vmId: runtimeGateway.vm.id,
80
+ };
81
+ }
82
+ async function executeGatewayCommand(runtimeGateway, command) {
83
+ const result = await runtimeGateway.vm.exec(command);
84
+ return {
85
+ exitCode: result.exitCode,
86
+ stderr: result.stderr,
87
+ stdout: result.stdout,
88
+ };
89
+ }
22
90
  export function createOpenClawZoneRuntime(options) {
91
+ const clearTimeoutImpl = options.clearTimeoutImpl ?? clearTimeout;
92
+ const closeGatewayTimeoutMs = options.closeGatewayTimeoutMs ?? defaultGatewayCloseTimeoutMs;
93
+ const isProcessAlive = options.isProcessAlive ?? defaultIsProcessAlive;
94
+ const setTimeoutImpl = options.setTimeoutImpl ?? setTimeout;
95
+ const appendGatewayLifecycleOperationRecord = options.appendGatewayLifecycleOperationRecord;
23
96
  let gateway;
24
97
  let bootedAt;
25
98
  let lastError;
26
- const startGateway = async () => options.restartGatewayZone
27
- ? await options.restartGatewayZone(options.zone.id)
99
+ let lastOperation = 'none';
100
+ let originalOutageCause = { kind: 'unknown' };
101
+ let lifecycleState = { kind: 'stopped' };
102
+ let lifecycleOperation = Promise.resolve();
103
+ let lifecycleGeneration = 0;
104
+ let staleGatewayPendingClose;
105
+ const startGateway = async (startOptions = {}) => options.restartGatewayZone
106
+ ? await options.restartGatewayZone(options.zone.id, startOptions)
28
107
  : await startGatewayZone({
29
- secretResolver: options.secretResolver,
108
+ secretResolver: startOptions.secretResolver ?? options.secretResolver,
30
109
  systemConfig: options.systemConfig,
31
110
  zoneId: options.zone.id,
32
111
  });
33
112
  const requireGateway = () => {
34
- if (!gateway) {
35
- throw new ControllerZoneRuntimeUnavailableError(options.zone.id, lastError);
113
+ const currentState = getLifecycleState();
114
+ if (currentState.kind !== 'running' && currentState.kind !== 'running-degraded') {
115
+ throw new ControllerZoneRuntimeUnavailableError(options.zone.id, lastError ?? unavailableReasonForState(currentState));
36
116
  }
37
- return gateway;
117
+ return currentState.gateway;
38
118
  };
39
- const stop = async () => {
40
- const activeGateway = gateway;
119
+ const createOperationId = (operationName) => `${options.zone.id}-${operationName}-${randomUUID()}`;
120
+ const operationForRecordKind = (kind) => {
121
+ switch (kind) {
122
+ case 'cold-start-requested':
123
+ return 'cold-start';
124
+ case 'credentials-refresh-requested':
125
+ return 'credentials-refresh';
126
+ case 'restart-requested':
127
+ return 'restart';
128
+ case 'start-requested':
129
+ return 'start';
130
+ case 'stop-requested':
131
+ return 'stop';
132
+ case 'operation-failed':
133
+ case 'operation-finished':
134
+ case 'runtime-record-deleted':
135
+ case 'runtime-record-written':
136
+ case 'vm-close-finished':
137
+ case 'vm-close-started':
138
+ return undefined;
139
+ }
140
+ return assertNeverGatewayLifecycleOperationRecordKind(kind);
141
+ };
142
+ const setOriginalOutageCauseIfUnknown = (errorCode) => {
143
+ if (originalOutageCause.kind !== 'unknown') {
144
+ return;
145
+ }
146
+ originalOutageCause = {
147
+ ...(errorCode === undefined ? {} : { errorCode }),
148
+ eventKind: 'gateway-lifecycle-operation',
149
+ kind: 'proven',
150
+ };
151
+ };
152
+ const recordLifecycleOperation = async (record) => {
153
+ const operation = operationForRecordKind(record.kind);
154
+ if (operation !== undefined) {
155
+ lastOperation = operation;
156
+ }
157
+ if (record.kind === 'operation-failed' && !isRecoverySecretResolutionFailure(record)) {
158
+ setOriginalOutageCauseIfUnknown(record.errorCode);
159
+ }
160
+ const operationRecord = {
161
+ controllerPid: process.pid,
162
+ gatewayType: 'openclaw',
163
+ observedAtMs: options.now(),
164
+ zoneId: options.zone.id,
165
+ ...record,
166
+ };
167
+ try {
168
+ if (appendGatewayLifecycleOperationRecord) {
169
+ await appendGatewayLifecycleOperationRecord(operationRecord);
170
+ return;
171
+ }
172
+ await appendGatewayLifecycleOperationRecordDefault({
173
+ record: operationRecord,
174
+ runtimeDir: options.systemConfig.runtimeDir,
175
+ zoneId: options.zone.id,
176
+ });
177
+ }
178
+ catch (error) {
179
+ writeOpenClawZoneRuntimeLog(`failed to append gateway lifecycle operation record for zone '${options.zone.id}': ${formatUnknownError(error)}`);
180
+ }
181
+ };
182
+ const markGatewayHostPidMissing = (message) => {
183
+ if (staleGatewayPendingClose === undefined &&
184
+ (lifecycleState.kind === 'running' || lifecycleState.kind === 'running-degraded')) {
185
+ staleGatewayPendingClose = lifecycleState.gateway;
186
+ }
187
+ const errorMessage = `vm-process-missing: ${message}`;
188
+ setOriginalOutageCauseIfUnknown('vm-process-missing');
41
189
  gateway = undefined;
42
190
  bootedAt = undefined;
43
- lastError = undefined;
44
- if (activeGateway) {
45
- await activeGateway.vm.close();
191
+ lastError = errorMessage;
192
+ lifecycleState = {
193
+ coldStartEligible: true,
194
+ error: { code: 'vm-process-missing', message: errorMessage },
195
+ kind: 'failed',
196
+ };
197
+ return lifecycleState;
198
+ };
199
+ const closeStaleGatewayBeforeColdStart = async (operationContext) => {
200
+ const staleGateway = staleGatewayPendingClose;
201
+ if (!staleGateway) {
202
+ return;
203
+ }
204
+ staleGatewayPendingClose = undefined;
205
+ try {
206
+ await recordLifecycleOperation({
207
+ kind: 'vm-close-started',
208
+ operationId: operationContext.operationId,
209
+ operationTrigger: operationContext.operationTrigger,
210
+ previousGateway: gatewayIdentityFor(staleGateway),
211
+ });
212
+ await closeGatewayWithDeadline(staleGateway);
213
+ await recordLifecycleOperation({
214
+ kind: 'vm-close-finished',
215
+ operationId: operationContext.operationId,
216
+ operationTrigger: operationContext.operationTrigger,
217
+ previousGateway: gatewayIdentityFor(staleGateway),
218
+ });
219
+ }
220
+ catch (error) {
221
+ staleGatewayPendingClose = staleGateway;
222
+ lastError = formatUnknownError(error);
223
+ lifecycleState = {
224
+ coldStartEligible: false,
225
+ error: {
226
+ code: 'owner-unsafe',
227
+ message: lastError,
228
+ },
229
+ kind: 'failed',
230
+ };
231
+ await recordLifecycleOperation({
232
+ errorCode: 'owner-unsafe',
233
+ errorMessage: lastError,
234
+ kind: 'operation-failed',
235
+ operationId: operationContext.operationId,
236
+ operationTrigger: operationContext.operationTrigger,
237
+ previousGateway: gatewayIdentityFor(staleGateway),
238
+ });
239
+ throw error;
240
+ }
241
+ };
242
+ const classifyLastError = (message) => {
243
+ if (message.startsWith('vm-process-missing:')) {
244
+ return {
245
+ coldStartEligible: true,
246
+ error: { code: 'vm-process-missing', message },
247
+ kind: 'failed',
248
+ };
249
+ }
250
+ const error = classifyGatewayStartError(new Error(message));
251
+ return {
252
+ coldStartEligible: error.code !== 'owner-unsafe',
253
+ error,
254
+ kind: 'failed',
255
+ };
256
+ };
257
+ const getLifecycleState = () => {
258
+ if (lifecycleState.kind === 'running' || lifecycleState.kind === 'running-degraded') {
259
+ const hostPid = lifecycleState.gateway.vm.getHostPid();
260
+ if (hostPid === undefined || hostPid === null) {
261
+ return markGatewayHostPidMissing(`Gateway VM host pid is unavailable for zone '${options.zone.id}'.`);
262
+ }
263
+ if (!isProcessAlive(hostPid)) {
264
+ return markGatewayHostPidMissing(`Gateway VM host pid ${String(hostPid)} is not alive for zone '${options.zone.id}'.`);
265
+ }
266
+ return lifecycleState;
267
+ }
268
+ if (lifecycleState.kind === 'failed' || lifecycleState.kind === 'owner-unsafe') {
269
+ return lifecycleState;
270
+ }
271
+ if (lifecycleState.kind === 'starting' ||
272
+ lifecycleState.kind === 'stopping' ||
273
+ lifecycleState.kind === 'restarting') {
274
+ return lifecycleState;
275
+ }
276
+ if (lastError) {
277
+ lifecycleState = classifyLastError(lastError);
278
+ return lifecycleState;
279
+ }
280
+ return lifecycleState;
281
+ };
282
+ const runLifecycleOperation = async (operation) => {
283
+ const runAfterPrevious = async () => {
284
+ await lifecycleOperation.catch(() => undefined);
285
+ return await operation();
286
+ };
287
+ const executionPromise = runAfterPrevious();
288
+ const operationResultPromise = executionPromise.then(async (execution) => {
289
+ if (isLifecycleOperationExecutionWithLock(execution)) {
290
+ return await execution.publicResult;
291
+ }
292
+ return await execution;
293
+ });
294
+ lifecycleOperation = executionPromise
295
+ .then(async (execution) => {
296
+ if (isLifecycleOperationExecutionWithLock(execution)) {
297
+ await execution.lock;
298
+ return;
299
+ }
300
+ await execution;
301
+ })
302
+ .then(() => undefined, () => undefined);
303
+ return await operationResultPromise;
304
+ };
305
+ const withLifecycleTimeout = (props) => {
306
+ let timeout;
307
+ const timeoutPromise = new Promise((_resolve, reject) => {
308
+ timeout = setTimeoutImpl(() => {
309
+ lifecycleGeneration += 1;
310
+ reject(new OpenClawZoneRestartTimeoutError(options.zone.id, props.timeoutMs));
311
+ }, props.timeoutMs);
312
+ timeout.unref?.();
313
+ });
314
+ const publicResult = Promise.race([props.operation, timeoutPromise]).finally(() => {
315
+ if (timeout) {
316
+ clearTimeoutImpl(timeout);
317
+ }
318
+ });
319
+ return {
320
+ lock: props.operation.then(() => undefined, () => undefined),
321
+ publicResult,
322
+ };
323
+ };
324
+ const releaseZoneLeases = async (zoneId) => {
325
+ const leases = options.leaseManager
326
+ .listLeases()
327
+ .filter((activeLease) => activeLease.zoneId === zoneId);
328
+ const releaseResults = await Promise.allSettled(leases.map(async (lease) => await options.leaseManager.releaseLease(lease.id, { force: true })));
329
+ const failedLeaseIds = [];
330
+ for (const [index, releaseResult] of releaseResults.entries()) {
331
+ if (releaseResult.status === 'fulfilled') {
332
+ continue;
333
+ }
334
+ const leaseId = leases[index]?.id ?? `(unknown lease at index ${index})`;
335
+ failedLeaseIds.push(leaseId);
336
+ writeOpenClawZoneRuntimeLog(`lease '${leaseId}' release failed while restarting zone '${zoneId}': ${formatUnknownError(releaseResult.reason)}`);
337
+ }
338
+ return { failedLeaseIds };
339
+ };
340
+ const closeGatewayWithDeadline = async (activeGateway) => {
341
+ let timeout;
342
+ try {
343
+ await Promise.race([
344
+ activeGateway.vm.close(),
345
+ new Promise((_resolve, reject) => {
346
+ timeout = setTimeoutImpl(() => {
347
+ reject(new Error(`Gateway VM close timed out for zone '${options.zone.id}' after ${closeGatewayTimeoutMs}ms`));
348
+ }, closeGatewayTimeoutMs);
349
+ timeout.unref?.();
350
+ }),
351
+ ]);
352
+ }
353
+ finally {
354
+ if (timeout) {
355
+ clearTimeoutImpl(timeout);
356
+ }
357
+ }
358
+ };
359
+ const stopNow = async (next = 'stopped', operationContext) => {
360
+ const activeGateway = gateway;
361
+ const operationId = operationContext?.operationId ?? createOperationId('stop');
362
+ const operationTrigger = operationContext?.operationTrigger ?? 'operator-stop';
363
+ const previousGateway = operationContext?.previousGateway ?? activeGateway;
364
+ lifecycleState = {
365
+ kind: 'stopping',
366
+ next,
367
+ operationId,
368
+ previousGateway,
369
+ };
370
+ try {
371
+ await recordLifecycleOperation({
372
+ kind: 'stop-requested',
373
+ operationId,
374
+ operationTrigger,
375
+ previousGateway: gatewayIdentityFor(previousGateway),
376
+ });
377
+ if (activeGateway) {
378
+ await recordLifecycleOperation({
379
+ kind: 'vm-close-started',
380
+ operationId,
381
+ operationTrigger,
382
+ previousGateway: gatewayIdentityFor(previousGateway),
383
+ });
384
+ await closeGatewayWithDeadline(activeGateway);
385
+ await recordLifecycleOperation({
386
+ kind: 'vm-close-finished',
387
+ operationId,
388
+ operationTrigger,
389
+ previousGateway: gatewayIdentityFor(previousGateway),
390
+ });
391
+ }
392
+ await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
393
+ await recordLifecycleOperation({
394
+ kind: 'runtime-record-deleted',
395
+ operationId,
396
+ operationTrigger,
397
+ previousGateway: gatewayIdentityFor(previousGateway),
398
+ });
399
+ gateway = undefined;
400
+ bootedAt = undefined;
401
+ lastError = undefined;
402
+ lifecycleState = { kind: 'stopped' };
403
+ }
404
+ catch (error) {
405
+ lastError = formatUnknownError(error);
406
+ lifecycleState = {
407
+ coldStartEligible: false,
408
+ error: {
409
+ code: 'owner-unsafe',
410
+ message: lastError,
411
+ },
412
+ kind: 'failed',
413
+ };
414
+ await recordLifecycleOperation({
415
+ errorCode: 'owner-unsafe',
416
+ errorMessage: lastError,
417
+ kind: 'operation-failed',
418
+ operationId,
419
+ operationTrigger,
420
+ previousGateway: gatewayIdentityFor(previousGateway),
421
+ });
422
+ throw error;
46
423
  }
47
- await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
48
424
  };
49
- const start = async () => {
425
+ const startNow = async (expectedGeneration, startOptions = {}, operationContext) => {
426
+ const operationId = operationContext?.operationId ?? createOperationId('start');
427
+ const operationTrigger = operationContext?.operationTrigger ?? 'operator-start';
428
+ lifecycleState = {
429
+ kind: 'starting',
430
+ operationId,
431
+ startedAtMs: options.now(),
432
+ };
50
433
  try {
51
- const startedGateway = await startGateway();
434
+ await recordLifecycleOperation({
435
+ kind: 'start-requested',
436
+ operationId,
437
+ operationTrigger,
438
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
439
+ });
440
+ const startedGateway = await startGateway(startOptions);
441
+ if (expectedGeneration !== undefined && expectedGeneration !== lifecycleGeneration) {
442
+ try {
443
+ await closeGatewayWithDeadline(startedGateway);
444
+ if (lifecycleGeneration === expectedGeneration + 1) {
445
+ await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
446
+ await recordLifecycleOperation({
447
+ currentGateway: gatewayIdentityFor(startedGateway),
448
+ kind: 'runtime-record-deleted',
449
+ operationId,
450
+ operationTrigger,
451
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
452
+ });
453
+ }
454
+ lastError = `stale-generation-closed: Closed stale gateway start for zone '${options.zone.id}'.`;
455
+ lifecycleState = classifyLastError(lastError);
456
+ await recordLifecycleOperation({
457
+ currentGateway: gatewayIdentityFor(startedGateway),
458
+ errorCode: 'stale-generation-closed',
459
+ errorMessage: lastError,
460
+ kind: 'operation-failed',
461
+ operationId,
462
+ operationTrigger,
463
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
464
+ });
465
+ }
466
+ catch (error) {
467
+ lastError = `stale-generation-closed: Failed to close stale gateway start for zone '${options.zone.id}': ${formatUnknownError(error)}`;
468
+ lifecycleState = classifyLastError(lastError);
469
+ await recordLifecycleOperation({
470
+ currentGateway: gatewayIdentityFor(startedGateway),
471
+ errorCode: 'stale-generation-closed',
472
+ errorMessage: lastError,
473
+ kind: 'operation-failed',
474
+ operationId,
475
+ operationTrigger,
476
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
477
+ });
478
+ writeOpenClawZoneRuntimeLog(`stale gateway start cleanup failed for zone '${options.zone.id}': ${formatUnknownError(error)}`);
479
+ }
480
+ return;
481
+ }
52
482
  gateway = startedGateway;
53
483
  bootedAt = new Date(options.now()).toISOString();
54
484
  lastError = undefined;
485
+ lifecycleState = { gateway: startedGateway, kind: 'running' };
486
+ await recordLifecycleOperation({
487
+ currentGateway: gatewayIdentityFor(startedGateway),
488
+ kind: 'operation-finished',
489
+ operationId,
490
+ operationTrigger,
491
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
492
+ });
55
493
  }
56
494
  catch (error) {
495
+ if (error instanceof GatewayOwnershipUnsafeError) {
496
+ gateway = undefined;
497
+ bootedAt = undefined;
498
+ lastError = error.message;
499
+ lifecycleState = {
500
+ evidence: error.evidence,
501
+ kind: 'owner-unsafe',
502
+ };
503
+ await recordLifecycleOperation({
504
+ errorCode: 'owner-unsafe',
505
+ errorMessage: error.message,
506
+ kind: 'operation-failed',
507
+ operationId,
508
+ operationTrigger,
509
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
510
+ });
511
+ throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
512
+ gatewayLifecycleErrorCode: 'owner-unsafe',
513
+ operationId,
514
+ });
515
+ }
516
+ const classifiedError = classifyGatewayStartError(error);
57
517
  gateway = undefined;
58
518
  bootedAt = undefined;
59
519
  lastError = formatUnknownError(error);
60
- throw new ControllerZoneRuntimeStartError(options.zone.id, error);
520
+ lifecycleState = {
521
+ coldStartEligible: true,
522
+ error: classifiedError,
523
+ kind: 'failed',
524
+ };
525
+ await recordLifecycleOperation({
526
+ errorCode: classifiedError.code,
527
+ errorMessage: classifiedError.message,
528
+ kind: 'operation-failed',
529
+ operationId,
530
+ operationTrigger,
531
+ previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
532
+ });
533
+ throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
534
+ gatewayLifecycleErrorCode: classifiedError.code,
535
+ operationId,
536
+ });
61
537
  }
62
538
  };
63
- const restart = async () => {
64
- await stop();
65
- await start();
539
+ const stop = async () => await runLifecycleOperation(async () => await stopNow());
540
+ const start = async () => await runLifecycleOperation(async () => await startNow(undefined, {}, {
541
+ operationId: createOperationId('start'),
542
+ operationTrigger: 'controller-start',
543
+ }));
544
+ const restartWithStartOptions = async (restartOptions = {}, startOptions = {}, operationMetadata = {}) => {
545
+ return await runLifecycleOperation(async () => {
546
+ lifecycleGeneration += 1;
547
+ const operationGeneration = lifecycleGeneration;
548
+ const currentState = getLifecycleState();
549
+ const operationId = operationMetadata.operationId ?? createOperationId('restart');
550
+ const operationContext = {
551
+ operationId,
552
+ operationTrigger: operationMetadata.operationTrigger ??
553
+ restartOptions.operationTrigger ??
554
+ 'operator-restart',
555
+ previousGateway: currentState.kind === 'running' || currentState.kind === 'running-degraded'
556
+ ? currentState.gateway
557
+ : undefined,
558
+ };
559
+ if (currentState.kind === 'running' || currentState.kind === 'running-degraded') {
560
+ lifecycleState = {
561
+ kind: 'restarting',
562
+ operationId,
563
+ previousGateway: currentState.gateway,
564
+ };
565
+ }
566
+ const restartOperation = currentState.kind === 'running' || currentState.kind === 'running-degraded'
567
+ ? (async () => {
568
+ await recordLifecycleOperation({
569
+ kind: 'restart-requested',
570
+ operationId,
571
+ operationTrigger: operationContext.operationTrigger,
572
+ previousGateway: gatewayIdentityFor(operationContext.previousGateway),
573
+ });
574
+ const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
575
+ await stopNow('starting', operationContext);
576
+ await startNow(operationGeneration, startOptions, operationContext);
577
+ if (operationGeneration !== lifecycleGeneration) {
578
+ throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
579
+ }
580
+ return {
581
+ leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
582
+ operationId,
583
+ };
584
+ })()
585
+ : (async () => {
586
+ await recordLifecycleOperation({
587
+ kind: 'cold-start-requested',
588
+ operationId,
589
+ operationTrigger: operationContext.operationTrigger,
590
+ });
591
+ const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
592
+ await closeStaleGatewayBeforeColdStart(operationContext);
593
+ await startNow(operationGeneration, startOptions, operationContext);
594
+ if (operationGeneration !== lifecycleGeneration) {
595
+ throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
596
+ }
597
+ return {
598
+ leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
599
+ operationId,
600
+ };
601
+ })();
602
+ if (restartOptions.timeoutMs === undefined) {
603
+ return await restartOperation;
604
+ }
605
+ return withLifecycleTimeout({
606
+ operation: restartOperation,
607
+ timeoutMs: restartOptions.timeoutMs,
608
+ });
609
+ });
610
+ };
611
+ const restart = async (restartOptions = {}) => await restartWithStartOptions(restartOptions);
612
+ const coldStartWithStartOptions = async (restartOptions = {}, startOptions = {}, operationMetadata = {}) => {
613
+ return await runLifecycleOperation(async () => {
614
+ lifecycleGeneration += 1;
615
+ const operationGeneration = lifecycleGeneration;
616
+ const operationContext = {
617
+ operationId: operationMetadata.operationId ?? createOperationId('cold-start'),
618
+ operationTrigger: operationMetadata.operationTrigger ?? restartOptions.operationTrigger ?? 'auto-recovery',
619
+ };
620
+ const coldStartOperation = (async () => {
621
+ getLifecycleState();
622
+ await recordLifecycleOperation({
623
+ kind: 'cold-start-requested',
624
+ operationId: operationContext.operationId,
625
+ operationTrigger: operationContext.operationTrigger,
626
+ });
627
+ const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
628
+ await closeStaleGatewayBeforeColdStart(operationContext);
629
+ await startNow(operationGeneration, startOptions, operationContext);
630
+ if (operationGeneration !== lifecycleGeneration) {
631
+ throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
632
+ }
633
+ return {
634
+ leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
635
+ operationId: operationContext.operationId,
636
+ };
637
+ })();
638
+ if (restartOptions.timeoutMs === undefined) {
639
+ return await coldStartOperation;
640
+ }
641
+ return withLifecycleTimeout({
642
+ operation: coldStartOperation,
643
+ timeoutMs: restartOptions.timeoutMs,
644
+ });
645
+ });
66
646
  };
647
+ const coldStart = async (restartOptions = {}) => await coldStartWithStartOptions(restartOptions);
67
648
  return {
649
+ coldStart,
68
650
  destroy: async (purge) => await (options.runControllerDestroy ?? runControllerDestroyDefault)({ purge, systemConfig: options.systemConfig, zoneId: options.zone.id }, {
69
651
  releaseZoneLeases: async (zoneId) => {
70
- await Promise.all(options.leaseManager
71
- .listLeases()
72
- .filter((activeLease) => activeLease.zoneId === zoneId)
73
- .map(async (lease) => await options.leaseManager.releaseLease(lease.id, { force: true })));
652
+ await releaseZoneLeases(zoneId);
74
653
  },
75
654
  stopGatewayZone: async () => await stop(),
76
655
  }),
77
656
  enableSsh: async () => await requireGateway().vm.enableSsh(),
78
- exec: async (command) => await requireGateway().vm.exec(command),
657
+ exec: async (command) => await executeGatewayCommand(requireGateway(), command),
79
658
  gatewayType: 'openclaw',
80
659
  getHealth: async () => {
660
+ getLifecycleState();
81
661
  const activeGateway = requireGateway();
82
662
  const result = await runGatewayHealthCheck({
83
- exec: async (command) => await activeGateway.vm.exec(command),
663
+ exec: async (command) => await executeGatewayCommand(activeGateway, command),
84
664
  healthCheck: activeGateway.processSpec.healthCheck,
85
665
  });
86
666
  return {
@@ -92,47 +672,127 @@ export function createOpenClawZoneRuntime(options) {
92
672
  zoneId: options.zone.id,
93
673
  };
94
674
  },
675
+ getDiagnosis: () => deriveGatewayDiagnosisSnapshot({
676
+ channelProviderPlane: 'unknown',
677
+ controllerLiveness: 'ok',
678
+ lastOperation,
679
+ originalOutageCause,
680
+ state: getLifecycleState(),
681
+ toolVmPlane: 'unknown',
682
+ }),
95
683
  getLogs: async () => {
96
684
  const activeGateway = requireGateway();
97
685
  return await (options.runControllerLogs ?? runControllerLogsDefault)({ zoneId: options.zone.id }, {
98
686
  readGatewayLogs: async () => (await activeGateway.vm.exec(buildOpenClawCombinedLogsCommand(activeGateway.processSpec.logPath))).stdout,
99
687
  });
100
688
  },
689
+ getLifecycleState,
101
690
  getSnapshot: () => {
102
- if (gateway) {
103
- const hostPid = gateway.vm.getHostPid();
691
+ const currentLifecycleState = getLifecycleState();
692
+ if (currentLifecycleState.kind === 'running') {
693
+ const hostPid = currentLifecycleState.gateway.vm.getHostPid();
694
+ if (hostPid === undefined || hostPid === null) {
695
+ const missingHostPidState = markGatewayHostPidMissing(`Gateway VM host pid is unavailable for zone '${options.zone.id}'.`);
696
+ return {
697
+ lastError: missingHostPidState.error.message,
698
+ lifecycleState: 'failed',
699
+ };
700
+ }
104
701
  return {
105
702
  ...(bootedAt ? { bootedAt } : {}),
106
703
  gateway: {
107
- ingress: gateway.ingress,
704
+ ingress: currentLifecycleState.gateway.ingress,
108
705
  vm: {
109
- ...(hostPid === undefined || hostPid === null ? {} : { hostPid }),
110
- id: gateway.vm.id,
706
+ hostPid,
707
+ id: currentLifecycleState.gateway.vm.id,
111
708
  },
112
709
  },
710
+ ...(lastError ? { lastError } : {}),
113
711
  lifecycleState: 'running',
114
712
  };
115
713
  }
116
714
  return lastError ? { lastError, lifecycleState: 'failed' } : { lifecycleState: 'stopped' };
117
715
  },
118
- refreshCredentials: async () => await (options.runControllerCredentialsRefresh ?? runControllerCredentialsRefreshDefault)({ zoneId: options.zone.id }, {
119
- refreshZoneSecrets: async (zoneId) => {
120
- await resolveZoneSecrets({
121
- audience: 'gateway',
122
- secretResolver: options.secretResolver,
123
- systemConfig: options.systemConfig,
124
- zoneId,
716
+ refreshCredentials: async () => await (async () => {
717
+ const operationId = createOperationId('credentials-refresh');
718
+ const operationTrigger = 'credentials-refresh';
719
+ await recordLifecycleOperation({
720
+ kind: 'credentials-refresh-requested',
721
+ operationId,
722
+ operationTrigger,
723
+ previousGateway: gatewayIdentityFor(gateway),
724
+ });
725
+ const failCredentialsRefreshSecretResolution = async (error) => {
726
+ const classifiedError = {
727
+ code: 'secret-resolution-failed',
728
+ message: formatUnknownError(error),
729
+ };
730
+ const currentLifecycleState = getLifecycleState();
731
+ lastError = classifiedError.message;
732
+ if (currentLifecycleState.kind !== 'running' &&
733
+ currentLifecycleState.kind !== 'running-degraded') {
734
+ lifecycleState = {
735
+ coldStartEligible: true,
736
+ error: classifiedError,
737
+ kind: 'failed',
738
+ };
739
+ }
740
+ await recordLifecycleOperation({
741
+ errorCode: classifiedError.code,
742
+ errorMessage: classifiedError.message,
743
+ kind: 'operation-failed',
744
+ operationId,
745
+ operationTrigger,
746
+ previousGateway: gatewayIdentityFor(gateway),
125
747
  });
126
- },
127
- restartGatewayZone: async () => await restart(),
128
- }),
748
+ throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
749
+ gatewayLifecycleErrorCode: classifiedError.code,
750
+ operationId,
751
+ });
752
+ };
753
+ let refreshedSecretResolver;
754
+ try {
755
+ refreshedSecretResolver = options.createFreshSecretResolver
756
+ ? await options.createFreshSecretResolver()
757
+ : options.secretResolver;
758
+ }
759
+ catch (error) {
760
+ await failCredentialsRefreshSecretResolution(error);
761
+ }
762
+ return await (options.runControllerCredentialsRefresh ?? runControllerCredentialsRefreshDefault)({ zoneId: options.zone.id }, {
763
+ refreshZoneSecrets: async (zoneId) => {
764
+ try {
765
+ await resolveZoneSecrets({
766
+ audience: 'gateway',
767
+ secretResolver: refreshedSecretResolver,
768
+ systemConfig: options.systemConfig,
769
+ zoneId,
770
+ });
771
+ }
772
+ catch (error) {
773
+ await failCredentialsRefreshSecretResolution(error);
774
+ }
775
+ },
776
+ restartGatewayZone: async () => {
777
+ const currentLifecycleState = getLifecycleState();
778
+ if (currentLifecycleState.kind === 'running' ||
779
+ currentLifecycleState.kind === 'running-degraded') {
780
+ await restartWithStartOptions({}, { secretResolver: refreshedSecretResolver }, { operationId, operationTrigger });
781
+ return;
782
+ }
783
+ await coldStartWithStartOptions({}, { secretResolver: refreshedSecretResolver }, { operationId, operationTrigger });
784
+ },
785
+ });
786
+ })(),
129
787
  restart,
130
788
  shutdown: stop,
131
789
  start,
132
790
  stop,
133
791
  upgrade: async () => await (options.runControllerUpgrade ?? runControllerUpgradeDefault)({ systemConfig: options.systemConfig, zoneId: options.zone.id }, {
134
792
  rebuildGatewayImage: async () => { },
135
- restartGatewayZone: async () => await restart(),
793
+ restartGatewayZone: async () => {
794
+ await restart({ operationTrigger: 'upgrade' });
795
+ },
136
796
  }),
137
797
  zoneId: options.zone.id,
138
798
  };