@agent-vm/agent-vm 0.0.92 → 0.0.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build/managed-image-dockerfile.d.ts +2 -1
- package/dist/build/managed-image-dockerfile.d.ts.map +1 -1
- package/dist/build/managed-image-dockerfile.js +51 -27
- package/dist/build/managed-image-dockerfile.js.map +1 -1
- package/dist/cli/commands/controller-definition.d.ts +42 -42
- package/dist/cli/commands/create-app.d.ts +42 -42
- package/dist/cli/manual-templates.d.ts.map +1 -1
- package/dist/cli/manual-templates.js +11 -2
- package/dist/cli/manual-templates.js.map +1 -1
- package/dist/config/system-config.d.ts +7 -0
- package/dist/config/system-config.d.ts.map +1 -1
- package/dist/config/system-config.js +35 -0
- package/dist/config/system-config.js.map +1 -1
- package/dist/controller/controller-runtime-operations.d.ts +1 -0
- package/dist/controller/controller-runtime-operations.d.ts.map +1 -1
- package/dist/controller/controller-runtime-operations.js +2 -0
- package/dist/controller/controller-runtime-operations.js.map +1 -1
- package/dist/controller/controller-runtime-types.d.ts +3 -0
- package/dist/controller/controller-runtime-types.d.ts.map +1 -1
- package/dist/controller/controller-runtime.d.ts +1 -1
- package/dist/controller/controller-runtime.d.ts.map +1 -1
- package/dist/controller/controller-runtime.js +207 -116
- package/dist/controller/controller-runtime.js.map +1 -1
- package/dist/controller/health/channel-provider-recovery-observation.d.ts +23 -0
- package/dist/controller/health/channel-provider-recovery-observation.d.ts.map +1 -0
- package/dist/controller/health/channel-provider-recovery-observation.js +69 -0
- package/dist/controller/health/channel-provider-recovery-observation.js.map +1 -0
- package/dist/controller/health/durable-health-event-log.d.ts +24 -0
- package/dist/controller/health/durable-health-event-log.d.ts.map +1 -0
- package/dist/controller/health/durable-health-event-log.js +89 -0
- package/dist/controller/health/durable-health-event-log.js.map +1 -0
- package/dist/controller/health/gateway-recovery-actions.d.ts +27 -0
- package/dist/controller/health/gateway-recovery-actions.d.ts.map +1 -0
- package/dist/controller/health/gateway-recovery-actions.js +71 -0
- package/dist/controller/health/gateway-recovery-actions.js.map +1 -0
- package/dist/controller/health/gateway-service-health-monitor.d.ts +41 -3
- package/dist/controller/health/gateway-service-health-monitor.d.ts.map +1 -1
- package/dist/controller/health/gateway-service-health-monitor.js +231 -57
- package/dist/controller/health/gateway-service-health-monitor.js.map +1 -1
- package/dist/controller/health/gateway-vm-recovery-policy.d.ts +20 -0
- package/dist/controller/health/gateway-vm-recovery-policy.d.ts.map +1 -1
- package/dist/controller/health/gateway-vm-recovery-policy.js +85 -21
- package/dist/controller/health/gateway-vm-recovery-policy.js.map +1 -1
- package/dist/controller/health/gateway-vm-recovery-runner.d.ts +39 -0
- package/dist/controller/health/gateway-vm-recovery-runner.d.ts.map +1 -0
- package/dist/controller/health/gateway-vm-recovery-runner.js +251 -0
- package/dist/controller/health/gateway-vm-recovery-runner.js.map +1 -0
- package/dist/controller/health/health-event-store.d.ts +4 -0
- package/dist/controller/health/health-event-store.d.ts.map +1 -1
- package/dist/controller/health/health-event-store.js +19 -0
- package/dist/controller/health/health-event-store.js.map +1 -1
- package/dist/controller/http/controller-health-event-routes.d.ts +6 -0
- package/dist/controller/http/controller-health-event-routes.d.ts.map +1 -1
- package/dist/controller/http/controller-health-event-routes.js +49 -0
- package/dist/controller/http/controller-health-event-routes.js.map +1 -1
- package/dist/controller/http/controller-http-routes.d.ts.map +1 -1
- package/dist/controller/http/controller-http-routes.js +6 -0
- package/dist/controller/http/controller-http-routes.js.map +1 -1
- package/dist/controller/leases/lease-manager.d.ts.map +1 -1
- package/dist/controller/leases/lease-manager.js +37 -16
- package/dist/controller/leases/lease-manager.js.map +1 -1
- package/dist/controller/leases/tool-vm-lease-lifecycle.d.ts +44 -0
- package/dist/controller/leases/tool-vm-lease-lifecycle.d.ts.map +1 -0
- package/dist/controller/leases/tool-vm-lease-lifecycle.js +28 -0
- package/dist/controller/leases/tool-vm-lease-lifecycle.js.map +1 -0
- package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.d.ts +37 -0
- package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.d.ts.map +1 -0
- package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.js +133 -0
- package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.js.map +1 -0
- package/dist/controller/zone-runtimes/gateway-zone-state-machine.d.ts +101 -0
- package/dist/controller/zone-runtimes/gateway-zone-state-machine.d.ts.map +1 -0
- package/dist/controller/zone-runtimes/gateway-zone-state-machine.js +143 -0
- package/dist/controller/zone-runtimes/gateway-zone-state-machine.js.map +1 -0
- package/dist/controller/zone-runtimes/openclaw-zone-runtime.d.ts +8 -1
- package/dist/controller/zone-runtimes/openclaw-zone-runtime.d.ts.map +1 -1
- package/dist/controller/zone-runtimes/openclaw-zone-runtime.js +621 -65
- package/dist/controller/zone-runtimes/openclaw-zone-runtime.js.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-errors.d.ts +7 -1
- package/dist/controller/zone-runtimes/zone-runtime-errors.d.ts.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-errors.js +5 -1
- package/dist/controller/zone-runtimes/zone-runtime-errors.js.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-registry.d.ts +2 -0
- package/dist/controller/zone-runtimes/zone-runtime-registry.d.ts.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-registry.js +23 -0
- package/dist/controller/zone-runtimes/zone-runtime-registry.js.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-types.d.ts +7 -0
- package/dist/controller/zone-runtimes/zone-runtime-types.d.ts.map +1 -1
- package/dist/gateway/gateway-ownership-evidence.d.ts +35 -0
- package/dist/gateway/gateway-ownership-evidence.d.ts.map +1 -0
- package/dist/gateway/gateway-ownership-evidence.js +10 -0
- package/dist/gateway/gateway-ownership-evidence.js.map +1 -0
- package/dist/gateway/gateway-recovery.d.ts +16 -0
- package/dist/gateway/gateway-recovery.d.ts.map +1 -1
- package/dist/gateway/gateway-recovery.js +105 -9
- package/dist/gateway/gateway-recovery.js.map +1 -1
- package/dist/gateway/gateway-zone-orchestrator.d.ts.map +1 -1
- package/dist/gateway/gateway-zone-orchestrator.js +50 -39
- package/dist/gateway/gateway-zone-orchestrator.js.map +1 -1
- package/dist/integration-tests/{smoke-harness.d.ts → e2e-harness.d.ts} +45 -37
- package/dist/integration-tests/e2e-harness.d.ts.map +1 -0
- package/dist/integration-tests/{smoke-harness.js → e2e-harness.js} +112 -94
- package/dist/integration-tests/e2e-harness.js.map +1 -0
- package/dist/integration-tests/e2e-workspace-build-global-setup.d.ts +16 -0
- package/dist/integration-tests/e2e-workspace-build-global-setup.d.ts.map +1 -0
- package/dist/integration-tests/e2e-workspace-build-global-setup.js +27 -0
- package/dist/integration-tests/e2e-workspace-build-global-setup.js.map +1 -0
- package/dist/integration-tests/live-agent-model-roundtrip-deployment.d.ts +11 -0
- package/dist/integration-tests/live-agent-model-roundtrip-deployment.d.ts.map +1 -0
- package/dist/integration-tests/live-agent-model-roundtrip-deployment.js +48 -0
- package/dist/integration-tests/live-agent-model-roundtrip-deployment.js.map +1 -0
- package/dist/integration-tests/live-agent-model-roundtrip-gates.d.ts +11 -0
- package/dist/integration-tests/live-agent-model-roundtrip-gates.d.ts.map +1 -0
- package/dist/integration-tests/live-agent-model-roundtrip-gates.js +21 -0
- package/dist/integration-tests/live-agent-model-roundtrip-gates.js.map +1 -0
- package/dist/integration-tests/live-vm-e2e-gates.d.ts +2 -0
- package/dist/integration-tests/live-vm-e2e-gates.d.ts.map +1 -0
- package/dist/integration-tests/live-vm-e2e-gates.js +4 -0
- package/dist/integration-tests/live-vm-e2e-gates.js.map +1 -0
- package/dist/operations/controller-status.d.ts +5 -0
- package/dist/operations/controller-status.d.ts.map +1 -1
- package/dist/operations/controller-status.js +42 -0
- package/dist/operations/controller-status.js.map +1 -1
- package/package.json +11 -11
- package/dist/integration-tests/live-integration-gates.d.ts +0 -2
- package/dist/integration-tests/live-integration-gates.d.ts.map +0 -1
- package/dist/integration-tests/live-integration-gates.js +0 -4
- package/dist/integration-tests/live-integration-gates.js.map +0 -1
- package/dist/integration-tests/smoke-harness.d.ts.map +0 -1
- package/dist/integration-tests/smoke-harness.js.map +0 -1
|
@@ -1,13 +1,26 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
1
2
|
import { resolveZoneSecrets } from '../../gateway/credential-manager.js';
|
|
2
3
|
import { runGatewayHealthCheck } from '../../gateway/gateway-health-check.js';
|
|
4
|
+
import { GatewayOwnershipUnsafeError } from '../../gateway/gateway-ownership-evidence.js';
|
|
3
5
|
import { deleteGatewayRuntimeRecord as deleteGatewayRuntimeRecordDefault } from '../../gateway/gateway-runtime-record.js';
|
|
4
6
|
import { startGatewayZone } from '../../gateway/gateway-zone-orchestrator.js';
|
|
5
7
|
import { runControllerCredentialsRefresh as runControllerCredentialsRefreshDefault } from '../../operations/credentials-refresh.js';
|
|
6
8
|
import { runControllerDestroy as runControllerDestroyDefault } from '../../operations/destroy-zone.js';
|
|
7
9
|
import { runControllerUpgrade as runControllerUpgradeDefault } from '../../operations/upgrade-zone.js';
|
|
8
10
|
import { runControllerLogs as runControllerLogsDefault } from '../../operations/zone-logs.js';
|
|
11
|
+
import { isProcessAlive as defaultIsProcessAlive } from '../../shared/managed-vm-process.js';
|
|
12
|
+
import { appendGatewayLifecycleOperationRecord as appendGatewayLifecycleOperationRecordDefault, } from './gateway-lifecycle-operation-record.js';
|
|
13
|
+
import { classifyGatewayStartError, deriveGatewayDiagnosisSnapshot, } from './gateway-zone-state-machine.js';
|
|
9
14
|
import { ControllerZoneRuntimeStartError, ControllerZoneRuntimeUnavailableError, } from './zone-runtime-errors.js';
|
|
10
15
|
const defaultGatewayCloseTimeoutMs = 60_000;
|
|
16
|
+
function isLifecycleOperationExecutionWithLock(execution) {
|
|
17
|
+
return typeof execution === 'object' && execution !== null && 'lock' in execution;
|
|
18
|
+
}
|
|
19
|
+
function isRecoverySecretResolutionFailure(record) {
|
|
20
|
+
return (record.errorCode === 'secret-resolution-failed' &&
|
|
21
|
+
(record.operationTrigger === 'auto-recovery' ||
|
|
22
|
+
record.operationTrigger === 'credentials-refresh'));
|
|
23
|
+
}
|
|
11
24
|
class OpenClawZoneRestartTimeoutError extends Error {
|
|
12
25
|
code = 'OPENCLAW_GATEWAY_RESTART_TIMEOUT';
|
|
13
26
|
constructor(zoneId, timeoutMs) {
|
|
@@ -33,32 +46,280 @@ function buildOpenClawCombinedLogsCommand(logPath) {
|
|
|
33
46
|
function writeOpenClawZoneRuntimeLog(message) {
|
|
34
47
|
process.stderr.write(`[openclaw-zone-runtime] ${message}\n`);
|
|
35
48
|
}
|
|
49
|
+
function unavailableReasonForState(state) {
|
|
50
|
+
switch (state.kind) {
|
|
51
|
+
case 'failed':
|
|
52
|
+
return state.error.message;
|
|
53
|
+
case 'owner-unsafe':
|
|
54
|
+
return `Gateway runtime ownership is unsafe: ${state.evidence.kind}.`;
|
|
55
|
+
case 'restarting':
|
|
56
|
+
case 'starting':
|
|
57
|
+
case 'stopping':
|
|
58
|
+
return `Gateway runtime is ${state.kind}.`;
|
|
59
|
+
case 'running':
|
|
60
|
+
case 'running-degraded':
|
|
61
|
+
case 'stopped':
|
|
62
|
+
return undefined;
|
|
63
|
+
}
|
|
64
|
+
return assertNeverGatewayZoneLifecycleState(state);
|
|
65
|
+
}
|
|
66
|
+
function assertNeverGatewayZoneLifecycleState(state) {
|
|
67
|
+
throw new Error(`Unhandled gateway zone lifecycle state: ${JSON.stringify(state)}`);
|
|
68
|
+
}
|
|
69
|
+
function assertNeverGatewayLifecycleOperationRecordKind(kind) {
|
|
70
|
+
throw new Error(`Unhandled gateway lifecycle operation record kind: ${String(kind)}`);
|
|
71
|
+
}
|
|
72
|
+
function gatewayIdentityFor(runtimeGateway) {
|
|
73
|
+
if (!runtimeGateway) {
|
|
74
|
+
return undefined;
|
|
75
|
+
}
|
|
76
|
+
const hostPid = runtimeGateway.vm.getHostPid();
|
|
77
|
+
return {
|
|
78
|
+
...(typeof hostPid === 'number' && hostPid > 0 ? { hostPid } : {}),
|
|
79
|
+
vmId: runtimeGateway.vm.id,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
async function executeGatewayCommand(runtimeGateway, command) {
|
|
83
|
+
const result = await runtimeGateway.vm.exec(command);
|
|
84
|
+
return {
|
|
85
|
+
exitCode: result.exitCode,
|
|
86
|
+
stderr: result.stderr,
|
|
87
|
+
stdout: result.stdout,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
36
90
|
export function createOpenClawZoneRuntime(options) {
|
|
37
91
|
const clearTimeoutImpl = options.clearTimeoutImpl ?? clearTimeout;
|
|
38
92
|
const closeGatewayTimeoutMs = options.closeGatewayTimeoutMs ?? defaultGatewayCloseTimeoutMs;
|
|
93
|
+
const isProcessAlive = options.isProcessAlive ?? defaultIsProcessAlive;
|
|
39
94
|
const setTimeoutImpl = options.setTimeoutImpl ?? setTimeout;
|
|
95
|
+
const appendGatewayLifecycleOperationRecord = options.appendGatewayLifecycleOperationRecord;
|
|
40
96
|
let gateway;
|
|
41
97
|
let bootedAt;
|
|
42
98
|
let lastError;
|
|
99
|
+
let lastOperation = 'none';
|
|
100
|
+
let originalOutageCause = { kind: 'unknown' };
|
|
101
|
+
let lifecycleState = { kind: 'stopped' };
|
|
43
102
|
let lifecycleOperation = Promise.resolve();
|
|
44
103
|
let lifecycleGeneration = 0;
|
|
45
|
-
|
|
46
|
-
|
|
104
|
+
let staleGatewayPendingClose;
|
|
105
|
+
const startGateway = async (startOptions = {}) => options.restartGatewayZone
|
|
106
|
+
? await options.restartGatewayZone(options.zone.id, startOptions)
|
|
47
107
|
: await startGatewayZone({
|
|
48
|
-
secretResolver: options.secretResolver,
|
|
108
|
+
secretResolver: startOptions.secretResolver ?? options.secretResolver,
|
|
49
109
|
systemConfig: options.systemConfig,
|
|
50
110
|
zoneId: options.zone.id,
|
|
51
111
|
});
|
|
52
112
|
const requireGateway = () => {
|
|
53
|
-
|
|
54
|
-
|
|
113
|
+
const currentState = getLifecycleState();
|
|
114
|
+
if (currentState.kind !== 'running' && currentState.kind !== 'running-degraded') {
|
|
115
|
+
throw new ControllerZoneRuntimeUnavailableError(options.zone.id, lastError ?? unavailableReasonForState(currentState));
|
|
116
|
+
}
|
|
117
|
+
return currentState.gateway;
|
|
118
|
+
};
|
|
119
|
+
const createOperationId = (operationName) => `${options.zone.id}-${operationName}-${randomUUID()}`;
|
|
120
|
+
const operationForRecordKind = (kind) => {
|
|
121
|
+
switch (kind) {
|
|
122
|
+
case 'cold-start-requested':
|
|
123
|
+
return 'cold-start';
|
|
124
|
+
case 'credentials-refresh-requested':
|
|
125
|
+
return 'credentials-refresh';
|
|
126
|
+
case 'restart-requested':
|
|
127
|
+
return 'restart';
|
|
128
|
+
case 'start-requested':
|
|
129
|
+
return 'start';
|
|
130
|
+
case 'stop-requested':
|
|
131
|
+
return 'stop';
|
|
132
|
+
case 'operation-failed':
|
|
133
|
+
case 'operation-finished':
|
|
134
|
+
case 'runtime-record-deleted':
|
|
135
|
+
case 'runtime-record-written':
|
|
136
|
+
case 'vm-close-finished':
|
|
137
|
+
case 'vm-close-started':
|
|
138
|
+
return undefined;
|
|
139
|
+
}
|
|
140
|
+
return assertNeverGatewayLifecycleOperationRecordKind(kind);
|
|
141
|
+
};
|
|
142
|
+
const setOriginalOutageCauseIfUnknown = (errorCode) => {
|
|
143
|
+
if (originalOutageCause.kind !== 'unknown') {
|
|
144
|
+
return;
|
|
145
|
+
}
|
|
146
|
+
originalOutageCause = {
|
|
147
|
+
...(errorCode === undefined ? {} : { errorCode }),
|
|
148
|
+
eventKind: 'gateway-lifecycle-operation',
|
|
149
|
+
kind: 'proven',
|
|
150
|
+
};
|
|
151
|
+
};
|
|
152
|
+
const recordLifecycleOperation = async (record) => {
|
|
153
|
+
const operation = operationForRecordKind(record.kind);
|
|
154
|
+
if (operation !== undefined) {
|
|
155
|
+
lastOperation = operation;
|
|
156
|
+
}
|
|
157
|
+
if (record.kind === 'operation-failed' && !isRecoverySecretResolutionFailure(record)) {
|
|
158
|
+
setOriginalOutageCauseIfUnknown(record.errorCode);
|
|
159
|
+
}
|
|
160
|
+
const operationRecord = {
|
|
161
|
+
controllerPid: process.pid,
|
|
162
|
+
gatewayType: 'openclaw',
|
|
163
|
+
observedAtMs: options.now(),
|
|
164
|
+
zoneId: options.zone.id,
|
|
165
|
+
...record,
|
|
166
|
+
};
|
|
167
|
+
try {
|
|
168
|
+
if (appendGatewayLifecycleOperationRecord) {
|
|
169
|
+
await appendGatewayLifecycleOperationRecord(operationRecord);
|
|
170
|
+
return;
|
|
171
|
+
}
|
|
172
|
+
await appendGatewayLifecycleOperationRecordDefault({
|
|
173
|
+
record: operationRecord,
|
|
174
|
+
runtimeDir: options.systemConfig.runtimeDir,
|
|
175
|
+
zoneId: options.zone.id,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
catch (error) {
|
|
179
|
+
writeOpenClawZoneRuntimeLog(`failed to append gateway lifecycle operation record for zone '${options.zone.id}': ${formatUnknownError(error)}`);
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
const markGatewayHostPidMissing = (message) => {
|
|
183
|
+
if (staleGatewayPendingClose === undefined &&
|
|
184
|
+
(lifecycleState.kind === 'running' || lifecycleState.kind === 'running-degraded')) {
|
|
185
|
+
staleGatewayPendingClose = lifecycleState.gateway;
|
|
186
|
+
}
|
|
187
|
+
const errorMessage = `vm-process-missing: ${message}`;
|
|
188
|
+
setOriginalOutageCauseIfUnknown('vm-process-missing');
|
|
189
|
+
gateway = undefined;
|
|
190
|
+
bootedAt = undefined;
|
|
191
|
+
lastError = errorMessage;
|
|
192
|
+
lifecycleState = {
|
|
193
|
+
coldStartEligible: true,
|
|
194
|
+
error: { code: 'vm-process-missing', message: errorMessage },
|
|
195
|
+
kind: 'failed',
|
|
196
|
+
};
|
|
197
|
+
return lifecycleState;
|
|
198
|
+
};
|
|
199
|
+
const closeStaleGatewayBeforeColdStart = async (operationContext) => {
|
|
200
|
+
const staleGateway = staleGatewayPendingClose;
|
|
201
|
+
if (!staleGateway) {
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
staleGatewayPendingClose = undefined;
|
|
205
|
+
try {
|
|
206
|
+
await recordLifecycleOperation({
|
|
207
|
+
kind: 'vm-close-started',
|
|
208
|
+
operationId: operationContext.operationId,
|
|
209
|
+
operationTrigger: operationContext.operationTrigger,
|
|
210
|
+
previousGateway: gatewayIdentityFor(staleGateway),
|
|
211
|
+
});
|
|
212
|
+
await closeGatewayWithDeadline(staleGateway);
|
|
213
|
+
await recordLifecycleOperation({
|
|
214
|
+
kind: 'vm-close-finished',
|
|
215
|
+
operationId: operationContext.operationId,
|
|
216
|
+
operationTrigger: operationContext.operationTrigger,
|
|
217
|
+
previousGateway: gatewayIdentityFor(staleGateway),
|
|
218
|
+
});
|
|
219
|
+
}
|
|
220
|
+
catch (error) {
|
|
221
|
+
staleGatewayPendingClose = staleGateway;
|
|
222
|
+
lastError = formatUnknownError(error);
|
|
223
|
+
lifecycleState = {
|
|
224
|
+
coldStartEligible: false,
|
|
225
|
+
error: {
|
|
226
|
+
code: 'owner-unsafe',
|
|
227
|
+
message: lastError,
|
|
228
|
+
},
|
|
229
|
+
kind: 'failed',
|
|
230
|
+
};
|
|
231
|
+
await recordLifecycleOperation({
|
|
232
|
+
errorCode: 'owner-unsafe',
|
|
233
|
+
errorMessage: lastError,
|
|
234
|
+
kind: 'operation-failed',
|
|
235
|
+
operationId: operationContext.operationId,
|
|
236
|
+
operationTrigger: operationContext.operationTrigger,
|
|
237
|
+
previousGateway: gatewayIdentityFor(staleGateway),
|
|
238
|
+
});
|
|
239
|
+
throw error;
|
|
240
|
+
}
|
|
241
|
+
};
|
|
242
|
+
const classifyLastError = (message) => {
|
|
243
|
+
if (message.startsWith('vm-process-missing:')) {
|
|
244
|
+
return {
|
|
245
|
+
coldStartEligible: true,
|
|
246
|
+
error: { code: 'vm-process-missing', message },
|
|
247
|
+
kind: 'failed',
|
|
248
|
+
};
|
|
55
249
|
}
|
|
56
|
-
|
|
250
|
+
const error = classifyGatewayStartError(new Error(message));
|
|
251
|
+
return {
|
|
252
|
+
coldStartEligible: error.code !== 'owner-unsafe',
|
|
253
|
+
error,
|
|
254
|
+
kind: 'failed',
|
|
255
|
+
};
|
|
256
|
+
};
|
|
257
|
+
const getLifecycleState = () => {
|
|
258
|
+
if (lifecycleState.kind === 'running' || lifecycleState.kind === 'running-degraded') {
|
|
259
|
+
const hostPid = lifecycleState.gateway.vm.getHostPid();
|
|
260
|
+
if (hostPid === undefined || hostPid === null) {
|
|
261
|
+
return markGatewayHostPidMissing(`Gateway VM host pid is unavailable for zone '${options.zone.id}'.`);
|
|
262
|
+
}
|
|
263
|
+
if (!isProcessAlive(hostPid)) {
|
|
264
|
+
return markGatewayHostPidMissing(`Gateway VM host pid ${String(hostPid)} is not alive for zone '${options.zone.id}'.`);
|
|
265
|
+
}
|
|
266
|
+
return lifecycleState;
|
|
267
|
+
}
|
|
268
|
+
if (lifecycleState.kind === 'failed' || lifecycleState.kind === 'owner-unsafe') {
|
|
269
|
+
return lifecycleState;
|
|
270
|
+
}
|
|
271
|
+
if (lifecycleState.kind === 'starting' ||
|
|
272
|
+
lifecycleState.kind === 'stopping' ||
|
|
273
|
+
lifecycleState.kind === 'restarting') {
|
|
274
|
+
return lifecycleState;
|
|
275
|
+
}
|
|
276
|
+
if (lastError) {
|
|
277
|
+
lifecycleState = classifyLastError(lastError);
|
|
278
|
+
return lifecycleState;
|
|
279
|
+
}
|
|
280
|
+
return lifecycleState;
|
|
57
281
|
};
|
|
58
282
|
const runLifecycleOperation = async (operation) => {
|
|
59
|
-
const
|
|
60
|
-
|
|
61
|
-
|
|
283
|
+
const runAfterPrevious = async () => {
|
|
284
|
+
await lifecycleOperation.catch(() => undefined);
|
|
285
|
+
return await operation();
|
|
286
|
+
};
|
|
287
|
+
const executionPromise = runAfterPrevious();
|
|
288
|
+
const operationResultPromise = executionPromise.then(async (execution) => {
|
|
289
|
+
if (isLifecycleOperationExecutionWithLock(execution)) {
|
|
290
|
+
return await execution.publicResult;
|
|
291
|
+
}
|
|
292
|
+
return await execution;
|
|
293
|
+
});
|
|
294
|
+
lifecycleOperation = executionPromise
|
|
295
|
+
.then(async (execution) => {
|
|
296
|
+
if (isLifecycleOperationExecutionWithLock(execution)) {
|
|
297
|
+
await execution.lock;
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
await execution;
|
|
301
|
+
})
|
|
302
|
+
.then(() => undefined, () => undefined);
|
|
303
|
+
return await operationResultPromise;
|
|
304
|
+
};
|
|
305
|
+
const withLifecycleTimeout = (props) => {
|
|
306
|
+
let timeout;
|
|
307
|
+
const timeoutPromise = new Promise((_resolve, reject) => {
|
|
308
|
+
timeout = setTimeoutImpl(() => {
|
|
309
|
+
lifecycleGeneration += 1;
|
|
310
|
+
reject(new OpenClawZoneRestartTimeoutError(options.zone.id, props.timeoutMs));
|
|
311
|
+
}, props.timeoutMs);
|
|
312
|
+
timeout.unref?.();
|
|
313
|
+
});
|
|
314
|
+
const publicResult = Promise.race([props.operation, timeoutPromise]).finally(() => {
|
|
315
|
+
if (timeout) {
|
|
316
|
+
clearTimeoutImpl(timeout);
|
|
317
|
+
}
|
|
318
|
+
});
|
|
319
|
+
return {
|
|
320
|
+
lock: props.operation.then(() => undefined, () => undefined),
|
|
321
|
+
publicResult,
|
|
322
|
+
};
|
|
62
323
|
};
|
|
63
324
|
const releaseZoneLeases = async (zoneId) => {
|
|
64
325
|
const leases = options.leaseManager
|
|
@@ -95,24 +356,125 @@ export function createOpenClawZoneRuntime(options) {
|
|
|
95
356
|
}
|
|
96
357
|
}
|
|
97
358
|
};
|
|
98
|
-
const stopNow = async () => {
|
|
359
|
+
const stopNow = async (next = 'stopped', operationContext) => {
|
|
99
360
|
const activeGateway = gateway;
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
361
|
+
const operationId = operationContext?.operationId ?? createOperationId('stop');
|
|
362
|
+
const operationTrigger = operationContext?.operationTrigger ?? 'operator-stop';
|
|
363
|
+
const previousGateway = operationContext?.previousGateway ?? activeGateway;
|
|
364
|
+
lifecycleState = {
|
|
365
|
+
kind: 'stopping',
|
|
366
|
+
next,
|
|
367
|
+
operationId,
|
|
368
|
+
previousGateway,
|
|
369
|
+
};
|
|
370
|
+
try {
|
|
371
|
+
await recordLifecycleOperation({
|
|
372
|
+
kind: 'stop-requested',
|
|
373
|
+
operationId,
|
|
374
|
+
operationTrigger,
|
|
375
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
376
|
+
});
|
|
377
|
+
if (activeGateway) {
|
|
378
|
+
await recordLifecycleOperation({
|
|
379
|
+
kind: 'vm-close-started',
|
|
380
|
+
operationId,
|
|
381
|
+
operationTrigger,
|
|
382
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
383
|
+
});
|
|
384
|
+
await closeGatewayWithDeadline(activeGateway);
|
|
385
|
+
await recordLifecycleOperation({
|
|
386
|
+
kind: 'vm-close-finished',
|
|
387
|
+
operationId,
|
|
388
|
+
operationTrigger,
|
|
389
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
|
|
393
|
+
await recordLifecycleOperation({
|
|
394
|
+
kind: 'runtime-record-deleted',
|
|
395
|
+
operationId,
|
|
396
|
+
operationTrigger,
|
|
397
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
398
|
+
});
|
|
399
|
+
gateway = undefined;
|
|
400
|
+
bootedAt = undefined;
|
|
401
|
+
lastError = undefined;
|
|
402
|
+
lifecycleState = { kind: 'stopped' };
|
|
403
|
+
}
|
|
404
|
+
catch (error) {
|
|
405
|
+
lastError = formatUnknownError(error);
|
|
406
|
+
lifecycleState = {
|
|
407
|
+
coldStartEligible: false,
|
|
408
|
+
error: {
|
|
409
|
+
code: 'owner-unsafe',
|
|
410
|
+
message: lastError,
|
|
411
|
+
},
|
|
412
|
+
kind: 'failed',
|
|
413
|
+
};
|
|
414
|
+
await recordLifecycleOperation({
|
|
415
|
+
errorCode: 'owner-unsafe',
|
|
416
|
+
errorMessage: lastError,
|
|
417
|
+
kind: 'operation-failed',
|
|
418
|
+
operationId,
|
|
419
|
+
operationTrigger,
|
|
420
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
421
|
+
});
|
|
422
|
+
throw error;
|
|
105
423
|
}
|
|
106
|
-
await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
|
|
107
424
|
};
|
|
108
|
-
const startNow = async (expectedGeneration) => {
|
|
425
|
+
const startNow = async (expectedGeneration, startOptions = {}, operationContext) => {
|
|
426
|
+
const operationId = operationContext?.operationId ?? createOperationId('start');
|
|
427
|
+
const operationTrigger = operationContext?.operationTrigger ?? 'operator-start';
|
|
428
|
+
lifecycleState = {
|
|
429
|
+
kind: 'starting',
|
|
430
|
+
operationId,
|
|
431
|
+
startedAtMs: options.now(),
|
|
432
|
+
};
|
|
109
433
|
try {
|
|
110
|
-
|
|
434
|
+
await recordLifecycleOperation({
|
|
435
|
+
kind: 'start-requested',
|
|
436
|
+
operationId,
|
|
437
|
+
operationTrigger,
|
|
438
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
439
|
+
});
|
|
440
|
+
const startedGateway = await startGateway(startOptions);
|
|
111
441
|
if (expectedGeneration !== undefined && expectedGeneration !== lifecycleGeneration) {
|
|
112
442
|
try {
|
|
113
443
|
await closeGatewayWithDeadline(startedGateway);
|
|
444
|
+
if (lifecycleGeneration === expectedGeneration + 1) {
|
|
445
|
+
await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
|
|
446
|
+
await recordLifecycleOperation({
|
|
447
|
+
currentGateway: gatewayIdentityFor(startedGateway),
|
|
448
|
+
kind: 'runtime-record-deleted',
|
|
449
|
+
operationId,
|
|
450
|
+
operationTrigger,
|
|
451
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
452
|
+
});
|
|
453
|
+
}
|
|
454
|
+
lastError = `stale-generation-closed: Closed stale gateway start for zone '${options.zone.id}'.`;
|
|
455
|
+
lifecycleState = classifyLastError(lastError);
|
|
456
|
+
await recordLifecycleOperation({
|
|
457
|
+
currentGateway: gatewayIdentityFor(startedGateway),
|
|
458
|
+
errorCode: 'stale-generation-closed',
|
|
459
|
+
errorMessage: lastError,
|
|
460
|
+
kind: 'operation-failed',
|
|
461
|
+
operationId,
|
|
462
|
+
operationTrigger,
|
|
463
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
464
|
+
});
|
|
114
465
|
}
|
|
115
466
|
catch (error) {
|
|
467
|
+
lastError = `stale-generation-closed: Failed to close stale gateway start for zone '${options.zone.id}': ${formatUnknownError(error)}`;
|
|
468
|
+
lifecycleState = classifyLastError(lastError);
|
|
469
|
+
await recordLifecycleOperation({
|
|
470
|
+
currentGateway: gatewayIdentityFor(startedGateway),
|
|
471
|
+
errorCode: 'stale-generation-closed',
|
|
472
|
+
errorMessage: lastError,
|
|
473
|
+
kind: 'operation-failed',
|
|
474
|
+
operationId,
|
|
475
|
+
operationTrigger,
|
|
476
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
477
|
+
});
|
|
116
478
|
writeOpenClawZoneRuntimeLog(`stale gateway start cleanup failed for zone '${options.zone.id}': ${formatUnknownError(error)}`);
|
|
117
479
|
}
|
|
118
480
|
return;
|
|
@@ -120,54 +482,171 @@ export function createOpenClawZoneRuntime(options) {
|
|
|
120
482
|
gateway = startedGateway;
|
|
121
483
|
bootedAt = new Date(options.now()).toISOString();
|
|
122
484
|
lastError = undefined;
|
|
485
|
+
lifecycleState = { gateway: startedGateway, kind: 'running' };
|
|
486
|
+
await recordLifecycleOperation({
|
|
487
|
+
currentGateway: gatewayIdentityFor(startedGateway),
|
|
488
|
+
kind: 'operation-finished',
|
|
489
|
+
operationId,
|
|
490
|
+
operationTrigger,
|
|
491
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
492
|
+
});
|
|
123
493
|
}
|
|
124
494
|
catch (error) {
|
|
495
|
+
if (error instanceof GatewayOwnershipUnsafeError) {
|
|
496
|
+
gateway = undefined;
|
|
497
|
+
bootedAt = undefined;
|
|
498
|
+
lastError = error.message;
|
|
499
|
+
lifecycleState = {
|
|
500
|
+
evidence: error.evidence,
|
|
501
|
+
kind: 'owner-unsafe',
|
|
502
|
+
};
|
|
503
|
+
await recordLifecycleOperation({
|
|
504
|
+
errorCode: 'owner-unsafe',
|
|
505
|
+
errorMessage: error.message,
|
|
506
|
+
kind: 'operation-failed',
|
|
507
|
+
operationId,
|
|
508
|
+
operationTrigger,
|
|
509
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
510
|
+
});
|
|
511
|
+
throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
|
|
512
|
+
gatewayLifecycleErrorCode: 'owner-unsafe',
|
|
513
|
+
operationId,
|
|
514
|
+
});
|
|
515
|
+
}
|
|
516
|
+
const classifiedError = classifyGatewayStartError(error);
|
|
125
517
|
gateway = undefined;
|
|
126
518
|
bootedAt = undefined;
|
|
127
519
|
lastError = formatUnknownError(error);
|
|
128
|
-
|
|
520
|
+
lifecycleState = {
|
|
521
|
+
coldStartEligible: true,
|
|
522
|
+
error: classifiedError,
|
|
523
|
+
kind: 'failed',
|
|
524
|
+
};
|
|
525
|
+
await recordLifecycleOperation({
|
|
526
|
+
errorCode: classifiedError.code,
|
|
527
|
+
errorMessage: classifiedError.message,
|
|
528
|
+
kind: 'operation-failed',
|
|
529
|
+
operationId,
|
|
530
|
+
operationTrigger,
|
|
531
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
532
|
+
});
|
|
533
|
+
throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
|
|
534
|
+
gatewayLifecycleErrorCode: classifiedError.code,
|
|
535
|
+
operationId,
|
|
536
|
+
});
|
|
129
537
|
}
|
|
130
538
|
};
|
|
131
539
|
const stop = async () => await runLifecycleOperation(async () => await stopNow());
|
|
132
|
-
const start = async () => await runLifecycleOperation(async () => await startNow(
|
|
133
|
-
|
|
540
|
+
const start = async () => await runLifecycleOperation(async () => await startNow(undefined, {}, {
|
|
541
|
+
operationId: createOperationId('start'),
|
|
542
|
+
operationTrigger: 'controller-start',
|
|
543
|
+
}));
|
|
544
|
+
const restartWithStartOptions = async (restartOptions = {}, startOptions = {}, operationMetadata = {}) => {
|
|
134
545
|
return await runLifecycleOperation(async () => {
|
|
135
546
|
lifecycleGeneration += 1;
|
|
136
547
|
const operationGeneration = lifecycleGeneration;
|
|
137
|
-
const
|
|
548
|
+
const currentState = getLifecycleState();
|
|
549
|
+
const operationId = operationMetadata.operationId ?? createOperationId('restart');
|
|
550
|
+
const operationContext = {
|
|
551
|
+
operationId,
|
|
552
|
+
operationTrigger: operationMetadata.operationTrigger ??
|
|
553
|
+
restartOptions.operationTrigger ??
|
|
554
|
+
'operator-restart',
|
|
555
|
+
previousGateway: currentState.kind === 'running' || currentState.kind === 'running-degraded'
|
|
556
|
+
? currentState.gateway
|
|
557
|
+
: undefined,
|
|
558
|
+
};
|
|
559
|
+
if (currentState.kind === 'running' || currentState.kind === 'running-degraded') {
|
|
560
|
+
lifecycleState = {
|
|
561
|
+
kind: 'restarting',
|
|
562
|
+
operationId,
|
|
563
|
+
previousGateway: currentState.gateway,
|
|
564
|
+
};
|
|
565
|
+
}
|
|
566
|
+
const restartOperation = currentState.kind === 'running' || currentState.kind === 'running-degraded'
|
|
567
|
+
? (async () => {
|
|
568
|
+
await recordLifecycleOperation({
|
|
569
|
+
kind: 'restart-requested',
|
|
570
|
+
operationId,
|
|
571
|
+
operationTrigger: operationContext.operationTrigger,
|
|
572
|
+
previousGateway: gatewayIdentityFor(operationContext.previousGateway),
|
|
573
|
+
});
|
|
574
|
+
const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
|
|
575
|
+
await stopNow('starting', operationContext);
|
|
576
|
+
await startNow(operationGeneration, startOptions, operationContext);
|
|
577
|
+
if (operationGeneration !== lifecycleGeneration) {
|
|
578
|
+
throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
|
|
579
|
+
}
|
|
580
|
+
return {
|
|
581
|
+
leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
|
|
582
|
+
operationId,
|
|
583
|
+
};
|
|
584
|
+
})()
|
|
585
|
+
: (async () => {
|
|
586
|
+
await recordLifecycleOperation({
|
|
587
|
+
kind: 'cold-start-requested',
|
|
588
|
+
operationId,
|
|
589
|
+
operationTrigger: operationContext.operationTrigger,
|
|
590
|
+
});
|
|
591
|
+
const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
|
|
592
|
+
await closeStaleGatewayBeforeColdStart(operationContext);
|
|
593
|
+
await startNow(operationGeneration, startOptions, operationContext);
|
|
594
|
+
if (operationGeneration !== lifecycleGeneration) {
|
|
595
|
+
throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
|
|
596
|
+
}
|
|
597
|
+
return {
|
|
598
|
+
leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
|
|
599
|
+
operationId,
|
|
600
|
+
};
|
|
601
|
+
})();
|
|
602
|
+
if (restartOptions.timeoutMs === undefined) {
|
|
603
|
+
return await restartOperation;
|
|
604
|
+
}
|
|
605
|
+
return withLifecycleTimeout({
|
|
606
|
+
operation: restartOperation,
|
|
607
|
+
timeoutMs: restartOptions.timeoutMs,
|
|
608
|
+
});
|
|
609
|
+
});
|
|
610
|
+
};
|
|
611
|
+
const restart = async (restartOptions = {}) => await restartWithStartOptions(restartOptions);
|
|
612
|
+
const coldStartWithStartOptions = async (restartOptions = {}, startOptions = {}, operationMetadata = {}) => {
|
|
613
|
+
return await runLifecycleOperation(async () => {
|
|
614
|
+
lifecycleGeneration += 1;
|
|
615
|
+
const operationGeneration = lifecycleGeneration;
|
|
616
|
+
const operationContext = {
|
|
617
|
+
operationId: operationMetadata.operationId ?? createOperationId('cold-start'),
|
|
618
|
+
operationTrigger: operationMetadata.operationTrigger ?? restartOptions.operationTrigger ?? 'auto-recovery',
|
|
619
|
+
};
|
|
620
|
+
const coldStartOperation = (async () => {
|
|
621
|
+
getLifecycleState();
|
|
622
|
+
await recordLifecycleOperation({
|
|
623
|
+
kind: 'cold-start-requested',
|
|
624
|
+
operationId: operationContext.operationId,
|
|
625
|
+
operationTrigger: operationContext.operationTrigger,
|
|
626
|
+
});
|
|
138
627
|
const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
|
|
139
|
-
await
|
|
140
|
-
await startNow(operationGeneration);
|
|
628
|
+
await closeStaleGatewayBeforeColdStart(operationContext);
|
|
629
|
+
await startNow(operationGeneration, startOptions, operationContext);
|
|
141
630
|
if (operationGeneration !== lifecycleGeneration) {
|
|
142
631
|
throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
|
|
143
632
|
}
|
|
144
|
-
return {
|
|
633
|
+
return {
|
|
634
|
+
leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
|
|
635
|
+
operationId: operationContext.operationId,
|
|
636
|
+
};
|
|
145
637
|
})();
|
|
146
638
|
if (restartOptions.timeoutMs === undefined) {
|
|
147
|
-
return await
|
|
148
|
-
}
|
|
149
|
-
const restartTimeoutMs = restartOptions.timeoutMs;
|
|
150
|
-
let timeout;
|
|
151
|
-
try {
|
|
152
|
-
return await Promise.race([
|
|
153
|
-
restartOperation,
|
|
154
|
-
new Promise((_resolve, reject) => {
|
|
155
|
-
timeout = setTimeoutImpl(() => {
|
|
156
|
-
lifecycleGeneration += 1;
|
|
157
|
-
reject(new OpenClawZoneRestartTimeoutError(options.zone.id, restartTimeoutMs));
|
|
158
|
-
}, restartTimeoutMs);
|
|
159
|
-
timeout.unref?.();
|
|
160
|
-
}),
|
|
161
|
-
]);
|
|
162
|
-
}
|
|
163
|
-
finally {
|
|
164
|
-
if (timeout) {
|
|
165
|
-
clearTimeoutImpl(timeout);
|
|
166
|
-
}
|
|
639
|
+
return await coldStartOperation;
|
|
167
640
|
}
|
|
641
|
+
return withLifecycleTimeout({
|
|
642
|
+
operation: coldStartOperation,
|
|
643
|
+
timeoutMs: restartOptions.timeoutMs,
|
|
644
|
+
});
|
|
168
645
|
});
|
|
169
646
|
};
|
|
647
|
+
const coldStart = async (restartOptions = {}) => await coldStartWithStartOptions(restartOptions);
|
|
170
648
|
return {
|
|
649
|
+
coldStart,
|
|
171
650
|
destroy: async (purge) => await (options.runControllerDestroy ?? runControllerDestroyDefault)({ purge, systemConfig: options.systemConfig, zoneId: options.zone.id }, {
|
|
172
651
|
releaseZoneLeases: async (zoneId) => {
|
|
173
652
|
await releaseZoneLeases(zoneId);
|
|
@@ -175,12 +654,13 @@ export function createOpenClawZoneRuntime(options) {
|
|
|
175
654
|
stopGatewayZone: async () => await stop(),
|
|
176
655
|
}),
|
|
177
656
|
enableSsh: async () => await requireGateway().vm.enableSsh(),
|
|
178
|
-
exec: async (command) => await requireGateway()
|
|
657
|
+
exec: async (command) => await executeGatewayCommand(requireGateway(), command),
|
|
179
658
|
gatewayType: 'openclaw',
|
|
180
659
|
getHealth: async () => {
|
|
660
|
+
getLifecycleState();
|
|
181
661
|
const activeGateway = requireGateway();
|
|
182
662
|
const result = await runGatewayHealthCheck({
|
|
183
|
-
exec: async (command) => await activeGateway
|
|
663
|
+
exec: async (command) => await executeGatewayCommand(activeGateway, command),
|
|
184
664
|
healthCheck: activeGateway.processSpec.healthCheck,
|
|
185
665
|
});
|
|
186
666
|
return {
|
|
@@ -192,42 +672,118 @@ export function createOpenClawZoneRuntime(options) {
|
|
|
192
672
|
zoneId: options.zone.id,
|
|
193
673
|
};
|
|
194
674
|
},
|
|
675
|
+
getDiagnosis: () => deriveGatewayDiagnosisSnapshot({
|
|
676
|
+
channelProviderPlane: 'unknown',
|
|
677
|
+
controllerLiveness: 'ok',
|
|
678
|
+
lastOperation,
|
|
679
|
+
originalOutageCause,
|
|
680
|
+
state: getLifecycleState(),
|
|
681
|
+
toolVmPlane: 'unknown',
|
|
682
|
+
}),
|
|
195
683
|
getLogs: async () => {
|
|
196
684
|
const activeGateway = requireGateway();
|
|
197
685
|
return await (options.runControllerLogs ?? runControllerLogsDefault)({ zoneId: options.zone.id }, {
|
|
198
686
|
readGatewayLogs: async () => (await activeGateway.vm.exec(buildOpenClawCombinedLogsCommand(activeGateway.processSpec.logPath))).stdout,
|
|
199
687
|
});
|
|
200
688
|
},
|
|
689
|
+
getLifecycleState,
|
|
201
690
|
getSnapshot: () => {
|
|
202
|
-
|
|
203
|
-
|
|
691
|
+
const currentLifecycleState = getLifecycleState();
|
|
692
|
+
if (currentLifecycleState.kind === 'running') {
|
|
693
|
+
const hostPid = currentLifecycleState.gateway.vm.getHostPid();
|
|
694
|
+
if (hostPid === undefined || hostPid === null) {
|
|
695
|
+
const missingHostPidState = markGatewayHostPidMissing(`Gateway VM host pid is unavailable for zone '${options.zone.id}'.`);
|
|
696
|
+
return {
|
|
697
|
+
lastError: missingHostPidState.error.message,
|
|
698
|
+
lifecycleState: 'failed',
|
|
699
|
+
};
|
|
700
|
+
}
|
|
204
701
|
return {
|
|
205
702
|
...(bootedAt ? { bootedAt } : {}),
|
|
206
703
|
gateway: {
|
|
207
|
-
ingress: gateway.ingress,
|
|
704
|
+
ingress: currentLifecycleState.gateway.ingress,
|
|
208
705
|
vm: {
|
|
209
|
-
|
|
210
|
-
id: gateway.vm.id,
|
|
706
|
+
hostPid,
|
|
707
|
+
id: currentLifecycleState.gateway.vm.id,
|
|
211
708
|
},
|
|
212
709
|
},
|
|
710
|
+
...(lastError ? { lastError } : {}),
|
|
213
711
|
lifecycleState: 'running',
|
|
214
712
|
};
|
|
215
713
|
}
|
|
216
714
|
return lastError ? { lastError, lifecycleState: 'failed' } : { lifecycleState: 'stopped' };
|
|
217
715
|
},
|
|
218
|
-
refreshCredentials: async () => await (
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
716
|
+
refreshCredentials: async () => await (async () => {
|
|
717
|
+
const operationId = createOperationId('credentials-refresh');
|
|
718
|
+
const operationTrigger = 'credentials-refresh';
|
|
719
|
+
await recordLifecycleOperation({
|
|
720
|
+
kind: 'credentials-refresh-requested',
|
|
721
|
+
operationId,
|
|
722
|
+
operationTrigger,
|
|
723
|
+
previousGateway: gatewayIdentityFor(gateway),
|
|
724
|
+
});
|
|
725
|
+
const failCredentialsRefreshSecretResolution = async (error) => {
|
|
726
|
+
const classifiedError = {
|
|
727
|
+
code: 'secret-resolution-failed',
|
|
728
|
+
message: formatUnknownError(error),
|
|
729
|
+
};
|
|
730
|
+
const currentLifecycleState = getLifecycleState();
|
|
731
|
+
lastError = classifiedError.message;
|
|
732
|
+
if (currentLifecycleState.kind !== 'running' &&
|
|
733
|
+
currentLifecycleState.kind !== 'running-degraded') {
|
|
734
|
+
lifecycleState = {
|
|
735
|
+
coldStartEligible: true,
|
|
736
|
+
error: classifiedError,
|
|
737
|
+
kind: 'failed',
|
|
738
|
+
};
|
|
739
|
+
}
|
|
740
|
+
await recordLifecycleOperation({
|
|
741
|
+
errorCode: classifiedError.code,
|
|
742
|
+
errorMessage: classifiedError.message,
|
|
743
|
+
kind: 'operation-failed',
|
|
744
|
+
operationId,
|
|
745
|
+
operationTrigger,
|
|
746
|
+
previousGateway: gatewayIdentityFor(gateway),
|
|
225
747
|
});
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
748
|
+
throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
|
|
749
|
+
gatewayLifecycleErrorCode: classifiedError.code,
|
|
750
|
+
operationId,
|
|
751
|
+
});
|
|
752
|
+
};
|
|
753
|
+
let refreshedSecretResolver;
|
|
754
|
+
try {
|
|
755
|
+
refreshedSecretResolver = options.createFreshSecretResolver
|
|
756
|
+
? await options.createFreshSecretResolver()
|
|
757
|
+
: options.secretResolver;
|
|
758
|
+
}
|
|
759
|
+
catch (error) {
|
|
760
|
+
await failCredentialsRefreshSecretResolution(error);
|
|
761
|
+
}
|
|
762
|
+
return await (options.runControllerCredentialsRefresh ?? runControllerCredentialsRefreshDefault)({ zoneId: options.zone.id }, {
|
|
763
|
+
refreshZoneSecrets: async (zoneId) => {
|
|
764
|
+
try {
|
|
765
|
+
await resolveZoneSecrets({
|
|
766
|
+
audience: 'gateway',
|
|
767
|
+
secretResolver: refreshedSecretResolver,
|
|
768
|
+
systemConfig: options.systemConfig,
|
|
769
|
+
zoneId,
|
|
770
|
+
});
|
|
771
|
+
}
|
|
772
|
+
catch (error) {
|
|
773
|
+
await failCredentialsRefreshSecretResolution(error);
|
|
774
|
+
}
|
|
775
|
+
},
|
|
776
|
+
restartGatewayZone: async () => {
|
|
777
|
+
const currentLifecycleState = getLifecycleState();
|
|
778
|
+
if (currentLifecycleState.kind === 'running' ||
|
|
779
|
+
currentLifecycleState.kind === 'running-degraded') {
|
|
780
|
+
await restartWithStartOptions({}, { secretResolver: refreshedSecretResolver }, { operationId, operationTrigger });
|
|
781
|
+
return;
|
|
782
|
+
}
|
|
783
|
+
await coldStartWithStartOptions({}, { secretResolver: refreshedSecretResolver }, { operationId, operationTrigger });
|
|
784
|
+
},
|
|
785
|
+
});
|
|
786
|
+
})(),
|
|
231
787
|
restart,
|
|
232
788
|
shutdown: stop,
|
|
233
789
|
start,
|
|
@@ -235,7 +791,7 @@ export function createOpenClawZoneRuntime(options) {
|
|
|
235
791
|
upgrade: async () => await (options.runControllerUpgrade ?? runControllerUpgradeDefault)({ systemConfig: options.systemConfig, zoneId: options.zone.id }, {
|
|
236
792
|
rebuildGatewayImage: async () => { },
|
|
237
793
|
restartGatewayZone: async () => {
|
|
238
|
-
await restart();
|
|
794
|
+
await restart({ operationTrigger: 'upgrade' });
|
|
239
795
|
},
|
|
240
796
|
}),
|
|
241
797
|
zoneId: options.zone.id,
|