@agent-vm/agent-vm 0.0.91 → 0.0.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build/gondolin-image-builder.d.ts +1 -0
- package/dist/build/gondolin-image-builder.d.ts.map +1 -1
- package/dist/build/gondolin-image-builder.js +11 -1
- package/dist/build/gondolin-image-builder.js.map +1 -1
- package/dist/build/managed-image-dockerfile.d.ts +2 -1
- package/dist/build/managed-image-dockerfile.d.ts.map +1 -1
- package/dist/build/managed-image-dockerfile.js +51 -27
- package/dist/build/managed-image-dockerfile.js.map +1 -1
- package/dist/cli/commands/controller-definition.d.ts +42 -42
- package/dist/cli/commands/create-app.d.ts +60 -60
- package/dist/cli/manual-templates.d.ts.map +1 -1
- package/dist/cli/manual-templates.js +14 -1
- package/dist/cli/manual-templates.js.map +1 -1
- package/dist/config/system-config.d.ts +15 -0
- package/dist/config/system-config.d.ts.map +1 -1
- package/dist/config/system-config.js +74 -0
- package/dist/config/system-config.js.map +1 -1
- package/dist/controller/controller-runtime-operations.d.ts +1 -0
- package/dist/controller/controller-runtime-operations.d.ts.map +1 -1
- package/dist/controller/controller-runtime-operations.js +2 -0
- package/dist/controller/controller-runtime-operations.js.map +1 -1
- package/dist/controller/controller-runtime-types.d.ts +5 -0
- package/dist/controller/controller-runtime-types.d.ts.map +1 -1
- package/dist/controller/controller-runtime.d.ts +1 -0
- package/dist/controller/controller-runtime.d.ts.map +1 -1
- package/dist/controller/controller-runtime.js +220 -3
- package/dist/controller/controller-runtime.js.map +1 -1
- package/dist/controller/health/channel-provider-recovery-observation.d.ts +23 -0
- package/dist/controller/health/channel-provider-recovery-observation.d.ts.map +1 -0
- package/dist/controller/health/channel-provider-recovery-observation.js +69 -0
- package/dist/controller/health/channel-provider-recovery-observation.js.map +1 -0
- package/dist/controller/health/durable-health-event-log.d.ts +24 -0
- package/dist/controller/health/durable-health-event-log.d.ts.map +1 -0
- package/dist/controller/health/durable-health-event-log.js +89 -0
- package/dist/controller/health/durable-health-event-log.js.map +1 -0
- package/dist/controller/health/gateway-recovery-actions.d.ts +27 -0
- package/dist/controller/health/gateway-recovery-actions.d.ts.map +1 -0
- package/dist/controller/health/gateway-recovery-actions.js +71 -0
- package/dist/controller/health/gateway-recovery-actions.js.map +1 -0
- package/dist/controller/health/gateway-service-health-monitor.d.ts +71 -3
- package/dist/controller/health/gateway-service-health-monitor.d.ts.map +1 -1
- package/dist/controller/health/gateway-service-health-monitor.js +383 -10
- package/dist/controller/health/gateway-service-health-monitor.js.map +1 -1
- package/dist/controller/health/gateway-vm-recovery-policy.d.ts +68 -0
- package/dist/controller/health/gateway-vm-recovery-policy.d.ts.map +1 -0
- package/dist/controller/health/gateway-vm-recovery-policy.js +199 -0
- package/dist/controller/health/gateway-vm-recovery-policy.js.map +1 -0
- package/dist/controller/health/gateway-vm-recovery-runner.d.ts +39 -0
- package/dist/controller/health/gateway-vm-recovery-runner.d.ts.map +1 -0
- package/dist/controller/health/gateway-vm-recovery-runner.js +251 -0
- package/dist/controller/health/gateway-vm-recovery-runner.js.map +1 -0
- package/dist/controller/health/health-event-store.d.ts +4 -0
- package/dist/controller/health/health-event-store.d.ts.map +1 -1
- package/dist/controller/health/health-event-store.js +19 -0
- package/dist/controller/health/health-event-store.js.map +1 -1
- package/dist/controller/http/controller-health-event-routes.d.ts +6 -0
- package/dist/controller/http/controller-health-event-routes.d.ts.map +1 -1
- package/dist/controller/http/controller-health-event-routes.js +49 -0
- package/dist/controller/http/controller-health-event-routes.js.map +1 -1
- package/dist/controller/http/controller-http-routes.d.ts.map +1 -1
- package/dist/controller/http/controller-http-routes.js +6 -0
- package/dist/controller/http/controller-http-routes.js.map +1 -1
- package/dist/controller/leases/lease-manager.d.ts.map +1 -1
- package/dist/controller/leases/lease-manager.js +37 -16
- package/dist/controller/leases/lease-manager.js.map +1 -1
- package/dist/controller/leases/tool-vm-lease-lifecycle.d.ts +44 -0
- package/dist/controller/leases/tool-vm-lease-lifecycle.d.ts.map +1 -0
- package/dist/controller/leases/tool-vm-lease-lifecycle.js +28 -0
- package/dist/controller/leases/tool-vm-lease-lifecycle.js.map +1 -0
- package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.d.ts +37 -0
- package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.d.ts.map +1 -0
- package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.js +133 -0
- package/dist/controller/zone-runtimes/gateway-lifecycle-operation-record.js.map +1 -0
- package/dist/controller/zone-runtimes/gateway-zone-state-machine.d.ts +101 -0
- package/dist/controller/zone-runtimes/gateway-zone-state-machine.d.ts.map +1 -0
- package/dist/controller/zone-runtimes/gateway-zone-state-machine.js +143 -0
- package/dist/controller/zone-runtimes/gateway-zone-state-machine.js.map +1 -0
- package/dist/controller/zone-runtimes/openclaw-zone-runtime.d.ts +16 -1
- package/dist/controller/zone-runtimes/openclaw-zone-runtime.d.ts.map +1 -1
- package/dist/controller/zone-runtimes/openclaw-zone-runtime.js +700 -40
- package/dist/controller/zone-runtimes/openclaw-zone-runtime.js.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-errors.d.ts +7 -1
- package/dist/controller/zone-runtimes/zone-runtime-errors.d.ts.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-errors.js +5 -1
- package/dist/controller/zone-runtimes/zone-runtime-errors.js.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-registry.d.ts +2 -0
- package/dist/controller/zone-runtimes/zone-runtime-registry.d.ts.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-registry.js +23 -0
- package/dist/controller/zone-runtimes/zone-runtime-registry.js.map +1 -1
- package/dist/controller/zone-runtimes/zone-runtime-types.d.ts +14 -1
- package/dist/controller/zone-runtimes/zone-runtime-types.d.ts.map +1 -1
- package/dist/gateway/gateway-ownership-evidence.d.ts +35 -0
- package/dist/gateway/gateway-ownership-evidence.d.ts.map +1 -0
- package/dist/gateway/gateway-ownership-evidence.js +10 -0
- package/dist/gateway/gateway-ownership-evidence.js.map +1 -0
- package/dist/gateway/gateway-recovery.d.ts +16 -0
- package/dist/gateway/gateway-recovery.d.ts.map +1 -1
- package/dist/gateway/gateway-recovery.js +105 -9
- package/dist/gateway/gateway-recovery.js.map +1 -1
- package/dist/gateway/gateway-zone-orchestrator.d.ts.map +1 -1
- package/dist/gateway/gateway-zone-orchestrator.js +50 -39
- package/dist/gateway/gateway-zone-orchestrator.js.map +1 -1
- package/dist/integration-tests/{smoke-harness.d.ts → e2e-harness.d.ts} +45 -37
- package/dist/integration-tests/e2e-harness.d.ts.map +1 -0
- package/dist/integration-tests/{smoke-harness.js → e2e-harness.js} +134 -108
- package/dist/integration-tests/e2e-harness.js.map +1 -0
- package/dist/integration-tests/e2e-workspace-build-global-setup.d.ts +16 -0
- package/dist/integration-tests/e2e-workspace-build-global-setup.d.ts.map +1 -0
- package/dist/integration-tests/e2e-workspace-build-global-setup.js +27 -0
- package/dist/integration-tests/e2e-workspace-build-global-setup.js.map +1 -0
- package/dist/integration-tests/live-agent-model-roundtrip-deployment.d.ts +11 -0
- package/dist/integration-tests/live-agent-model-roundtrip-deployment.d.ts.map +1 -0
- package/dist/integration-tests/live-agent-model-roundtrip-deployment.js +48 -0
- package/dist/integration-tests/live-agent-model-roundtrip-deployment.js.map +1 -0
- package/dist/integration-tests/live-agent-model-roundtrip-gates.d.ts +11 -0
- package/dist/integration-tests/live-agent-model-roundtrip-gates.d.ts.map +1 -0
- package/dist/integration-tests/live-agent-model-roundtrip-gates.js +21 -0
- package/dist/integration-tests/live-agent-model-roundtrip-gates.js.map +1 -0
- package/dist/integration-tests/live-vm-e2e-gates.d.ts +2 -0
- package/dist/integration-tests/live-vm-e2e-gates.d.ts.map +1 -0
- package/dist/integration-tests/live-vm-e2e-gates.js +4 -0
- package/dist/integration-tests/live-vm-e2e-gates.js.map +1 -0
- package/dist/operations/controller-status.d.ts +5 -0
- package/dist/operations/controller-status.d.ts.map +1 -1
- package/dist/operations/controller-status.js +42 -0
- package/dist/operations/controller-status.js.map +1 -1
- package/package.json +11 -11
- package/dist/integration-tests/live-integration-gates.d.ts +0 -2
- package/dist/integration-tests/live-integration-gates.d.ts.map +0 -1
- package/dist/integration-tests/live-integration-gates.js +0 -4
- package/dist/integration-tests/live-integration-gates.js.map +0 -1
- package/dist/integration-tests/smoke-harness.d.ts.map +0 -1
- package/dist/integration-tests/smoke-harness.js.map +0 -1
|
@@ -1,12 +1,36 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
1
2
|
import { resolveZoneSecrets } from '../../gateway/credential-manager.js';
|
|
2
3
|
import { runGatewayHealthCheck } from '../../gateway/gateway-health-check.js';
|
|
4
|
+
import { GatewayOwnershipUnsafeError } from '../../gateway/gateway-ownership-evidence.js';
|
|
3
5
|
import { deleteGatewayRuntimeRecord as deleteGatewayRuntimeRecordDefault } from '../../gateway/gateway-runtime-record.js';
|
|
4
6
|
import { startGatewayZone } from '../../gateway/gateway-zone-orchestrator.js';
|
|
5
7
|
import { runControllerCredentialsRefresh as runControllerCredentialsRefreshDefault } from '../../operations/credentials-refresh.js';
|
|
6
8
|
import { runControllerDestroy as runControllerDestroyDefault } from '../../operations/destroy-zone.js';
|
|
7
9
|
import { runControllerUpgrade as runControllerUpgradeDefault } from '../../operations/upgrade-zone.js';
|
|
8
10
|
import { runControllerLogs as runControllerLogsDefault } from '../../operations/zone-logs.js';
|
|
11
|
+
import { isProcessAlive as defaultIsProcessAlive } from '../../shared/managed-vm-process.js';
|
|
12
|
+
import { appendGatewayLifecycleOperationRecord as appendGatewayLifecycleOperationRecordDefault, } from './gateway-lifecycle-operation-record.js';
|
|
13
|
+
import { classifyGatewayStartError, deriveGatewayDiagnosisSnapshot, } from './gateway-zone-state-machine.js';
|
|
9
14
|
import { ControllerZoneRuntimeStartError, ControllerZoneRuntimeUnavailableError, } from './zone-runtime-errors.js';
|
|
15
|
+
const defaultGatewayCloseTimeoutMs = 60_000;
|
|
16
|
+
function isLifecycleOperationExecutionWithLock(execution) {
|
|
17
|
+
return typeof execution === 'object' && execution !== null && 'lock' in execution;
|
|
18
|
+
}
|
|
19
|
+
function isRecoverySecretResolutionFailure(record) {
|
|
20
|
+
return (record.errorCode === 'secret-resolution-failed' &&
|
|
21
|
+
(record.operationTrigger === 'auto-recovery' ||
|
|
22
|
+
record.operationTrigger === 'credentials-refresh'));
|
|
23
|
+
}
|
|
24
|
+
class OpenClawZoneRestartTimeoutError extends Error {
|
|
25
|
+
code = 'OPENCLAW_GATEWAY_RESTART_TIMEOUT';
|
|
26
|
+
constructor(zoneId, timeoutMs) {
|
|
27
|
+
super(`OpenClaw gateway restart timed out for zone '${zoneId}' after ${timeoutMs}ms`);
|
|
28
|
+
this.name = 'OpenClawZoneRestartTimeoutError';
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
export function isOpenClawZoneRestartTimeoutError(error) {
|
|
32
|
+
return (error instanceof Error && 'code' in error && error.code === 'OPENCLAW_GATEWAY_RESTART_TIMEOUT');
|
|
33
|
+
}
|
|
10
34
|
function formatUnknownError(error) {
|
|
11
35
|
return error instanceof Error ? error.message : String(error);
|
|
12
36
|
}
|
|
@@ -19,68 +43,624 @@ function buildOpenClawCombinedLogsCommand(logPath) {
|
|
|
19
43
|
'latest_openclaw_log=$(ls -1t /agent-vm/logs/*.log 2>/dev/null | grep -v "/gateway-boot-latest\\.log$" | head -n 1); if [ -n "$latest_openclaw_log" ]; then tail -n 400 "$latest_openclaw_log"; fi',
|
|
20
44
|
].join('; ');
|
|
21
45
|
}
|
|
46
|
+
function writeOpenClawZoneRuntimeLog(message) {
|
|
47
|
+
process.stderr.write(`[openclaw-zone-runtime] ${message}\n`);
|
|
48
|
+
}
|
|
49
|
+
function unavailableReasonForState(state) {
|
|
50
|
+
switch (state.kind) {
|
|
51
|
+
case 'failed':
|
|
52
|
+
return state.error.message;
|
|
53
|
+
case 'owner-unsafe':
|
|
54
|
+
return `Gateway runtime ownership is unsafe: ${state.evidence.kind}.`;
|
|
55
|
+
case 'restarting':
|
|
56
|
+
case 'starting':
|
|
57
|
+
case 'stopping':
|
|
58
|
+
return `Gateway runtime is ${state.kind}.`;
|
|
59
|
+
case 'running':
|
|
60
|
+
case 'running-degraded':
|
|
61
|
+
case 'stopped':
|
|
62
|
+
return undefined;
|
|
63
|
+
}
|
|
64
|
+
return assertNeverGatewayZoneLifecycleState(state);
|
|
65
|
+
}
|
|
66
|
+
function assertNeverGatewayZoneLifecycleState(state) {
|
|
67
|
+
throw new Error(`Unhandled gateway zone lifecycle state: ${JSON.stringify(state)}`);
|
|
68
|
+
}
|
|
69
|
+
function assertNeverGatewayLifecycleOperationRecordKind(kind) {
|
|
70
|
+
throw new Error(`Unhandled gateway lifecycle operation record kind: ${String(kind)}`);
|
|
71
|
+
}
|
|
72
|
+
function gatewayIdentityFor(runtimeGateway) {
|
|
73
|
+
if (!runtimeGateway) {
|
|
74
|
+
return undefined;
|
|
75
|
+
}
|
|
76
|
+
const hostPid = runtimeGateway.vm.getHostPid();
|
|
77
|
+
return {
|
|
78
|
+
...(typeof hostPid === 'number' && hostPid > 0 ? { hostPid } : {}),
|
|
79
|
+
vmId: runtimeGateway.vm.id,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
async function executeGatewayCommand(runtimeGateway, command) {
|
|
83
|
+
const result = await runtimeGateway.vm.exec(command);
|
|
84
|
+
return {
|
|
85
|
+
exitCode: result.exitCode,
|
|
86
|
+
stderr: result.stderr,
|
|
87
|
+
stdout: result.stdout,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
22
90
|
export function createOpenClawZoneRuntime(options) {
|
|
91
|
+
const clearTimeoutImpl = options.clearTimeoutImpl ?? clearTimeout;
|
|
92
|
+
const closeGatewayTimeoutMs = options.closeGatewayTimeoutMs ?? defaultGatewayCloseTimeoutMs;
|
|
93
|
+
const isProcessAlive = options.isProcessAlive ?? defaultIsProcessAlive;
|
|
94
|
+
const setTimeoutImpl = options.setTimeoutImpl ?? setTimeout;
|
|
95
|
+
const appendGatewayLifecycleOperationRecord = options.appendGatewayLifecycleOperationRecord;
|
|
23
96
|
let gateway;
|
|
24
97
|
let bootedAt;
|
|
25
98
|
let lastError;
|
|
26
|
-
|
|
27
|
-
|
|
99
|
+
let lastOperation = 'none';
|
|
100
|
+
let originalOutageCause = { kind: 'unknown' };
|
|
101
|
+
let lifecycleState = { kind: 'stopped' };
|
|
102
|
+
let lifecycleOperation = Promise.resolve();
|
|
103
|
+
let lifecycleGeneration = 0;
|
|
104
|
+
let staleGatewayPendingClose;
|
|
105
|
+
const startGateway = async (startOptions = {}) => options.restartGatewayZone
|
|
106
|
+
? await options.restartGatewayZone(options.zone.id, startOptions)
|
|
28
107
|
: await startGatewayZone({
|
|
29
|
-
secretResolver: options.secretResolver,
|
|
108
|
+
secretResolver: startOptions.secretResolver ?? options.secretResolver,
|
|
30
109
|
systemConfig: options.systemConfig,
|
|
31
110
|
zoneId: options.zone.id,
|
|
32
111
|
});
|
|
33
112
|
const requireGateway = () => {
|
|
34
|
-
|
|
35
|
-
|
|
113
|
+
const currentState = getLifecycleState();
|
|
114
|
+
if (currentState.kind !== 'running' && currentState.kind !== 'running-degraded') {
|
|
115
|
+
throw new ControllerZoneRuntimeUnavailableError(options.zone.id, lastError ?? unavailableReasonForState(currentState));
|
|
36
116
|
}
|
|
37
|
-
return gateway;
|
|
117
|
+
return currentState.gateway;
|
|
38
118
|
};
|
|
39
|
-
const
|
|
40
|
-
|
|
119
|
+
const createOperationId = (operationName) => `${options.zone.id}-${operationName}-${randomUUID()}`;
|
|
120
|
+
const operationForRecordKind = (kind) => {
|
|
121
|
+
switch (kind) {
|
|
122
|
+
case 'cold-start-requested':
|
|
123
|
+
return 'cold-start';
|
|
124
|
+
case 'credentials-refresh-requested':
|
|
125
|
+
return 'credentials-refresh';
|
|
126
|
+
case 'restart-requested':
|
|
127
|
+
return 'restart';
|
|
128
|
+
case 'start-requested':
|
|
129
|
+
return 'start';
|
|
130
|
+
case 'stop-requested':
|
|
131
|
+
return 'stop';
|
|
132
|
+
case 'operation-failed':
|
|
133
|
+
case 'operation-finished':
|
|
134
|
+
case 'runtime-record-deleted':
|
|
135
|
+
case 'runtime-record-written':
|
|
136
|
+
case 'vm-close-finished':
|
|
137
|
+
case 'vm-close-started':
|
|
138
|
+
return undefined;
|
|
139
|
+
}
|
|
140
|
+
return assertNeverGatewayLifecycleOperationRecordKind(kind);
|
|
141
|
+
};
|
|
142
|
+
const setOriginalOutageCauseIfUnknown = (errorCode) => {
|
|
143
|
+
if (originalOutageCause.kind !== 'unknown') {
|
|
144
|
+
return;
|
|
145
|
+
}
|
|
146
|
+
originalOutageCause = {
|
|
147
|
+
...(errorCode === undefined ? {} : { errorCode }),
|
|
148
|
+
eventKind: 'gateway-lifecycle-operation',
|
|
149
|
+
kind: 'proven',
|
|
150
|
+
};
|
|
151
|
+
};
|
|
152
|
+
const recordLifecycleOperation = async (record) => {
|
|
153
|
+
const operation = operationForRecordKind(record.kind);
|
|
154
|
+
if (operation !== undefined) {
|
|
155
|
+
lastOperation = operation;
|
|
156
|
+
}
|
|
157
|
+
if (record.kind === 'operation-failed' && !isRecoverySecretResolutionFailure(record)) {
|
|
158
|
+
setOriginalOutageCauseIfUnknown(record.errorCode);
|
|
159
|
+
}
|
|
160
|
+
const operationRecord = {
|
|
161
|
+
controllerPid: process.pid,
|
|
162
|
+
gatewayType: 'openclaw',
|
|
163
|
+
observedAtMs: options.now(),
|
|
164
|
+
zoneId: options.zone.id,
|
|
165
|
+
...record,
|
|
166
|
+
};
|
|
167
|
+
try {
|
|
168
|
+
if (appendGatewayLifecycleOperationRecord) {
|
|
169
|
+
await appendGatewayLifecycleOperationRecord(operationRecord);
|
|
170
|
+
return;
|
|
171
|
+
}
|
|
172
|
+
await appendGatewayLifecycleOperationRecordDefault({
|
|
173
|
+
record: operationRecord,
|
|
174
|
+
runtimeDir: options.systemConfig.runtimeDir,
|
|
175
|
+
zoneId: options.zone.id,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
catch (error) {
|
|
179
|
+
writeOpenClawZoneRuntimeLog(`failed to append gateway lifecycle operation record for zone '${options.zone.id}': ${formatUnknownError(error)}`);
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
const markGatewayHostPidMissing = (message) => {
|
|
183
|
+
if (staleGatewayPendingClose === undefined &&
|
|
184
|
+
(lifecycleState.kind === 'running' || lifecycleState.kind === 'running-degraded')) {
|
|
185
|
+
staleGatewayPendingClose = lifecycleState.gateway;
|
|
186
|
+
}
|
|
187
|
+
const errorMessage = `vm-process-missing: ${message}`;
|
|
188
|
+
setOriginalOutageCauseIfUnknown('vm-process-missing');
|
|
41
189
|
gateway = undefined;
|
|
42
190
|
bootedAt = undefined;
|
|
43
|
-
lastError =
|
|
44
|
-
|
|
45
|
-
|
|
191
|
+
lastError = errorMessage;
|
|
192
|
+
lifecycleState = {
|
|
193
|
+
coldStartEligible: true,
|
|
194
|
+
error: { code: 'vm-process-missing', message: errorMessage },
|
|
195
|
+
kind: 'failed',
|
|
196
|
+
};
|
|
197
|
+
return lifecycleState;
|
|
198
|
+
};
|
|
199
|
+
const closeStaleGatewayBeforeColdStart = async (operationContext) => {
|
|
200
|
+
const staleGateway = staleGatewayPendingClose;
|
|
201
|
+
if (!staleGateway) {
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
staleGatewayPendingClose = undefined;
|
|
205
|
+
try {
|
|
206
|
+
await recordLifecycleOperation({
|
|
207
|
+
kind: 'vm-close-started',
|
|
208
|
+
operationId: operationContext.operationId,
|
|
209
|
+
operationTrigger: operationContext.operationTrigger,
|
|
210
|
+
previousGateway: gatewayIdentityFor(staleGateway),
|
|
211
|
+
});
|
|
212
|
+
await closeGatewayWithDeadline(staleGateway);
|
|
213
|
+
await recordLifecycleOperation({
|
|
214
|
+
kind: 'vm-close-finished',
|
|
215
|
+
operationId: operationContext.operationId,
|
|
216
|
+
operationTrigger: operationContext.operationTrigger,
|
|
217
|
+
previousGateway: gatewayIdentityFor(staleGateway),
|
|
218
|
+
});
|
|
219
|
+
}
|
|
220
|
+
catch (error) {
|
|
221
|
+
staleGatewayPendingClose = staleGateway;
|
|
222
|
+
lastError = formatUnknownError(error);
|
|
223
|
+
lifecycleState = {
|
|
224
|
+
coldStartEligible: false,
|
|
225
|
+
error: {
|
|
226
|
+
code: 'owner-unsafe',
|
|
227
|
+
message: lastError,
|
|
228
|
+
},
|
|
229
|
+
kind: 'failed',
|
|
230
|
+
};
|
|
231
|
+
await recordLifecycleOperation({
|
|
232
|
+
errorCode: 'owner-unsafe',
|
|
233
|
+
errorMessage: lastError,
|
|
234
|
+
kind: 'operation-failed',
|
|
235
|
+
operationId: operationContext.operationId,
|
|
236
|
+
operationTrigger: operationContext.operationTrigger,
|
|
237
|
+
previousGateway: gatewayIdentityFor(staleGateway),
|
|
238
|
+
});
|
|
239
|
+
throw error;
|
|
240
|
+
}
|
|
241
|
+
};
|
|
242
|
+
const classifyLastError = (message) => {
|
|
243
|
+
if (message.startsWith('vm-process-missing:')) {
|
|
244
|
+
return {
|
|
245
|
+
coldStartEligible: true,
|
|
246
|
+
error: { code: 'vm-process-missing', message },
|
|
247
|
+
kind: 'failed',
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
const error = classifyGatewayStartError(new Error(message));
|
|
251
|
+
return {
|
|
252
|
+
coldStartEligible: error.code !== 'owner-unsafe',
|
|
253
|
+
error,
|
|
254
|
+
kind: 'failed',
|
|
255
|
+
};
|
|
256
|
+
};
|
|
257
|
+
const getLifecycleState = () => {
|
|
258
|
+
if (lifecycleState.kind === 'running' || lifecycleState.kind === 'running-degraded') {
|
|
259
|
+
const hostPid = lifecycleState.gateway.vm.getHostPid();
|
|
260
|
+
if (hostPid === undefined || hostPid === null) {
|
|
261
|
+
return markGatewayHostPidMissing(`Gateway VM host pid is unavailable for zone '${options.zone.id}'.`);
|
|
262
|
+
}
|
|
263
|
+
if (!isProcessAlive(hostPid)) {
|
|
264
|
+
return markGatewayHostPidMissing(`Gateway VM host pid ${String(hostPid)} is not alive for zone '${options.zone.id}'.`);
|
|
265
|
+
}
|
|
266
|
+
return lifecycleState;
|
|
267
|
+
}
|
|
268
|
+
if (lifecycleState.kind === 'failed' || lifecycleState.kind === 'owner-unsafe') {
|
|
269
|
+
return lifecycleState;
|
|
270
|
+
}
|
|
271
|
+
if (lifecycleState.kind === 'starting' ||
|
|
272
|
+
lifecycleState.kind === 'stopping' ||
|
|
273
|
+
lifecycleState.kind === 'restarting') {
|
|
274
|
+
return lifecycleState;
|
|
275
|
+
}
|
|
276
|
+
if (lastError) {
|
|
277
|
+
lifecycleState = classifyLastError(lastError);
|
|
278
|
+
return lifecycleState;
|
|
279
|
+
}
|
|
280
|
+
return lifecycleState;
|
|
281
|
+
};
|
|
282
|
+
const runLifecycleOperation = async (operation) => {
|
|
283
|
+
const runAfterPrevious = async () => {
|
|
284
|
+
await lifecycleOperation.catch(() => undefined);
|
|
285
|
+
return await operation();
|
|
286
|
+
};
|
|
287
|
+
const executionPromise = runAfterPrevious();
|
|
288
|
+
const operationResultPromise = executionPromise.then(async (execution) => {
|
|
289
|
+
if (isLifecycleOperationExecutionWithLock(execution)) {
|
|
290
|
+
return await execution.publicResult;
|
|
291
|
+
}
|
|
292
|
+
return await execution;
|
|
293
|
+
});
|
|
294
|
+
lifecycleOperation = executionPromise
|
|
295
|
+
.then(async (execution) => {
|
|
296
|
+
if (isLifecycleOperationExecutionWithLock(execution)) {
|
|
297
|
+
await execution.lock;
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
await execution;
|
|
301
|
+
})
|
|
302
|
+
.then(() => undefined, () => undefined);
|
|
303
|
+
return await operationResultPromise;
|
|
304
|
+
};
|
|
305
|
+
const withLifecycleTimeout = (props) => {
|
|
306
|
+
let timeout;
|
|
307
|
+
const timeoutPromise = new Promise((_resolve, reject) => {
|
|
308
|
+
timeout = setTimeoutImpl(() => {
|
|
309
|
+
lifecycleGeneration += 1;
|
|
310
|
+
reject(new OpenClawZoneRestartTimeoutError(options.zone.id, props.timeoutMs));
|
|
311
|
+
}, props.timeoutMs);
|
|
312
|
+
timeout.unref?.();
|
|
313
|
+
});
|
|
314
|
+
const publicResult = Promise.race([props.operation, timeoutPromise]).finally(() => {
|
|
315
|
+
if (timeout) {
|
|
316
|
+
clearTimeoutImpl(timeout);
|
|
317
|
+
}
|
|
318
|
+
});
|
|
319
|
+
return {
|
|
320
|
+
lock: props.operation.then(() => undefined, () => undefined),
|
|
321
|
+
publicResult,
|
|
322
|
+
};
|
|
323
|
+
};
|
|
324
|
+
const releaseZoneLeases = async (zoneId) => {
|
|
325
|
+
const leases = options.leaseManager
|
|
326
|
+
.listLeases()
|
|
327
|
+
.filter((activeLease) => activeLease.zoneId === zoneId);
|
|
328
|
+
const releaseResults = await Promise.allSettled(leases.map(async (lease) => await options.leaseManager.releaseLease(lease.id, { force: true })));
|
|
329
|
+
const failedLeaseIds = [];
|
|
330
|
+
for (const [index, releaseResult] of releaseResults.entries()) {
|
|
331
|
+
if (releaseResult.status === 'fulfilled') {
|
|
332
|
+
continue;
|
|
333
|
+
}
|
|
334
|
+
const leaseId = leases[index]?.id ?? `(unknown lease at index ${index})`;
|
|
335
|
+
failedLeaseIds.push(leaseId);
|
|
336
|
+
writeOpenClawZoneRuntimeLog(`lease '${leaseId}' release failed while restarting zone '${zoneId}': ${formatUnknownError(releaseResult.reason)}`);
|
|
337
|
+
}
|
|
338
|
+
return { failedLeaseIds };
|
|
339
|
+
};
|
|
340
|
+
const closeGatewayWithDeadline = async (activeGateway) => {
|
|
341
|
+
let timeout;
|
|
342
|
+
try {
|
|
343
|
+
await Promise.race([
|
|
344
|
+
activeGateway.vm.close(),
|
|
345
|
+
new Promise((_resolve, reject) => {
|
|
346
|
+
timeout = setTimeoutImpl(() => {
|
|
347
|
+
reject(new Error(`Gateway VM close timed out for zone '${options.zone.id}' after ${closeGatewayTimeoutMs}ms`));
|
|
348
|
+
}, closeGatewayTimeoutMs);
|
|
349
|
+
timeout.unref?.();
|
|
350
|
+
}),
|
|
351
|
+
]);
|
|
352
|
+
}
|
|
353
|
+
finally {
|
|
354
|
+
if (timeout) {
|
|
355
|
+
clearTimeoutImpl(timeout);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
};
|
|
359
|
+
const stopNow = async (next = 'stopped', operationContext) => {
|
|
360
|
+
const activeGateway = gateway;
|
|
361
|
+
const operationId = operationContext?.operationId ?? createOperationId('stop');
|
|
362
|
+
const operationTrigger = operationContext?.operationTrigger ?? 'operator-stop';
|
|
363
|
+
const previousGateway = operationContext?.previousGateway ?? activeGateway;
|
|
364
|
+
lifecycleState = {
|
|
365
|
+
kind: 'stopping',
|
|
366
|
+
next,
|
|
367
|
+
operationId,
|
|
368
|
+
previousGateway,
|
|
369
|
+
};
|
|
370
|
+
try {
|
|
371
|
+
await recordLifecycleOperation({
|
|
372
|
+
kind: 'stop-requested',
|
|
373
|
+
operationId,
|
|
374
|
+
operationTrigger,
|
|
375
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
376
|
+
});
|
|
377
|
+
if (activeGateway) {
|
|
378
|
+
await recordLifecycleOperation({
|
|
379
|
+
kind: 'vm-close-started',
|
|
380
|
+
operationId,
|
|
381
|
+
operationTrigger,
|
|
382
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
383
|
+
});
|
|
384
|
+
await closeGatewayWithDeadline(activeGateway);
|
|
385
|
+
await recordLifecycleOperation({
|
|
386
|
+
kind: 'vm-close-finished',
|
|
387
|
+
operationId,
|
|
388
|
+
operationTrigger,
|
|
389
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
|
|
393
|
+
await recordLifecycleOperation({
|
|
394
|
+
kind: 'runtime-record-deleted',
|
|
395
|
+
operationId,
|
|
396
|
+
operationTrigger,
|
|
397
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
398
|
+
});
|
|
399
|
+
gateway = undefined;
|
|
400
|
+
bootedAt = undefined;
|
|
401
|
+
lastError = undefined;
|
|
402
|
+
lifecycleState = { kind: 'stopped' };
|
|
403
|
+
}
|
|
404
|
+
catch (error) {
|
|
405
|
+
lastError = formatUnknownError(error);
|
|
406
|
+
lifecycleState = {
|
|
407
|
+
coldStartEligible: false,
|
|
408
|
+
error: {
|
|
409
|
+
code: 'owner-unsafe',
|
|
410
|
+
message: lastError,
|
|
411
|
+
},
|
|
412
|
+
kind: 'failed',
|
|
413
|
+
};
|
|
414
|
+
await recordLifecycleOperation({
|
|
415
|
+
errorCode: 'owner-unsafe',
|
|
416
|
+
errorMessage: lastError,
|
|
417
|
+
kind: 'operation-failed',
|
|
418
|
+
operationId,
|
|
419
|
+
operationTrigger,
|
|
420
|
+
previousGateway: gatewayIdentityFor(previousGateway),
|
|
421
|
+
});
|
|
422
|
+
throw error;
|
|
46
423
|
}
|
|
47
|
-
await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
|
|
48
424
|
};
|
|
49
|
-
const
|
|
425
|
+
const startNow = async (expectedGeneration, startOptions = {}, operationContext) => {
|
|
426
|
+
const operationId = operationContext?.operationId ?? createOperationId('start');
|
|
427
|
+
const operationTrigger = operationContext?.operationTrigger ?? 'operator-start';
|
|
428
|
+
lifecycleState = {
|
|
429
|
+
kind: 'starting',
|
|
430
|
+
operationId,
|
|
431
|
+
startedAtMs: options.now(),
|
|
432
|
+
};
|
|
50
433
|
try {
|
|
51
|
-
|
|
434
|
+
await recordLifecycleOperation({
|
|
435
|
+
kind: 'start-requested',
|
|
436
|
+
operationId,
|
|
437
|
+
operationTrigger,
|
|
438
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
439
|
+
});
|
|
440
|
+
const startedGateway = await startGateway(startOptions);
|
|
441
|
+
if (expectedGeneration !== undefined && expectedGeneration !== lifecycleGeneration) {
|
|
442
|
+
try {
|
|
443
|
+
await closeGatewayWithDeadline(startedGateway);
|
|
444
|
+
if (lifecycleGeneration === expectedGeneration + 1) {
|
|
445
|
+
await (options.deleteGatewayRuntimeRecord ?? deleteGatewayRuntimeRecordDefault)(options.zone.gateway.stateDir);
|
|
446
|
+
await recordLifecycleOperation({
|
|
447
|
+
currentGateway: gatewayIdentityFor(startedGateway),
|
|
448
|
+
kind: 'runtime-record-deleted',
|
|
449
|
+
operationId,
|
|
450
|
+
operationTrigger,
|
|
451
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
452
|
+
});
|
|
453
|
+
}
|
|
454
|
+
lastError = `stale-generation-closed: Closed stale gateway start for zone '${options.zone.id}'.`;
|
|
455
|
+
lifecycleState = classifyLastError(lastError);
|
|
456
|
+
await recordLifecycleOperation({
|
|
457
|
+
currentGateway: gatewayIdentityFor(startedGateway),
|
|
458
|
+
errorCode: 'stale-generation-closed',
|
|
459
|
+
errorMessage: lastError,
|
|
460
|
+
kind: 'operation-failed',
|
|
461
|
+
operationId,
|
|
462
|
+
operationTrigger,
|
|
463
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
464
|
+
});
|
|
465
|
+
}
|
|
466
|
+
catch (error) {
|
|
467
|
+
lastError = `stale-generation-closed: Failed to close stale gateway start for zone '${options.zone.id}': ${formatUnknownError(error)}`;
|
|
468
|
+
lifecycleState = classifyLastError(lastError);
|
|
469
|
+
await recordLifecycleOperation({
|
|
470
|
+
currentGateway: gatewayIdentityFor(startedGateway),
|
|
471
|
+
errorCode: 'stale-generation-closed',
|
|
472
|
+
errorMessage: lastError,
|
|
473
|
+
kind: 'operation-failed',
|
|
474
|
+
operationId,
|
|
475
|
+
operationTrigger,
|
|
476
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
477
|
+
});
|
|
478
|
+
writeOpenClawZoneRuntimeLog(`stale gateway start cleanup failed for zone '${options.zone.id}': ${formatUnknownError(error)}`);
|
|
479
|
+
}
|
|
480
|
+
return;
|
|
481
|
+
}
|
|
52
482
|
gateway = startedGateway;
|
|
53
483
|
bootedAt = new Date(options.now()).toISOString();
|
|
54
484
|
lastError = undefined;
|
|
485
|
+
lifecycleState = { gateway: startedGateway, kind: 'running' };
|
|
486
|
+
await recordLifecycleOperation({
|
|
487
|
+
currentGateway: gatewayIdentityFor(startedGateway),
|
|
488
|
+
kind: 'operation-finished',
|
|
489
|
+
operationId,
|
|
490
|
+
operationTrigger,
|
|
491
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
492
|
+
});
|
|
55
493
|
}
|
|
56
494
|
catch (error) {
|
|
495
|
+
if (error instanceof GatewayOwnershipUnsafeError) {
|
|
496
|
+
gateway = undefined;
|
|
497
|
+
bootedAt = undefined;
|
|
498
|
+
lastError = error.message;
|
|
499
|
+
lifecycleState = {
|
|
500
|
+
evidence: error.evidence,
|
|
501
|
+
kind: 'owner-unsafe',
|
|
502
|
+
};
|
|
503
|
+
await recordLifecycleOperation({
|
|
504
|
+
errorCode: 'owner-unsafe',
|
|
505
|
+
errorMessage: error.message,
|
|
506
|
+
kind: 'operation-failed',
|
|
507
|
+
operationId,
|
|
508
|
+
operationTrigger,
|
|
509
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
510
|
+
});
|
|
511
|
+
throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
|
|
512
|
+
gatewayLifecycleErrorCode: 'owner-unsafe',
|
|
513
|
+
operationId,
|
|
514
|
+
});
|
|
515
|
+
}
|
|
516
|
+
const classifiedError = classifyGatewayStartError(error);
|
|
57
517
|
gateway = undefined;
|
|
58
518
|
bootedAt = undefined;
|
|
59
519
|
lastError = formatUnknownError(error);
|
|
60
|
-
|
|
520
|
+
lifecycleState = {
|
|
521
|
+
coldStartEligible: true,
|
|
522
|
+
error: classifiedError,
|
|
523
|
+
kind: 'failed',
|
|
524
|
+
};
|
|
525
|
+
await recordLifecycleOperation({
|
|
526
|
+
errorCode: classifiedError.code,
|
|
527
|
+
errorMessage: classifiedError.message,
|
|
528
|
+
kind: 'operation-failed',
|
|
529
|
+
operationId,
|
|
530
|
+
operationTrigger,
|
|
531
|
+
previousGateway: gatewayIdentityFor(operationContext?.previousGateway),
|
|
532
|
+
});
|
|
533
|
+
throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
|
|
534
|
+
gatewayLifecycleErrorCode: classifiedError.code,
|
|
535
|
+
operationId,
|
|
536
|
+
});
|
|
61
537
|
}
|
|
62
538
|
};
|
|
63
|
-
const
|
|
64
|
-
|
|
65
|
-
|
|
539
|
+
const stop = async () => await runLifecycleOperation(async () => await stopNow());
|
|
540
|
+
const start = async () => await runLifecycleOperation(async () => await startNow(undefined, {}, {
|
|
541
|
+
operationId: createOperationId('start'),
|
|
542
|
+
operationTrigger: 'controller-start',
|
|
543
|
+
}));
|
|
544
|
+
const restartWithStartOptions = async (restartOptions = {}, startOptions = {}, operationMetadata = {}) => {
|
|
545
|
+
return await runLifecycleOperation(async () => {
|
|
546
|
+
lifecycleGeneration += 1;
|
|
547
|
+
const operationGeneration = lifecycleGeneration;
|
|
548
|
+
const currentState = getLifecycleState();
|
|
549
|
+
const operationId = operationMetadata.operationId ?? createOperationId('restart');
|
|
550
|
+
const operationContext = {
|
|
551
|
+
operationId,
|
|
552
|
+
operationTrigger: operationMetadata.operationTrigger ??
|
|
553
|
+
restartOptions.operationTrigger ??
|
|
554
|
+
'operator-restart',
|
|
555
|
+
previousGateway: currentState.kind === 'running' || currentState.kind === 'running-degraded'
|
|
556
|
+
? currentState.gateway
|
|
557
|
+
: undefined,
|
|
558
|
+
};
|
|
559
|
+
if (currentState.kind === 'running' || currentState.kind === 'running-degraded') {
|
|
560
|
+
lifecycleState = {
|
|
561
|
+
kind: 'restarting',
|
|
562
|
+
operationId,
|
|
563
|
+
previousGateway: currentState.gateway,
|
|
564
|
+
};
|
|
565
|
+
}
|
|
566
|
+
const restartOperation = currentState.kind === 'running' || currentState.kind === 'running-degraded'
|
|
567
|
+
? (async () => {
|
|
568
|
+
await recordLifecycleOperation({
|
|
569
|
+
kind: 'restart-requested',
|
|
570
|
+
operationId,
|
|
571
|
+
operationTrigger: operationContext.operationTrigger,
|
|
572
|
+
previousGateway: gatewayIdentityFor(operationContext.previousGateway),
|
|
573
|
+
});
|
|
574
|
+
const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
|
|
575
|
+
await stopNow('starting', operationContext);
|
|
576
|
+
await startNow(operationGeneration, startOptions, operationContext);
|
|
577
|
+
if (operationGeneration !== lifecycleGeneration) {
|
|
578
|
+
throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
|
|
579
|
+
}
|
|
580
|
+
return {
|
|
581
|
+
leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
|
|
582
|
+
operationId,
|
|
583
|
+
};
|
|
584
|
+
})()
|
|
585
|
+
: (async () => {
|
|
586
|
+
await recordLifecycleOperation({
|
|
587
|
+
kind: 'cold-start-requested',
|
|
588
|
+
operationId,
|
|
589
|
+
operationTrigger: operationContext.operationTrigger,
|
|
590
|
+
});
|
|
591
|
+
const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
|
|
592
|
+
await closeStaleGatewayBeforeColdStart(operationContext);
|
|
593
|
+
await startNow(operationGeneration, startOptions, operationContext);
|
|
594
|
+
if (operationGeneration !== lifecycleGeneration) {
|
|
595
|
+
throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
|
|
596
|
+
}
|
|
597
|
+
return {
|
|
598
|
+
leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
|
|
599
|
+
operationId,
|
|
600
|
+
};
|
|
601
|
+
})();
|
|
602
|
+
if (restartOptions.timeoutMs === undefined) {
|
|
603
|
+
return await restartOperation;
|
|
604
|
+
}
|
|
605
|
+
return withLifecycleTimeout({
|
|
606
|
+
operation: restartOperation,
|
|
607
|
+
timeoutMs: restartOptions.timeoutMs,
|
|
608
|
+
});
|
|
609
|
+
});
|
|
610
|
+
};
|
|
611
|
+
const restart = async (restartOptions = {}) => await restartWithStartOptions(restartOptions);
|
|
612
|
+
const coldStartWithStartOptions = async (restartOptions = {}, startOptions = {}, operationMetadata = {}) => {
|
|
613
|
+
return await runLifecycleOperation(async () => {
|
|
614
|
+
lifecycleGeneration += 1;
|
|
615
|
+
const operationGeneration = lifecycleGeneration;
|
|
616
|
+
const operationContext = {
|
|
617
|
+
operationId: operationMetadata.operationId ?? createOperationId('cold-start'),
|
|
618
|
+
operationTrigger: operationMetadata.operationTrigger ?? restartOptions.operationTrigger ?? 'auto-recovery',
|
|
619
|
+
};
|
|
620
|
+
const coldStartOperation = (async () => {
|
|
621
|
+
getLifecycleState();
|
|
622
|
+
await recordLifecycleOperation({
|
|
623
|
+
kind: 'cold-start-requested',
|
|
624
|
+
operationId: operationContext.operationId,
|
|
625
|
+
operationTrigger: operationContext.operationTrigger,
|
|
626
|
+
});
|
|
627
|
+
const leaseReleaseResult = await releaseZoneLeases(options.zone.id);
|
|
628
|
+
await closeStaleGatewayBeforeColdStart(operationContext);
|
|
629
|
+
await startNow(operationGeneration, startOptions, operationContext);
|
|
630
|
+
if (operationGeneration !== lifecycleGeneration) {
|
|
631
|
+
throw new OpenClawZoneRestartTimeoutError(options.zone.id, restartOptions.timeoutMs ?? 0);
|
|
632
|
+
}
|
|
633
|
+
return {
|
|
634
|
+
leaseReleaseFailureCount: leaseReleaseResult.failedLeaseIds.length,
|
|
635
|
+
operationId: operationContext.operationId,
|
|
636
|
+
};
|
|
637
|
+
})();
|
|
638
|
+
if (restartOptions.timeoutMs === undefined) {
|
|
639
|
+
return await coldStartOperation;
|
|
640
|
+
}
|
|
641
|
+
return withLifecycleTimeout({
|
|
642
|
+
operation: coldStartOperation,
|
|
643
|
+
timeoutMs: restartOptions.timeoutMs,
|
|
644
|
+
});
|
|
645
|
+
});
|
|
66
646
|
};
|
|
647
|
+
const coldStart = async (restartOptions = {}) => await coldStartWithStartOptions(restartOptions);
|
|
67
648
|
return {
|
|
649
|
+
coldStart,
|
|
68
650
|
destroy: async (purge) => await (options.runControllerDestroy ?? runControllerDestroyDefault)({ purge, systemConfig: options.systemConfig, zoneId: options.zone.id }, {
|
|
69
651
|
releaseZoneLeases: async (zoneId) => {
|
|
70
|
-
await
|
|
71
|
-
.listLeases()
|
|
72
|
-
.filter((activeLease) => activeLease.zoneId === zoneId)
|
|
73
|
-
.map(async (lease) => await options.leaseManager.releaseLease(lease.id, { force: true })));
|
|
652
|
+
await releaseZoneLeases(zoneId);
|
|
74
653
|
},
|
|
75
654
|
stopGatewayZone: async () => await stop(),
|
|
76
655
|
}),
|
|
77
656
|
enableSsh: async () => await requireGateway().vm.enableSsh(),
|
|
78
|
-
exec: async (command) => await requireGateway()
|
|
657
|
+
exec: async (command) => await executeGatewayCommand(requireGateway(), command),
|
|
79
658
|
gatewayType: 'openclaw',
|
|
80
659
|
getHealth: async () => {
|
|
660
|
+
getLifecycleState();
|
|
81
661
|
const activeGateway = requireGateway();
|
|
82
662
|
const result = await runGatewayHealthCheck({
|
|
83
|
-
exec: async (command) => await activeGateway
|
|
663
|
+
exec: async (command) => await executeGatewayCommand(activeGateway, command),
|
|
84
664
|
healthCheck: activeGateway.processSpec.healthCheck,
|
|
85
665
|
});
|
|
86
666
|
return {
|
|
@@ -92,47 +672,127 @@ export function createOpenClawZoneRuntime(options) {
|
|
|
92
672
|
zoneId: options.zone.id,
|
|
93
673
|
};
|
|
94
674
|
},
|
|
675
|
+
getDiagnosis: () => deriveGatewayDiagnosisSnapshot({
|
|
676
|
+
channelProviderPlane: 'unknown',
|
|
677
|
+
controllerLiveness: 'ok',
|
|
678
|
+
lastOperation,
|
|
679
|
+
originalOutageCause,
|
|
680
|
+
state: getLifecycleState(),
|
|
681
|
+
toolVmPlane: 'unknown',
|
|
682
|
+
}),
|
|
95
683
|
getLogs: async () => {
|
|
96
684
|
const activeGateway = requireGateway();
|
|
97
685
|
return await (options.runControllerLogs ?? runControllerLogsDefault)({ zoneId: options.zone.id }, {
|
|
98
686
|
readGatewayLogs: async () => (await activeGateway.vm.exec(buildOpenClawCombinedLogsCommand(activeGateway.processSpec.logPath))).stdout,
|
|
99
687
|
});
|
|
100
688
|
},
|
|
689
|
+
getLifecycleState,
|
|
101
690
|
getSnapshot: () => {
|
|
102
|
-
|
|
103
|
-
|
|
691
|
+
const currentLifecycleState = getLifecycleState();
|
|
692
|
+
if (currentLifecycleState.kind === 'running') {
|
|
693
|
+
const hostPid = currentLifecycleState.gateway.vm.getHostPid();
|
|
694
|
+
if (hostPid === undefined || hostPid === null) {
|
|
695
|
+
const missingHostPidState = markGatewayHostPidMissing(`Gateway VM host pid is unavailable for zone '${options.zone.id}'.`);
|
|
696
|
+
return {
|
|
697
|
+
lastError: missingHostPidState.error.message,
|
|
698
|
+
lifecycleState: 'failed',
|
|
699
|
+
};
|
|
700
|
+
}
|
|
104
701
|
return {
|
|
105
702
|
...(bootedAt ? { bootedAt } : {}),
|
|
106
703
|
gateway: {
|
|
107
|
-
ingress: gateway.ingress,
|
|
704
|
+
ingress: currentLifecycleState.gateway.ingress,
|
|
108
705
|
vm: {
|
|
109
|
-
|
|
110
|
-
id: gateway.vm.id,
|
|
706
|
+
hostPid,
|
|
707
|
+
id: currentLifecycleState.gateway.vm.id,
|
|
111
708
|
},
|
|
112
709
|
},
|
|
710
|
+
...(lastError ? { lastError } : {}),
|
|
113
711
|
lifecycleState: 'running',
|
|
114
712
|
};
|
|
115
713
|
}
|
|
116
714
|
return lastError ? { lastError, lifecycleState: 'failed' } : { lifecycleState: 'stopped' };
|
|
117
715
|
},
|
|
118
|
-
refreshCredentials: async () => await (
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
716
|
+
refreshCredentials: async () => await (async () => {
|
|
717
|
+
const operationId = createOperationId('credentials-refresh');
|
|
718
|
+
const operationTrigger = 'credentials-refresh';
|
|
719
|
+
await recordLifecycleOperation({
|
|
720
|
+
kind: 'credentials-refresh-requested',
|
|
721
|
+
operationId,
|
|
722
|
+
operationTrigger,
|
|
723
|
+
previousGateway: gatewayIdentityFor(gateway),
|
|
724
|
+
});
|
|
725
|
+
const failCredentialsRefreshSecretResolution = async (error) => {
|
|
726
|
+
const classifiedError = {
|
|
727
|
+
code: 'secret-resolution-failed',
|
|
728
|
+
message: formatUnknownError(error),
|
|
729
|
+
};
|
|
730
|
+
const currentLifecycleState = getLifecycleState();
|
|
731
|
+
lastError = classifiedError.message;
|
|
732
|
+
if (currentLifecycleState.kind !== 'running' &&
|
|
733
|
+
currentLifecycleState.kind !== 'running-degraded') {
|
|
734
|
+
lifecycleState = {
|
|
735
|
+
coldStartEligible: true,
|
|
736
|
+
error: classifiedError,
|
|
737
|
+
kind: 'failed',
|
|
738
|
+
};
|
|
739
|
+
}
|
|
740
|
+
await recordLifecycleOperation({
|
|
741
|
+
errorCode: classifiedError.code,
|
|
742
|
+
errorMessage: classifiedError.message,
|
|
743
|
+
kind: 'operation-failed',
|
|
744
|
+
operationId,
|
|
745
|
+
operationTrigger,
|
|
746
|
+
previousGateway: gatewayIdentityFor(gateway),
|
|
125
747
|
});
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
748
|
+
throw new ControllerZoneRuntimeStartError(options.zone.id, error, {
|
|
749
|
+
gatewayLifecycleErrorCode: classifiedError.code,
|
|
750
|
+
operationId,
|
|
751
|
+
});
|
|
752
|
+
};
|
|
753
|
+
let refreshedSecretResolver;
|
|
754
|
+
try {
|
|
755
|
+
refreshedSecretResolver = options.createFreshSecretResolver
|
|
756
|
+
? await options.createFreshSecretResolver()
|
|
757
|
+
: options.secretResolver;
|
|
758
|
+
}
|
|
759
|
+
catch (error) {
|
|
760
|
+
await failCredentialsRefreshSecretResolution(error);
|
|
761
|
+
}
|
|
762
|
+
return await (options.runControllerCredentialsRefresh ?? runControllerCredentialsRefreshDefault)({ zoneId: options.zone.id }, {
|
|
763
|
+
refreshZoneSecrets: async (zoneId) => {
|
|
764
|
+
try {
|
|
765
|
+
await resolveZoneSecrets({
|
|
766
|
+
audience: 'gateway',
|
|
767
|
+
secretResolver: refreshedSecretResolver,
|
|
768
|
+
systemConfig: options.systemConfig,
|
|
769
|
+
zoneId,
|
|
770
|
+
});
|
|
771
|
+
}
|
|
772
|
+
catch (error) {
|
|
773
|
+
await failCredentialsRefreshSecretResolution(error);
|
|
774
|
+
}
|
|
775
|
+
},
|
|
776
|
+
restartGatewayZone: async () => {
|
|
777
|
+
const currentLifecycleState = getLifecycleState();
|
|
778
|
+
if (currentLifecycleState.kind === 'running' ||
|
|
779
|
+
currentLifecycleState.kind === 'running-degraded') {
|
|
780
|
+
await restartWithStartOptions({}, { secretResolver: refreshedSecretResolver }, { operationId, operationTrigger });
|
|
781
|
+
return;
|
|
782
|
+
}
|
|
783
|
+
await coldStartWithStartOptions({}, { secretResolver: refreshedSecretResolver }, { operationId, operationTrigger });
|
|
784
|
+
},
|
|
785
|
+
});
|
|
786
|
+
})(),
|
|
129
787
|
restart,
|
|
130
788
|
shutdown: stop,
|
|
131
789
|
start,
|
|
132
790
|
stop,
|
|
133
791
|
upgrade: async () => await (options.runControllerUpgrade ?? runControllerUpgradeDefault)({ systemConfig: options.systemConfig, zoneId: options.zone.id }, {
|
|
134
792
|
rebuildGatewayImage: async () => { },
|
|
135
|
-
restartGatewayZone: async () =>
|
|
793
|
+
restartGatewayZone: async () => {
|
|
794
|
+
await restart({ operationTrigger: 'upgrade' });
|
|
795
|
+
},
|
|
136
796
|
}),
|
|
137
797
|
zoneId: options.zone.id,
|
|
138
798
|
};
|