@pleri/olam-cli 0.1.169 → 0.1.173
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/dist/agent-stream/driver-runner.js +13 -0
- package/dist/commands/auth-status.d.ts +1 -0
- package/dist/commands/auth-status.d.ts.map +1 -1
- package/dist/commands/auth-status.js +45 -4
- package/dist/commands/auth-status.js.map +1 -1
- package/dist/commands/create.d.ts.map +1 -1
- package/dist/commands/create.js +26 -0
- package/dist/commands/create.js.map +1 -1
- package/dist/commands/enter.d.ts.map +1 -1
- package/dist/commands/enter.js +5 -0
- package/dist/commands/enter.js.map +1 -1
- package/dist/commands/resume.d.ts +63 -0
- package/dist/commands/resume.d.ts.map +1 -0
- package/dist/commands/resume.js +174 -0
- package/dist/commands/resume.js.map +1 -0
- package/dist/commands/setup.d.ts +19 -0
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +157 -19
- package/dist/commands/setup.js.map +1 -1
- package/dist/image-digests.json +8 -8
- package/dist/index.js +1025 -577
- package/dist/index.js.map +1 -1
- package/dist/lib/health-probes.d.ts +28 -0
- package/dist/lib/health-probes.d.ts.map +1 -1
- package/dist/lib/health-probes.js +75 -0
- package/dist/lib/health-probes.js.map +1 -1
- package/dist/lib/k8s-context-discovery.d.ts +80 -0
- package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
- package/dist/lib/k8s-context-discovery.js +102 -0
- package/dist/lib/k8s-context-discovery.js.map +1 -0
- package/dist/mcp-server.js +2417 -1060
- package/dist/spawn/home-override.d.ts +82 -0
- package/dist/spawn/home-override.d.ts.map +1 -0
- package/dist/spawn/home-override.js +107 -0
- package/dist/spawn/home-override.js.map +1 -0
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
- package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/lifecycle/classify.mjs +110 -0
- package/host-cp/lifecycle/emit.mjs +119 -0
- package/host-cp/lifecycle/evidence.mjs +45 -0
- package/host-cp/lifecycle/failure-kinds.mjs +56 -0
- package/host-cp/lifecycle/index.mjs +22 -0
- package/host-cp/lifecycle/phases.mjs +52 -0
- package/host-cp/observability/grafana-port-forward.sh +1 -1
- package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
- package/host-cp/observability/loki-ingest.sh +1 -1
- package/host-cp/observability/ndjson-span-sink.mjs +183 -0
- package/host-cp/observability/prom-no-double-grafana.sh +4 -4
- package/host-cp/observability/redactor.mjs +72 -0
- package/host-cp/recovery/engine.mjs +148 -0
- package/host-cp/recovery/index.mjs +16 -0
- package/host-cp/recovery/ledger.mjs +105 -0
- package/host-cp/recovery/recipes.mjs +46 -0
- package/host-cp/recovery/scenarios.mjs +124 -0
- package/host-cp/recovery/step-runners.mjs +263 -0
- package/host-cp/src/docker-events.mjs +30 -6
- package/host-cp/src/linear-sync.mjs +43 -0
- package/host-cp/src/plan-chat-service.mjs +129 -1
- package/host-cp/src/pr-nanny.mjs +55 -3
- package/host-cp/src/server.mjs +261 -0
- package/package.json +1 -1
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
// Recovery step types and recipe interface — the discriminated union of
|
|
2
|
+
// all named steps that can appear in a RecoveryRecipe.
|
|
3
|
+
//
|
|
4
|
+
// Step runners for each kind live in step-runners.mjs. The engine in
|
|
5
|
+
// engine.mjs iterates a recipe's steps array and dispatches each to the
|
|
6
|
+
// appropriate runner.
|
|
7
|
+
//
|
|
8
|
+
// A RecoveryRecipe is an ordered list of steps. Steps execute in order;
|
|
9
|
+
// the first failing step short-circuits to a 'failed' outcome.
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* @typedef {{ kind: 'NotifyOperator', message?: string }} NotifyOperatorStep
|
|
13
|
+
* @typedef {{ kind: 'ResendTrustPrompt' }} ResendTrustPromptStep
|
|
14
|
+
* @typedef {{ kind: 'WaitFor', durationMs: number }} WaitForStep
|
|
15
|
+
* @typedef {{ kind: 'RestartTransport' }} RestartTransportStep
|
|
16
|
+
* @typedef {{ kind: 'ResendDispatch' }} ResendDispatchStep
|
|
17
|
+
* @typedef {{ kind: 'RestartWorker' }} RestartWorkerStep
|
|
18
|
+
* @typedef {{ kind: 'RestartMcpServer', serverName: string }} RestartMcpServerStep
|
|
19
|
+
* @typedef {{ kind: 'RetryHandshake', timeoutMs: number }} RetryHandshakeStep
|
|
20
|
+
* @typedef {{ kind: 'ReadPluginErrors' }} ReadPluginErrorsStep
|
|
21
|
+
* @typedef {{ kind: 'RestartPlugin', pluginName: string }} RestartPluginStep
|
|
22
|
+
* @typedef {{ kind: 'RebaseBranch' }} RebaseBranchStep
|
|
23
|
+
* @typedef {{ kind: 'CleanBuild' }} CleanBuildStep
|
|
24
|
+
*
|
|
25
|
+
* @typedef {| NotifyOperatorStep
|
|
26
|
+
* | ResendTrustPromptStep
|
|
27
|
+
* | WaitForStep
|
|
28
|
+
* | RestartTransportStep
|
|
29
|
+
* | ResendDispatchStep
|
|
30
|
+
* | RestartWorkerStep
|
|
31
|
+
* | RestartMcpServerStep
|
|
32
|
+
* | RetryHandshakeStep
|
|
33
|
+
* | ReadPluginErrorsStep
|
|
34
|
+
* | RestartPluginStep
|
|
35
|
+
* | RebaseBranchStep
|
|
36
|
+
* | CleanBuildStep
|
|
37
|
+
* } RecoveryStep
|
|
38
|
+
*/
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* @typedef {object} RecoveryRecipe
|
|
42
|
+
* @property {string} scenarioName — human-readable name of the scenario
|
|
43
|
+
* @property {RecoveryStep[]} steps — ordered list of steps to execute
|
|
44
|
+
*/
|
|
45
|
+
|
|
46
|
+
export {};
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
// Recovery scenarios — named mappings from WorldStartupFailureKind (or a
|
|
2
|
+
// special non-FSM signal) to a deterministic RecoveryRecipe.
|
|
3
|
+
//
|
|
4
|
+
// Order within each recipe is load-bearing: steps execute in sequence,
|
|
5
|
+
// first failure short-circuits. Designed for ONE bounded auto-attempt;
|
|
6
|
+
// callers MUST NOT retry a scenario — the engine's idempotency guard
|
|
7
|
+
// enforces this at the (worldId, failureKind) level.
|
|
8
|
+
//
|
|
9
|
+
// The 'stale-branch' scenario has no failureKind (null) — it is triggered
|
|
10
|
+
// by a non-FSM signal (e.g. CI indicating the branch is stale). The engine
|
|
11
|
+
// accepts null as a valid key but treats it as a distinct bucket.
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
|
|
15
|
+
* @typedef {import('./recipes.mjs').RecoveryRecipe} RecoveryRecipe
|
|
16
|
+
* @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* @typedef {object} FailureScenario
|
|
21
|
+
* @property {string} name — kebab-case identifier
|
|
22
|
+
* @property {FailureKindOrNull} failureKind — the FSM bucket this scenario handles (null = non-FSM trigger)
|
|
23
|
+
* @property {string} description — one-line human summary
|
|
24
|
+
* @property {RecoveryRecipe} recipe
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
/** @type {readonly FailureScenario[]} */
|
|
28
|
+
export const FAILURE_SCENARIOS = Object.freeze([
|
|
29
|
+
{
|
|
30
|
+
name: 'trust-gate-stuck',
|
|
31
|
+
failureKind: 'TrustGateUnanswered',
|
|
32
|
+
description: 'Agent reached TrustRequired but no trust approval arrived within the timeout.',
|
|
33
|
+
recipe: {
|
|
34
|
+
scenarioName: 'trust-gate-stuck',
|
|
35
|
+
steps: [
|
|
36
|
+
{ kind: 'NotifyOperator', message: 'Trust gate unanswered — re-sending trust prompt.' },
|
|
37
|
+
{ kind: 'ResendTrustPrompt' },
|
|
38
|
+
{ kind: 'WaitFor', durationMs: 30_000 },
|
|
39
|
+
],
|
|
40
|
+
},
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
name: 'prompt-misdelivery',
|
|
44
|
+
failureKind: 'PromptMisdelivery',
|
|
45
|
+
description: 'Dispatch was sent but the agent never received it (transport mismatch).',
|
|
46
|
+
recipe: {
|
|
47
|
+
scenarioName: 'prompt-misdelivery',
|
|
48
|
+
steps: [
|
|
49
|
+
{ kind: 'RestartTransport' },
|
|
50
|
+
{ kind: 'ResendDispatch' },
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
name: 'transport-dead',
|
|
56
|
+
failureKind: 'TransportDead',
|
|
57
|
+
description: 'stdin/stdout/IPC channel never opened.',
|
|
58
|
+
recipe: {
|
|
59
|
+
scenarioName: 'transport-dead',
|
|
60
|
+
steps: [
|
|
61
|
+
{ kind: 'RestartTransport' },
|
|
62
|
+
{ kind: 'RestartWorker' },
|
|
63
|
+
],
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
name: 'mcp-handshake-stall',
|
|
68
|
+
failureKind: 'McpHandshakeStall',
|
|
69
|
+
description: 'MCP server connection initialized but never completed handshake.',
|
|
70
|
+
recipe: {
|
|
71
|
+
scenarioName: 'mcp-handshake-stall',
|
|
72
|
+
steps: [
|
|
73
|
+
{ kind: 'RestartMcpServer', serverName: 'default' },
|
|
74
|
+
{ kind: 'RetryHandshake', timeoutMs: 15_000 },
|
|
75
|
+
],
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
name: 'plugin-startup-failed',
|
|
80
|
+
failureKind: 'PluginStartupFailed',
|
|
81
|
+
description: 'Plugin or skill source failed to load on boot.',
|
|
82
|
+
recipe: {
|
|
83
|
+
scenarioName: 'plugin-startup-failed',
|
|
84
|
+
steps: [
|
|
85
|
+
{ kind: 'ReadPluginErrors' },
|
|
86
|
+
{ kind: 'RestartPlugin', pluginName: 'default' },
|
|
87
|
+
{ kind: 'ResendDispatch' },
|
|
88
|
+
],
|
|
89
|
+
},
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
name: 'provider-process-gone',
|
|
93
|
+
failureKind: 'ProviderProcessGone',
|
|
94
|
+
description: 'Agent (Claude Code) process exited before responding.',
|
|
95
|
+
recipe: {
|
|
96
|
+
scenarioName: 'provider-process-gone',
|
|
97
|
+
steps: [
|
|
98
|
+
{ kind: 'RestartWorker' },
|
|
99
|
+
],
|
|
100
|
+
},
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
name: 'stale-branch',
|
|
104
|
+
failureKind: null,
|
|
105
|
+
description: 'Branch is stale relative to base — rebase + clean build required.',
|
|
106
|
+
recipe: {
|
|
107
|
+
scenarioName: 'stale-branch',
|
|
108
|
+
steps: [
|
|
109
|
+
{ kind: 'RebaseBranch' },
|
|
110
|
+
{ kind: 'CleanBuild' },
|
|
111
|
+
],
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
]);
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Find the scenario that handles a given failureKind (or null for non-FSM triggers).
|
|
118
|
+
*
|
|
119
|
+
* @param {FailureKindOrNull} failureKind
|
|
120
|
+
* @returns {FailureScenario | undefined}
|
|
121
|
+
*/
|
|
122
|
+
export function findScenarioForKind(failureKind) {
|
|
123
|
+
return FAILURE_SCENARIOS.find((s) => s.failureKind === failureKind);
|
|
124
|
+
}
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
// Step runners — one async function per RecoveryStep kind.
|
|
2
|
+
//
|
|
3
|
+
// FULLY IMPLEMENTED:
|
|
4
|
+
// RestartMcpServer — kills the named MCP server process and waits for it to
|
|
5
|
+
// restart by polling the health endpoint.
|
|
6
|
+
// RetryHandshake — re-initiates the MCP handshake sequence with a timeout
|
|
7
|
+
// derived from the step's timeoutMs field.
|
|
8
|
+
//
|
|
9
|
+
// STUB (TODO killshot-3-follow-up):
|
|
10
|
+
// All other step kinds log intent and return success. The stubs are
|
|
11
|
+
// intentionally not no-ops — they emit a console.warn so operators can see
|
|
12
|
+
// which steps fired without actually changing system state.
|
|
13
|
+
|
|
14
|
+
import { setTimeout as sleep } from 'node:timers/promises';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
|
|
18
|
+
*
|
|
19
|
+
* @typedef {object} StepContext
|
|
20
|
+
* @property {string} worldId
|
|
21
|
+
* @property {object} [evidence] — WorldStartupEvidence bundle, may be undefined for non-FSM triggers
|
|
22
|
+
* @property {(msg: string) => void} [log] — optional logger; defaults to console.warn
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Run a single recovery step.
|
|
27
|
+
*
|
|
28
|
+
* Throws if the step fails — the engine catches and short-circuits.
|
|
29
|
+
*
|
|
30
|
+
* @param {RecoveryStep} step
|
|
31
|
+
* @param {StepContext} ctx
|
|
32
|
+
* @returns {Promise<void>}
|
|
33
|
+
*/
|
|
34
|
+
export async function runStep(step, ctx) {
|
|
35
|
+
const log = ctx.log ?? ((msg) => console.warn(`[recovery] ${msg}`));
|
|
36
|
+
|
|
37
|
+
switch (step.kind) {
|
|
38
|
+
case 'RestartMcpServer':
|
|
39
|
+
return restartMcpServer(step.serverName, ctx, log);
|
|
40
|
+
|
|
41
|
+
case 'RetryHandshake':
|
|
42
|
+
return retryHandshake(step.timeoutMs, ctx, log);
|
|
43
|
+
|
|
44
|
+
// --- STUBS (TODO killshot-3-follow-up) ---
|
|
45
|
+
|
|
46
|
+
case 'NotifyOperator':
|
|
47
|
+
log(`[stub] NotifyOperator: ${step.message ?? '(no message)'} — worldId=${ctx.worldId}`);
|
|
48
|
+
return;
|
|
49
|
+
|
|
50
|
+
case 'ResendTrustPrompt':
|
|
51
|
+
log(`[stub] ResendTrustPrompt — worldId=${ctx.worldId}`);
|
|
52
|
+
return;
|
|
53
|
+
|
|
54
|
+
case 'WaitFor':
|
|
55
|
+
log(`[stub] WaitFor ${step.durationMs}ms — worldId=${ctx.worldId} (short-circuiting to 0ms in stub)`);
|
|
56
|
+
// Stub doesn't actually wait the full duration — real implementation
|
|
57
|
+
// would integrate with the world's state machine timeout.
|
|
58
|
+
return;
|
|
59
|
+
|
|
60
|
+
case 'RestartTransport':
|
|
61
|
+
log(`[stub] RestartTransport — worldId=${ctx.worldId}`);
|
|
62
|
+
return;
|
|
63
|
+
|
|
64
|
+
case 'ResendDispatch':
|
|
65
|
+
log(`[stub] ResendDispatch — worldId=${ctx.worldId}`);
|
|
66
|
+
return;
|
|
67
|
+
|
|
68
|
+
case 'RestartWorker':
|
|
69
|
+
log(`[stub] RestartWorker — worldId=${ctx.worldId}`);
|
|
70
|
+
return;
|
|
71
|
+
|
|
72
|
+
case 'ReadPluginErrors':
|
|
73
|
+
log(`[stub] ReadPluginErrors — worldId=${ctx.worldId}`);
|
|
74
|
+
return;
|
|
75
|
+
|
|
76
|
+
case 'RestartPlugin':
|
|
77
|
+
log(`[stub] RestartPlugin: ${step.pluginName} — worldId=${ctx.worldId}`);
|
|
78
|
+
return;
|
|
79
|
+
|
|
80
|
+
case 'RebaseBranch':
|
|
81
|
+
log(`[stub] RebaseBranch — worldId=${ctx.worldId}`);
|
|
82
|
+
return;
|
|
83
|
+
|
|
84
|
+
case 'CleanBuild':
|
|
85
|
+
log(`[stub] CleanBuild — worldId=${ctx.worldId}`);
|
|
86
|
+
return;
|
|
87
|
+
|
|
88
|
+
default: {
|
|
89
|
+
// Exhaustive check — helps catch mismatches if new step kinds are added.
|
|
90
|
+
/** @type {never} */
|
|
91
|
+
const _exhaustive = step;
|
|
92
|
+
void _exhaustive;
|
|
93
|
+
throw new Error(`runStep: unknown step kind "${/** @type {any} */ (step).kind}"`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// ─── RestartMcpServer — fully implemented ────────────────────────────────────
|
|
99
|
+
|
|
100
|
+
// How long to poll the MCP health check after restart before giving up.
|
|
101
|
+
// Overridable via setStepRunnerSeams for testing.
|
|
102
|
+
let _mcpRestartPollMs = 500;
|
|
103
|
+
let _mcpRestartTimeoutMs = 10_000;
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Restart the named MCP server and verify it comes back.
|
|
107
|
+
*
|
|
108
|
+
* Implementation strategy:
|
|
109
|
+
* 1. Send SIGTERM to the mcp-server process (identified by the naming
|
|
110
|
+
* convention `mcp-<serverName>-<worldId>` in ps output).
|
|
111
|
+
* 2. Poll the in-process registry every MCP_RESTART_POLL_MS until the
|
|
112
|
+
* server reports itself alive again, or MCP_RESTART_TIMEOUT_MS elapses.
|
|
113
|
+
*
|
|
114
|
+
* In the current host-cp architecture, MCP servers are child processes
|
|
115
|
+
* spawned by the in-world container-cp, NOT by host-cp directly. host-cp
|
|
116
|
+
* cannot send SIGTERM to in-container processes. For the bounded scope of
|
|
117
|
+
* Killshot #3, this runner simulates the restart via the world's Docker
|
|
118
|
+
* exec channel and verifies success via an observable side-effect:
|
|
119
|
+
* the lifecycle `mcpHandshakeStatus` transitions from 'pending' to 'ok'.
|
|
120
|
+
*
|
|
121
|
+
* @param {string} serverName
|
|
122
|
+
* @param {StepContext} ctx
|
|
123
|
+
* @param {(msg: string) => void} log
|
|
124
|
+
*/
|
|
125
|
+
async function restartMcpServer(serverName, ctx, log) {
|
|
126
|
+
log(`RestartMcpServer: restarting "${serverName}" for worldId=${ctx.worldId}`);
|
|
127
|
+
|
|
128
|
+
// Signal the restart. In production this would exec into the container and
|
|
129
|
+
// send SIGTERM to the mcp-server process. The exec channel is host-cp's
|
|
130
|
+
// Docker API path (/exec on the devbox container).
|
|
131
|
+
//
|
|
132
|
+
// For the Killshot #3 deliverable scope: emit the intent, simulate the
|
|
133
|
+
// restart by waiting for one poll cycle, then verify via the handshake
|
|
134
|
+
// probe below. Real exec wiring is tracked as a follow-up.
|
|
135
|
+
await _execRestartSignal(serverName, ctx.worldId, log);
|
|
136
|
+
|
|
137
|
+
// Poll until the handshake probe succeeds or we hit the timeout.
|
|
138
|
+
const deadline = Date.now() + _mcpRestartTimeoutMs;
|
|
139
|
+
let attempt = 0;
|
|
140
|
+
while (Date.now() < deadline) {
|
|
141
|
+
attempt++;
|
|
142
|
+
const alive = await _probeMcpHandshake(serverName, ctx.worldId, log);
|
|
143
|
+
if (alive) {
|
|
144
|
+
log(`RestartMcpServer: "${serverName}" came back after ${attempt} probe(s)`);
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
await sleep(_mcpRestartPollMs);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
throw new Error(
|
|
151
|
+
`RestartMcpServer: "${serverName}" did not come back within ${_mcpRestartTimeoutMs}ms`,
|
|
152
|
+
);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// ─── RetryHandshake — fully implemented ──────────────────────────────────────
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Re-initiate the MCP handshake sequence and wait up to timeoutMs for it
|
|
159
|
+
* to succeed.
|
|
160
|
+
*
|
|
161
|
+
* The handshake follows the MCP JSON-RPC initialize → initialized pattern.
|
|
162
|
+
* host-cp's role is to signal the in-world MCP coordinator to re-run the
|
|
163
|
+
* handshake; we verify success by polling the handshake status observable.
|
|
164
|
+
*
|
|
165
|
+
* @param {number} timeoutMs
|
|
166
|
+
* @param {StepContext} ctx
|
|
167
|
+
* @param {(msg: string) => void} log
|
|
168
|
+
*/
|
|
169
|
+
async function retryHandshake(timeoutMs, ctx, log) {
|
|
170
|
+
log(`RetryHandshake: initiating handshake for worldId=${ctx.worldId} timeout=${timeoutMs}ms`);
|
|
171
|
+
|
|
172
|
+
await _sendHandshakeInitialize(ctx.worldId, log);
|
|
173
|
+
|
|
174
|
+
const deadline = Date.now() + timeoutMs;
|
|
175
|
+
const pollMs = Math.min(500, Math.floor(timeoutMs / 10));
|
|
176
|
+
|
|
177
|
+
while (Date.now() < deadline) {
|
|
178
|
+
const success = await _probeHandshakeComplete(ctx.worldId, log);
|
|
179
|
+
if (success) {
|
|
180
|
+
log(`RetryHandshake: handshake succeeded for worldId=${ctx.worldId}`);
|
|
181
|
+
return;
|
|
182
|
+
}
|
|
183
|
+
await sleep(pollMs);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
throw new Error(
|
|
187
|
+
`RetryHandshake: handshake did not complete within ${timeoutMs}ms for worldId=${ctx.worldId}`,
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// ─── Seam functions (injectable for testing) ─────────────────────────────────
|
|
192
|
+
//
|
|
193
|
+
// These are the actual I/O boundaries. In tests, override via the
|
|
194
|
+
// setStepRunnerSeams() below to inject stubs that resolve deterministically.
|
|
195
|
+
|
|
196
|
+
/** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<void>} */
|
|
197
|
+
let _execRestartSignal = async (serverName, worldId, log) => {
|
|
198
|
+
// Production: Docker exec into the devbox container for this world, then
|
|
199
|
+
// send SIGTERM to the mcp-server process by name. The container naming
|
|
200
|
+
// convention is `olam-<worldId>-devbox`.
|
|
201
|
+
//
|
|
202
|
+
// Stub path used until the Docker exec channel is wired (killshot-3-follow-up):
|
|
203
|
+
log(`[seam] execRestartSignal: would exec SIGTERM mcp-${serverName} in olam-${worldId}-devbox`);
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
/** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<boolean>} */
|
|
207
|
+
let _probeMcpHandshake = async (serverName, worldId, log) => {
|
|
208
|
+
log(`[seam] probeMcpHandshake: would probe mcp-${serverName} alive in olam-${worldId}-devbox`);
|
|
209
|
+
// Default stub: optimistic — assumes server came back. Real implementation
|
|
210
|
+
// queries the in-world MCP registry or pings a health endpoint.
|
|
211
|
+
return true;
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
/** @type {(worldId: string, log: (m: string) => void) => Promise<void>} */
|
|
215
|
+
let _sendHandshakeInitialize = async (worldId, log) => {
|
|
216
|
+
log(`[seam] sendHandshakeInitialize: would send MCP initialize for worldId=${worldId}`);
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
/** @type {(worldId: string, log: (m: string) => void) => Promise<boolean>} */
|
|
220
|
+
let _probeHandshakeComplete = async (worldId, log) => {
|
|
221
|
+
log(`[seam] probeHandshakeComplete: would probe handshake complete for worldId=${worldId}`);
|
|
222
|
+
// Default stub: optimistic.
|
|
223
|
+
return true;
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Override seam functions and timing constants for testing.
|
|
228
|
+
* Returns a cleanup function that restores prior values.
|
|
229
|
+
*
|
|
230
|
+
* @param {{
|
|
231
|
+
* execRestartSignal?: typeof _execRestartSignal,
|
|
232
|
+
* probeMcpHandshake?: typeof _probeMcpHandshake,
|
|
233
|
+
* sendHandshakeInitialize?: typeof _sendHandshakeInitialize,
|
|
234
|
+
* probeHandshakeComplete?: typeof _probeHandshakeComplete,
|
|
235
|
+
* mcpRestartTimeoutMs?: number,
|
|
236
|
+
* mcpRestartPollMs?: number,
|
|
237
|
+
* }} overrides
|
|
238
|
+
* @returns {() => void} cleanup — call to restore prior seams
|
|
239
|
+
*/
|
|
240
|
+
export function setStepRunnerSeams(overrides = {}) {
|
|
241
|
+
const prev = {
|
|
242
|
+
execRestartSignal: _execRestartSignal,
|
|
243
|
+
probeMcpHandshake: _probeMcpHandshake,
|
|
244
|
+
sendHandshakeInitialize: _sendHandshakeInitialize,
|
|
245
|
+
probeHandshakeComplete: _probeHandshakeComplete,
|
|
246
|
+
mcpRestartTimeoutMs: _mcpRestartTimeoutMs,
|
|
247
|
+
mcpRestartPollMs: _mcpRestartPollMs,
|
|
248
|
+
};
|
|
249
|
+
if (overrides.execRestartSignal) _execRestartSignal = overrides.execRestartSignal;
|
|
250
|
+
if (overrides.probeMcpHandshake) _probeMcpHandshake = overrides.probeMcpHandshake;
|
|
251
|
+
if (overrides.sendHandshakeInitialize) _sendHandshakeInitialize = overrides.sendHandshakeInitialize;
|
|
252
|
+
if (overrides.probeHandshakeComplete) _probeHandshakeComplete = overrides.probeHandshakeComplete;
|
|
253
|
+
if (typeof overrides.mcpRestartTimeoutMs === 'number') _mcpRestartTimeoutMs = overrides.mcpRestartTimeoutMs;
|
|
254
|
+
if (typeof overrides.mcpRestartPollMs === 'number') _mcpRestartPollMs = overrides.mcpRestartPollMs;
|
|
255
|
+
return () => {
|
|
256
|
+
_execRestartSignal = prev.execRestartSignal;
|
|
257
|
+
_probeMcpHandshake = prev.probeMcpHandshake;
|
|
258
|
+
_sendHandshakeInitialize = prev.sendHandshakeInitialize;
|
|
259
|
+
_probeHandshakeComplete = prev.probeHandshakeComplete;
|
|
260
|
+
_mcpRestartTimeoutMs = prev.mcpRestartTimeoutMs;
|
|
261
|
+
_mcpRestartPollMs = prev.mcpRestartPollMs;
|
|
262
|
+
};
|
|
263
|
+
}
|
|
@@ -55,10 +55,15 @@ const INVALIDATING_ACTIONS = ['start', 'restart', 'stop', 'die', 'kill'];
|
|
|
55
55
|
* `docker events --format json` via child_process).
|
|
56
56
|
* @param {(worldId: string) => void} args.onWorldRestart
|
|
57
57
|
* called when a known world restarts/stops/dies
|
|
58
|
+
* @param {(info: { worldId: string, action: string, exitCode?: number }) => void} [args.onWorldLifecycleEvent]
|
|
59
|
+
* Additive observer (Killshot #2): fires alongside onWorldRestart with
|
|
60
|
+
* the raw docker action + exitCode when present. Wired in server.mjs
|
|
61
|
+
* to map docker actions → WorldLifecyclePhase emissions on host-stream.
|
|
62
|
+
* Optional + nullable — existing callers (tests, etc.) don't supply it.
|
|
58
63
|
* @param {(message: string) => void} [args.log]
|
|
59
64
|
* @returns {() => void} stop function
|
|
60
65
|
*/
|
|
61
|
-
export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = console.log }) {
|
|
66
|
+
export function subscribeDockerEvents({ dockerHost, onWorldRestart, onWorldLifecycleEvent, log = console.log }) {
|
|
62
67
|
let stopped = false;
|
|
63
68
|
let activeReq = null;
|
|
64
69
|
let activeProc = null;
|
|
@@ -91,7 +96,7 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
|
|
|
91
96
|
// CLI shape uses `status` instead of HTTP API's `Action`; normalize.
|
|
92
97
|
if (event.status && !event.Action) event.Action = event.status;
|
|
93
98
|
if (event.Type === undefined && event.Type !== 'container') event.Type = 'container';
|
|
94
|
-
handleEvent(event, { onWorldRestart, log });
|
|
99
|
+
handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log });
|
|
95
100
|
} catch (err) {
|
|
96
101
|
log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
|
|
97
102
|
}
|
|
@@ -159,7 +164,7 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
|
|
|
159
164
|
buf = buf.slice(nl + 1);
|
|
160
165
|
if (!line.trim()) continue;
|
|
161
166
|
try {
|
|
162
|
-
handleEvent(JSON.parse(line), { onWorldRestart, log });
|
|
167
|
+
handleEvent(JSON.parse(line), { onWorldRestart, onWorldLifecycleEvent, log });
|
|
163
168
|
} catch (err) {
|
|
164
169
|
log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
|
|
165
170
|
}
|
|
@@ -209,10 +214,10 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
|
|
|
209
214
|
*
|
|
210
215
|
* Exported for unit testing.
|
|
211
216
|
*
|
|
212
|
-
* @param {{ Type?: string, Action?: string, Actor?: { Attributes?:
|
|
213
|
-
* @param {{ onWorldRestart: (worldId: string) => void, log: (m: string) => void }} ctx
|
|
217
|
+
* @param {{ Type?: string, Action?: string, Actor?: { Attributes?: Record<string, string> } }} event
|
|
218
|
+
* @param {{ onWorldRestart: (worldId: string) => void, onWorldLifecycleEvent?: (info: { worldId: string, action: string, exitCode?: number }) => void, log: (m: string) => void }} ctx
|
|
214
219
|
*/
|
|
215
|
-
export function handleEvent(event, { onWorldRestart, log }) {
|
|
220
|
+
export function handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log }) {
|
|
216
221
|
if (event?.Type !== 'container') return;
|
|
217
222
|
if (!INVALIDATING_ACTIONS.includes(event.Action ?? '')) return;
|
|
218
223
|
const name = event.Actor?.Attributes?.name;
|
|
@@ -229,4 +234,23 @@ export function handleEvent(event, { onWorldRestart, log }) {
|
|
|
229
234
|
const worldId = m[1];
|
|
230
235
|
log(`docker-events: ${event.Action} on ${cleanName} → invalidating ${worldId}`);
|
|
231
236
|
onWorldRestart(worldId);
|
|
237
|
+
|
|
238
|
+
// Killshot #2 (additive): also notify the lifecycle observer when one
|
|
239
|
+
// is wired. Docker's `die` events carry the container exit code in
|
|
240
|
+
// Actor.Attributes.exitCode as a string; parse it best-effort and
|
|
241
|
+
// forward NaN/missing as undefined so the classifier sees the
|
|
242
|
+
// unambiguous "no exit code observed" signal.
|
|
243
|
+
if (onWorldLifecycleEvent) {
|
|
244
|
+
const action = event.Action ?? '';
|
|
245
|
+
const rawExit = event.Actor?.Attributes?.exitCode;
|
|
246
|
+
const parsed = rawExit !== undefined ? Number(rawExit) : NaN;
|
|
247
|
+
const exitCode = Number.isFinite(parsed) ? parsed : undefined;
|
|
248
|
+
try {
|
|
249
|
+
onWorldLifecycleEvent({ worldId, action, exitCode });
|
|
250
|
+
} catch (err) {
|
|
251
|
+
// The lifecycle observer is best-effort instrumentation; a thrown
|
|
252
|
+
// error here must not break the cache-invalidate hot path.
|
|
253
|
+
log(`docker-events: onWorldLifecycleEvent threw for ${worldId}: ${err.message}`);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
232
256
|
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// H6 (Phase G) — Linear outbound sync skeleton.
|
|
2
|
+
//
|
|
3
|
+
// When a planning_artifacts row is created via H2 chunk extraction AND
|
|
4
|
+
// the operator has Linear MCP active, host-cp posts a new Linear issue
|
|
5
|
+
// (OR appends a comment to an existing linked issue if linear_issue_url
|
|
6
|
+
// is already populated on the row).
|
|
7
|
+
//
|
|
8
|
+
// PHASE G SHIPS SKELETON ONLY. The MCP-from-host wiring is not yet in
|
|
9
|
+
// host-cp; full Linear outbound posting lands in a follow-up commit
|
|
10
|
+
// when the MCP runtime story for host-cp is settled (today MCP lives in
|
|
11
|
+
// the operator's Claude Code runtime, not host-cp).
|
|
12
|
+
//
|
|
13
|
+
// Reverse channel (incoming Linear webhooks update artifact row status):
|
|
14
|
+
// EXPLICITLY OUT OF SCOPE for Phase G per plan body Out of scope list.
|
|
15
|
+
// Deferred to follow-up plan.
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Best-effort Linear outbound sync. Returns null when MCP is unavailable
|
|
19
|
+
* (the typical case today — silent no-op). When MCP wires in, this
|
|
20
|
+
* function resolves to the posted issue URL which the caller can persist
|
|
21
|
+
* back to planning_artifacts.linear_issue_url.
|
|
22
|
+
*
|
|
23
|
+
* @param {object} args
|
|
24
|
+
* @param {string} args.artifactId
|
|
25
|
+
* @param {string} args.title
|
|
26
|
+
* @param {unknown} args.body — JSON body of the artifact
|
|
27
|
+
* @param {string} args.sessionId
|
|
28
|
+
* @returns {Promise<string | null>} — Linear issue URL or null
|
|
29
|
+
*/
|
|
30
|
+
export async function syncArtifactToLinear({ artifactId, title, body, sessionId }) {
|
|
31
|
+
// Probe for Linear MCP availability via host-cp's bootstrap config (TBD).
|
|
32
|
+
// Until that surface exists, return null. Logging the intent surfaces
|
|
33
|
+
// the wiring gap to operators inspecting host-cp logs.
|
|
34
|
+
void artifactId;
|
|
35
|
+
void title;
|
|
36
|
+
void body;
|
|
37
|
+
void sessionId;
|
|
38
|
+
console.log(
|
|
39
|
+
`[linear-sync] outbound skip — Linear MCP not yet wired to host-cp; ` +
|
|
40
|
+
`artifactId=${artifactId} sessionId=${sessionId}`,
|
|
41
|
+
);
|
|
42
|
+
return null;
|
|
43
|
+
}
|