agent-relay-orchestrator 0.10.19 → 0.10.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/relay.ts CHANGED
@@ -1,10 +1,13 @@
1
1
  import type { OrchestratorConfig } from "./config";
2
- import { GIT_SHA, ORCHESTRATOR_PROTOCOL_VERSION, VERSION } from "./version";
2
+ import type { ProviderProbeCache } from "./provider-probe";
3
+ import { detectSelfSupervision } from "./self-supervision";
4
+ import { GIT_SHA, ORCHESTRATOR_PROTOCOL_VERSION, VERSION, runtimeMetadata } from "./version";
5
+ import type { WorkspaceMetadata, WorkspaceMode } from "agent-relay-sdk";
3
6
 
4
7
  export interface RelayClient {
5
8
  register(): Promise<void>;
6
9
  heartbeat(): Promise<void>;
7
- updateManagedAgents(agents: ManagedAgentReport[]): Promise<void>;
10
+ updateManagedAgents(agents: ManagedAgentReport[], exitedAgents?: ManagedSessionExitDiagnostics[]): Promise<void>;
8
11
  pollCommands(): Promise<RelayCommand[]>;
9
12
  updateCommand(commandId: string, status: string, result?: Record<string, unknown>, error?: string): Promise<boolean>;
10
13
  setApiUrl(url: string): void;
@@ -16,16 +19,68 @@ export interface RelayClient {
16
19
  export interface ManagedAgentReport {
17
20
  agentId: string;
18
21
  provider: "claude" | "codex";
22
+ model?: string;
23
+ effort?: string;
24
+ profile?: string;
25
+ workspaceMode?: WorkspaceMode;
26
+ workspace?: WorkspaceMetadata;
27
+ sessionName?: string;
28
+ supervisor?: "process" | "systemd" | "unknown";
29
+ systemdUnit?: string;
30
+ terminalSession?: string;
31
+ terminalAvailable?: boolean;
19
32
  tmuxSession: string;
20
33
  cwd: string;
21
34
  label?: string;
22
35
  approvalMode: string;
23
36
  policyName?: string;
24
37
  spawnRequestId?: string;
38
+ automationRunId?: string;
25
39
  pid?: number;
26
40
  startedAt: number;
27
41
  }
28
42
 
43
+ export interface ManagedSessionExitDiagnostics {
44
+ agentId: string;
45
+ provider: "claude" | "codex";
46
+ workspaceMode?: WorkspaceMode;
47
+ workspace?: WorkspaceMetadata;
48
+ sessionName?: string;
49
+ tmuxSession: string;
50
+ cwd: string;
51
+ label?: string;
52
+ policyName?: string;
53
+ spawnRequestId?: string;
54
+ automationRunId?: string;
55
+ supervisor: "process" | "systemd" | "unknown";
56
+ systemdUnit?: string;
57
+ terminalSession?: string;
58
+ terminalAvailable?: boolean;
59
+ pid?: number;
60
+ currentPid?: number;
61
+ startedAt: number;
62
+ detectedAt: number;
63
+ runtimeMs: number;
64
+ logFile?: string;
65
+ logBytes?: number;
66
+ logEmpty?: boolean;
67
+ logTail?: string[];
68
+ runnerInfoFile?: string;
69
+ runnerInfoPresent?: boolean;
70
+ systemd?: {
71
+ unit: string;
72
+ activeState?: string;
73
+ subState?: string;
74
+ result?: string;
75
+ execMainCode?: string;
76
+ execMainStatus?: string;
77
+ mainPid?: number;
78
+ unavailable?: string;
79
+ };
80
+ unavailable?: string[];
81
+ lastError: string;
82
+ }
83
+
29
84
  export interface RelayCommand {
30
85
  id: string;
31
86
  type: string;
@@ -39,17 +94,40 @@ const BACKOFF_SCHEDULE_MS = [
39
94
  ];
40
95
  const MAX_BACKOFF_MS = 3_600_000; // 1 hour
41
96
 
42
- export function createRelayClient(config: OrchestratorConfig): RelayClient {
97
+ export function buildRegistrationMeta(
98
+ config: Pick<OrchestratorConfig, "tmuxPrefix">,
99
+ runtime: ReturnType<typeof runtimeMetadata>,
100
+ now = Date.now,
101
+ pid = process.pid,
102
+ ): Record<string, unknown> {
103
+ const supervision = detectSelfSupervision();
104
+ return {
105
+ ...runtime,
106
+ pid,
107
+ tmuxPrefix: config.tmuxPrefix,
108
+ startedAt: now(),
109
+ version: VERSION,
110
+ protocolVersion: ORCHESTRATOR_PROTOCOL_VERSION,
111
+ gitSha: GIT_SHA,
112
+ supervisor: supervision.supervisor,
113
+ ...(supervision.selfUnit ? { selfUnit: supervision.selfUnit } : {}),
114
+ ...(supervision.runtimePrefix ? { runtimePrefix: supervision.runtimePrefix } : {}),
115
+ };
116
+ }
117
+
118
+ export function createRelayClient(config: OrchestratorConfig, probeCache: ProviderProbeCache): RelayClient {
43
119
  const agentId = `orchestrator-${config.id}`;
44
120
  let heartbeatTimer: Timer | null = null;
45
121
  let connected = false;
46
122
  let backoffIndex = 0;
47
123
  let cursorFloor = 0;
48
124
  let apiUrl: string | undefined;
125
+ let runtimeToken: string | undefined;
49
126
 
50
127
  function headers(): Record<string, string> {
51
128
  const h: Record<string, string> = { "Content-Type": "application/json" };
52
- if (config.token) h["X-Agent-Relay-Token"] = config.token;
129
+ const token = runtimeToken ?? config.token;
130
+ if (token) h["X-Agent-Relay-Token"] = token;
53
131
  return h;
54
132
  }
55
133
 
@@ -64,29 +142,31 @@ export function createRelayClient(config: OrchestratorConfig): RelayClient {
64
142
  }
65
143
 
66
144
  async function register(): Promise<void> {
145
+ const runtime = runtimeMetadata();
146
+ const providerSnapshot = await probeCache.getSnapshot(true);
67
147
  const res = await apiCall("POST", "/orchestrators", {
68
148
  id: config.id,
69
149
  hostname: config.hostname,
70
- providers: config.providers,
150
+ providers: providerSnapshot.providers,
151
+ providerStatus: providerSnapshot.providerStatus,
152
+ providerCatalog: providerSnapshot.providerCatalog,
71
153
  baseDir: config.baseDir,
72
154
  apiUrl,
73
155
  envKeys: Object.keys(config.env),
156
+ package: runtime.package,
157
+ contracts: runtime.contracts,
158
+ capabilities: runtime.capabilities,
74
159
  version: VERSION,
75
160
  protocolVersion: ORCHESTRATOR_PROTOCOL_VERSION,
76
161
  gitSha: GIT_SHA,
77
- meta: {
78
- pid: process.pid,
79
- tmuxPrefix: config.tmuxPrefix,
80
- startedAt: Date.now(),
81
- version: VERSION,
82
- protocolVersion: ORCHESTRATOR_PROTOCOL_VERSION,
83
- gitSha: GIT_SHA,
84
- },
162
+ meta: buildRegistrationMeta(config, runtime),
85
163
  });
86
164
  if (!res.ok) {
87
165
  const err = await res.text();
88
166
  throw new Error(`Failed to register orchestrator: ${res.status} ${err}`);
89
167
  }
168
+ const registered = await res.json().catch(() => null) as { runtimeToken?: { token?: string } } | null;
169
+ if (registered?.runtimeToken?.token) runtimeToken = registered.runtimeToken.token;
90
170
  connected = true;
91
171
  backoffIndex = 0;
92
172
 
@@ -102,10 +182,18 @@ export function createRelayClient(config: OrchestratorConfig): RelayClient {
102
182
 
103
183
  async function heartbeat(): Promise<void> {
104
184
  try {
185
+ const runtime = runtimeMetadata();
186
+ const providerSnapshot = await probeCache.getSnapshot();
105
187
  const res = await apiCall("POST", `/orchestrators/${config.id}/heartbeat`, {
188
+ package: runtime.package,
189
+ contracts: runtime.contracts,
190
+ capabilities: runtime.capabilities,
106
191
  version: VERSION,
107
192
  protocolVersion: ORCHESTRATOR_PROTOCOL_VERSION,
108
193
  gitSha: GIT_SHA,
194
+ providers: providerSnapshot.providers,
195
+ providerStatus: providerSnapshot.providerStatus,
196
+ providerCatalog: providerSnapshot.providerCatalog,
109
197
  });
110
198
  if (!res.ok) throw new Error(`heartbeat failed: ${res.status}`);
111
199
  if (!connected) {
@@ -136,8 +224,11 @@ export function createRelayClient(config: OrchestratorConfig): RelayClient {
136
224
  }
137
225
  }
138
226
 
139
- async function updateManagedAgents(agents: ManagedAgentReport[]): Promise<void> {
140
- await apiCall("PATCH", `/orchestrators/${config.id}/agents`, { agents });
227
+ async function updateManagedAgents(agents: ManagedAgentReport[], exitedAgents: ManagedSessionExitDiagnostics[] = []): Promise<void> {
228
+ await apiCall("PATCH", `/orchestrators/${config.id}/agents`, {
229
+ agents,
230
+ ...(exitedAgents.length ? { exitedAgents } : {}),
231
+ });
141
232
  }
142
233
 
143
234
  async function pollCommands(): Promise<RelayCommand[]> {
@@ -0,0 +1,82 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { fileURLToPath } from "node:url";
3
+
4
+ export interface SelfSupervision {
5
+ supervisor: "process" | "systemd" | "unknown";
6
+ selfUnit?: string;
7
+ runtimePrefix?: string;
8
+ }
9
+
10
+ let cached: SelfSupervision | undefined;
11
+
12
+ /**
13
+ * Detect how this orchestrator process is supervised so the relay can target a
14
+ * remote self-upgrade at the correct systemd unit and install prefix. Result is
15
+ * stable for the process lifetime, so it is computed once and cached.
16
+ */
17
+ export function detectSelfSupervision(moduleUrl: string = import.meta.url): SelfSupervision {
18
+ if (cached) return cached;
19
+ cached = { supervisor: detectSupervisorRaw(), runtimePrefix: detectRuntimePrefix(moduleUrl) };
20
+ const unit = detectSystemdUnit();
21
+ if (unit) {
22
+ cached.supervisor = "systemd";
23
+ cached.selfUnit = unit;
24
+ }
25
+ return cached;
26
+ }
27
+
28
+ /** Reset the cache. Test-only. */
29
+ export function resetSelfSupervisionCache(): void {
30
+ cached = undefined;
31
+ }
32
+
33
+ function detectSupervisorRaw(): SelfSupervision["supervisor"] {
34
+ // /proc only exists on Linux; on macOS (launchd) we can't introspect cheaply.
35
+ try {
36
+ readFileSync("/proc/self/cgroup", "utf8");
37
+ return "process";
38
+ } catch {
39
+ return "unknown";
40
+ }
41
+ }
42
+
43
+ /**
44
+ * Parse the systemd unit owning this process from /proc/self/cgroup. A
45
+ * --user service cgroup looks like:
46
+ * 0::/user.slice/user-1000.slice/user@1000.service/app.slice/agent-relay-orchestrator.service
47
+ * We want the LAST `*.service` segment that isn't the user manager itself.
48
+ */
49
+ export function parseSystemdUnitFromCgroup(cgroup: string): string | undefined {
50
+ const services = cgroup
51
+ .split("\n")
52
+ .flatMap((line) => line.split("/"))
53
+ .filter((seg) => seg.endsWith(".service"))
54
+ .filter((seg) => !/^user@\d+\.service$/.test(seg) && seg !== "init.scope");
55
+ const last = services.at(-1);
56
+ return last && last.length > ".service".length ? last : undefined;
57
+ }
58
+
59
+ function detectSystemdUnit(): string | undefined {
60
+ try {
61
+ return parseSystemdUnitFromCgroup(readFileSync("/proc/self/cgroup", "utf8"));
62
+ } catch {
63
+ return undefined;
64
+ }
65
+ }
66
+
67
+ /**
68
+ * The install prefix is the directory above `node_modules` when the orchestrator
69
+ * runs from an installed package (e.g. ~/.agent-relay/runtime). Undefined when
70
+ * running from a source/workspace checkout.
71
+ */
72
+ export function detectRuntimePrefix(moduleUrl: string): string | undefined {
73
+ let path: string;
74
+ try {
75
+ path = fileURLToPath(moduleUrl);
76
+ } catch {
77
+ return undefined;
78
+ }
79
+ const marker = "/node_modules/";
80
+ const idx = path.indexOf(marker);
81
+ return idx >= 0 ? path.slice(0, idx) : undefined;
82
+ }
@@ -0,0 +1,143 @@
1
+ import { existsSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import type { OrchestratorConfig } from "./config";
4
+ import type { RelayClient, RelayCommand } from "./relay";
5
+ import { detectSelfSupervision } from "./self-supervision";
6
+
7
+ const VALID_PROVIDERS = new Set(["auto", "all", "codex", "claude", "orchestrator"]);
8
+ const SEMVER_RE = /^\d+\.\d+\.\d+(?:-[0-9A-Za-z.-]+)?$/;
9
+
10
+ export interface SelfUpgradeRunner {
11
+ run(cmd: string[]): Promise<{ exitCode: number; stdout: string; stderr: string }>;
12
+ commandExists(name: string): boolean;
13
+ }
14
+
15
+ const defaultRunner: SelfUpgradeRunner = {
16
+ async run(cmd) {
17
+ const proc = Bun.spawn({ cmd, stdout: "pipe", stderr: "pipe" });
18
+ const [stdout, stderr] = await Promise.all([
19
+ new Response(proc.stdout).text(),
20
+ new Response(proc.stderr).text(),
21
+ ]);
22
+ const exitCode = await proc.exited;
23
+ return { exitCode, stdout, stderr };
24
+ },
25
+ commandExists(name) {
26
+ try {
27
+ return Bun.spawnSync({ cmd: ["which", name], stdout: "ignore", stderr: "ignore" }).exitCode === 0;
28
+ } catch {
29
+ return false;
30
+ }
31
+ },
32
+ };
33
+
34
+ export interface SelfUpgradePlan {
35
+ targetVersion: string;
36
+ providers: string[];
37
+ unit: string;
38
+ runtimePrefix?: string;
39
+ binary: string;
40
+ installCmd: string[];
41
+ restartCmd: string[];
42
+ /** restart runs decoupled from this process's cgroup (transient unit) */
43
+ restartDetached: boolean;
44
+ }
45
+
46
+ /**
47
+ * Build the upgrade plan from a command's params + detected self-supervision.
48
+ * Throws (with an operator-facing message) when the request is invalid or the
49
+ * host can't self-restart — caught by the caller and reported as a failed command.
50
+ */
51
+ export function planSelfUpgrade(
52
+ params: Record<string, unknown>,
53
+ supervision = detectSelfSupervision(),
54
+ runner: SelfUpgradeRunner = defaultRunner,
55
+ ): SelfUpgradePlan {
56
+ const targetVersion = typeof params.targetVersion === "string" ? params.targetVersion.trim() : "";
57
+ if (!SEMVER_RE.test(targetVersion)) {
58
+ throw new Error(`invalid targetVersion "${targetVersion}" (expected x.y.z)`);
59
+ }
60
+ const providers = normalizeProviders(params.providers);
61
+
62
+ if (supervision.supervisor !== "systemd" || !supervision.selfUnit) {
63
+ throw new Error("orchestrator is not under systemd --user; remote self-upgrade requires a systemd unit (P1)");
64
+ }
65
+ const unit = supervision.selfUnit;
66
+ const binary = resolveBinary(supervision.runtimePrefix);
67
+
68
+ const installCmd = [
69
+ binary, "upgrade",
70
+ "--version", targetVersion,
71
+ "--providers", providers.join(","),
72
+ "--no-restart",
73
+ "--yes",
74
+ ];
75
+ if (supervision.runtimePrefix) installCmd.push("--runtime-prefix", supervision.runtimePrefix);
76
+
77
+ // Decouple the restart from this orchestrator's own cgroup: restarting our unit
78
+ // SIGTERMs us, and a child in our cgroup would be killed mid-restart. systemd-run
79
+ // schedules it as an independent transient unit that survives our teardown.
80
+ const restartDetached = runner.commandExists("systemd-run");
81
+ const restartCmd = restartDetached
82
+ ? ["systemd-run", "--user", "--collect", "--description", "agent-relay orchestrator self-upgrade restart", "systemctl", "--user", "restart", unit]
83
+ : ["setsid", "systemctl", "--user", "restart", unit];
84
+
85
+ return { targetVersion, providers, unit, runtimePrefix: supervision.runtimePrefix, binary, installCmd, restartCmd, restartDetached };
86
+ }
87
+
88
+ /**
89
+ * Handle an orchestrator.upgrade command: validate, install the target version
90
+ * WITHOUT restart (so install failures are caught while we're still alive), then
91
+ * launch a decoupled restart. The command is intentionally left "running" — the
92
+ * relay settles it by reconciling the version we report after we come back up.
93
+ */
94
+ export async function handleSelfUpgrade(
95
+ command: RelayCommand,
96
+ _config: OrchestratorConfig,
97
+ relay: RelayClient,
98
+ runner: SelfUpgradeRunner = defaultRunner,
99
+ ): Promise<void> {
100
+ const plan = planSelfUpgrade(command.params, detectSelfSupervision(), runner);
101
+ await relay.updateCommand(command.id, "running", {
102
+ phase: "installing",
103
+ targetVersion: plan.targetVersion,
104
+ providers: plan.providers,
105
+ unit: plan.unit,
106
+ });
107
+
108
+ const install = await runner.run(plan.installCmd);
109
+ if (install.exitCode !== 0) {
110
+ throw new Error(`install failed (exit ${install.exitCode}): ${(install.stderr || install.stdout).trim().slice(-500)}`);
111
+ }
112
+
113
+ await relay.updateCommand(command.id, "running", {
114
+ phase: "restart-pending",
115
+ targetVersion: plan.targetVersion,
116
+ unit: plan.unit,
117
+ restartDetached: plan.restartDetached,
118
+ });
119
+
120
+ // Fire the restart and return. We do not await its effect — it tears us down.
121
+ const restart = await runner.run(plan.restartCmd);
122
+ if (restart.exitCode !== 0) {
123
+ throw new Error(`restart failed (exit ${restart.exitCode}): ${(restart.stderr || restart.stdout).trim().slice(-500)}`);
124
+ }
125
+ console.error(`[orchestrator] self-upgrade to ${plan.targetVersion} installed; restart dispatched for ${plan.unit}`);
126
+ }
127
+
128
+ function normalizeProviders(value: unknown): string[] {
129
+ const list = Array.isArray(value)
130
+ ? value.filter((v): v is string => typeof v === "string").map((v) => v.trim()).filter(Boolean)
131
+ : [];
132
+ const filtered = list.filter((p) => VALID_PROVIDERS.has(p));
133
+ const providers = filtered.length ? filtered : ["orchestrator"];
134
+ return [...new Set(providers)];
135
+ }
136
+
137
+ function resolveBinary(runtimePrefix?: string): string {
138
+ if (runtimePrefix) {
139
+ const local = join(runtimePrefix, "node_modules", ".bin", "agent-relay");
140
+ if (existsSync(local)) return local;
141
+ }
142
+ return "agent-relay";
143
+ }