buildhive-agent 1.0.0-beta.11 → 1.0.0-beta.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@
14
14
  * agent-enrollment.platform_url — bound at enroll; refuses cross-instance reuse
15
15
  */
16
16
  import type { SecretStore } from '../security/secretStore.js';
17
+ import { CredentialFileStore } from './credentialFileStore.js';
17
18
  import { PlatformUrlMismatchError } from './types.js';
18
19
  export declare const AGENT_ENROLLMENT_KEY_PREFIX = "agent-enrollment.";
19
20
  export declare const KEY_JWT = "agent-enrollment.jwt";
@@ -31,6 +32,12 @@ export interface AgentEnrollmentCredentials {
31
32
  export interface AgentEnrollmentKeyringStoreOptions {
32
33
  /** Override the SecretStore (tests + environments without OS keyring). */
33
34
  readonly store?: SecretStore;
35
+ /**
36
+ * Override the 0600 credential-file mirror (row 21b / F2 daemon fallback).
37
+ * Tests inject a fake; production uses the real `~/.buildhive/` file.
38
+ * Pass `null` to disable the file mirror entirely (keyring-only).
39
+ */
40
+ readonly fileStore?: CredentialFileStore | null;
34
41
  }
35
42
  /**
36
43
  * Custom error thrown when the stored platform_url doesn't match the current
@@ -42,6 +49,8 @@ export declare class AgentPlatformUrlMismatchError extends PlatformUrlMismatchEr
42
49
  }
43
50
  export declare class AgentEnrollmentKeyringStore {
44
51
  private readonly store;
52
+ /** 0600 file mirror for the launchd daemon; null when explicitly disabled. */
53
+ private readonly fileStore;
45
54
  constructor(opts?: AgentEnrollmentKeyringStoreOptions);
46
55
  /** Throws KeyringUnavailableError if the OS keyring is unreachable. */
47
56
  static assertAvailable(): Promise<void>;
@@ -63,16 +72,21 @@ export declare class AgentEnrollmentKeyringStore {
63
72
  * Throws NotLoggedInError if any required key is missing.
64
73
  */
65
74
  readAll(): Promise<AgentEnrollmentCredentials>;
75
+ /**
76
+ * Read the 0600 file mirror, swallowing any error to null. Centralised so
77
+ * every read path (readJwt / readAll / hasEnrollment) shares one fallback.
78
+ */
79
+ private readFromFileMirror;
66
80
  /**
67
81
  * Cross-platform-url check. Refuses to use credentials minted against a
68
82
  * different BuildHive instance.
69
83
  */
70
84
  assertPlatformUrlMatches(currentPlatformUrl: string): Promise<void>;
71
- /** Returns true if any enrollment key exists in the keyring. */
85
+ /** Returns true if an enrollment JWT exists in the keyring OR the file mirror. */
72
86
  hasEnrollment(): Promise<boolean>;
73
87
  /**
74
- * Delete all 5 `agent-enrollment.*` keys. Idempotent.
75
- * Called by a hypothetical `buildhive-agent leave` command (out of scope for v1).
88
+ * Delete all 5 `agent-enrollment.*` keys AND the 0600 file mirror. Idempotent.
89
+ * Called by `buildhive-agent logout` (row 21b / F4) to fully de-enroll.
76
90
  */
77
91
  clear(): Promise<void>;
78
92
  }
@@ -14,7 +14,10 @@
14
14
  * agent-enrollment.platform_url — bound at enroll; refuses cross-instance reuse
15
15
  */
16
16
  import { KeyringSecretStore } from '../security/keyringSecretStore.js';
17
+ import { CredentialFileStore } from './credentialFileStore.js';
18
+ import { createLogger } from '../utils/logger.js';
17
19
  import { KeyringUnavailableError, NotLoggedInError, PlatformUrlMismatchError, } from './types.js';
20
+ const logger = createLogger('auth.agentEnrollmentKeyringStore');
18
21
  export const AGENT_ENROLLMENT_KEY_PREFIX = 'agent-enrollment.';
19
22
  export const KEY_JWT = `${AGENT_ENROLLMENT_KEY_PREFIX}jwt`;
20
23
  export const KEY_JWT_EXP = `${AGENT_ENROLLMENT_KEY_PREFIX}jwt_exp`;
@@ -44,8 +47,29 @@ export class AgentPlatformUrlMismatchError extends PlatformUrlMismatchError {
44
47
  }
45
48
  export class AgentEnrollmentKeyringStore {
46
49
  store;
50
+ /** 0600 file mirror for the launchd daemon; null when explicitly disabled. */
51
+ fileStore;
47
52
  constructor(opts = {}) {
48
53
  this.store = opts.store ?? new KeyringSecretStore();
54
+ // File-mirror resolution (row 21b / F2):
55
+ // - explicit `null` → disabled (keyring-only)
56
+ // - explicit CredentialFileStore → use it
57
+ // - omitted + real keyring → real `~/.buildhive/` mirror (production)
58
+ // - omitted + INJECTED keyring (tests) → disabled, so unit tests never
59
+ // read/write the real user home. A test exercising the mirror must
60
+ // inject its own `fileStore` explicitly.
61
+ if (opts.fileStore === null) {
62
+ this.fileStore = null;
63
+ }
64
+ else if (opts.fileStore) {
65
+ this.fileStore = opts.fileStore;
66
+ }
67
+ else if (opts.store) {
68
+ this.fileStore = null;
69
+ }
70
+ else {
71
+ this.fileStore = new CredentialFileStore();
72
+ }
49
73
  }
50
74
  /** Throws KeyringUnavailableError if the OS keyring is unreachable. */
51
75
  static async assertAvailable() {
@@ -75,6 +99,21 @@ export class AgentEnrollmentKeyringStore {
75
99
  await this.clear().catch(() => undefined);
76
100
  throw err;
77
101
  }
102
+ // Row 21b / F2: mirror to the 0600 file so the launchd daemon (which
103
+ // cannot read the interactive login keychain) can resolve the JWT.
104
+ // The keychain write above is the source of truth; a file-mirror failure
105
+ // is non-fatal here (the agent is enrolled for foreground use) but it
106
+ // leaves the daemon unable to start — `join`'s post-bootstrap self-check
107
+ // (Fix D) and the doctor will surface that loudly rather than silently.
108
+ if (this.fileStore) {
109
+ try {
110
+ await this.fileStore.write(creds);
111
+ }
112
+ catch (err) {
113
+ logger.warn('Could not write the 0600 credential mirror — the background daemon may ' +
114
+ 'not be able to read the JWT. Foreground `buildhive-agent start` is unaffected.', { err: err instanceof Error ? err.message : String(err) });
115
+ }
116
+ }
78
117
  }
79
118
  /**
80
119
  * Read the stored JWT + its cached exp. Throws NotLoggedInError if absent
@@ -85,13 +124,17 @@ export class AgentEnrollmentKeyringStore {
85
124
  this.store.getSecret(KEY_JWT),
86
125
  this.store.getSecret(KEY_JWT_EXP),
87
126
  ]);
88
- if (!token || !expStr)
89
- throw new NotLoggedInError();
90
- const exp = Number(expStr);
91
- return {
92
- jwt: token,
93
- expiresAtUnix: Number.isFinite(exp) ? exp : 0,
94
- };
127
+ if (token && expStr) {
128
+ const exp = Number(expStr);
129
+ return { jwt: token, expiresAtUnix: Number.isFinite(exp) ? exp : 0 };
130
+ }
131
+ // Row 21b / F2: keychain returned nothing (the daemon case) — fall back
132
+ // to the 0600 file mirror before declaring "not enrolled".
133
+ const fromFile = await this.readFromFileMirror();
134
+ if (fromFile) {
135
+ return { jwt: fromFile.jwt, expiresAtUnix: fromFile.jwtExpiresAtUnix };
136
+ }
137
+ throw new NotLoggedInError();
95
138
  }
96
139
  /**
97
140
  * Read the full stored credentials (for display or row-17b supervisor).
@@ -105,43 +148,79 @@ export class AgentEnrollmentKeyringStore {
105
148
  this.store.getSecret(KEY_TENANT_ID),
106
149
  this.store.getSecret(KEY_PLATFORM_URL),
107
150
  ]);
108
- if (!jwt || !expStr || !agentId || !tenantId || !platformUrl) {
109
- throw new NotLoggedInError();
110
- }
111
- const exp = Number(expStr);
112
- return {
113
- jwt,
114
- jwtExpiresAtUnix: Number.isFinite(exp) ? exp : 0,
115
- agentId,
116
- tenantId,
117
- platformUrl,
118
- };
151
+ if (jwt && expStr && agentId && tenantId && platformUrl) {
152
+ const exp = Number(expStr);
153
+ return {
154
+ jwt,
155
+ jwtExpiresAtUnix: Number.isFinite(exp) ? exp : 0,
156
+ agentId,
157
+ tenantId,
158
+ platformUrl,
159
+ };
160
+ }
161
+ // Row 21b / F2: the launchd daemon's keychain read comes back empty.
162
+ // Fall back to the 0600 file mirror written by `join` before throwing.
163
+ const fromFile = await this.readFromFileMirror();
164
+ if (fromFile)
165
+ return fromFile;
166
+ throw new NotLoggedInError();
167
+ }
168
+ /**
169
+ * Read the 0600 file mirror, swallowing any error to null. Centralised so
170
+ * every read path (readJwt / readAll / hasEnrollment) shares one fallback.
171
+ */
172
+ async readFromFileMirror() {
173
+ if (!this.fileStore)
174
+ return null;
175
+ try {
176
+ const creds = await this.fileStore.read();
177
+ if (creds) {
178
+ logger.info('Resolved agent credentials from the 0600 file mirror (keychain unavailable in this session)');
179
+ }
180
+ return creds;
181
+ }
182
+ catch {
183
+ return null;
184
+ }
119
185
  }
120
186
  /**
121
187
  * Cross-platform-url check. Refuses to use credentials minted against a
122
188
  * different BuildHive instance.
123
189
  */
124
190
  async assertPlatformUrlMatches(currentPlatformUrl) {
125
- const stored = await this.store.getSecret(KEY_PLATFORM_URL);
126
- if (!stored)
127
- throw new NotLoggedInError();
191
+ let stored = await this.store.getSecret(KEY_PLATFORM_URL);
192
+ if (!stored) {
193
+ // Row 21b / F2: keep the cross-instance guard working in the daemon
194
+ // session, where the keychain read comes back empty — fall back to the
195
+ // 0600 file mirror before declaring "not enrolled".
196
+ const fromFile = await this.readFromFileMirror();
197
+ if (!fromFile)
198
+ throw new NotLoggedInError();
199
+ stored = fromFile.platformUrl;
200
+ }
128
201
  if (normalizeUrl(stored) !== normalizeUrl(currentPlatformUrl)) {
129
202
  throw new AgentPlatformUrlMismatchError(stored, currentPlatformUrl);
130
203
  }
131
204
  }
132
- /** Returns true if any enrollment key exists in the keyring. */
205
+ /** Returns true if an enrollment JWT exists in the keyring OR the file mirror. */
133
206
  async hasEnrollment() {
134
207
  const jwt = await this.store.getSecret(KEY_JWT);
135
- return jwt !== null && jwt.length > 0;
208
+ if (jwt !== null && jwt.length > 0)
209
+ return true;
210
+ const fromFile = await this.readFromFileMirror();
211
+ return fromFile !== null;
136
212
  }
137
213
  /**
138
- * Delete all 5 `agent-enrollment.*` keys. Idempotent.
139
- * Called by a hypothetical `buildhive-agent leave` command (out of scope for v1).
214
+ * Delete all 5 `agent-enrollment.*` keys AND the 0600 file mirror. Idempotent.
215
+ * Called by `buildhive-agent logout` (row 21b / F4) to fully de-enroll.
140
216
  */
141
217
  async clear() {
142
218
  for (const k of ALL_KEYS) {
143
219
  await this.store.deleteSecret(k).catch(() => false);
144
220
  }
221
+ if (this.fileStore) {
222
+ await this.fileStore.delete().catch(() => undefined);
223
+ }
145
224
  }
146
225
  }
147
226
  function normalizeUrl(u) {
@@ -0,0 +1,79 @@
1
+ /**
2
+ * credentialFileStore — 0600 on-disk fallback for the agent-enrollment
3
+ * credentials, so the **launchd-managed daemon** can read its JWT.
4
+ *
5
+ * ── Why this exists (row 21b / F2, 2026-06-02) ───────────────────────────────
6
+ * The 2026-06-01 verification walk found the LaunchAgent daemon crashlooping
7
+ * with `last exit code = 1` and empty logs. Root cause (proven in
8
+ * docs/ops/diagnosis-f3-agent-daemon-2026-06-02.html): the macOS Keychain item
9
+ * holding the JWT is created by the *foreground* `join` process (Terminal
10
+ * security session). The launchd-spawned daemon runs in a different security
11
+ * session (`SessionCreate=true`), so `@napi-rs/keyring`'s `getPassword()`
12
+ * returns `null` for it (silent access-deny). The daemon then reads "no JWT" →
13
+ * "Not enrolled" → exits 1 → launchd respawns → silent crashloop.
14
+ *
15
+ * GitHub's own `actions/runner` `svc.sh` stores its `.credentials` as files in
16
+ * the runner directory for exactly this reason — a launchd/systemd service
17
+ * cannot rely on the interactive login keychain.
18
+ *
19
+ * ── Design ───────────────────────────────────────────────────────────────────
20
+ * The OS keyring stays the PRIMARY store (used by foreground `start`, where it
21
+ * works and is the more-secure option). This file is a daemon-readable MIRROR:
22
+ * `join` writes both; the resolver falls back to the file only when the keyring
23
+ * returns nothing (the daemon case).
24
+ *
25
+ * Security:
26
+ * - File mode 0600 (owner read/write only), parent dir 0700.
27
+ * - Lives under the user's own `~/.buildhive/` — same trust boundary as the
28
+ * actions/runner `.credentials` file and the workspaces dir.
29
+ * - Never packaged (runtime state in $HOME, not in the npm tarball).
30
+ *
31
+ * This module is pure-where-possible: fs + homedir are injectable for tests.
32
+ */
33
+ import type { AgentEnrollmentCredentials } from './agentEnrollmentKeyringStore.js';
34
+ /** Basename of the credential mirror file under `~/.buildhive/`. */
35
+ export declare const CREDENTIAL_FILE_NAME = "agent-enrollment.cred";
36
+ /** Injectable fs surface (subset of node:fs/promises we use). */
37
+ export interface CredentialFileFs {
38
+ readonly mkdir: (path: string, opts: {
39
+ recursive: boolean;
40
+ mode?: number;
41
+ }) => Promise<string | undefined>;
42
+ readonly writeFile: (path: string, data: string, opts: {
43
+ encoding: 'utf8';
44
+ mode?: number;
45
+ }) => Promise<void>;
46
+ readonly chmod: (path: string, mode: number) => Promise<void>;
47
+ readonly rename: (from: string, to: string) => Promise<void>;
48
+ readonly readFile: (path: string, encoding: 'utf8') => Promise<string>;
49
+ readonly unlink: (path: string) => Promise<void>;
50
+ }
51
+ export interface CredentialFileStoreOptions {
52
+ /** Override the home directory (tests + multi-user). */
53
+ readonly homeDir?: string;
54
+ /** Override the fs surface (tests). */
55
+ readonly fs?: CredentialFileFs;
56
+ }
57
+ /**
58
+ * Reads/writes the agent-enrollment credentials as a single 0600 JSON file.
59
+ */
60
+ export declare class CredentialFileStore {
61
+ private readonly fs;
62
+ private readonly dir;
63
+ private readonly filePath;
64
+ constructor(opts?: CredentialFileStoreOptions);
65
+ /** Absolute path to the credential file (for diagnostics + doctor). */
66
+ get path(): string;
67
+ /**
68
+ * Atomically write the credential mirror with 0600 perms.
69
+ * tmp-file → chmod 0600 → rename avoids a torn read by the daemon.
70
+ */
71
+ write(creds: AgentEnrollmentCredentials): Promise<void>;
72
+ /**
73
+ * Read the credential mirror. Returns null when absent or malformed — never
74
+ * throws, so the resolver can cleanly fall through to "not enrolled".
75
+ */
76
+ read(): Promise<AgentEnrollmentCredentials | null>;
77
+ /** Delete the credential mirror. Idempotent (ENOENT is not an error). */
78
+ delete(): Promise<void>;
79
+ }
@@ -0,0 +1,140 @@
1
+ /**
2
+ * credentialFileStore — 0600 on-disk fallback for the agent-enrollment
3
+ * credentials, so the **launchd-managed daemon** can read its JWT.
4
+ *
5
+ * ── Why this exists (row 21b / F2, 2026-06-02) ───────────────────────────────
6
+ * The 2026-06-01 verification walk found the LaunchAgent daemon crashlooping
7
+ * with `last exit code = 1` and empty logs. Root cause (proven in
8
+ * docs/ops/diagnosis-f3-agent-daemon-2026-06-02.html): the macOS Keychain item
9
+ * holding the JWT is created by the *foreground* `join` process (Terminal
10
+ * security session). The launchd-spawned daemon runs in a different security
11
+ * session (`SessionCreate=true`), so `@napi-rs/keyring`'s `getPassword()`
12
+ * returns `null` for it (silent access-deny). The daemon then reads "no JWT" →
13
+ * "Not enrolled" → exits 1 → launchd respawns → silent crashloop.
14
+ *
15
+ * GitHub's own `actions/runner` `svc.sh` stores its `.credentials` as files in
16
+ * the runner directory for exactly this reason — a launchd/systemd service
17
+ * cannot rely on the interactive login keychain.
18
+ *
19
+ * ── Design ───────────────────────────────────────────────────────────────────
20
+ * The OS keyring stays the PRIMARY store (used by foreground `start`, where it
21
+ * works and is the more-secure option). This file is a daemon-readable MIRROR:
22
+ * `join` writes both; the resolver falls back to the file only when the keyring
23
+ * returns nothing (the daemon case).
24
+ *
25
+ * Security:
26
+ * - File mode 0600 (owner read/write only), parent dir 0700.
27
+ * - Lives under the user's own `~/.buildhive/` — same trust boundary as the
28
+ * actions/runner `.credentials` file and the workspaces dir.
29
+ * - Never packaged (runtime state in $HOME, not in the npm tarball).
30
+ *
31
+ * This module is pure-where-possible: fs + homedir are injectable for tests.
32
+ */
33
+ import { homedir } from 'node:os';
34
+ import { join } from 'node:path';
35
+ import { promises as defaultFsPromises } from 'node:fs';
36
+ /** Basename of the credential mirror file under `~/.buildhive/`. */
37
+ export const CREDENTIAL_FILE_NAME = 'agent-enrollment.cred';
38
+ const defaultFs = {
39
+ mkdir: (p, o) => defaultFsPromises.mkdir(p, o),
40
+ writeFile: (p, d, o) => defaultFsPromises.writeFile(p, d, o),
41
+ chmod: (p, m) => defaultFsPromises.chmod(p, m),
42
+ rename: (a, b) => defaultFsPromises.rename(a, b),
43
+ readFile: (p, e) => defaultFsPromises.readFile(p, e),
44
+ unlink: (p) => defaultFsPromises.unlink(p),
45
+ };
46
+ /**
47
+ * Reads/writes the agent-enrollment credentials as a single 0600 JSON file.
48
+ */
49
+ export class CredentialFileStore {
50
+ fs;
51
+ dir;
52
+ filePath;
53
+ constructor(opts = {}) {
54
+ this.fs = opts.fs ?? defaultFs;
55
+ const home = opts.homeDir ?? homedir();
56
+ this.dir = join(home, '.buildhive');
57
+ this.filePath = join(this.dir, CREDENTIAL_FILE_NAME);
58
+ }
59
+ /** Absolute path to the credential file (for diagnostics + doctor). */
60
+ get path() {
61
+ return this.filePath;
62
+ }
63
+ /**
64
+ * Atomically write the credential mirror with 0600 perms.
65
+ * tmp-file → chmod 0600 → rename avoids a torn read by the daemon.
66
+ */
67
+ async write(creds) {
68
+ await this.fs.mkdir(this.dir, { recursive: true, mode: 0o700 });
69
+ const tmpPath = `${this.filePath}.tmp.${process.pid}`;
70
+ const payload = JSON.stringify({
71
+ jwt: creds.jwt,
72
+ jwtExpiresAtUnix: creds.jwtExpiresAtUnix,
73
+ agentId: creds.agentId,
74
+ tenantId: creds.tenantId,
75
+ platformUrl: creds.platformUrl,
76
+ });
77
+ await this.fs.writeFile(tmpPath, payload, { encoding: 'utf8', mode: 0o600 });
78
+ try {
79
+ // Belt-and-suspenders over the writeFile mode (umask can clear bits).
80
+ await this.fs.chmod(tmpPath, 0o600);
81
+ await this.fs.rename(tmpPath, this.filePath);
82
+ }
83
+ catch (err) {
84
+ // Don't leave a tmp file holding the JWT on a chmod/rename failure
85
+ // (ENOSPC, cross-device, mid-op crash). Best-effort cleanup, re-throw.
86
+ await this.fs.unlink(tmpPath).catch(() => undefined);
87
+ throw err;
88
+ }
89
+ }
90
+ /**
91
+ * Read the credential mirror. Returns null when absent or malformed — never
92
+ * throws, so the resolver can cleanly fall through to "not enrolled".
93
+ */
94
+ async read() {
95
+ let raw;
96
+ try {
97
+ raw = await this.fs.readFile(this.filePath, 'utf8');
98
+ }
99
+ catch {
100
+ return null; // ENOENT or unreadable
101
+ }
102
+ let parsed;
103
+ try {
104
+ parsed = JSON.parse(raw);
105
+ }
106
+ catch {
107
+ return null;
108
+ }
109
+ if (!parsed || typeof parsed !== 'object')
110
+ return null;
111
+ const o = parsed;
112
+ if (typeof o.jwt !== 'string' ||
113
+ typeof o.agentId !== 'string' ||
114
+ typeof o.tenantId !== 'string' ||
115
+ typeof o.platformUrl !== 'string' ||
116
+ o.jwt.length === 0 ||
117
+ o.agentId.length === 0 ||
118
+ o.tenantId.length === 0 ||
119
+ o.platformUrl.length === 0) {
120
+ return null;
121
+ }
122
+ const exp = typeof o.jwtExpiresAtUnix === 'number' ? o.jwtExpiresAtUnix : 0;
123
+ return {
124
+ jwt: o.jwt,
125
+ jwtExpiresAtUnix: Number.isFinite(exp) ? exp : 0,
126
+ agentId: o.agentId,
127
+ tenantId: o.tenantId,
128
+ platformUrl: o.platformUrl,
129
+ };
130
+ }
131
+ /** Delete the credential mirror. Idempotent (ENOENT is not an error). */
132
+ async delete() {
133
+ try {
134
+ await this.fs.unlink(this.filePath);
135
+ }
136
+ catch {
137
+ // best-effort — file may already be gone
138
+ }
139
+ }
140
+ }
@@ -38,6 +38,21 @@ export interface ServiceInstallerInjection {
38
38
  paths: ServicePaths;
39
39
  bootstrapStdout: string;
40
40
  }>;
41
+ /**
42
+ * Row 21b / Fix D — read the launchd service health after bootstrap so
43
+ * `join` can self-verify the daemon actually started (instead of silently
44
+ * installing a crashlooping daemon). Optional: when omitted, the real
45
+ * `getServiceStatus` is used via dynamic import.
46
+ */
47
+ readonly getServiceStatus?: (opts: {
48
+ mode: 'user' | 'system';
49
+ label?: string;
50
+ homeDir?: string;
51
+ }) => Promise<{
52
+ loaded: boolean;
53
+ state: 'running' | 'waiting' | 'unknown' | null;
54
+ lastExitCode: number | null;
55
+ }>;
41
56
  }
42
57
  export interface JoinOptions {
43
58
  readonly token: string;
@@ -69,6 +84,14 @@ export interface JoinOptions {
69
84
  * the test runner, not at dist/cli.js).
70
85
  */
71
86
  readonly cliEntryPathOverride?: string;
87
+ /**
88
+ * Row 21b / Fix D — how long to wait after bootstrap before reading the
89
+ * daemon's launchd health (gives RunAtLoad + the FB16131937 startup guard
90
+ * time to fire). Default 3000ms; tests pass 0.
91
+ */
92
+ readonly verifyDelayMs?: number;
93
+ /** Sleep injection for the Fix-D verify delay (tests pass a no-op). */
94
+ readonly sleepFn?: (ms: number) => Promise<void>;
72
95
  }
73
96
  export interface JoinResult {
74
97
  readonly exitCode: 0 | 1 | 2 | 3;
@@ -195,19 +195,24 @@ export async function runJoin(opts) {
195
195
  }
196
196
  // 8. Print enrollment success.
197
197
  const agentIdShort = agentId.slice(0, 8);
198
- console.log(`✓ Agent enrolled in team "${teamName}" as agent ${agentIdShort}…`);
198
+ // teamName is cosmetic; guard against an older/forward backend that omits it.
199
+ console.log(`✓ Agent enrolled in team "${teamName ?? '(unknown)'}" as agent ${agentIdShort}…`);
199
200
  console.log(`✓ JWT stored in OS keyring (expires ${expiresDate}).`);
200
201
  // 9. S-1: Auto-install LaunchAgent so the dev never thinks about persistence.
201
202
  // Idempotent — re-running `join` cleanly upgrades the plist (the underlying
202
203
  // installer uses an atomic tmp-file → rename).
204
+ // Row 21b / Fix D: maybeInstallLaunchAgent ALSO self-verifies the daemon
205
+ // actually came up; it returns false (with a loud message) when the daemon
206
+ // installed but crashed, so we never claim "set and forget" falsely.
203
207
  const serviceInstalled = await maybeInstallLaunchAgent(opts);
204
208
  if (serviceInstalled) {
205
- console.log('✓ Agent installed and will auto-start on every login. ' +
209
+ console.log('✓ Agent installed and running — it will auto-start on every login. ' +
206
210
  'Inspect logs with `buildhive-agent logs`.');
207
211
  }
208
212
  else {
209
- // Either non-macOS, opt-out, or install failed. Fall back to the
210
- // foreground-start guidance the user can still execute.
213
+ // Either non-macOS, opt-out, install failed, OR the daemon did not start
214
+ // cleanly (Fix D). maybeInstallLaunchAgent already printed the specific
215
+ // reason; give the foreground fallback the user can always run.
211
216
  console.log('Run `buildhive-agent start` to begin picking up workflow jobs.');
212
217
  }
213
218
  return { exitCode: 0, serviceInstalled };
@@ -265,7 +270,6 @@ async function maybeInstallLaunchAgent(opts) {
265
270
  }
266
271
  try {
267
272
  await installFn({ mode: 'user', cliEntryPath });
268
- return true;
269
273
  }
270
274
  catch (err) {
271
275
  // Already-loaded-service is a benign case — installService's atomic
@@ -277,6 +281,56 @@ async function maybeInstallLaunchAgent(opts) {
277
281
  'or run the agent in the foreground via `buildhive-agent start`.');
278
282
  return false;
279
283
  }
284
+ // Row 21b / Fix D — self-verify the daemon actually started. The 2026-06-01
285
+ // walk installed a daemon that crashlooped silently (last exit code = 1) yet
286
+ // join reported success. Now we wait, read launchd health, and fail LOUD.
287
+ return verifyDaemonStarted(opts);
288
+ }
289
+ /**
290
+ * Fix D: after bootstrap, give launchd's RunAtLoad + the FB16131937 startup
291
+ * guard a moment to fire, then read the daemon's health. Returns true only when
292
+ * the daemon is genuinely up (loaded, not crashed). On a crash it prints an
293
+ * actionable message and returns false so `join` recommends the foreground
294
+ * fallback instead of claiming "set and forget".
295
+ */
296
+ async function verifyDaemonStarted(opts) {
297
+ const sleep = opts.sleepFn ?? ((ms) => new Promise((r) => setTimeout(r, ms)));
298
+ const delay = opts.verifyDelayMs ?? 3000;
299
+ // Resolve the status reader (injected for tests, real one otherwise).
300
+ let statusFn;
301
+ if (opts.serviceInstaller?.getServiceStatus) {
302
+ statusFn = opts.serviceInstaller.getServiceStatus;
303
+ }
304
+ else {
305
+ try {
306
+ const mod = await import('../service/serviceInstaller.js');
307
+ statusFn = mod.getServiceStatus;
308
+ }
309
+ catch {
310
+ // Can't verify — assume installed (don't block enrollment), but say so.
311
+ console.error('[join] Service installed; could not verify daemon health automatically.');
312
+ return true;
313
+ }
314
+ }
315
+ await sleep(delay);
316
+ let status;
317
+ try {
318
+ status = await statusFn({ mode: 'user' });
319
+ }
320
+ catch {
321
+ console.error('[join] Service installed; could not read daemon health (launchctl print failed).');
322
+ return true;
323
+ }
324
+ const crashed = status.lastExitCode !== null && status.lastExitCode !== 0;
325
+ if (!status.loaded || crashed) {
326
+ console.error('[join] ⚠ The background service was installed but did NOT start cleanly' +
327
+ (crashed ? ` (last exit code = ${status.lastExitCode}).` : ' (not loaded).'));
328
+ console.error('[join] Diagnose with `buildhive-agent doctor`, read the log with ' +
329
+ '`buildhive-agent logs` (~/.buildhive/logs/buildhive-agent.log), ' +
330
+ 'or run `buildhive-agent start` in the foreground for now.');
331
+ return false;
332
+ }
333
+ return true;
280
334
  }
281
335
  /** Pull tenant_id from JWT payload without signature verification. */
282
336
  function extractTenantId(jwtToken) {
@@ -291,14 +291,29 @@ export function checkServiceLoaded(deps) {
291
291
  };
292
292
  }
293
293
  }
294
- // 16: agent process is actively running (as reported by launchctl print's
295
- // `state` field). Read-only on the cached print output from check 15.
294
+ // 16: agent process is actually healthy (as reported by launchctl print's
295
+ // `state` AND `last exit code` fields). Read-only on the cached print output
296
+ // from check 15.
296
297
  //
297
- // `state = running` → pass
298
- // `state = waiting` → warn (KeepAlive will restart it; not necessarily a bug)
299
- // anything else → warn (unknown launchd state surface verbatim)
300
- // not loaded → fail (upstream check 15 already failed; chain here so
301
- // we don't double-report)
298
+ // Row 21b / Fix C — HONESTY. The old version returned a soft WARNING for
299
+ // `state = spawn scheduled`, which HID a crashlooping daemon (the 2026-06-01
300
+ // walk saw 14/15 "healthy" while the daemon was dead, last exit code = 1).
301
+ // New contract:
302
+ // last exit code != 0 → FAIL (daemon crashed / is crashlooping)
303
+ // state = running, exit 0/absent → PASS
304
+ // state = spawn scheduled → WARN (launchd is throttling respawns)
305
+ // state = waiting → WARN (KeepAlive should restart it)
306
+ // anything else → WARN (unknown — surface verbatim)
307
+ // not loaded → FAIL (upstream check 15 already failed)
308
+ const SERVICE_CRASH_FIX =
309
+ // NB: the daemon logs to ~/.buildhive/logs/buildhive-agent.log (via pino-roll),
310
+ // NOT the launchd StandardOutPath file, which is empty in production. Point
311
+ // operators at `buildhive-agent logs` so they don\'t chase the empty stdout
312
+ // file the 2026-06-01 walk got stuck on.
313
+ 'The background daemon crashed. Inspect its log with `buildhive-agent logs` ' +
314
+ '(file: ~/.buildhive/logs/buildhive-agent.log), then re-run ' +
315
+ '`buildhive-agent join <token>` (re-enrolls + reinstalls). ' +
316
+ 'You can always run `buildhive-agent start` in the foreground meanwhile.';
302
317
  export function checkServiceRunning(deps, printOutput) {
303
318
  if (deps.platform !== 'darwin') {
304
319
  return {
@@ -312,18 +327,39 @@ export function checkServiceRunning(deps, printOutput) {
312
327
  message: 'Service not loaded (see previous check)',
313
328
  };
314
329
  }
315
- const stateMatch = printOutput.stdout.match(/state\s*=\s*(\w+)/);
330
+ // Parse `last exit code = N` first — a non-zero value means the daemon died,
331
+ // and takes precedence over whatever transient `state` launchd reports.
332
+ const exitMatch = printOutput.stdout.match(/last exit code\s*=\s*(-?\d+)/i);
333
+ const lastExitCode = exitMatch ? parseInt(exitMatch[1], 10) : null;
334
+ if (lastExitCode !== null && lastExitCode !== 0) {
335
+ return {
336
+ name: 'Agent process running', status: 'fail',
337
+ message: `Daemon is not healthy — last exit code = ${lastExitCode} (crashing on start)`,
338
+ fix: SERVICE_CRASH_FIX,
339
+ };
340
+ }
341
+ // `state = spawn scheduled` has a space — capture the whole value, not just
342
+ // the first word (the old `\w+` regex truncated it to "spawn"). Anchor on
343
+ // start-of-string OR newline so the first (top-level) `state =` is taken.
344
+ const stateMatch = printOutput.stdout.match(/(?:^|\n)\s*state\s*=\s*([^\n]+)/);
316
345
  if (!stateMatch) {
317
346
  return {
318
347
  name: 'Agent process running', status: 'warn',
319
348
  message: 'Could not parse `state =` from launchctl print output',
320
349
  };
321
350
  }
322
- const stateRaw = stateMatch[1];
351
+ const stateRaw = stateMatch[1].trim();
323
352
  const state = stateRaw.toLowerCase();
324
353
  if (state.includes('run')) {
325
354
  return { name: 'Agent process running', status: 'pass', message: `state = ${stateRaw}` };
326
355
  }
356
+ if (state.includes('spawn')) {
357
+ return {
358
+ name: 'Agent process running', status: 'warn',
359
+ message: `Daemon is being (re)spawned by launchd (state = ${stateRaw}); re-run doctor in a few seconds`,
360
+ fix: SERVICE_CRASH_FIX,
361
+ };
362
+ }
327
363
  if (state.includes('wait')) {
328
364
  return {
329
365
  name: 'Agent process running', status: 'warn',
@@ -14,15 +14,25 @@
14
14
  *
15
15
  * Row 17b — Wave 1 (agent supervisor + actions/runner spawn).
16
16
  * Row 18b S7 — Wave 2 (cache env injection into runner spawn).
17
+ * Row 21b — Fix A: the default single-slot `start` now REPLENISHES. Each
18
+ * ephemeral runner still handles one job then exits (binding constraint #5),
19
+ * but the supervisor re-registers and keeps listening until SIGTERM, so the
20
+ * agent runs forever ("set and forget") instead of stopping after one job.
17
21
  *
18
22
  * Exit-code contract:
19
- * 0 — runner completed a job and exited cleanly (--ephemeral one-shot)
20
- * 1 fatal error (auth, config, binary download, registration)
23
+ * 0 — supervisor drained cleanly after SIGTERM/SIGINT (the normal stop path),
24
+ * OR (pre-start) a successful no-op. The supervisor does NOT exit after a
25
+ * single job anymore.
26
+ * 1 — fatal pre-supervisor error (auth, no JWT, no enrolled repo, binary
27
+ * download, unsupported platform handled below)
21
28
  * 2 — platform not supported
22
29
  */
23
30
  import { AgentEnrollmentKeyringStore } from '../auth/agentEnrollmentKeyringStore.js';
31
+ import { fetchRunnerToken } from './tokenClient.js';
24
32
  import { fetchMyRepos } from './myReposClient.js';
25
33
  import { ensureRunner } from './binaryFetcher.js';
34
+ import { configureRunner, runRunner } from './supervisor.js';
35
+ import { startHeartbeat } from './heartbeat.js';
26
36
  import { type PoolDeps } from './pool.js';
27
37
  import type { EnsureRunnerOptions } from './binaryFetcher.js';
28
38
  import type { CacheEnvDeps } from './cacheEnv.js';
@@ -61,6 +71,23 @@ export interface StartOptions {
61
71
  * Allows injecting fake configureRunner / runRunner / fetchRunnerToken / prepareSlotDir.
62
72
  */
63
73
  readonly poolDeps?: Partial<PoolDeps>;
74
+ /**
75
+ * Dependency injection for the N==1 replenishing supervisor (Row 21b / Fix A).
76
+ * Tests inject fakes to drive the replenishment loop deterministically and
77
+ * assert backoff / replenish behaviour. Omit in production (real impls used).
78
+ */
79
+ readonly supervisorDeps?: Partial<{
80
+ configureRunner: typeof configureRunner;
81
+ runRunner: typeof runRunner;
82
+ fetchRunnerToken: typeof fetchRunnerToken;
83
+ }>;
84
+ /**
85
+ * Dependency injection for the heartbeat (tests only). The real
86
+ * {@link startHeartbeat} POSTs to the backend on a 30s timer and once more
87
+ * on `stop()`; injecting a no-op keeps the supervisor-loop tests off the
88
+ * network and deterministic. Omit in production.
89
+ */
90
+ readonly startHeartbeatFn?: typeof startHeartbeat;
64
91
  }
65
92
  export interface StartResult {
66
93
  readonly exitCode: 0 | 1 | 2;
@@ -14,10 +14,17 @@
14
14
  *
15
15
  * Row 17b — Wave 1 (agent supervisor + actions/runner spawn).
16
16
  * Row 18b S7 — Wave 2 (cache env injection into runner spawn).
17
+ * Row 21b — Fix A: the default single-slot `start` now REPLENISHES. Each
18
+ * ephemeral runner still handles one job then exits (binding constraint #5),
19
+ * but the supervisor re-registers and keeps listening until SIGTERM, so the
20
+ * agent runs forever ("set and forget") instead of stopping after one job.
17
21
  *
18
22
  * Exit-code contract:
19
- * 0 — runner completed a job and exited cleanly (--ephemeral one-shot)
20
- * 1 fatal error (auth, config, binary download, registration)
23
+ * 0 — supervisor drained cleanly after SIGTERM/SIGINT (the normal stop path),
24
+ * OR (pre-start) a successful no-op. The supervisor does NOT exit after a
25
+ * single job anymore.
26
+ * 1 — fatal pre-supervisor error (auth, no JWT, no enrolled repo, binary
27
+ * download, unsupported platform handled below)
21
28
  * 2 — platform not supported
22
29
  */
23
30
  import os from 'os';
@@ -32,6 +39,29 @@ import { startHeartbeat } from './heartbeat.js';
32
39
  import { collectDeviceInfo } from './deviceInfo.js';
33
40
  import { runPool } from './pool.js';
34
41
  const logger = createLogger('runner.startCommand');
42
+ /**
43
+ * Row 21b / Fix A — supervisor backoff bounds.
44
+ *
45
+ * When a registration cycle fails (token fetch or config.sh), the supervisor
46
+ * does NOT exit (that would crashloop under launchd KeepAlive). It backs off
47
+ * with capped exponential delay and retries, so a transient backend blip
48
+ * self-heals and a persistent fault (e.g. repo de-enrolled) loops calmly —
49
+ * surfaced loudly by the dashboard (OFFLINE) and `buildhive-agent doctor`.
50
+ */
51
+ const SUPERVISOR_BACKOFF_BASE_MS = 2_000;
52
+ const SUPERVISOR_BACKOFF_MAX_MS = 30_000;
53
+ /**
54
+ * Sleep up to `ms`, but wake early (≤250ms) once `isCancelled()` flips true so
55
+ * a SIGTERM during a backoff drains promptly instead of waiting the full delay.
56
+ */
57
+ async function interruptibleSleep(ms, isCancelled) {
58
+ const deadline = Date.now() + ms;
59
+ while (Date.now() < deadline) {
60
+ if (isCancelled())
61
+ return;
62
+ await new Promise((r) => setTimeout(r, Math.min(250, deadline - Date.now())));
63
+ }
64
+ }
35
65
  /**
36
66
  * Attempt to read a JWT from the keyring.
37
67
  * Prefers agent_enrollment_jwt (row 17c path); falls back to device_flow_jwt
@@ -221,11 +251,30 @@ export async function runStart(opts = {}) {
221
251
  // Device specs (cpuCores, memoryGB) are collected once here and ride every
222
252
  // heartbeat so the dashboard shows real hardware (Task #28).
223
253
  const deviceInfo = collectDeviceInfo();
224
- const heartbeat = startHeartbeat({ platformUrl, jwt, deviceInfo });
254
+ const startHeartbeatFn = opts.startHeartbeatFn ?? startHeartbeat;
255
+ const heartbeat = startHeartbeatFn({ platformUrl, jwt, deviceInfo });
225
256
  // ── Step 7: Branch on concurrency ─────────────────────────────────────────────
226
257
  if (concurrency === 1) {
227
- // ── N==1: original single-shot path (byte-for-byte preserved) ──────────────
228
- // Steps below mirror the original startCommand.ts flow exactly.
258
+ // ── N==1: replenishing single-slot supervisor (Row 21b / Fix A) ────────────
259
+ //
260
+ // Each ephemeral `actions/runner` still handles exactly one job then exits
261
+ // (binding constraint #5 preserved). What changed in row 21b: the OUTER
262
+ // supervisor no longer exits with it — it re-registers a fresh ephemeral
263
+ // runner and keeps listening, so `buildhive-agent start` runs forever
264
+ // ("set and forget") instead of stopping after the first job (walk finding
265
+ // F3). The loop exits only on SIGTERM/SIGINT (user logout / `launchctl
266
+ // bootout` / Ctrl-C).
267
+ let shuttingDown = false;
268
+ const onShutdown = () => {
269
+ if (!shuttingDown) {
270
+ shuttingDown = true;
271
+ logger.info('Supervisor received shutdown signal — finishing current cycle then exiting');
272
+ }
273
+ };
274
+ // Multi-label per 2026-05-17 operator correction:
275
+ // runs-on: [self-hosted, buildhive, ubuntu-latest] requires all three labels.
276
+ const labels = 'self-hosted,buildhive,ubuntu-latest';
277
+ const runnerRegistrationUrl = `https://github.com/${owner}/${repo}`;
229
278
  const tokenFetchOpts = {
230
279
  platformUrl,
231
280
  jwt,
@@ -233,56 +282,11 @@ export async function runStart(opts = {}) {
233
282
  repo,
234
283
  fetchFn: opts.fetchFn,
235
284
  };
236
- let runnerToken;
237
- try {
238
- runnerToken = await fetchRunnerToken(tokenFetchOpts);
239
- }
240
- catch (err) {
241
- const msg = `Failed to fetch runner token: ${err instanceof Error ? err.message : String(err)}`;
242
- logger.error(msg);
243
- await heartbeat.stop('OFFLINE').catch(() => undefined);
244
- return { exitCode: 1, message: msg };
245
- }
246
- const runnerName = generateRunnerName();
247
- // Multi-label per 2026-05-17 operator correction:
248
- // runs-on: [self-hosted, buildhive, ubuntu-latest] requires all three labels.
249
- const labels = 'self-hosted,buildhive,ubuntu-latest';
250
- const runnerRegistrationUrl = `https://github.com/${owner}/${repo}`;
251
- const configOpts = {
252
- configSh: runnerPaths.configSh,
253
- runnerRegistrationUrl,
254
- registrationToken: runnerToken.token,
255
- runnerName,
256
- labels,
257
- };
258
- logger.info('Registering runner with GitHub', { runnerName, runnerRegistrationUrl, labels });
259
- let configExitCode = await configureRunner(configOpts);
260
- if (configExitCode !== 0) {
261
- logger.warn('config.sh failed — retrying with a fresh token once', { configExitCode });
262
- let retryToken;
263
- try {
264
- retryToken = await fetchRunnerToken(tokenFetchOpts);
265
- }
266
- catch (err) {
267
- const msg = `Retry token fetch failed: ${err instanceof Error ? err.message : String(err)}`;
268
- logger.error(msg);
269
- await heartbeat.stop('OFFLINE').catch(() => undefined);
270
- return { exitCode: 1, message: msg };
271
- }
272
- configExitCode = await configureRunner({
273
- ...configOpts,
274
- registrationToken: retryToken.token,
275
- });
276
- if (configExitCode !== 0) {
277
- const msg = `Runner configuration failed after retry (config.sh exited ${configExitCode}). ` +
278
- `Check your enrollment credentials and ensure the repo is enrolled in BuildHive.`;
279
- logger.error(msg);
280
- await heartbeat.stop('OFFLINE').catch(() => undefined);
281
- return { exitCode: 1, message: msg };
282
- }
283
- }
284
- logger.info('Runner configured successfully — starting run.sh', { runnerName });
285
- // ── Cache server (Row 18b S7, Wave 2) — N==1 only ──────────────────────────
285
+ // DI seam (tests) — production uses the real supervisor primitives.
286
+ const fetchTokenFn = opts.supervisorDeps?.fetchRunnerToken ?? fetchRunnerToken;
287
+ const configureFn = opts.supervisorDeps?.configureRunner ?? configureRunner;
288
+ const runFn = opts.supervisorDeps?.runRunner ?? runRunner;
289
+ // ── Cache server (Row 18b S7, Wave 2) — started ONCE, lives across cycles ──
286
290
  const defaultCacheConfig = {
287
291
  enabled: false,
288
292
  directory: '/tmp/buildhive/cache',
@@ -299,17 +303,124 @@ export async function runStart(opts = {}) {
299
303
  catch {
300
304
  logger.warn('Could not load agent config for cache check — proceeding without cache');
301
305
  }
302
- const effectiveCacheDeps = opts.cacheEnvDeps ?? await buildProductionCacheEnvDeps();
303
- const cacheResult = await prepareCache(cacheConfig, effectiveCacheDeps);
304
- const runOpts = {
305
- runSh: runnerPaths.runSh,
306
- extraEnv: cacheResult.enabled ? cacheResult.patch : undefined,
306
+ // Cache prep + signal handlers are set up INSIDE the try below so a throw
307
+ // (e.g. a dynamic import failing in buildProductionCacheEnvDeps) can never
308
+ // leak the SIGTERM/SIGINT listeners or skip heartbeat shutdown.
309
+ let cacheResult;
310
+ let runExtraEnv;
311
+ /**
312
+ * Register a fresh ephemeral runner (one fresh token, one-time retry on
313
+ * non-zero config.sh exit). Returns true once registered, false on failure
314
+ * (caller backs off). Honours `shuttingDown` between awaits.
315
+ */
316
+ const registerOnce = async () => {
317
+ let token;
318
+ try {
319
+ token = await fetchTokenFn(tokenFetchOpts);
320
+ }
321
+ catch (err) {
322
+ logger.error('Failed to fetch runner token', {
323
+ err: err instanceof Error ? err.message : String(err),
324
+ });
325
+ return false;
326
+ }
327
+ if (shuttingDown)
328
+ return false;
329
+ const configOpts = {
330
+ configSh: runnerPaths.configSh,
331
+ runnerRegistrationUrl,
332
+ registrationToken: token.token,
333
+ runnerName: generateRunnerName(),
334
+ labels,
335
+ };
336
+ logger.info('Registering runner with GitHub', {
337
+ runnerName: configOpts.runnerName,
338
+ runnerRegistrationUrl,
339
+ labels,
340
+ });
341
+ let configExitCode = await configureFn(configOpts);
342
+ if (configExitCode !== 0) {
343
+ logger.warn('config.sh failed — retrying with a fresh token once', { configExitCode });
344
+ let retryToken;
345
+ try {
346
+ retryToken = await fetchTokenFn(tokenFetchOpts);
347
+ }
348
+ catch (err) {
349
+ logger.error('Retry token fetch failed', {
350
+ err: err instanceof Error ? err.message : String(err),
351
+ });
352
+ return false;
353
+ }
354
+ if (shuttingDown)
355
+ return false;
356
+ configExitCode = await configureFn({
357
+ ...configOpts,
358
+ registrationToken: retryToken.token,
359
+ runnerName: generateRunnerName(),
360
+ });
361
+ if (configExitCode !== 0) {
362
+ logger.error('Runner configuration failed after retry — check enrollment credentials and ' +
363
+ 'that the repo is enrolled in BuildHive.', { configExitCode });
364
+ return false;
365
+ }
366
+ }
367
+ return true;
307
368
  };
308
- let result;
369
+ let consecutiveFailures = 0;
370
+ process.on('SIGTERM', onShutdown);
371
+ process.on('SIGINT', onShutdown);
309
372
  try {
310
- result = await runRunner(runOpts);
373
+ // Cache server (Row 18b S7) — started once, lives across cycles. Inside
374
+ // the try so a failure still hits the finally (listener + heartbeat cleanup).
375
+ const effectiveCacheDeps = opts.cacheEnvDeps ?? (await buildProductionCacheEnvDeps());
376
+ cacheResult = await prepareCache(cacheConfig, effectiveCacheDeps);
377
+ runExtraEnv = cacheResult.enabled ? cacheResult.patch : undefined;
378
+ while (!shuttingDown) {
379
+ // Yield to the macrotask queue each cycle so SIGTERM/timers always get
380
+ // a turn — a fast path (e.g. config.sh exiting instantly) must never
381
+ // starve the event loop on the microtask queue.
382
+ await new Promise((r) => setTimeout(r, 0));
383
+ if (shuttingDown)
384
+ break;
385
+ const registered = await registerOnce();
386
+ if (shuttingDown)
387
+ break;
388
+ if (!registered) {
389
+ // Bounded exponential backoff so a transient outage self-heals and a
390
+ // persistent fault loops calmly (no respawn storm). Surfaced as
391
+ // OFFLINE on the dashboard + a doctor failure.
392
+ consecutiveFailures += 1;
393
+ const backoff = Math.min(SUPERVISOR_BACKOFF_MAX_MS, SUPERVISOR_BACKOFF_BASE_MS * 2 ** (consecutiveFailures - 1));
394
+ logger.warn('Registration failed — backing off before retry', {
395
+ consecutiveFailures,
396
+ backoffMs: backoff,
397
+ });
398
+ await interruptibleSleep(backoff, () => shuttingDown);
399
+ continue;
400
+ }
401
+ consecutiveFailures = 0;
402
+ logger.info('Runner configured successfully — starting run.sh');
403
+ const result = await runFn({ runSh: runnerPaths.runSh, extraEnv: runExtraEnv });
404
+ if (shuttingDown)
405
+ break;
406
+ if (result.ok) {
407
+ logger.info('Runner completed a job and exited cleanly — replenishing');
408
+ }
409
+ else {
410
+ // A non-zero run.sh exit reflects the runner's own health, not the
411
+ // build result. Replenish (consistent with the pool path) but apply
412
+ // a small backoff so a crash-looping runner doesn't spin hot.
413
+ logger.warn('Runner exited non-zero — replenishing after short backoff', {
414
+ exitCode: result.exitCode,
415
+ signal: result.signal,
416
+ });
417
+ await interruptibleSleep(SUPERVISOR_BACKOFF_BASE_MS, () => shuttingDown);
418
+ }
419
+ }
311
420
  }
312
421
  finally {
422
+ process.removeListener('SIGTERM', onShutdown);
423
+ process.removeListener('SIGINT', onShutdown);
313
424
  try {
314
425
  await heartbeat.stop('OFFLINE');
315
426
  }
@@ -318,7 +429,7 @@ export async function runStart(opts = {}) {
318
429
  err: err instanceof Error ? err.message : String(err),
319
430
  });
320
431
  }
321
- if (cacheResult.enabled) {
432
+ if (cacheResult?.enabled) {
322
433
  try {
323
434
  await stopCache(cacheResult.handle);
324
435
  }
@@ -329,13 +440,8 @@ export async function runStart(opts = {}) {
329
440
  }
330
441
  }
331
442
  }
332
- if (result.ok) {
333
- logger.info('Runner completed a job and exited cleanly');
334
- return { exitCode: 0 };
335
- }
336
- const msg = `Runner exited with code ${result.exitCode ?? 'null'} signal ${result.signal ?? 'none'}`;
337
- logger.warn(msg);
338
- return { exitCode: result.exitCode === 0 ? 0 : 1, message: msg };
443
+ logger.info('Supervisor exited cleanly after shutdown signal');
444
+ return { exitCode: 0 };
339
445
  }
340
446
  // ── N>1: pool path ──────────────────────────────────────────────────────────
341
447
  // Cache is disabled for N>1 — shared cache server is a follow-up item.
@@ -96,6 +96,15 @@ export declare function configureRunner(opts: ConfigureRunnerOptions): Promise<n
96
96
  * completes. The resolved RunnerExitResult reflects that final exit.
97
97
  */
98
98
  export declare function runRunner(opts: RunRunnerOptions): Promise<RunnerExitResult>;
99
+ /**
100
+ * Remove a stale `actions/runner` local registration so a fresh `config.sh`
101
+ * can register cleanly. Best-effort (ENOENT is fine). Exported for tests.
102
+ *
103
+ * These are the files config.sh writes on registration; a leftover set (from an
104
+ * unclean shutdown of an idle-Listening ephemeral runner) makes a subsequent
105
+ * config.sh fail with "already configured".
106
+ */
107
+ export declare function removeStaleRunnerConfig(runnerDir: string): Promise<void>;
99
108
  /**
100
109
  * Generate a runner name: "buildhive-<8-char random hex>".
101
110
  * Short enough to be readable in GitHub's runner list.
@@ -41,8 +41,9 @@
41
41
  */
42
42
  import { spawn } from 'child_process';
43
43
  import { createInterface } from 'readline';
44
- import { dirname } from 'path';
44
+ import { dirname, join } from 'path';
45
45
  import { randomBytes } from 'crypto';
46
+ import { promises as fsPromises } from 'fs';
46
47
  import { createLogger } from '../utils/logger.js';
47
48
  const logger = createLogger('runner.supervisor');
48
49
  /** Timeout in ms to wait for the runner to exit after SIGTERM before SIGKILL */
@@ -64,6 +65,7 @@ export function sleep(ms) {
64
65
  export async function configureRunner(opts) {
65
66
  const { configSh, runnerRegistrationUrl, registrationToken, runnerName, labels, } = opts;
66
67
  const spawnFn = opts.spawnFn ?? spawn;
68
+ const runnerDir = dirname(configSh);
67
69
  const args = [
68
70
  '--unattended',
69
71
  '--ephemeral',
@@ -95,36 +97,61 @@ export async function configureRunner(opts) {
95
97
  '--labels', labels,
96
98
  '--replace',
97
99
  ];
98
- logger.info('Configuring runner', { configSh, runnerName, runnerRegistrationUrl, labels });
99
- // SECURITY M-1: The GitHub registration token is passed as a process argument
100
- // (--token <value>) and is briefly visible to local users via 'ps aux' for the
101
- // lifetime of config.sh (~few seconds). This is acceptable per Apple/GitHub
102
- // guidance: the token is single-use and expires in ~1 hour. See P6 security
103
- // review M-1.
104
- const child = spawnFn(configSh, args, {
105
- cwd: dirname(configSh),
106
- env: { ...process.env },
107
- stdio: ['ignore', 'pipe', 'pipe'],
108
- });
109
- const stdoutLog = createLogger('runner.config.stdout');
110
- const stderrLog = createLogger('runner.config.stderr');
111
- if (child.stdout) {
112
- const rl = createInterface({ input: child.stdout, crlfDelay: Infinity });
113
- rl.on('line', (line) => stdoutLog.info(line));
114
- }
115
- if (child.stderr) {
116
- const rl = createInterface({ input: child.stderr, crlfDelay: Infinity });
117
- rl.on('line', (line) => stderrLog.warn(line));
118
- }
119
- return new Promise((resolve) => {
120
- child.on('close', (code) => {
121
- resolve(code ?? 1);
100
+ // Spawn config.sh once; stream its output to the log AND watch for the
101
+ // "already configured" marker so the caller can self-heal a stale/raced
102
+ // local registration. SECURITY M-1: the GH registration token is passed as a
103
+ // process arg (--token), briefly visible via `ps aux`; acceptable per
104
+ // Apple/GitHub guidance (single-use, ~1h TTL). See P6 review M-1.
105
+ const runConfigSh = () => {
106
+ logger.info('Configuring runner', { configSh, runnerName, runnerRegistrationUrl, labels });
107
+ const child = spawnFn(configSh, args, {
108
+ cwd: runnerDir,
109
+ env: { ...process.env },
110
+ stdio: ['ignore', 'pipe', 'pipe'],
122
111
  });
123
- child.on('error', (err) => {
124
- logger.error('config.sh spawn error', err);
125
- resolve(1);
112
+ let alreadyConfigured = false;
113
+ const markIfAlreadyConfigured = (line) => {
114
+ if (/already configured/i.test(line))
115
+ alreadyConfigured = true;
116
+ };
117
+ const stdoutLog = createLogger('runner.config.stdout');
118
+ const stderrLog = createLogger('runner.config.stderr');
119
+ if (child.stdout) {
120
+ const rl = createInterface({ input: child.stdout, crlfDelay: Infinity });
121
+ rl.on('line', (line) => { markIfAlreadyConfigured(line); stdoutLog.info(line); });
122
+ }
123
+ if (child.stderr) {
124
+ const rl = createInterface({ input: child.stderr, crlfDelay: Infinity });
125
+ rl.on('line', (line) => { markIfAlreadyConfigured(line); stderrLog.warn(line); });
126
+ }
127
+ return new Promise((resolve) => {
128
+ child.on('close', (code) => resolve({ code: code ?? 1, alreadyConfigured }));
129
+ child.on('error', (err) => {
130
+ logger.error('config.sh spawn error', err);
131
+ resolve({ code: 1, alreadyConfigured });
132
+ });
126
133
  });
127
- });
134
+ };
135
+ // Row 21b — reboot robustness. We ALWAYS register a fresh ephemeral runner
136
+ // each cycle, so any local registration is stale. An ephemeral runner killed
137
+ // while idle-Listening (a reboot, or the SIGTERM grace window on `launchctl
138
+ // bootout`) leaves `.runner`/`.credentials*` behind; config.sh then refuses
139
+ // with "already configured".
140
+ //
141
+ // 1. Pre-clean the common leftover case.
142
+ await removeStaleRunnerConfig(runnerDir);
143
+ let result = await runConfigSh();
144
+ // 2. Fast self-heal: if config.sh STILL reports "already configured" (a
145
+ // leftover the pre-clean missed, or a brief post-reboot race), clean and
146
+ // retry ONCE immediately — instead of returning failure and waiting for
147
+ // the supervisor's backoff + a launchd respawn (the 2026-06-02 reboot
148
+ // walk recovered only after ~6 min via a respawn; this makes it seconds).
149
+ if (result.code !== 0 && result.alreadyConfigured) {
150
+ logger.warn('config.sh reported "already configured" — cleaning stale registration and retrying immediately');
151
+ await removeStaleRunnerConfig(runnerDir);
152
+ result = await runConfigSh();
153
+ }
154
+ return result.code;
128
155
  }
129
156
  /**
130
157
  * Spawn run.sh (the GitHub runner listen loop) and manage its lifecycle.
@@ -256,6 +283,32 @@ export async function runRunner(opts) {
256
283
  });
257
284
  });
258
285
  }
286
+ /**
287
+ * Remove a stale `actions/runner` local registration so a fresh `config.sh`
288
+ * can register cleanly. Best-effort (ENOENT is fine). Exported for tests.
289
+ *
290
+ * These are the files config.sh writes on registration; a leftover set (from an
291
+ * unclean shutdown of an idle-Listening ephemeral runner) makes a subsequent
292
+ * config.sh fail with "already configured".
293
+ */
294
+ export async function removeStaleRunnerConfig(runnerDir) {
295
+ const stale = ['.runner', '.credentials', '.credentials_rsaparams'];
296
+ let removedAny = false;
297
+ for (const name of stale) {
298
+ try {
299
+ await fsPromises.unlink(join(runnerDir, name));
300
+ removedAny = true;
301
+ }
302
+ catch {
303
+ // ENOENT (normal — clean slate) or unreadable; ignore.
304
+ }
305
+ }
306
+ if (removedAny) {
307
+ logger.info('Removed stale runner registration before re-config (reboot/unclean-exit recovery)', {
308
+ runnerDir,
309
+ });
310
+ }
311
+ }
259
312
  /**
260
313
  * Generate a runner name: "buildhive-<8-char random hex>".
261
314
  * Short enough to be readable in GitHub's runner list.
@@ -168,9 +168,14 @@ export function generatePlist(mode, config) {
168
168
  // and the CLI looking at different ~/.buildhive/ paths).
169
169
  // System mode: HOME=/var/_buildhive so the dropped-priv user can read
170
170
  // its own state.
171
+ // Row 21b / Fix B: prepend `/opt/homebrew/bin` (Apple-Silicon Homebrew) and
172
+ // keep `/usr/local/bin` (Intel Homebrew). launchd does NOT inherit the
173
+ // interactive shell PATH, so without this the daemon-spawned `actions/runner`
174
+ // can't find `git`, `node`, etc. installed via Homebrew on Apple Silicon —
175
+ // the runner's config.sh/run.sh fail and the job never runs.
171
176
  const envDict = [
172
177
  ['NODE_ENV', 'production'],
173
- ['PATH', '/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin'],
178
+ ['PATH', '/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin'],
174
179
  ];
175
180
  if (mode === 'system') {
176
181
  envDict.push(['HOME', `/var/${config.systemUserName ?? DEFAULT_SYSTEM_USER}`]);
@@ -37,6 +37,11 @@ export interface ServiceStatus {
37
37
  readonly state: 'running' | 'waiting' | 'unknown' | null;
38
38
  /** Most recent exit reason if available. */
39
39
  readonly lastExitReason: string | null;
40
+ /**
41
+ * Most recent `last exit code` per `launchctl print` (or null if absent).
42
+ * Row 21b / Fix C: a non-zero value means the daemon is crashing on start.
43
+ */
44
+ readonly lastExitCode: number | null;
40
45
  /** Computed install paths for this mode (single source of truth). */
41
46
  readonly paths: ServicePaths;
42
47
  }
@@ -158,12 +158,16 @@ export async function getServiceStatus(opts, deps = {}) {
158
158
  loaded: false,
159
159
  state: null,
160
160
  lastExitReason: null,
161
+ lastExitCode: null,
161
162
  paths,
162
163
  };
163
164
  }
164
165
  const out = printResult.stdout;
165
- const stateMatch = out.match(/state\s*=\s*(\w+)/);
166
+ // Anchored + `[^\n]+` so multi-word states like `spawn scheduled` aren't
167
+ // truncated to `spawn` (the exact truncation Fix C corrected in the doctor).
168
+ const stateMatch = out.match(/(?:^|\n)\s*state\s*=\s*([^\n]+)/);
166
169
  const exitReasonMatch = out.match(/last exit reason\s*=\s*([^\n]+)/);
170
+ const exitCodeMatch = out.match(/last exit code\s*=\s*(-?\d+)/i);
167
171
  let state = 'unknown';
168
172
  if (stateMatch) {
169
173
  const raw = stateMatch[1].toLowerCase();
@@ -179,6 +183,7 @@ export async function getServiceStatus(opts, deps = {}) {
179
183
  loaded: true,
180
184
  state,
181
185
  lastExitReason: exitReasonMatch ? exitReasonMatch[1].trim() : null,
186
+ lastExitCode: exitCodeMatch ? parseInt(exitCodeMatch[1], 10) : null,
182
187
  paths,
183
188
  };
184
189
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "buildhive-agent",
3
- "version": "1.0.0-beta.11",
3
+ "version": "1.0.0-beta.12",
4
4
  "description": "BuildHive CI Agent - Distributed build execution agent",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -74,4 +74,4 @@
74
74
  "overrides": {
75
75
  "minimatch": "^10.0.1"
76
76
  }
77
- }
77
+ }