npm - buildhive-agent - Versions diffs - 1.0.0-beta.11 → 1.0.0-beta.12 - Mend

buildhive-agent 1.0.0-beta.11 → 1.0.0-beta.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/auth/agentEnrollmentKeyringStore.d.ts +17 -3
package/dist/auth/agentEnrollmentKeyringStore.js +104 -25
package/dist/auth/credentialFileStore.d.ts +79 -0
package/dist/auth/credentialFileStore.js +140 -0
package/dist/auth/joinCommand.d.ts +23 -0
package/dist/auth/joinCommand.js +59 -5
package/dist/doctor/runChecks.js +45 -9
package/dist/runner/startCommand.d.ts +29 -2
package/dist/runner/startCommand.js +176 -70
package/dist/runner/supervisor.d.ts +9 -0
package/dist/runner/supervisor.js +82 -29
package/dist/service/plistGenerator.js +6 -1
package/dist/service/serviceInstaller.d.ts +5 -0
package/dist/service/serviceInstaller.js +6 -1
package/package.json +2 -2

package/dist/auth/agentEnrollmentKeyringStore.d.ts CHANGED Viewed

@@ -14,6 +14,7 @@
  *   agent-enrollment.platform_url     — bound at enroll; refuses cross-instance reuse
  */
 import type { SecretStore } from '../security/secretStore.js';
+import { CredentialFileStore } from './credentialFileStore.js';
 import { PlatformUrlMismatchError } from './types.js';
 export declare const AGENT_ENROLLMENT_KEY_PREFIX = "agent-enrollment.";
 export declare const KEY_JWT = "agent-enrollment.jwt";
@@ -31,6 +32,12 @@ export interface AgentEnrollmentCredentials {
 export interface AgentEnrollmentKeyringStoreOptions {
     /** Override the SecretStore (tests + environments without OS keyring). */
     readonly store?: SecretStore;
+    /**
+     * Override the 0600 credential-file mirror (row 21b / F2 daemon fallback).
+     * Tests inject a fake; production uses the real `~/.buildhive/` file.
+     * Pass `null` to disable the file mirror entirely (keyring-only).
+     */
+    readonly fileStore?: CredentialFileStore | null;
 }
 /**
  * Custom error thrown when the stored platform_url doesn't match the current
@@ -42,6 +49,8 @@ export declare class AgentPlatformUrlMismatchError extends PlatformUrlMismatchEr
 }
 export declare class AgentEnrollmentKeyringStore {
     private readonly store;
+    /** 0600 file mirror for the launchd daemon; null when explicitly disabled. */
+    private readonly fileStore;
     constructor(opts?: AgentEnrollmentKeyringStoreOptions);
     /** Throws KeyringUnavailableError if the OS keyring is unreachable. */
     static assertAvailable(): Promise<void>;
@@ -63,16 +72,21 @@ export declare class AgentEnrollmentKeyringStore {
      * Throws NotLoggedInError if any required key is missing.
      */
     readAll(): Promise<AgentEnrollmentCredentials>;
+    /**
+     * Read the 0600 file mirror, swallowing any error to null. Centralised so
+     * every read path (readJwt / readAll / hasEnrollment) shares one fallback.
+     */
+    private readFromFileMirror;
     /**
      * Cross-platform-url check. Refuses to use credentials minted against a
      * different BuildHive instance.
      */
     assertPlatformUrlMatches(currentPlatformUrl: string): Promise<void>;
-    /** Returns true if any enrollment key exists in the keyring. */
+    /** Returns true if an enrollment JWT exists in the keyring OR the file mirror. */
     hasEnrollment(): Promise<boolean>;
     /**
-     * Delete all 5 `agent-enrollment.*` keys. Idempotent.
-     * Called by a hypothetical `buildhive-agent leave` command (out of scope for v1).
+     * Delete all 5 `agent-enrollment.*` keys AND the 0600 file mirror. Idempotent.
+     * Called by `buildhive-agent logout` (row 21b / F4) to fully de-enroll.
      */
     clear(): Promise<void>;
 }

package/dist/auth/agentEnrollmentKeyringStore.js CHANGED Viewed

@@ -14,7 +14,10 @@
  *   agent-enrollment.platform_url     — bound at enroll; refuses cross-instance reuse
  */
 import { KeyringSecretStore } from '../security/keyringSecretStore.js';
+import { CredentialFileStore } from './credentialFileStore.js';
+import { createLogger } from '../utils/logger.js';
 import { KeyringUnavailableError, NotLoggedInError, PlatformUrlMismatchError, } from './types.js';
+const logger = createLogger('auth.agentEnrollmentKeyringStore');
 export const AGENT_ENROLLMENT_KEY_PREFIX = 'agent-enrollment.';
 export const KEY_JWT = `${AGENT_ENROLLMENT_KEY_PREFIX}jwt`;
 export const KEY_JWT_EXP = `${AGENT_ENROLLMENT_KEY_PREFIX}jwt_exp`;
@@ -44,8 +47,29 @@ export class AgentPlatformUrlMismatchError extends PlatformUrlMismatchError {
 }
 export class AgentEnrollmentKeyringStore {
     store;
+    /** 0600 file mirror for the launchd daemon; null when explicitly disabled. */
+    fileStore;
     constructor(opts = {}) {
         this.store = opts.store ?? new KeyringSecretStore();
+        // File-mirror resolution (row 21b / F2):
+        //   - explicit `null`        → disabled (keyring-only)
+        //   - explicit CredentialFileStore → use it
+        //   - omitted + real keyring → real `~/.buildhive/` mirror (production)
+        //   - omitted + INJECTED keyring (tests) → disabled, so unit tests never
+        //     read/write the real user home. A test exercising the mirror must
+        //     inject its own `fileStore` explicitly.
+        if (opts.fileStore === null) {
+            this.fileStore = null;
+        }
+        else if (opts.fileStore) {
+            this.fileStore = opts.fileStore;
+        }
+        else if (opts.store) {
+            this.fileStore = null;
+        }
+        else {
+            this.fileStore = new CredentialFileStore();
+        }
     }
     /** Throws KeyringUnavailableError if the OS keyring is unreachable. */
     static async assertAvailable() {
@@ -75,6 +99,21 @@ export class AgentEnrollmentKeyringStore {
             await this.clear().catch(() => undefined);
             throw err;
         }
+        // Row 21b / F2: mirror to the 0600 file so the launchd daemon (which
+        // cannot read the interactive login keychain) can resolve the JWT.
+        // The keychain write above is the source of truth; a file-mirror failure
+        // is non-fatal here (the agent is enrolled for foreground use) but it
+        // leaves the daemon unable to start — `join`'s post-bootstrap self-check
+        // (Fix D) and the doctor will surface that loudly rather than silently.
+        if (this.fileStore) {
+            try {
+                await this.fileStore.write(creds);
+            }
+            catch (err) {
+                logger.warn('Could not write the 0600 credential mirror — the background daemon may ' +
+                    'not be able to read the JWT. Foreground `buildhive-agent start` is unaffected.', { err: err instanceof Error ? err.message : String(err) });
+            }
+        }
     }
     /**
      * Read the stored JWT + its cached exp. Throws NotLoggedInError if absent
@@ -85,13 +124,17 @@ export class AgentEnrollmentKeyringStore {
             this.store.getSecret(KEY_JWT),
             this.store.getSecret(KEY_JWT_EXP),
         ]);
-        if (!token || !expStr)
-            throw new NotLoggedInError();
-        const exp = Number(expStr);
-        return {
-            jwt: token,
-            expiresAtUnix: Number.isFinite(exp) ? exp : 0,
-        };
+        if (token && expStr) {
+            const exp = Number(expStr);
+            return { jwt: token, expiresAtUnix: Number.isFinite(exp) ? exp : 0 };
+        }
+        // Row 21b / F2: keychain returned nothing (the daemon case) — fall back
+        // to the 0600 file mirror before declaring "not enrolled".
+        const fromFile = await this.readFromFileMirror();
+        if (fromFile) {
+            return { jwt: fromFile.jwt, expiresAtUnix: fromFile.jwtExpiresAtUnix };
+        }
+        throw new NotLoggedInError();
     }
     /**
      * Read the full stored credentials (for display or row-17b supervisor).
@@ -105,43 +148,79 @@ export class AgentEnrollmentKeyringStore {
             this.store.getSecret(KEY_TENANT_ID),
             this.store.getSecret(KEY_PLATFORM_URL),
         ]);
-        if (!jwt || !expStr || !agentId || !tenantId || !platformUrl) {
-            throw new NotLoggedInError();
-        }
-        const exp = Number(expStr);
-        return {
-            jwt,
-            jwtExpiresAtUnix: Number.isFinite(exp) ? exp : 0,
-            agentId,
-            tenantId,
-            platformUrl,
-        };
+        if (jwt && expStr && agentId && tenantId && platformUrl) {
+            const exp = Number(expStr);
+            return {
+                jwt,
+                jwtExpiresAtUnix: Number.isFinite(exp) ? exp : 0,
+                agentId,
+                tenantId,
+                platformUrl,
+            };
+        }
+        // Row 21b / F2: the launchd daemon's keychain read comes back empty.
+        // Fall back to the 0600 file mirror written by `join` before throwing.
+        const fromFile = await this.readFromFileMirror();
+        if (fromFile)
+            return fromFile;
+        throw new NotLoggedInError();
+    }
+    /**
+     * Read the 0600 file mirror, swallowing any error to null. Centralised so
+     * every read path (readJwt / readAll / hasEnrollment) shares one fallback.
+     */
+    async readFromFileMirror() {
+        if (!this.fileStore)
+            return null;
+        try {
+            const creds = await this.fileStore.read();
+            if (creds) {
+                logger.info('Resolved agent credentials from the 0600 file mirror (keychain unavailable in this session)');
+            }
+            return creds;
+        }
+        catch {
+            return null;
+        }
     }
     /**
      * Cross-platform-url check. Refuses to use credentials minted against a
      * different BuildHive instance.
      */
     async assertPlatformUrlMatches(currentPlatformUrl) {
-        const stored = await this.store.getSecret(KEY_PLATFORM_URL);
-        if (!stored)
-            throw new NotLoggedInError();
+        let stored = await this.store.getSecret(KEY_PLATFORM_URL);
+        if (!stored) {
+            // Row 21b / F2: keep the cross-instance guard working in the daemon
+            // session, where the keychain read comes back empty — fall back to the
+            // 0600 file mirror before declaring "not enrolled".
+            const fromFile = await this.readFromFileMirror();
+            if (!fromFile)
+                throw new NotLoggedInError();
+            stored = fromFile.platformUrl;
+        }
         if (normalizeUrl(stored) !== normalizeUrl(currentPlatformUrl)) {
             throw new AgentPlatformUrlMismatchError(stored, currentPlatformUrl);
         }
     }
-    /** Returns true if any enrollment key exists in the keyring. */
+    /** Returns true if an enrollment JWT exists in the keyring OR the file mirror. */
     async hasEnrollment() {
         const jwt = await this.store.getSecret(KEY_JWT);
-        return jwt !== null && jwt.length > 0;
+        if (jwt !== null && jwt.length > 0)
+            return true;
+        const fromFile = await this.readFromFileMirror();
+        return fromFile !== null;
     }
     /**
-     * Delete all 5 `agent-enrollment.*` keys. Idempotent.
-     * Called by a hypothetical `buildhive-agent leave` command (out of scope for v1).
+     * Delete all 5 `agent-enrollment.*` keys AND the 0600 file mirror. Idempotent.
+     * Called by `buildhive-agent logout` (row 21b / F4) to fully de-enroll.
      */
     async clear() {
         for (const k of ALL_KEYS) {
             await this.store.deleteSecret(k).catch(() => false);
         }
+        if (this.fileStore) {
+            await this.fileStore.delete().catch(() => undefined);
+        }
     }
 }
 function normalizeUrl(u) {

package/dist/auth/credentialFileStore.d.ts ADDED Viewed

@@ -0,0 +1,79 @@
+/**
+ * credentialFileStore — 0600 on-disk fallback for the agent-enrollment
+ * credentials, so the **launchd-managed daemon** can read its JWT.
+ *
+ * ── Why this exists (row 21b / F2, 2026-06-02) ───────────────────────────────
+ * The 2026-06-01 verification walk found the LaunchAgent daemon crashlooping
+ * with `last exit code = 1` and empty logs. Root cause (proven in
+ * docs/ops/diagnosis-f3-agent-daemon-2026-06-02.html): the macOS Keychain item
+ * holding the JWT is created by the *foreground* `join` process (Terminal
+ * security session). The launchd-spawned daemon runs in a different security
+ * session (`SessionCreate=true`), so `@napi-rs/keyring`'s `getPassword()`
+ * returns `null` for it (silent access-deny). The daemon then reads "no JWT" →
+ * "Not enrolled" → exits 1 → launchd respawns → silent crashloop.
+ *
+ * GitHub's own `actions/runner` `svc.sh` stores its `.credentials` as files in
+ * the runner directory for exactly this reason — a launchd/systemd service
+ * cannot rely on the interactive login keychain.
+ *
+ * ── Design ───────────────────────────────────────────────────────────────────
+ * The OS keyring stays the PRIMARY store (used by foreground `start`, where it
+ * works and is the more-secure option). This file is a daemon-readable MIRROR:
+ * `join` writes both; the resolver falls back to the file only when the keyring
+ * returns nothing (the daemon case).
+ *
+ * Security:
+ *   - File mode 0600 (owner read/write only), parent dir 0700.
+ *   - Lives under the user's own `~/.buildhive/` — same trust boundary as the
+ *     actions/runner `.credentials` file and the workspaces dir.
+ *   - Never packaged (runtime state in $HOME, not in the npm tarball).
+ *
+ * This module is pure-where-possible: fs + homedir are injectable for tests.
+ */
+import type { AgentEnrollmentCredentials } from './agentEnrollmentKeyringStore.js';
+/** Basename of the credential mirror file under `~/.buildhive/`. */
+export declare const CREDENTIAL_FILE_NAME = "agent-enrollment.cred";
+/** Injectable fs surface (subset of node:fs/promises we use). */
+export interface CredentialFileFs {
+    readonly mkdir: (path: string, opts: {
+        recursive: boolean;
+        mode?: number;
+    }) => Promise<string | undefined>;
+    readonly writeFile: (path: string, data: string, opts: {
+        encoding: 'utf8';
+        mode?: number;
+    }) => Promise<void>;
+    readonly chmod: (path: string, mode: number) => Promise<void>;
+    readonly rename: (from: string, to: string) => Promise<void>;
+    readonly readFile: (path: string, encoding: 'utf8') => Promise<string>;
+    readonly unlink: (path: string) => Promise<void>;
+}
+export interface CredentialFileStoreOptions {
+    /** Override the home directory (tests + multi-user). */
+    readonly homeDir?: string;
+    /** Override the fs surface (tests). */
+    readonly fs?: CredentialFileFs;
+}
+/**
+ * Reads/writes the agent-enrollment credentials as a single 0600 JSON file.
+ */
+export declare class CredentialFileStore {
+    private readonly fs;
+    private readonly dir;
+    private readonly filePath;
+    constructor(opts?: CredentialFileStoreOptions);
+    /** Absolute path to the credential file (for diagnostics + doctor). */
+    get path(): string;
+    /**
+     * Atomically write the credential mirror with 0600 perms.
+     * tmp-file → chmod 0600 → rename avoids a torn read by the daemon.
+     */
+    write(creds: AgentEnrollmentCredentials): Promise<void>;
+    /**
+     * Read the credential mirror. Returns null when absent or malformed — never
+     * throws, so the resolver can cleanly fall through to "not enrolled".
+     */
+    read(): Promise<AgentEnrollmentCredentials | null>;
+    /** Delete the credential mirror. Idempotent (ENOENT is not an error). */
+    delete(): Promise<void>;
+}

package/dist/auth/credentialFileStore.js ADDED Viewed

@@ -0,0 +1,140 @@
+/**
+ * credentialFileStore — 0600 on-disk fallback for the agent-enrollment
+ * credentials, so the **launchd-managed daemon** can read its JWT.
+ *
+ * ── Why this exists (row 21b / F2, 2026-06-02) ───────────────────────────────
+ * The 2026-06-01 verification walk found the LaunchAgent daemon crashlooping
+ * with `last exit code = 1` and empty logs. Root cause (proven in
+ * docs/ops/diagnosis-f3-agent-daemon-2026-06-02.html): the macOS Keychain item
+ * holding the JWT is created by the *foreground* `join` process (Terminal
+ * security session). The launchd-spawned daemon runs in a different security
+ * session (`SessionCreate=true`), so `@napi-rs/keyring`'s `getPassword()`
+ * returns `null` for it (silent access-deny). The daemon then reads "no JWT" →
+ * "Not enrolled" → exits 1 → launchd respawns → silent crashloop.
+ *
+ * GitHub's own `actions/runner` `svc.sh` stores its `.credentials` as files in
+ * the runner directory for exactly this reason — a launchd/systemd service
+ * cannot rely on the interactive login keychain.
+ *
+ * ── Design ───────────────────────────────────────────────────────────────────
+ * The OS keyring stays the PRIMARY store (used by foreground `start`, where it
+ * works and is the more-secure option). This file is a daemon-readable MIRROR:
+ * `join` writes both; the resolver falls back to the file only when the keyring
+ * returns nothing (the daemon case).
+ *
+ * Security:
+ *   - File mode 0600 (owner read/write only), parent dir 0700.
+ *   - Lives under the user's own `~/.buildhive/` — same trust boundary as the
+ *     actions/runner `.credentials` file and the workspaces dir.
+ *   - Never packaged (runtime state in $HOME, not in the npm tarball).
+ *
+ * This module is pure-where-possible: fs + homedir are injectable for tests.
+ */
+import { homedir } from 'node:os';
+import { join } from 'node:path';
+import { promises as defaultFsPromises } from 'node:fs';
+/** Basename of the credential mirror file under `~/.buildhive/`. */
+export const CREDENTIAL_FILE_NAME = 'agent-enrollment.cred';
+const defaultFs = {
+    mkdir: (p, o) => defaultFsPromises.mkdir(p, o),
+    writeFile: (p, d, o) => defaultFsPromises.writeFile(p, d, o),
+    chmod: (p, m) => defaultFsPromises.chmod(p, m),
+    rename: (a, b) => defaultFsPromises.rename(a, b),
+    readFile: (p, e) => defaultFsPromises.readFile(p, e),
+    unlink: (p) => defaultFsPromises.unlink(p),
+};
+/**
+ * Reads/writes the agent-enrollment credentials as a single 0600 JSON file.
+ */
+export class CredentialFileStore {
+    fs;
+    dir;
+    filePath;
+    constructor(opts = {}) {
+        this.fs = opts.fs ?? defaultFs;
+        const home = opts.homeDir ?? homedir();
+        this.dir = join(home, '.buildhive');
+        this.filePath = join(this.dir, CREDENTIAL_FILE_NAME);
+    }
+    /** Absolute path to the credential file (for diagnostics + doctor). */
+    get path() {
+        return this.filePath;
+    }
+    /**
+     * Atomically write the credential mirror with 0600 perms.
+     * tmp-file → chmod 0600 → rename avoids a torn read by the daemon.
+     */
+    async write(creds) {
+        await this.fs.mkdir(this.dir, { recursive: true, mode: 0o700 });
+        const tmpPath = `${this.filePath}.tmp.${process.pid}`;
+        const payload = JSON.stringify({
+            jwt: creds.jwt,
+            jwtExpiresAtUnix: creds.jwtExpiresAtUnix,
+            agentId: creds.agentId,
+            tenantId: creds.tenantId,
+            platformUrl: creds.platformUrl,
+        });
+        await this.fs.writeFile(tmpPath, payload, { encoding: 'utf8', mode: 0o600 });
+        try {
+            // Belt-and-suspenders over the writeFile mode (umask can clear bits).
+            await this.fs.chmod(tmpPath, 0o600);
+            await this.fs.rename(tmpPath, this.filePath);
+        }
+        catch (err) {
+            // Don't leave a tmp file holding the JWT on a chmod/rename failure
+            // (ENOSPC, cross-device, mid-op crash). Best-effort cleanup, re-throw.
+            await this.fs.unlink(tmpPath).catch(() => undefined);
+            throw err;
+        }
+    }
+    /**
+     * Read the credential mirror. Returns null when absent or malformed — never
+     * throws, so the resolver can cleanly fall through to "not enrolled".
+     */
+    async read() {
+        let raw;
+        try {
+            raw = await this.fs.readFile(this.filePath, 'utf8');
+        }
+        catch {
+            return null; // ENOENT or unreadable
+        }
+        let parsed;
+        try {
+            parsed = JSON.parse(raw);
+        }
+        catch {
+            return null;
+        }
+        if (!parsed || typeof parsed !== 'object')
+            return null;
+        const o = parsed;
+        if (typeof o.jwt !== 'string' ||
+            typeof o.agentId !== 'string' ||
+            typeof o.tenantId !== 'string' ||
+            typeof o.platformUrl !== 'string' ||
+            o.jwt.length === 0 ||
+            o.agentId.length === 0 ||
+            o.tenantId.length === 0 ||
+            o.platformUrl.length === 0) {
+            return null;
+        }
+        const exp = typeof o.jwtExpiresAtUnix === 'number' ? o.jwtExpiresAtUnix : 0;
+        return {
+            jwt: o.jwt,
+            jwtExpiresAtUnix: Number.isFinite(exp) ? exp : 0,
+            agentId: o.agentId,
+            tenantId: o.tenantId,
+            platformUrl: o.platformUrl,
+        };
+    }
+    /** Delete the credential mirror. Idempotent (ENOENT is not an error). */
+    async delete() {
+        try {
+            await this.fs.unlink(this.filePath);
+        }
+        catch {
+            // best-effort — file may already be gone
+        }
+    }
+}

package/dist/auth/joinCommand.d.ts CHANGED Viewed

@@ -38,6 +38,21 @@ export interface ServiceInstallerInjection {
         paths: ServicePaths;
         bootstrapStdout: string;
     }>;
+    /**
+     * Row 21b / Fix D — read the launchd service health after bootstrap so
+     * `join` can self-verify the daemon actually started (instead of silently
+     * installing a crashlooping daemon). Optional: when omitted, the real
+     * `getServiceStatus` is used via dynamic import.
+     */
+    readonly getServiceStatus?: (opts: {
+        mode: 'user' | 'system';
+        label?: string;
+        homeDir?: string;
+    }) => Promise<{
+        loaded: boolean;
+        state: 'running' | 'waiting' | 'unknown' | null;
+        lastExitCode: number | null;
+    }>;
 }
 export interface JoinOptions {
     readonly token: string;
@@ -69,6 +84,14 @@ export interface JoinOptions {
      * the test runner, not at dist/cli.js).
      */
     readonly cliEntryPathOverride?: string;
+    /**
+     * Row 21b / Fix D — how long to wait after bootstrap before reading the
+     * daemon's launchd health (gives RunAtLoad + the FB16131937 startup guard
+     * time to fire). Default 3000ms; tests pass 0.
+     */
+    readonly verifyDelayMs?: number;
+    /** Sleep injection for the Fix-D verify delay (tests pass a no-op). */
+    readonly sleepFn?: (ms: number) => Promise<void>;
 }
 export interface JoinResult {
     readonly exitCode: 0 | 1 | 2 | 3;

package/dist/auth/joinCommand.js CHANGED Viewed

@@ -195,19 +195,24 @@ export async function runJoin(opts) {
     }
     // 8. Print enrollment success.
     const agentIdShort = agentId.slice(0, 8);
-    console.log(`✓ Agent enrolled in team "${teamName}" as agent ${agentIdShort}…`);
+    // teamName is cosmetic; guard against an older/forward backend that omits it.
+    console.log(`✓ Agent enrolled in team "${teamName ?? '(unknown)'}" as agent ${agentIdShort}…`);
     console.log(`✓ JWT stored in OS keyring (expires ${expiresDate}).`);
     // 9. S-1: Auto-install LaunchAgent so the dev never thinks about persistence.
     //    Idempotent — re-running `join` cleanly upgrades the plist (the underlying
     //    installer uses an atomic tmp-file → rename).
+    //    Row 21b / Fix D: maybeInstallLaunchAgent ALSO self-verifies the daemon
+    //    actually came up; it returns false (with a loud message) when the daemon
+    //    installed but crashed, so we never claim "set and forget" falsely.
     const serviceInstalled = await maybeInstallLaunchAgent(opts);
     if (serviceInstalled) {
-        console.log('✓ Agent installed and will auto-start on every login. ' +
+        console.log('✓ Agent installed and running — it will auto-start on every login. ' +
             'Inspect logs with `buildhive-agent logs`.');
     }
     else {
-        // Either non-macOS, opt-out, or install failed. Fall back to the
-        // foreground-start guidance the user can still execute.
+        // Either non-macOS, opt-out, install failed, OR the daemon did not start
+        // cleanly (Fix D). maybeInstallLaunchAgent already printed the specific
+        // reason; give the foreground fallback the user can always run.
         console.log('Run `buildhive-agent start` to begin picking up workflow jobs.');
     }
     return { exitCode: 0, serviceInstalled };
@@ -265,7 +270,6 @@ async function maybeInstallLaunchAgent(opts) {
     }
     try {
         await installFn({ mode: 'user', cliEntryPath });
-        return true;
     }
     catch (err) {
         // Already-loaded-service is a benign case — installService's atomic
@@ -277,6 +281,56 @@ async function maybeInstallLaunchAgent(opts) {
             'or run the agent in the foreground via `buildhive-agent start`.');
         return false;
     }
+    // Row 21b / Fix D — self-verify the daemon actually started. The 2026-06-01
+    // walk installed a daemon that crashlooped silently (last exit code = 1) yet
+    // join reported success. Now we wait, read launchd health, and fail LOUD.
+    return verifyDaemonStarted(opts);
+}
+/**
+ * Fix D: after bootstrap, give launchd's RunAtLoad + the FB16131937 startup
+ * guard a moment to fire, then read the daemon's health. Returns true only when
+ * the daemon is genuinely up (loaded, not crashed). On a crash it prints an
+ * actionable message and returns false so `join` recommends the foreground
+ * fallback instead of claiming "set and forget".
+ */
+async function verifyDaemonStarted(opts) {
+    const sleep = opts.sleepFn ?? ((ms) => new Promise((r) => setTimeout(r, ms)));
+    const delay = opts.verifyDelayMs ?? 3000;
+    // Resolve the status reader (injected for tests, real one otherwise).
+    let statusFn;
+    if (opts.serviceInstaller?.getServiceStatus) {
+        statusFn = opts.serviceInstaller.getServiceStatus;
+    }
+    else {
+        try {
+            const mod = await import('../service/serviceInstaller.js');
+            statusFn = mod.getServiceStatus;
+        }
+        catch {
+            // Can't verify — assume installed (don't block enrollment), but say so.
+            console.error('[join] Service installed; could not verify daemon health automatically.');
+            return true;
+        }
+    }
+    await sleep(delay);
+    let status;
+    try {
+        status = await statusFn({ mode: 'user' });
+    }
+    catch {
+        console.error('[join] Service installed; could not read daemon health (launchctl print failed).');
+        return true;
+    }
+    const crashed = status.lastExitCode !== null && status.lastExitCode !== 0;
+    if (!status.loaded || crashed) {
+        console.error('[join] ⚠ The background service was installed but did NOT start cleanly' +
+            (crashed ? ` (last exit code = ${status.lastExitCode}).` : ' (not loaded).'));
+        console.error('[join]   Diagnose with `buildhive-agent doctor`, read the log with ' +
+            '`buildhive-agent logs` (~/.buildhive/logs/buildhive-agent.log), ' +
+            'or run `buildhive-agent start` in the foreground for now.');
+        return false;
+    }
+    return true;
 }
 /** Pull tenant_id from JWT payload without signature verification. */
 function extractTenantId(jwtToken) {

package/dist/doctor/runChecks.js CHANGED Viewed

@@ -291,14 +291,29 @@ export function checkServiceLoaded(deps) {
         };
     }
 }
-// 16: agent process is actively running (as reported by launchctl print's
-// `state` field). Read-only on the cached print output from check 15.
+// 16: agent process is actually healthy (as reported by launchctl print's
+// `state` AND `last exit code` fields). Read-only on the cached print output
+// from check 15.
 //
-// `state = running`  → pass
-// `state = waiting`  → warn (KeepAlive will restart it; not necessarily a bug)
-// anything else      → warn (unknown launchd state — surface verbatim)
-// not loaded         → fail (upstream check 15 already failed; chain here so
-//                      we don't double-report)
+// Row 21b / Fix C — HONESTY. The old version returned a soft WARNING for
+// `state = spawn scheduled`, which HID a crashlooping daemon (the 2026-06-01
+// walk saw 14/15 "healthy" while the daemon was dead, last exit code = 1).
+// New contract:
+//   last exit code != 0          → FAIL (daemon crashed / is crashlooping)
+//   state = running, exit 0/absent → PASS
+//   state = spawn scheduled        → WARN (launchd is throttling respawns)
+//   state = waiting                → WARN (KeepAlive should restart it)
+//   anything else                  → WARN (unknown — surface verbatim)
+//   not loaded                     → FAIL (upstream check 15 already failed)
+const SERVICE_CRASH_FIX =
+// NB: the daemon logs to ~/.buildhive/logs/buildhive-agent.log (via pino-roll),
+// NOT the launchd StandardOutPath file, which is empty in production. Point
+// operators at `buildhive-agent logs` so they don\'t chase the empty stdout
+// file the 2026-06-01 walk got stuck on.
+'The background daemon crashed. Inspect its log with `buildhive-agent logs` ' +
+    '(file: ~/.buildhive/logs/buildhive-agent.log), then re-run ' +
+    '`buildhive-agent join <token>` (re-enrolls + reinstalls). ' +
+    'You can always run `buildhive-agent start` in the foreground meanwhile.';
 export function checkServiceRunning(deps, printOutput) {
     if (deps.platform !== 'darwin') {
         return {
@@ -312,18 +327,39 @@ export function checkServiceRunning(deps, printOutput) {
             message: 'Service not loaded (see previous check)',
         };
     }
-    const stateMatch = printOutput.stdout.match(/state\s*=\s*(\w+)/);
+    // Parse `last exit code = N` first — a non-zero value means the daemon died,
+    // and takes precedence over whatever transient `state` launchd reports.
+    const exitMatch = printOutput.stdout.match(/last exit code\s*=\s*(-?\d+)/i);
+    const lastExitCode = exitMatch ? parseInt(exitMatch[1], 10) : null;
+    if (lastExitCode !== null && lastExitCode !== 0) {
+        return {
+            name: 'Agent process running', status: 'fail',
+            message: `Daemon is not healthy — last exit code = ${lastExitCode} (crashing on start)`,
+            fix: SERVICE_CRASH_FIX,
+        };
+    }
+    // `state = spawn scheduled` has a space — capture the whole value, not just
+    // the first word (the old `\w+` regex truncated it to "spawn"). Anchor on
+    // start-of-string OR newline so the first (top-level) `state =` is taken.
+    const stateMatch = printOutput.stdout.match(/(?:^|\n)\s*state\s*=\s*([^\n]+)/);
     if (!stateMatch) {
         return {
             name: 'Agent process running', status: 'warn',
             message: 'Could not parse `state =` from launchctl print output',
         };
     }
-    const stateRaw = stateMatch[1];
+    const stateRaw = stateMatch[1].trim();
     const state = stateRaw.toLowerCase();
     if (state.includes('run')) {
         return { name: 'Agent process running', status: 'pass', message: `state = ${stateRaw}` };
     }
+    if (state.includes('spawn')) {
+        return {
+            name: 'Agent process running', status: 'warn',
+            message: `Daemon is being (re)spawned by launchd (state = ${stateRaw}); re-run doctor in a few seconds`,
+            fix: SERVICE_CRASH_FIX,
+        };
+    }
     if (state.includes('wait')) {
         return {
             name: 'Agent process running', status: 'warn',

package/dist/runner/startCommand.d.ts CHANGED Viewed

@@ -14,15 +14,25 @@
  *
  * Row 17b — Wave 1 (agent supervisor + actions/runner spawn).
  * Row 18b S7 — Wave 2 (cache env injection into runner spawn).
+ * Row 21b — Fix A: the default single-slot `start` now REPLENISHES. Each
+ *   ephemeral runner still handles one job then exits (binding constraint #5),
+ *   but the supervisor re-registers and keeps listening until SIGTERM, so the
+ *   agent runs forever ("set and forget") instead of stopping after one job.
  *
  * Exit-code contract:
- *   0 — runner completed a job and exited cleanly (--ephemeral one-shot)
- *   1 — fatal error (auth, config, binary download, registration)
+ *   0 — supervisor drained cleanly after SIGTERM/SIGINT (the normal stop path),
+ *       OR (pre-start) a successful no-op. The supervisor does NOT exit after a
+ *       single job anymore.
+ *   1 — fatal pre-supervisor error (auth, no JWT, no enrolled repo, binary
+ *       download, unsupported platform handled below)
  *   2 — platform not supported
  */
 import { AgentEnrollmentKeyringStore } from '../auth/agentEnrollmentKeyringStore.js';
+import { fetchRunnerToken } from './tokenClient.js';
 import { fetchMyRepos } from './myReposClient.js';
 import { ensureRunner } from './binaryFetcher.js';
+import { configureRunner, runRunner } from './supervisor.js';
+import { startHeartbeat } from './heartbeat.js';
 import { type PoolDeps } from './pool.js';
 import type { EnsureRunnerOptions } from './binaryFetcher.js';
 import type { CacheEnvDeps } from './cacheEnv.js';
@@ -61,6 +71,23 @@ export interface StartOptions {
      * Allows injecting fake configureRunner / runRunner / fetchRunnerToken / prepareSlotDir.
      */
     readonly poolDeps?: Partial<PoolDeps>;
+    /**
+     * Dependency injection for the N==1 replenishing supervisor (Row 21b / Fix A).
+     * Tests inject fakes to drive the replenishment loop deterministically and
+     * assert backoff / replenish behaviour. Omit in production (real impls used).
+     */
+    readonly supervisorDeps?: Partial<{
+        configureRunner: typeof configureRunner;
+        runRunner: typeof runRunner;
+        fetchRunnerToken: typeof fetchRunnerToken;
+    }>;
+    /**
+     * Dependency injection for the heartbeat (tests only). The real
+     * {@link startHeartbeat} POSTs to the backend on a 30s timer and once more
+     * on `stop()`; injecting a no-op keeps the supervisor-loop tests off the
+     * network and deterministic. Omit in production.
+     */
+    readonly startHeartbeatFn?: typeof startHeartbeat;
 }
 export interface StartResult {
     readonly exitCode: 0 | 1 | 2;

package/dist/runner/startCommand.js CHANGED Viewed

@@ -14,10 +14,17 @@
  *
  * Row 17b — Wave 1 (agent supervisor + actions/runner spawn).
  * Row 18b S7 — Wave 2 (cache env injection into runner spawn).
+ * Row 21b — Fix A: the default single-slot `start` now REPLENISHES. Each
+ *   ephemeral runner still handles one job then exits (binding constraint #5),
+ *   but the supervisor re-registers and keeps listening until SIGTERM, so the
+ *   agent runs forever ("set and forget") instead of stopping after one job.
  *
  * Exit-code contract:
- *   0 — runner completed a job and exited cleanly (--ephemeral one-shot)
- *   1 — fatal error (auth, config, binary download, registration)
+ *   0 — supervisor drained cleanly after SIGTERM/SIGINT (the normal stop path),
+ *       OR (pre-start) a successful no-op. The supervisor does NOT exit after a
+ *       single job anymore.
+ *   1 — fatal pre-supervisor error (auth, no JWT, no enrolled repo, binary
+ *       download, unsupported platform handled below)
  *   2 — platform not supported
  */
 import os from 'os';
@@ -32,6 +39,29 @@ import { startHeartbeat } from './heartbeat.js';
 import { collectDeviceInfo } from './deviceInfo.js';
 import { runPool } from './pool.js';
 const logger = createLogger('runner.startCommand');
+/**
+ * Row 21b / Fix A — supervisor backoff bounds.
+ *
+ * When a registration cycle fails (token fetch or config.sh), the supervisor
+ * does NOT exit (that would crashloop under launchd KeepAlive). It backs off
+ * with capped exponential delay and retries, so a transient backend blip
+ * self-heals and a persistent fault (e.g. repo de-enrolled) loops calmly —
+ * surfaced loudly by the dashboard (OFFLINE) and `buildhive-agent doctor`.
+ */
+const SUPERVISOR_BACKOFF_BASE_MS = 2_000;
+const SUPERVISOR_BACKOFF_MAX_MS = 30_000;
+/**
+ * Sleep up to `ms`, but wake early (≤250ms) once `isCancelled()` flips true so
+ * a SIGTERM during a backoff drains promptly instead of waiting the full delay.
+ */
+async function interruptibleSleep(ms, isCancelled) {
+    const deadline = Date.now() + ms;
+    while (Date.now() < deadline) {
+        if (isCancelled())
+            return;
+        await new Promise((r) => setTimeout(r, Math.min(250, deadline - Date.now())));
+    }
+}
 /**
  * Attempt to read a JWT from the keyring.
  * Prefers agent_enrollment_jwt (row 17c path); falls back to device_flow_jwt
@@ -221,11 +251,30 @@ export async function runStart(opts = {}) {
     // Device specs (cpuCores, memoryGB) are collected once here and ride every
     // heartbeat so the dashboard shows real hardware (Task #28).
     const deviceInfo = collectDeviceInfo();
-    const heartbeat = startHeartbeat({ platformUrl, jwt, deviceInfo });
+    const startHeartbeatFn = opts.startHeartbeatFn ?? startHeartbeat;
+    const heartbeat = startHeartbeatFn({ platformUrl, jwt, deviceInfo });
     // ── Step 7: Branch on concurrency ─────────────────────────────────────────────
     if (concurrency === 1) {
-        // ── N==1: original single-shot path (byte-for-byte preserved) ──────────────
-        // Steps below mirror the original startCommand.ts flow exactly.
+        // ── N==1: replenishing single-slot supervisor (Row 21b / Fix A) ────────────
+        //
+        // Each ephemeral `actions/runner` still handles exactly one job then exits
+        // (binding constraint #5 preserved). What changed in row 21b: the OUTER
+        // supervisor no longer exits with it — it re-registers a fresh ephemeral
+        // runner and keeps listening, so `buildhive-agent start` runs forever
+        // ("set and forget") instead of stopping after the first job (walk finding
+        // F3). The loop exits only on SIGTERM/SIGINT (user logout / `launchctl
+        // bootout` / Ctrl-C).
+        let shuttingDown = false;
+        const onShutdown = () => {
+            if (!shuttingDown) {
+                shuttingDown = true;
+                logger.info('Supervisor received shutdown signal — finishing current cycle then exiting');
+            }
+        };
+        // Multi-label per 2026-05-17 operator correction:
+        // runs-on: [self-hosted, buildhive, ubuntu-latest] requires all three labels.
+        const labels = 'self-hosted,buildhive,ubuntu-latest';
+        const runnerRegistrationUrl = `https://github.com/${owner}/${repo}`;
         const tokenFetchOpts = {
             platformUrl,
             jwt,
@@ -233,56 +282,11 @@ export async function runStart(opts = {}) {
             repo,
             fetchFn: opts.fetchFn,
         };
-        let runnerToken;
-        try {
-            runnerToken = await fetchRunnerToken(tokenFetchOpts);
-        }
-        catch (err) {
-            const msg = `Failed to fetch runner token: ${err instanceof Error ? err.message : String(err)}`;
-            logger.error(msg);
-            await heartbeat.stop('OFFLINE').catch(() => undefined);
-            return { exitCode: 1, message: msg };
-        }
-        const runnerName = generateRunnerName();
-        // Multi-label per 2026-05-17 operator correction:
-        // runs-on: [self-hosted, buildhive, ubuntu-latest] requires all three labels.
-        const labels = 'self-hosted,buildhive,ubuntu-latest';
-        const runnerRegistrationUrl = `https://github.com/${owner}/${repo}`;
-        const configOpts = {
-            configSh: runnerPaths.configSh,
-            runnerRegistrationUrl,
-            registrationToken: runnerToken.token,
-            runnerName,
-            labels,
-        };
-        logger.info('Registering runner with GitHub', { runnerName, runnerRegistrationUrl, labels });
-        let configExitCode = await configureRunner(configOpts);
-        if (configExitCode !== 0) {
-            logger.warn('config.sh failed — retrying with a fresh token once', { configExitCode });
-            let retryToken;
-            try {
-                retryToken = await fetchRunnerToken(tokenFetchOpts);
-            }
-            catch (err) {
-                const msg = `Retry token fetch failed: ${err instanceof Error ? err.message : String(err)}`;
-                logger.error(msg);
-                await heartbeat.stop('OFFLINE').catch(() => undefined);
-                return { exitCode: 1, message: msg };
-            }
-            configExitCode = await configureRunner({
-                ...configOpts,
-                registrationToken: retryToken.token,
-            });
-            if (configExitCode !== 0) {
-                const msg = `Runner configuration failed after retry (config.sh exited ${configExitCode}). ` +
-                    `Check your enrollment credentials and ensure the repo is enrolled in BuildHive.`;
-                logger.error(msg);
-                await heartbeat.stop('OFFLINE').catch(() => undefined);
-                return { exitCode: 1, message: msg };
-            }
-        }
-        logger.info('Runner configured successfully — starting run.sh', { runnerName });
-        // ── Cache server (Row 18b S7, Wave 2) — N==1 only ──────────────────────────
+        // DI seam (tests) — production uses the real supervisor primitives.
+        const fetchTokenFn = opts.supervisorDeps?.fetchRunnerToken ?? fetchRunnerToken;
+        const configureFn = opts.supervisorDeps?.configureRunner ?? configureRunner;
+        const runFn = opts.supervisorDeps?.runRunner ?? runRunner;
+        // ── Cache server (Row 18b S7, Wave 2) — started ONCE, lives across cycles ──
         const defaultCacheConfig = {
             enabled: false,
             directory: '/tmp/buildhive/cache',
@@ -299,17 +303,124 @@ export async function runStart(opts = {}) {
         catch {
             logger.warn('Could not load agent config for cache check — proceeding without cache');
         }
-        const effectiveCacheDeps = opts.cacheEnvDeps ?? await buildProductionCacheEnvDeps();
-        const cacheResult = await prepareCache(cacheConfig, effectiveCacheDeps);
-        const runOpts = {
-            runSh: runnerPaths.runSh,
-            extraEnv: cacheResult.enabled ? cacheResult.patch : undefined,
+        // Cache prep + signal handlers are set up INSIDE the try below so a throw
+        // (e.g. a dynamic import failing in buildProductionCacheEnvDeps) can never
+        // leak the SIGTERM/SIGINT listeners or skip heartbeat shutdown.
+        let cacheResult;
+        let runExtraEnv;
+        /**
+         * Register a fresh ephemeral runner (one fresh token, one-time retry on
+         * non-zero config.sh exit). Returns true once registered, false on failure
+         * (caller backs off). Honours `shuttingDown` between awaits.
+         */
+        const registerOnce = async () => {
+            let token;
+            try {
+                token = await fetchTokenFn(tokenFetchOpts);
+            }
+            catch (err) {
+                logger.error('Failed to fetch runner token', {
+                    err: err instanceof Error ? err.message : String(err),
+                });
+                return false;
+            }
+            if (shuttingDown)
+                return false;
+            const configOpts = {
+                configSh: runnerPaths.configSh,
+                runnerRegistrationUrl,
+                registrationToken: token.token,
+                runnerName: generateRunnerName(),
+                labels,
+            };
+            logger.info('Registering runner with GitHub', {
+                runnerName: configOpts.runnerName,
+                runnerRegistrationUrl,
+                labels,
+            });
+            let configExitCode = await configureFn(configOpts);
+            if (configExitCode !== 0) {
+                logger.warn('config.sh failed — retrying with a fresh token once', { configExitCode });
+                let retryToken;
+                try {
+                    retryToken = await fetchTokenFn(tokenFetchOpts);
+                }
+                catch (err) {
+                    logger.error('Retry token fetch failed', {
+                        err: err instanceof Error ? err.message : String(err),
+                    });
+                    return false;
+                }
+                if (shuttingDown)
+                    return false;
+                configExitCode = await configureFn({
+                    ...configOpts,
+                    registrationToken: retryToken.token,
+                    runnerName: generateRunnerName(),
+                });
+                if (configExitCode !== 0) {
+                    logger.error('Runner configuration failed after retry — check enrollment credentials and ' +
+                        'that the repo is enrolled in BuildHive.', { configExitCode });
+                    return false;
+                }
+            }
+            return true;
         };
-        let result;
+        let consecutiveFailures = 0;
+        process.on('SIGTERM', onShutdown);
+        process.on('SIGINT', onShutdown);
         try {
-            result = await runRunner(runOpts);
+            // Cache server (Row 18b S7) — started once, lives across cycles. Inside
+            // the try so a failure still hits the finally (listener + heartbeat cleanup).
+            const effectiveCacheDeps = opts.cacheEnvDeps ?? (await buildProductionCacheEnvDeps());
+            cacheResult = await prepareCache(cacheConfig, effectiveCacheDeps);
+            runExtraEnv = cacheResult.enabled ? cacheResult.patch : undefined;
+            while (!shuttingDown) {
+                // Yield to the macrotask queue each cycle so SIGTERM/timers always get
+                // a turn — a fast path (e.g. config.sh exiting instantly) must never
+                // starve the event loop on the microtask queue.
+                await new Promise((r) => setTimeout(r, 0));
+                if (shuttingDown)
+                    break;
+                const registered = await registerOnce();
+                if (shuttingDown)
+                    break;
+                if (!registered) {
+                    // Bounded exponential backoff so a transient outage self-heals and a
+                    // persistent fault loops calmly (no respawn storm). Surfaced as
+                    // OFFLINE on the dashboard + a doctor failure.
+                    consecutiveFailures += 1;
+                    const backoff = Math.min(SUPERVISOR_BACKOFF_MAX_MS, SUPERVISOR_BACKOFF_BASE_MS * 2 ** (consecutiveFailures - 1));
+                    logger.warn('Registration failed — backing off before retry', {
+                        consecutiveFailures,
+                        backoffMs: backoff,
+                    });
+                    await interruptibleSleep(backoff, () => shuttingDown);
+                    continue;
+                }
+                consecutiveFailures = 0;
+                logger.info('Runner configured successfully — starting run.sh');
+                const result = await runFn({ runSh: runnerPaths.runSh, extraEnv: runExtraEnv });
+                if (shuttingDown)
+                    break;
+                if (result.ok) {
+                    logger.info('Runner completed a job and exited cleanly — replenishing');
+                }
+                else {
+                    // A non-zero run.sh exit reflects the runner's own health, not the
+                    // build result. Replenish (consistent with the pool path) but apply
+                    // a small backoff so a crash-looping runner doesn't spin hot.
+                    logger.warn('Runner exited non-zero — replenishing after short backoff', {
+                        exitCode: result.exitCode,
+                        signal: result.signal,
+                    });
+                    await interruptibleSleep(SUPERVISOR_BACKOFF_BASE_MS, () => shuttingDown);
+                }
+            }
         }
         finally {
+            process.removeListener('SIGTERM', onShutdown);
+            process.removeListener('SIGINT', onShutdown);
             try {
                 await heartbeat.stop('OFFLINE');
             }
@@ -318,7 +429,7 @@ export async function runStart(opts = {}) {
                     err: err instanceof Error ? err.message : String(err),
                 });
             }
-            if (cacheResult.enabled) {
+            if (cacheResult?.enabled) {
                 try {
                     await stopCache(cacheResult.handle);
                 }
@@ -329,13 +440,8 @@ export async function runStart(opts = {}) {
                 }
             }
         }
-        if (result.ok) {
-            logger.info('Runner completed a job and exited cleanly');
-            return { exitCode: 0 };
-        }
-        const msg = `Runner exited with code ${result.exitCode ?? 'null'} signal ${result.signal ?? 'none'}`;
-        logger.warn(msg);
-        return { exitCode: result.exitCode === 0 ? 0 : 1, message: msg };
+        logger.info('Supervisor exited cleanly after shutdown signal');
+        return { exitCode: 0 };
     }
     // ── N>1: pool path ──────────────────────────────────────────────────────────
     // Cache is disabled for N>1 — shared cache server is a follow-up item.

package/dist/runner/supervisor.d.ts CHANGED Viewed

@@ -96,6 +96,15 @@ export declare function configureRunner(opts: ConfigureRunnerOptions): Promise<n
  * completes. The resolved RunnerExitResult reflects that final exit.
  */
 export declare function runRunner(opts: RunRunnerOptions): Promise<RunnerExitResult>;
+/**
+ * Remove a stale `actions/runner` local registration so a fresh `config.sh`
+ * can register cleanly. Best-effort (ENOENT is fine). Exported for tests.
+ *
+ * These are the files config.sh writes on registration; a leftover set (from an
+ * unclean shutdown of an idle-Listening ephemeral runner) makes a subsequent
+ * config.sh fail with "already configured".
+ */
+export declare function removeStaleRunnerConfig(runnerDir: string): Promise<void>;
 /**
  * Generate a runner name: "buildhive-<8-char random hex>".
  * Short enough to be readable in GitHub's runner list.

package/dist/runner/supervisor.js CHANGED Viewed

@@ -41,8 +41,9 @@
  */
 import { spawn } from 'child_process';
 import { createInterface } from 'readline';
-import { dirname } from 'path';
+import { dirname, join } from 'path';
 import { randomBytes } from 'crypto';
+import { promises as fsPromises } from 'fs';
 import { createLogger } from '../utils/logger.js';
 const logger = createLogger('runner.supervisor');
 /** Timeout in ms to wait for the runner to exit after SIGTERM before SIGKILL */
@@ -64,6 +65,7 @@ export function sleep(ms) {
 export async function configureRunner(opts) {
     const { configSh, runnerRegistrationUrl, registrationToken, runnerName, labels, } = opts;
     const spawnFn = opts.spawnFn ?? spawn;
+    const runnerDir = dirname(configSh);
     const args = [
         '--unattended',
         '--ephemeral',
@@ -95,36 +97,61 @@ export async function configureRunner(opts) {
         '--labels', labels,
         '--replace',
     ];
-    logger.info('Configuring runner', { configSh, runnerName, runnerRegistrationUrl, labels });
-    // SECURITY M-1: The GitHub registration token is passed as a process argument
-    // (--token <value>) and is briefly visible to local users via 'ps aux' for the
-    // lifetime of config.sh (~few seconds). This is acceptable per Apple/GitHub
-    // guidance: the token is single-use and expires in ~1 hour. See P6 security
-    // review M-1.
-    const child = spawnFn(configSh, args, {
-        cwd: dirname(configSh),
-        env: { ...process.env },
-        stdio: ['ignore', 'pipe', 'pipe'],
-    });
-    const stdoutLog = createLogger('runner.config.stdout');
-    const stderrLog = createLogger('runner.config.stderr');
-    if (child.stdout) {
-        const rl = createInterface({ input: child.stdout, crlfDelay: Infinity });
-        rl.on('line', (line) => stdoutLog.info(line));
-    }
-    if (child.stderr) {
-        const rl = createInterface({ input: child.stderr, crlfDelay: Infinity });
-        rl.on('line', (line) => stderrLog.warn(line));
-    }
-    return new Promise((resolve) => {
-        child.on('close', (code) => {
-            resolve(code ?? 1);
+    // Spawn config.sh once; stream its output to the log AND watch for the
+    // "already configured" marker so the caller can self-heal a stale/raced
+    // local registration. SECURITY M-1: the GH registration token is passed as a
+    // process arg (--token), briefly visible via `ps aux`; acceptable per
+    // Apple/GitHub guidance (single-use, ~1h TTL). See P6 review M-1.
+    const runConfigSh = () => {
+        logger.info('Configuring runner', { configSh, runnerName, runnerRegistrationUrl, labels });
+        const child = spawnFn(configSh, args, {
+            cwd: runnerDir,
+            env: { ...process.env },
+            stdio: ['ignore', 'pipe', 'pipe'],
         });
-        child.on('error', (err) => {
-            logger.error('config.sh spawn error', err);
-            resolve(1);
+        let alreadyConfigured = false;
+        const markIfAlreadyConfigured = (line) => {
+            if (/already configured/i.test(line))
+                alreadyConfigured = true;
+        };
+        const stdoutLog = createLogger('runner.config.stdout');
+        const stderrLog = createLogger('runner.config.stderr');
+        if (child.stdout) {
+            const rl = createInterface({ input: child.stdout, crlfDelay: Infinity });
+            rl.on('line', (line) => { markIfAlreadyConfigured(line); stdoutLog.info(line); });
+        }
+        if (child.stderr) {
+            const rl = createInterface({ input: child.stderr, crlfDelay: Infinity });
+            rl.on('line', (line) => { markIfAlreadyConfigured(line); stderrLog.warn(line); });
+        }
+        return new Promise((resolve) => {
+            child.on('close', (code) => resolve({ code: code ?? 1, alreadyConfigured }));
+            child.on('error', (err) => {
+                logger.error('config.sh spawn error', err);
+                resolve({ code: 1, alreadyConfigured });
+            });
         });
-    });
+    };
+    // Row 21b — reboot robustness. We ALWAYS register a fresh ephemeral runner
+    // each cycle, so any local registration is stale. An ephemeral runner killed
+    // while idle-Listening (a reboot, or the SIGTERM grace window on `launchctl
+    // bootout`) leaves `.runner`/`.credentials*` behind; config.sh then refuses
+    // with "already configured".
+    //
+    // 1. Pre-clean the common leftover case.
+    await removeStaleRunnerConfig(runnerDir);
+    let result = await runConfigSh();
+    // 2. Fast self-heal: if config.sh STILL reports "already configured" (a
+    //    leftover the pre-clean missed, or a brief post-reboot race), clean and
+    //    retry ONCE immediately — instead of returning failure and waiting for
+    //    the supervisor's backoff + a launchd respawn (the 2026-06-02 reboot
+    //    walk recovered only after ~6 min via a respawn; this makes it seconds).
+    if (result.code !== 0 && result.alreadyConfigured) {
+        logger.warn('config.sh reported "already configured" — cleaning stale registration and retrying immediately');
+        await removeStaleRunnerConfig(runnerDir);
+        result = await runConfigSh();
+    }
+    return result.code;
 }
 /**
  * Spawn run.sh (the GitHub runner listen loop) and manage its lifecycle.
@@ -256,6 +283,32 @@ export async function runRunner(opts) {
         });
     });
 }
+/**
+ * Remove a stale `actions/runner` local registration so a fresh `config.sh`
+ * can register cleanly. Best-effort (ENOENT is fine). Exported for tests.
+ *
+ * These are the files config.sh writes on registration; a leftover set (from an
+ * unclean shutdown of an idle-Listening ephemeral runner) makes a subsequent
+ * config.sh fail with "already configured".
+ */
+export async function removeStaleRunnerConfig(runnerDir) {
+    const stale = ['.runner', '.credentials', '.credentials_rsaparams'];
+    let removedAny = false;
+    for (const name of stale) {
+        try {
+            await fsPromises.unlink(join(runnerDir, name));
+            removedAny = true;
+        }
+        catch {
+            // ENOENT (normal — clean slate) or unreadable; ignore.
+        }
+    }
+    if (removedAny) {
+        logger.info('Removed stale runner registration before re-config (reboot/unclean-exit recovery)', {
+            runnerDir,
+        });
+    }
+}
 /**
  * Generate a runner name: "buildhive-<8-char random hex>".
  * Short enough to be readable in GitHub's runner list.

package/dist/service/plistGenerator.js CHANGED Viewed

@@ -168,9 +168,14 @@ export function generatePlist(mode, config) {
     // and the CLI looking at different ~/.buildhive/ paths).
     // System mode: HOME=/var/_buildhive so the dropped-priv user can read
     // its own state.
+    // Row 21b / Fix B: prepend `/opt/homebrew/bin` (Apple-Silicon Homebrew) and
+    // keep `/usr/local/bin` (Intel Homebrew). launchd does NOT inherit the
+    // interactive shell PATH, so without this the daemon-spawned `actions/runner`
+    // can't find `git`, `node`, etc. installed via Homebrew on Apple Silicon —
+    // the runner's config.sh/run.sh fail and the job never runs.
     const envDict = [
         ['NODE_ENV', 'production'],
-        ['PATH', '/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin'],
+        ['PATH', '/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin'],
     ];
     if (mode === 'system') {
         envDict.push(['HOME', `/var/${config.systemUserName ?? DEFAULT_SYSTEM_USER}`]);

package/dist/service/serviceInstaller.d.ts CHANGED Viewed

@@ -37,6 +37,11 @@ export interface ServiceStatus {
     readonly state: 'running' | 'waiting' | 'unknown' | null;
     /** Most recent exit reason if available. */
     readonly lastExitReason: string | null;
+    /**
+     * Most recent `last exit code` per `launchctl print` (or null if absent).
+     * Row 21b / Fix C: a non-zero value means the daemon is crashing on start.
+     */
+    readonly lastExitCode: number | null;
     /** Computed install paths for this mode (single source of truth). */
     readonly paths: ServicePaths;
 }

package/dist/service/serviceInstaller.js CHANGED Viewed

@@ -158,12 +158,16 @@ export async function getServiceStatus(opts, deps = {}) {
             loaded: false,
             state: null,
             lastExitReason: null,
+            lastExitCode: null,
             paths,
         };
     }
     const out = printResult.stdout;
-    const stateMatch = out.match(/state\s*=\s*(\w+)/);
+    // Anchored + `[^\n]+` so multi-word states like `spawn scheduled` aren't
+    // truncated to `spawn` (the exact truncation Fix C corrected in the doctor).
+    const stateMatch = out.match(/(?:^|\n)\s*state\s*=\s*([^\n]+)/);
     const exitReasonMatch = out.match(/last exit reason\s*=\s*([^\n]+)/);
+    const exitCodeMatch = out.match(/last exit code\s*=\s*(-?\d+)/i);
     let state = 'unknown';
     if (stateMatch) {
         const raw = stateMatch[1].toLowerCase();
@@ -179,6 +183,7 @@ export async function getServiceStatus(opts, deps = {}) {
         loaded: true,
         state,
         lastExitReason: exitReasonMatch ? exitReasonMatch[1].trim() : null,
+        lastExitCode: exitCodeMatch ? parseInt(exitCodeMatch[1], 10) : null,
         paths,
     };
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "buildhive-agent",
-  "version": "1.0.0-beta.11",
+  "version": "1.0.0-beta.12",
   "description": "BuildHive CI Agent - Distributed build execution agent",
   "type": "module",
   "main": "dist/index.js",
@@ -74,4 +74,4 @@
   "overrides": {
     "minimatch": "^10.0.1"
   }
-}
+}