buildhive-agent 1.0.0-beta.11 → 1.0.0-beta.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auth/agentEnrollmentKeyringStore.d.ts +17 -3
- package/dist/auth/agentEnrollmentKeyringStore.js +104 -25
- package/dist/auth/credentialFileStore.d.ts +79 -0
- package/dist/auth/credentialFileStore.js +140 -0
- package/dist/auth/joinCommand.d.ts +23 -0
- package/dist/auth/joinCommand.js +59 -5
- package/dist/doctor/runChecks.js +45 -9
- package/dist/runner/startCommand.d.ts +29 -2
- package/dist/runner/startCommand.js +176 -70
- package/dist/runner/supervisor.d.ts +9 -0
- package/dist/runner/supervisor.js +82 -29
- package/dist/service/plistGenerator.js +6 -1
- package/dist/service/serviceInstaller.d.ts +5 -0
- package/dist/service/serviceInstaller.js +6 -1
- package/package.json +2 -2
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
* agent-enrollment.platform_url — bound at enroll; refuses cross-instance reuse
|
|
15
15
|
*/
|
|
16
16
|
import type { SecretStore } from '../security/secretStore.js';
|
|
17
|
+
import { CredentialFileStore } from './credentialFileStore.js';
|
|
17
18
|
import { PlatformUrlMismatchError } from './types.js';
|
|
18
19
|
export declare const AGENT_ENROLLMENT_KEY_PREFIX = "agent-enrollment.";
|
|
19
20
|
export declare const KEY_JWT = "agent-enrollment.jwt";
|
|
@@ -31,6 +32,12 @@ export interface AgentEnrollmentCredentials {
|
|
|
31
32
|
export interface AgentEnrollmentKeyringStoreOptions {
|
|
32
33
|
/** Override the SecretStore (tests + environments without OS keyring). */
|
|
33
34
|
readonly store?: SecretStore;
|
|
35
|
+
/**
|
|
36
|
+
* Override the 0600 credential-file mirror (row 21b / F2 daemon fallback).
|
|
37
|
+
* Tests inject a fake; production uses the real `~/.buildhive/` file.
|
|
38
|
+
* Pass `null` to disable the file mirror entirely (keyring-only).
|
|
39
|
+
*/
|
|
40
|
+
readonly fileStore?: CredentialFileStore | null;
|
|
34
41
|
}
|
|
35
42
|
/**
|
|
36
43
|
* Custom error thrown when the stored platform_url doesn't match the current
|
|
@@ -42,6 +49,8 @@ export declare class AgentPlatformUrlMismatchError extends PlatformUrlMismatchEr
|
|
|
42
49
|
}
|
|
43
50
|
export declare class AgentEnrollmentKeyringStore {
|
|
44
51
|
private readonly store;
|
|
52
|
+
/** 0600 file mirror for the launchd daemon; null when explicitly disabled. */
|
|
53
|
+
private readonly fileStore;
|
|
45
54
|
constructor(opts?: AgentEnrollmentKeyringStoreOptions);
|
|
46
55
|
/** Throws KeyringUnavailableError if the OS keyring is unreachable. */
|
|
47
56
|
static assertAvailable(): Promise<void>;
|
|
@@ -63,16 +72,21 @@ export declare class AgentEnrollmentKeyringStore {
|
|
|
63
72
|
* Throws NotLoggedInError if any required key is missing.
|
|
64
73
|
*/
|
|
65
74
|
readAll(): Promise<AgentEnrollmentCredentials>;
|
|
75
|
+
/**
|
|
76
|
+
* Read the 0600 file mirror, swallowing any error to null. Centralised so
|
|
77
|
+
* every read path (readJwt / readAll / hasEnrollment) shares one fallback.
|
|
78
|
+
*/
|
|
79
|
+
private readFromFileMirror;
|
|
66
80
|
/**
|
|
67
81
|
* Cross-platform-url check. Refuses to use credentials minted against a
|
|
68
82
|
* different BuildHive instance.
|
|
69
83
|
*/
|
|
70
84
|
assertPlatformUrlMatches(currentPlatformUrl: string): Promise<void>;
|
|
71
|
-
/** Returns true if
|
|
85
|
+
/** Returns true if an enrollment JWT exists in the keyring OR the file mirror. */
|
|
72
86
|
hasEnrollment(): Promise<boolean>;
|
|
73
87
|
/**
|
|
74
|
-
* Delete all 5 `agent-enrollment.*` keys. Idempotent.
|
|
75
|
-
* Called by
|
|
88
|
+
* Delete all 5 `agent-enrollment.*` keys AND the 0600 file mirror. Idempotent.
|
|
89
|
+
* Called by `buildhive-agent logout` (row 21b / F4) to fully de-enroll.
|
|
76
90
|
*/
|
|
77
91
|
clear(): Promise<void>;
|
|
78
92
|
}
|
|
@@ -14,7 +14,10 @@
|
|
|
14
14
|
* agent-enrollment.platform_url — bound at enroll; refuses cross-instance reuse
|
|
15
15
|
*/
|
|
16
16
|
import { KeyringSecretStore } from '../security/keyringSecretStore.js';
|
|
17
|
+
import { CredentialFileStore } from './credentialFileStore.js';
|
|
18
|
+
import { createLogger } from '../utils/logger.js';
|
|
17
19
|
import { KeyringUnavailableError, NotLoggedInError, PlatformUrlMismatchError, } from './types.js';
|
|
20
|
+
const logger = createLogger('auth.agentEnrollmentKeyringStore');
|
|
18
21
|
export const AGENT_ENROLLMENT_KEY_PREFIX = 'agent-enrollment.';
|
|
19
22
|
export const KEY_JWT = `${AGENT_ENROLLMENT_KEY_PREFIX}jwt`;
|
|
20
23
|
export const KEY_JWT_EXP = `${AGENT_ENROLLMENT_KEY_PREFIX}jwt_exp`;
|
|
@@ -44,8 +47,29 @@ export class AgentPlatformUrlMismatchError extends PlatformUrlMismatchError {
|
|
|
44
47
|
}
|
|
45
48
|
export class AgentEnrollmentKeyringStore {
|
|
46
49
|
store;
|
|
50
|
+
/** 0600 file mirror for the launchd daemon; null when explicitly disabled. */
|
|
51
|
+
fileStore;
|
|
47
52
|
constructor(opts = {}) {
|
|
48
53
|
this.store = opts.store ?? new KeyringSecretStore();
|
|
54
|
+
// File-mirror resolution (row 21b / F2):
|
|
55
|
+
// - explicit `null` → disabled (keyring-only)
|
|
56
|
+
// - explicit CredentialFileStore → use it
|
|
57
|
+
// - omitted + real keyring → real `~/.buildhive/` mirror (production)
|
|
58
|
+
// - omitted + INJECTED keyring (tests) → disabled, so unit tests never
|
|
59
|
+
// read/write the real user home. A test exercising the mirror must
|
|
60
|
+
// inject its own `fileStore` explicitly.
|
|
61
|
+
if (opts.fileStore === null) {
|
|
62
|
+
this.fileStore = null;
|
|
63
|
+
}
|
|
64
|
+
else if (opts.fileStore) {
|
|
65
|
+
this.fileStore = opts.fileStore;
|
|
66
|
+
}
|
|
67
|
+
else if (opts.store) {
|
|
68
|
+
this.fileStore = null;
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
this.fileStore = new CredentialFileStore();
|
|
72
|
+
}
|
|
49
73
|
}
|
|
50
74
|
/** Throws KeyringUnavailableError if the OS keyring is unreachable. */
|
|
51
75
|
static async assertAvailable() {
|
|
@@ -75,6 +99,21 @@ export class AgentEnrollmentKeyringStore {
|
|
|
75
99
|
await this.clear().catch(() => undefined);
|
|
76
100
|
throw err;
|
|
77
101
|
}
|
|
102
|
+
// Row 21b / F2: mirror to the 0600 file so the launchd daemon (which
|
|
103
|
+
// cannot read the interactive login keychain) can resolve the JWT.
|
|
104
|
+
// The keychain write above is the source of truth; a file-mirror failure
|
|
105
|
+
// is non-fatal here (the agent is enrolled for foreground use) but it
|
|
106
|
+
// leaves the daemon unable to start — `join`'s post-bootstrap self-check
|
|
107
|
+
// (Fix D) and the doctor will surface that loudly rather than silently.
|
|
108
|
+
if (this.fileStore) {
|
|
109
|
+
try {
|
|
110
|
+
await this.fileStore.write(creds);
|
|
111
|
+
}
|
|
112
|
+
catch (err) {
|
|
113
|
+
logger.warn('Could not write the 0600 credential mirror — the background daemon may ' +
|
|
114
|
+
'not be able to read the JWT. Foreground `buildhive-agent start` is unaffected.', { err: err instanceof Error ? err.message : String(err) });
|
|
115
|
+
}
|
|
116
|
+
}
|
|
78
117
|
}
|
|
79
118
|
/**
|
|
80
119
|
* Read the stored JWT + its cached exp. Throws NotLoggedInError if absent
|
|
@@ -85,13 +124,17 @@ export class AgentEnrollmentKeyringStore {
|
|
|
85
124
|
this.store.getSecret(KEY_JWT),
|
|
86
125
|
this.store.getSecret(KEY_JWT_EXP),
|
|
87
126
|
]);
|
|
88
|
-
if (
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
127
|
+
if (token && expStr) {
|
|
128
|
+
const exp = Number(expStr);
|
|
129
|
+
return { jwt: token, expiresAtUnix: Number.isFinite(exp) ? exp : 0 };
|
|
130
|
+
}
|
|
131
|
+
// Row 21b / F2: keychain returned nothing (the daemon case) — fall back
|
|
132
|
+
// to the 0600 file mirror before declaring "not enrolled".
|
|
133
|
+
const fromFile = await this.readFromFileMirror();
|
|
134
|
+
if (fromFile) {
|
|
135
|
+
return { jwt: fromFile.jwt, expiresAtUnix: fromFile.jwtExpiresAtUnix };
|
|
136
|
+
}
|
|
137
|
+
throw new NotLoggedInError();
|
|
95
138
|
}
|
|
96
139
|
/**
|
|
97
140
|
* Read the full stored credentials (for display or row-17b supervisor).
|
|
@@ -105,43 +148,79 @@ export class AgentEnrollmentKeyringStore {
|
|
|
105
148
|
this.store.getSecret(KEY_TENANT_ID),
|
|
106
149
|
this.store.getSecret(KEY_PLATFORM_URL),
|
|
107
150
|
]);
|
|
108
|
-
if (
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
151
|
+
if (jwt && expStr && agentId && tenantId && platformUrl) {
|
|
152
|
+
const exp = Number(expStr);
|
|
153
|
+
return {
|
|
154
|
+
jwt,
|
|
155
|
+
jwtExpiresAtUnix: Number.isFinite(exp) ? exp : 0,
|
|
156
|
+
agentId,
|
|
157
|
+
tenantId,
|
|
158
|
+
platformUrl,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
// Row 21b / F2: the launchd daemon's keychain read comes back empty.
|
|
162
|
+
// Fall back to the 0600 file mirror written by `join` before throwing.
|
|
163
|
+
const fromFile = await this.readFromFileMirror();
|
|
164
|
+
if (fromFile)
|
|
165
|
+
return fromFile;
|
|
166
|
+
throw new NotLoggedInError();
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Read the 0600 file mirror, swallowing any error to null. Centralised so
|
|
170
|
+
* every read path (readJwt / readAll / hasEnrollment) shares one fallback.
|
|
171
|
+
*/
|
|
172
|
+
async readFromFileMirror() {
|
|
173
|
+
if (!this.fileStore)
|
|
174
|
+
return null;
|
|
175
|
+
try {
|
|
176
|
+
const creds = await this.fileStore.read();
|
|
177
|
+
if (creds) {
|
|
178
|
+
logger.info('Resolved agent credentials from the 0600 file mirror (keychain unavailable in this session)');
|
|
179
|
+
}
|
|
180
|
+
return creds;
|
|
181
|
+
}
|
|
182
|
+
catch {
|
|
183
|
+
return null;
|
|
184
|
+
}
|
|
119
185
|
}
|
|
120
186
|
/**
|
|
121
187
|
* Cross-platform-url check. Refuses to use credentials minted against a
|
|
122
188
|
* different BuildHive instance.
|
|
123
189
|
*/
|
|
124
190
|
async assertPlatformUrlMatches(currentPlatformUrl) {
|
|
125
|
-
|
|
126
|
-
if (!stored)
|
|
127
|
-
|
|
191
|
+
let stored = await this.store.getSecret(KEY_PLATFORM_URL);
|
|
192
|
+
if (!stored) {
|
|
193
|
+
// Row 21b / F2: keep the cross-instance guard working in the daemon
|
|
194
|
+
// session, where the keychain read comes back empty — fall back to the
|
|
195
|
+
// 0600 file mirror before declaring "not enrolled".
|
|
196
|
+
const fromFile = await this.readFromFileMirror();
|
|
197
|
+
if (!fromFile)
|
|
198
|
+
throw new NotLoggedInError();
|
|
199
|
+
stored = fromFile.platformUrl;
|
|
200
|
+
}
|
|
128
201
|
if (normalizeUrl(stored) !== normalizeUrl(currentPlatformUrl)) {
|
|
129
202
|
throw new AgentPlatformUrlMismatchError(stored, currentPlatformUrl);
|
|
130
203
|
}
|
|
131
204
|
}
|
|
132
|
-
/** Returns true if
|
|
205
|
+
/** Returns true if an enrollment JWT exists in the keyring OR the file mirror. */
|
|
133
206
|
async hasEnrollment() {
|
|
134
207
|
const jwt = await this.store.getSecret(KEY_JWT);
|
|
135
|
-
|
|
208
|
+
if (jwt !== null && jwt.length > 0)
|
|
209
|
+
return true;
|
|
210
|
+
const fromFile = await this.readFromFileMirror();
|
|
211
|
+
return fromFile !== null;
|
|
136
212
|
}
|
|
137
213
|
/**
|
|
138
|
-
* Delete all 5 `agent-enrollment.*` keys. Idempotent.
|
|
139
|
-
* Called by
|
|
214
|
+
* Delete all 5 `agent-enrollment.*` keys AND the 0600 file mirror. Idempotent.
|
|
215
|
+
* Called by `buildhive-agent logout` (row 21b / F4) to fully de-enroll.
|
|
140
216
|
*/
|
|
141
217
|
async clear() {
|
|
142
218
|
for (const k of ALL_KEYS) {
|
|
143
219
|
await this.store.deleteSecret(k).catch(() => false);
|
|
144
220
|
}
|
|
221
|
+
if (this.fileStore) {
|
|
222
|
+
await this.fileStore.delete().catch(() => undefined);
|
|
223
|
+
}
|
|
145
224
|
}
|
|
146
225
|
}
|
|
147
226
|
function normalizeUrl(u) {
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* credentialFileStore — 0600 on-disk fallback for the agent-enrollment
|
|
3
|
+
* credentials, so the **launchd-managed daemon** can read its JWT.
|
|
4
|
+
*
|
|
5
|
+
* ── Why this exists (row 21b / F2, 2026-06-02) ───────────────────────────────
|
|
6
|
+
* The 2026-06-01 verification walk found the LaunchAgent daemon crashlooping
|
|
7
|
+
* with `last exit code = 1` and empty logs. Root cause (proven in
|
|
8
|
+
* docs/ops/diagnosis-f3-agent-daemon-2026-06-02.html): the macOS Keychain item
|
|
9
|
+
* holding the JWT is created by the *foreground* `join` process (Terminal
|
|
10
|
+
* security session). The launchd-spawned daemon runs in a different security
|
|
11
|
+
* session (`SessionCreate=true`), so `@napi-rs/keyring`'s `getPassword()`
|
|
12
|
+
* returns `null` for it (silent access-deny). The daemon then reads "no JWT" →
|
|
13
|
+
* "Not enrolled" → exits 1 → launchd respawns → silent crashloop.
|
|
14
|
+
*
|
|
15
|
+
* GitHub's own `actions/runner` `svc.sh` stores its `.credentials` as files in
|
|
16
|
+
* the runner directory for exactly this reason — a launchd/systemd service
|
|
17
|
+
* cannot rely on the interactive login keychain.
|
|
18
|
+
*
|
|
19
|
+
* ── Design ───────────────────────────────────────────────────────────────────
|
|
20
|
+
* The OS keyring stays the PRIMARY store (used by foreground `start`, where it
|
|
21
|
+
* works and is the more-secure option). This file is a daemon-readable MIRROR:
|
|
22
|
+
* `join` writes both; the resolver falls back to the file only when the keyring
|
|
23
|
+
* returns nothing (the daemon case).
|
|
24
|
+
*
|
|
25
|
+
* Security:
|
|
26
|
+
* - File mode 0600 (owner read/write only), parent dir 0700.
|
|
27
|
+
* - Lives under the user's own `~/.buildhive/` — same trust boundary as the
|
|
28
|
+
* actions/runner `.credentials` file and the workspaces dir.
|
|
29
|
+
* - Never packaged (runtime state in $HOME, not in the npm tarball).
|
|
30
|
+
*
|
|
31
|
+
* This module is pure-where-possible: fs + homedir are injectable for tests.
|
|
32
|
+
*/
|
|
33
|
+
import type { AgentEnrollmentCredentials } from './agentEnrollmentKeyringStore.js';
|
|
34
|
+
/** Basename of the credential mirror file under `~/.buildhive/`. */
|
|
35
|
+
export declare const CREDENTIAL_FILE_NAME = "agent-enrollment.cred";
|
|
36
|
+
/** Injectable fs surface (subset of node:fs/promises we use). */
|
|
37
|
+
export interface CredentialFileFs {
|
|
38
|
+
readonly mkdir: (path: string, opts: {
|
|
39
|
+
recursive: boolean;
|
|
40
|
+
mode?: number;
|
|
41
|
+
}) => Promise<string | undefined>;
|
|
42
|
+
readonly writeFile: (path: string, data: string, opts: {
|
|
43
|
+
encoding: 'utf8';
|
|
44
|
+
mode?: number;
|
|
45
|
+
}) => Promise<void>;
|
|
46
|
+
readonly chmod: (path: string, mode: number) => Promise<void>;
|
|
47
|
+
readonly rename: (from: string, to: string) => Promise<void>;
|
|
48
|
+
readonly readFile: (path: string, encoding: 'utf8') => Promise<string>;
|
|
49
|
+
readonly unlink: (path: string) => Promise<void>;
|
|
50
|
+
}
|
|
51
|
+
export interface CredentialFileStoreOptions {
|
|
52
|
+
/** Override the home directory (tests + multi-user). */
|
|
53
|
+
readonly homeDir?: string;
|
|
54
|
+
/** Override the fs surface (tests). */
|
|
55
|
+
readonly fs?: CredentialFileFs;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Reads/writes the agent-enrollment credentials as a single 0600 JSON file.
|
|
59
|
+
*/
|
|
60
|
+
export declare class CredentialFileStore {
|
|
61
|
+
private readonly fs;
|
|
62
|
+
private readonly dir;
|
|
63
|
+
private readonly filePath;
|
|
64
|
+
constructor(opts?: CredentialFileStoreOptions);
|
|
65
|
+
/** Absolute path to the credential file (for diagnostics + doctor). */
|
|
66
|
+
get path(): string;
|
|
67
|
+
/**
|
|
68
|
+
* Atomically write the credential mirror with 0600 perms.
|
|
69
|
+
* tmp-file → chmod 0600 → rename avoids a torn read by the daemon.
|
|
70
|
+
*/
|
|
71
|
+
write(creds: AgentEnrollmentCredentials): Promise<void>;
|
|
72
|
+
/**
|
|
73
|
+
* Read the credential mirror. Returns null when absent or malformed — never
|
|
74
|
+
* throws, so the resolver can cleanly fall through to "not enrolled".
|
|
75
|
+
*/
|
|
76
|
+
read(): Promise<AgentEnrollmentCredentials | null>;
|
|
77
|
+
/** Delete the credential mirror. Idempotent (ENOENT is not an error). */
|
|
78
|
+
delete(): Promise<void>;
|
|
79
|
+
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* credentialFileStore — 0600 on-disk fallback for the agent-enrollment
|
|
3
|
+
* credentials, so the **launchd-managed daemon** can read its JWT.
|
|
4
|
+
*
|
|
5
|
+
* ── Why this exists (row 21b / F2, 2026-06-02) ───────────────────────────────
|
|
6
|
+
* The 2026-06-01 verification walk found the LaunchAgent daemon crashlooping
|
|
7
|
+
* with `last exit code = 1` and empty logs. Root cause (proven in
|
|
8
|
+
* docs/ops/diagnosis-f3-agent-daemon-2026-06-02.html): the macOS Keychain item
|
|
9
|
+
* holding the JWT is created by the *foreground* `join` process (Terminal
|
|
10
|
+
* security session). The launchd-spawned daemon runs in a different security
|
|
11
|
+
* session (`SessionCreate=true`), so `@napi-rs/keyring`'s `getPassword()`
|
|
12
|
+
* returns `null` for it (silent access-deny). The daemon then reads "no JWT" →
|
|
13
|
+
* "Not enrolled" → exits 1 → launchd respawns → silent crashloop.
|
|
14
|
+
*
|
|
15
|
+
* GitHub's own `actions/runner` `svc.sh` stores its `.credentials` as files in
|
|
16
|
+
* the runner directory for exactly this reason — a launchd/systemd service
|
|
17
|
+
* cannot rely on the interactive login keychain.
|
|
18
|
+
*
|
|
19
|
+
* ── Design ───────────────────────────────────────────────────────────────────
|
|
20
|
+
* The OS keyring stays the PRIMARY store (used by foreground `start`, where it
|
|
21
|
+
* works and is the more-secure option). This file is a daemon-readable MIRROR:
|
|
22
|
+
* `join` writes both; the resolver falls back to the file only when the keyring
|
|
23
|
+
* returns nothing (the daemon case).
|
|
24
|
+
*
|
|
25
|
+
* Security:
|
|
26
|
+
* - File mode 0600 (owner read/write only), parent dir 0700.
|
|
27
|
+
* - Lives under the user's own `~/.buildhive/` — same trust boundary as the
|
|
28
|
+
* actions/runner `.credentials` file and the workspaces dir.
|
|
29
|
+
* - Never packaged (runtime state in $HOME, not in the npm tarball).
|
|
30
|
+
*
|
|
31
|
+
* This module is pure-where-possible: fs + homedir are injectable for tests.
|
|
32
|
+
*/
|
|
33
|
+
import { homedir } from 'node:os';
|
|
34
|
+
import { join } from 'node:path';
|
|
35
|
+
import { promises as defaultFsPromises } from 'node:fs';
|
|
36
|
+
/** Basename of the credential mirror file under `~/.buildhive/`. */
|
|
37
|
+
export const CREDENTIAL_FILE_NAME = 'agent-enrollment.cred';
|
|
38
|
+
const defaultFs = {
|
|
39
|
+
mkdir: (p, o) => defaultFsPromises.mkdir(p, o),
|
|
40
|
+
writeFile: (p, d, o) => defaultFsPromises.writeFile(p, d, o),
|
|
41
|
+
chmod: (p, m) => defaultFsPromises.chmod(p, m),
|
|
42
|
+
rename: (a, b) => defaultFsPromises.rename(a, b),
|
|
43
|
+
readFile: (p, e) => defaultFsPromises.readFile(p, e),
|
|
44
|
+
unlink: (p) => defaultFsPromises.unlink(p),
|
|
45
|
+
};
|
|
46
|
+
/**
|
|
47
|
+
* Reads/writes the agent-enrollment credentials as a single 0600 JSON file.
|
|
48
|
+
*/
|
|
49
|
+
export class CredentialFileStore {
|
|
50
|
+
fs;
|
|
51
|
+
dir;
|
|
52
|
+
filePath;
|
|
53
|
+
constructor(opts = {}) {
|
|
54
|
+
this.fs = opts.fs ?? defaultFs;
|
|
55
|
+
const home = opts.homeDir ?? homedir();
|
|
56
|
+
this.dir = join(home, '.buildhive');
|
|
57
|
+
this.filePath = join(this.dir, CREDENTIAL_FILE_NAME);
|
|
58
|
+
}
|
|
59
|
+
/** Absolute path to the credential file (for diagnostics + doctor). */
|
|
60
|
+
get path() {
|
|
61
|
+
return this.filePath;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Atomically write the credential mirror with 0600 perms.
|
|
65
|
+
* tmp-file → chmod 0600 → rename avoids a torn read by the daemon.
|
|
66
|
+
*/
|
|
67
|
+
async write(creds) {
|
|
68
|
+
await this.fs.mkdir(this.dir, { recursive: true, mode: 0o700 });
|
|
69
|
+
const tmpPath = `${this.filePath}.tmp.${process.pid}`;
|
|
70
|
+
const payload = JSON.stringify({
|
|
71
|
+
jwt: creds.jwt,
|
|
72
|
+
jwtExpiresAtUnix: creds.jwtExpiresAtUnix,
|
|
73
|
+
agentId: creds.agentId,
|
|
74
|
+
tenantId: creds.tenantId,
|
|
75
|
+
platformUrl: creds.platformUrl,
|
|
76
|
+
});
|
|
77
|
+
await this.fs.writeFile(tmpPath, payload, { encoding: 'utf8', mode: 0o600 });
|
|
78
|
+
try {
|
|
79
|
+
// Belt-and-suspenders over the writeFile mode (umask can clear bits).
|
|
80
|
+
await this.fs.chmod(tmpPath, 0o600);
|
|
81
|
+
await this.fs.rename(tmpPath, this.filePath);
|
|
82
|
+
}
|
|
83
|
+
catch (err) {
|
|
84
|
+
// Don't leave a tmp file holding the JWT on a chmod/rename failure
|
|
85
|
+
// (ENOSPC, cross-device, mid-op crash). Best-effort cleanup, re-throw.
|
|
86
|
+
await this.fs.unlink(tmpPath).catch(() => undefined);
|
|
87
|
+
throw err;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Read the credential mirror. Returns null when absent or malformed — never
|
|
92
|
+
* throws, so the resolver can cleanly fall through to "not enrolled".
|
|
93
|
+
*/
|
|
94
|
+
async read() {
|
|
95
|
+
let raw;
|
|
96
|
+
try {
|
|
97
|
+
raw = await this.fs.readFile(this.filePath, 'utf8');
|
|
98
|
+
}
|
|
99
|
+
catch {
|
|
100
|
+
return null; // ENOENT or unreadable
|
|
101
|
+
}
|
|
102
|
+
let parsed;
|
|
103
|
+
try {
|
|
104
|
+
parsed = JSON.parse(raw);
|
|
105
|
+
}
|
|
106
|
+
catch {
|
|
107
|
+
return null;
|
|
108
|
+
}
|
|
109
|
+
if (!parsed || typeof parsed !== 'object')
|
|
110
|
+
return null;
|
|
111
|
+
const o = parsed;
|
|
112
|
+
if (typeof o.jwt !== 'string' ||
|
|
113
|
+
typeof o.agentId !== 'string' ||
|
|
114
|
+
typeof o.tenantId !== 'string' ||
|
|
115
|
+
typeof o.platformUrl !== 'string' ||
|
|
116
|
+
o.jwt.length === 0 ||
|
|
117
|
+
o.agentId.length === 0 ||
|
|
118
|
+
o.tenantId.length === 0 ||
|
|
119
|
+
o.platformUrl.length === 0) {
|
|
120
|
+
return null;
|
|
121
|
+
}
|
|
122
|
+
const exp = typeof o.jwtExpiresAtUnix === 'number' ? o.jwtExpiresAtUnix : 0;
|
|
123
|
+
return {
|
|
124
|
+
jwt: o.jwt,
|
|
125
|
+
jwtExpiresAtUnix: Number.isFinite(exp) ? exp : 0,
|
|
126
|
+
agentId: o.agentId,
|
|
127
|
+
tenantId: o.tenantId,
|
|
128
|
+
platformUrl: o.platformUrl,
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
/** Delete the credential mirror. Idempotent (ENOENT is not an error). */
|
|
132
|
+
async delete() {
|
|
133
|
+
try {
|
|
134
|
+
await this.fs.unlink(this.filePath);
|
|
135
|
+
}
|
|
136
|
+
catch {
|
|
137
|
+
// best-effort — file may already be gone
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
@@ -38,6 +38,21 @@ export interface ServiceInstallerInjection {
|
|
|
38
38
|
paths: ServicePaths;
|
|
39
39
|
bootstrapStdout: string;
|
|
40
40
|
}>;
|
|
41
|
+
/**
|
|
42
|
+
* Row 21b / Fix D — read the launchd service health after bootstrap so
|
|
43
|
+
* `join` can self-verify the daemon actually started (instead of silently
|
|
44
|
+
* installing a crashlooping daemon). Optional: when omitted, the real
|
|
45
|
+
* `getServiceStatus` is used via dynamic import.
|
|
46
|
+
*/
|
|
47
|
+
readonly getServiceStatus?: (opts: {
|
|
48
|
+
mode: 'user' | 'system';
|
|
49
|
+
label?: string;
|
|
50
|
+
homeDir?: string;
|
|
51
|
+
}) => Promise<{
|
|
52
|
+
loaded: boolean;
|
|
53
|
+
state: 'running' | 'waiting' | 'unknown' | null;
|
|
54
|
+
lastExitCode: number | null;
|
|
55
|
+
}>;
|
|
41
56
|
}
|
|
42
57
|
export interface JoinOptions {
|
|
43
58
|
readonly token: string;
|
|
@@ -69,6 +84,14 @@ export interface JoinOptions {
|
|
|
69
84
|
* the test runner, not at dist/cli.js).
|
|
70
85
|
*/
|
|
71
86
|
readonly cliEntryPathOverride?: string;
|
|
87
|
+
/**
|
|
88
|
+
* Row 21b / Fix D — how long to wait after bootstrap before reading the
|
|
89
|
+
* daemon's launchd health (gives RunAtLoad + the FB16131937 startup guard
|
|
90
|
+
* time to fire). Default 3000ms; tests pass 0.
|
|
91
|
+
*/
|
|
92
|
+
readonly verifyDelayMs?: number;
|
|
93
|
+
/** Sleep injection for the Fix-D verify delay (tests pass a no-op). */
|
|
94
|
+
readonly sleepFn?: (ms: number) => Promise<void>;
|
|
72
95
|
}
|
|
73
96
|
export interface JoinResult {
|
|
74
97
|
readonly exitCode: 0 | 1 | 2 | 3;
|
package/dist/auth/joinCommand.js
CHANGED
|
@@ -195,19 +195,24 @@ export async function runJoin(opts) {
|
|
|
195
195
|
}
|
|
196
196
|
// 8. Print enrollment success.
|
|
197
197
|
const agentIdShort = agentId.slice(0, 8);
|
|
198
|
-
|
|
198
|
+
// teamName is cosmetic; guard against an older/forward backend that omits it.
|
|
199
|
+
console.log(`✓ Agent enrolled in team "${teamName ?? '(unknown)'}" as agent ${agentIdShort}…`);
|
|
199
200
|
console.log(`✓ JWT stored in OS keyring (expires ${expiresDate}).`);
|
|
200
201
|
// 9. S-1: Auto-install LaunchAgent so the dev never thinks about persistence.
|
|
201
202
|
// Idempotent — re-running `join` cleanly upgrades the plist (the underlying
|
|
202
203
|
// installer uses an atomic tmp-file → rename).
|
|
204
|
+
// Row 21b / Fix D: maybeInstallLaunchAgent ALSO self-verifies the daemon
|
|
205
|
+
// actually came up; it returns false (with a loud message) when the daemon
|
|
206
|
+
// installed but crashed, so we never claim "set and forget" falsely.
|
|
203
207
|
const serviceInstalled = await maybeInstallLaunchAgent(opts);
|
|
204
208
|
if (serviceInstalled) {
|
|
205
|
-
console.log('✓ Agent installed and will auto-start on every login. ' +
|
|
209
|
+
console.log('✓ Agent installed and running — it will auto-start on every login. ' +
|
|
206
210
|
'Inspect logs with `buildhive-agent logs`.');
|
|
207
211
|
}
|
|
208
212
|
else {
|
|
209
|
-
// Either non-macOS, opt-out,
|
|
210
|
-
//
|
|
213
|
+
// Either non-macOS, opt-out, install failed, OR the daemon did not start
|
|
214
|
+
// cleanly (Fix D). maybeInstallLaunchAgent already printed the specific
|
|
215
|
+
// reason; give the foreground fallback the user can always run.
|
|
211
216
|
console.log('Run `buildhive-agent start` to begin picking up workflow jobs.');
|
|
212
217
|
}
|
|
213
218
|
return { exitCode: 0, serviceInstalled };
|
|
@@ -265,7 +270,6 @@ async function maybeInstallLaunchAgent(opts) {
|
|
|
265
270
|
}
|
|
266
271
|
try {
|
|
267
272
|
await installFn({ mode: 'user', cliEntryPath });
|
|
268
|
-
return true;
|
|
269
273
|
}
|
|
270
274
|
catch (err) {
|
|
271
275
|
// Already-loaded-service is a benign case — installService's atomic
|
|
@@ -277,6 +281,56 @@ async function maybeInstallLaunchAgent(opts) {
|
|
|
277
281
|
'or run the agent in the foreground via `buildhive-agent start`.');
|
|
278
282
|
return false;
|
|
279
283
|
}
|
|
284
|
+
// Row 21b / Fix D — self-verify the daemon actually started. The 2026-06-01
|
|
285
|
+
// walk installed a daemon that crashlooped silently (last exit code = 1) yet
|
|
286
|
+
// join reported success. Now we wait, read launchd health, and fail LOUD.
|
|
287
|
+
return verifyDaemonStarted(opts);
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Fix D: after bootstrap, give launchd's RunAtLoad + the FB16131937 startup
|
|
291
|
+
* guard a moment to fire, then read the daemon's health. Returns true only when
|
|
292
|
+
* the daemon is genuinely up (loaded, not crashed). On a crash it prints an
|
|
293
|
+
* actionable message and returns false so `join` recommends the foreground
|
|
294
|
+
* fallback instead of claiming "set and forget".
|
|
295
|
+
*/
|
|
296
|
+
async function verifyDaemonStarted(opts) {
|
|
297
|
+
const sleep = opts.sleepFn ?? ((ms) => new Promise((r) => setTimeout(r, ms)));
|
|
298
|
+
const delay = opts.verifyDelayMs ?? 3000;
|
|
299
|
+
// Resolve the status reader (injected for tests, real one otherwise).
|
|
300
|
+
let statusFn;
|
|
301
|
+
if (opts.serviceInstaller?.getServiceStatus) {
|
|
302
|
+
statusFn = opts.serviceInstaller.getServiceStatus;
|
|
303
|
+
}
|
|
304
|
+
else {
|
|
305
|
+
try {
|
|
306
|
+
const mod = await import('../service/serviceInstaller.js');
|
|
307
|
+
statusFn = mod.getServiceStatus;
|
|
308
|
+
}
|
|
309
|
+
catch {
|
|
310
|
+
// Can't verify — assume installed (don't block enrollment), but say so.
|
|
311
|
+
console.error('[join] Service installed; could not verify daemon health automatically.');
|
|
312
|
+
return true;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
await sleep(delay);
|
|
316
|
+
let status;
|
|
317
|
+
try {
|
|
318
|
+
status = await statusFn({ mode: 'user' });
|
|
319
|
+
}
|
|
320
|
+
catch {
|
|
321
|
+
console.error('[join] Service installed; could not read daemon health (launchctl print failed).');
|
|
322
|
+
return true;
|
|
323
|
+
}
|
|
324
|
+
const crashed = status.lastExitCode !== null && status.lastExitCode !== 0;
|
|
325
|
+
if (!status.loaded || crashed) {
|
|
326
|
+
console.error('[join] ⚠ The background service was installed but did NOT start cleanly' +
|
|
327
|
+
(crashed ? ` (last exit code = ${status.lastExitCode}).` : ' (not loaded).'));
|
|
328
|
+
console.error('[join] Diagnose with `buildhive-agent doctor`, read the log with ' +
|
|
329
|
+
'`buildhive-agent logs` (~/.buildhive/logs/buildhive-agent.log), ' +
|
|
330
|
+
'or run `buildhive-agent start` in the foreground for now.');
|
|
331
|
+
return false;
|
|
332
|
+
}
|
|
333
|
+
return true;
|
|
280
334
|
}
|
|
281
335
|
/** Pull tenant_id from JWT payload without signature verification. */
|
|
282
336
|
function extractTenantId(jwtToken) {
|
package/dist/doctor/runChecks.js
CHANGED
|
@@ -291,14 +291,29 @@ export function checkServiceLoaded(deps) {
|
|
|
291
291
|
};
|
|
292
292
|
}
|
|
293
293
|
}
|
|
294
|
-
// 16: agent process is
|
|
295
|
-
// `state`
|
|
294
|
+
// 16: agent process is actually healthy (as reported by launchctl print's
|
|
295
|
+
// `state` AND `last exit code` fields). Read-only on the cached print output
|
|
296
|
+
// from check 15.
|
|
296
297
|
//
|
|
297
|
-
//
|
|
298
|
-
// `state =
|
|
299
|
-
//
|
|
300
|
-
//
|
|
301
|
-
//
|
|
298
|
+
// Row 21b / Fix C — HONESTY. The old version returned a soft WARNING for
|
|
299
|
+
// `state = spawn scheduled`, which HID a crashlooping daemon (the 2026-06-01
|
|
300
|
+
// walk saw 14/15 "healthy" while the daemon was dead, last exit code = 1).
|
|
301
|
+
// New contract:
|
|
302
|
+
// last exit code != 0 → FAIL (daemon crashed / is crashlooping)
|
|
303
|
+
// state = running, exit 0/absent → PASS
|
|
304
|
+
// state = spawn scheduled → WARN (launchd is throttling respawns)
|
|
305
|
+
// state = waiting → WARN (KeepAlive should restart it)
|
|
306
|
+
// anything else → WARN (unknown — surface verbatim)
|
|
307
|
+
// not loaded → FAIL (upstream check 15 already failed)
|
|
308
|
+
const SERVICE_CRASH_FIX =
|
|
309
|
+
// NB: the daemon logs to ~/.buildhive/logs/buildhive-agent.log (via pino-roll),
|
|
310
|
+
// NOT the launchd StandardOutPath file, which is empty in production. Point
|
|
311
|
+
// operators at `buildhive-agent logs` so they don\'t chase the empty stdout
|
|
312
|
+
// file the 2026-06-01 walk got stuck on.
|
|
313
|
+
'The background daemon crashed. Inspect its log with `buildhive-agent logs` ' +
|
|
314
|
+
'(file: ~/.buildhive/logs/buildhive-agent.log), then re-run ' +
|
|
315
|
+
'`buildhive-agent join <token>` (re-enrolls + reinstalls). ' +
|
|
316
|
+
'You can always run `buildhive-agent start` in the foreground meanwhile.';
|
|
302
317
|
export function checkServiceRunning(deps, printOutput) {
|
|
303
318
|
if (deps.platform !== 'darwin') {
|
|
304
319
|
return {
|
|
@@ -312,18 +327,39 @@ export function checkServiceRunning(deps, printOutput) {
|
|
|
312
327
|
message: 'Service not loaded (see previous check)',
|
|
313
328
|
};
|
|
314
329
|
}
|
|
315
|
-
|
|
330
|
+
// Parse `last exit code = N` first — a non-zero value means the daemon died,
|
|
331
|
+
// and takes precedence over whatever transient `state` launchd reports.
|
|
332
|
+
const exitMatch = printOutput.stdout.match(/last exit code\s*=\s*(-?\d+)/i);
|
|
333
|
+
const lastExitCode = exitMatch ? parseInt(exitMatch[1], 10) : null;
|
|
334
|
+
if (lastExitCode !== null && lastExitCode !== 0) {
|
|
335
|
+
return {
|
|
336
|
+
name: 'Agent process running', status: 'fail',
|
|
337
|
+
message: `Daemon is not healthy — last exit code = ${lastExitCode} (crashing on start)`,
|
|
338
|
+
fix: SERVICE_CRASH_FIX,
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
// `state = spawn scheduled` has a space — capture the whole value, not just
|
|
342
|
+
// the first word (the old `\w+` regex truncated it to "spawn"). Anchor on
|
|
343
|
+
// start-of-string OR newline so the first (top-level) `state =` is taken.
|
|
344
|
+
const stateMatch = printOutput.stdout.match(/(?:^|\n)\s*state\s*=\s*([^\n]+)/);
|
|
316
345
|
if (!stateMatch) {
|
|
317
346
|
return {
|
|
318
347
|
name: 'Agent process running', status: 'warn',
|
|
319
348
|
message: 'Could not parse `state =` from launchctl print output',
|
|
320
349
|
};
|
|
321
350
|
}
|
|
322
|
-
const stateRaw = stateMatch[1];
|
|
351
|
+
const stateRaw = stateMatch[1].trim();
|
|
323
352
|
const state = stateRaw.toLowerCase();
|
|
324
353
|
if (state.includes('run')) {
|
|
325
354
|
return { name: 'Agent process running', status: 'pass', message: `state = ${stateRaw}` };
|
|
326
355
|
}
|
|
356
|
+
if (state.includes('spawn')) {
|
|
357
|
+
return {
|
|
358
|
+
name: 'Agent process running', status: 'warn',
|
|
359
|
+
message: `Daemon is being (re)spawned by launchd (state = ${stateRaw}); re-run doctor in a few seconds`,
|
|
360
|
+
fix: SERVICE_CRASH_FIX,
|
|
361
|
+
};
|
|
362
|
+
}
|
|
327
363
|
if (state.includes('wait')) {
|
|
328
364
|
return {
|
|
329
365
|
name: 'Agent process running', status: 'warn',
|
|
@@ -14,15 +14,25 @@
|
|
|
14
14
|
*
|
|
15
15
|
* Row 17b — Wave 1 (agent supervisor + actions/runner spawn).
|
|
16
16
|
* Row 18b S7 — Wave 2 (cache env injection into runner spawn).
|
|
17
|
+
* Row 21b — Fix A: the default single-slot `start` now REPLENISHES. Each
|
|
18
|
+
* ephemeral runner still handles one job then exits (binding constraint #5),
|
|
19
|
+
* but the supervisor re-registers and keeps listening until SIGTERM, so the
|
|
20
|
+
* agent runs forever ("set and forget") instead of stopping after one job.
|
|
17
21
|
*
|
|
18
22
|
* Exit-code contract:
|
|
19
|
-
* 0 —
|
|
20
|
-
*
|
|
23
|
+
* 0 — supervisor drained cleanly after SIGTERM/SIGINT (the normal stop path),
|
|
24
|
+
* OR (pre-start) a successful no-op. The supervisor does NOT exit after a
|
|
25
|
+
* single job anymore.
|
|
26
|
+
* 1 — fatal pre-supervisor error (auth, no JWT, no enrolled repo, binary
|
|
27
|
+
* download, unsupported platform handled below)
|
|
21
28
|
* 2 — platform not supported
|
|
22
29
|
*/
|
|
23
30
|
import { AgentEnrollmentKeyringStore } from '../auth/agentEnrollmentKeyringStore.js';
|
|
31
|
+
import { fetchRunnerToken } from './tokenClient.js';
|
|
24
32
|
import { fetchMyRepos } from './myReposClient.js';
|
|
25
33
|
import { ensureRunner } from './binaryFetcher.js';
|
|
34
|
+
import { configureRunner, runRunner } from './supervisor.js';
|
|
35
|
+
import { startHeartbeat } from './heartbeat.js';
|
|
26
36
|
import { type PoolDeps } from './pool.js';
|
|
27
37
|
import type { EnsureRunnerOptions } from './binaryFetcher.js';
|
|
28
38
|
import type { CacheEnvDeps } from './cacheEnv.js';
|
|
@@ -61,6 +71,23 @@ export interface StartOptions {
|
|
|
61
71
|
* Allows injecting fake configureRunner / runRunner / fetchRunnerToken / prepareSlotDir.
|
|
62
72
|
*/
|
|
63
73
|
readonly poolDeps?: Partial<PoolDeps>;
|
|
74
|
+
/**
|
|
75
|
+
* Dependency injection for the N==1 replenishing supervisor (Row 21b / Fix A).
|
|
76
|
+
* Tests inject fakes to drive the replenishment loop deterministically and
|
|
77
|
+
* assert backoff / replenish behaviour. Omit in production (real impls used).
|
|
78
|
+
*/
|
|
79
|
+
readonly supervisorDeps?: Partial<{
|
|
80
|
+
configureRunner: typeof configureRunner;
|
|
81
|
+
runRunner: typeof runRunner;
|
|
82
|
+
fetchRunnerToken: typeof fetchRunnerToken;
|
|
83
|
+
}>;
|
|
84
|
+
/**
|
|
85
|
+
* Dependency injection for the heartbeat (tests only). The real
|
|
86
|
+
* {@link startHeartbeat} POSTs to the backend on a 30s timer and once more
|
|
87
|
+
* on `stop()`; injecting a no-op keeps the supervisor-loop tests off the
|
|
88
|
+
* network and deterministic. Omit in production.
|
|
89
|
+
*/
|
|
90
|
+
readonly startHeartbeatFn?: typeof startHeartbeat;
|
|
64
91
|
}
|
|
65
92
|
export interface StartResult {
|
|
66
93
|
readonly exitCode: 0 | 1 | 2;
|
|
@@ -14,10 +14,17 @@
|
|
|
14
14
|
*
|
|
15
15
|
* Row 17b — Wave 1 (agent supervisor + actions/runner spawn).
|
|
16
16
|
* Row 18b S7 — Wave 2 (cache env injection into runner spawn).
|
|
17
|
+
* Row 21b — Fix A: the default single-slot `start` now REPLENISHES. Each
|
|
18
|
+
* ephemeral runner still handles one job then exits (binding constraint #5),
|
|
19
|
+
* but the supervisor re-registers and keeps listening until SIGTERM, so the
|
|
20
|
+
* agent runs forever ("set and forget") instead of stopping after one job.
|
|
17
21
|
*
|
|
18
22
|
* Exit-code contract:
|
|
19
|
-
* 0 —
|
|
20
|
-
*
|
|
23
|
+
* 0 — supervisor drained cleanly after SIGTERM/SIGINT (the normal stop path),
|
|
24
|
+
* OR (pre-start) a successful no-op. The supervisor does NOT exit after a
|
|
25
|
+
* single job anymore.
|
|
26
|
+
* 1 — fatal pre-supervisor error (auth, no JWT, no enrolled repo, binary
|
|
27
|
+
* download, unsupported platform handled below)
|
|
21
28
|
* 2 — platform not supported
|
|
22
29
|
*/
|
|
23
30
|
import os from 'os';
|
|
@@ -32,6 +39,29 @@ import { startHeartbeat } from './heartbeat.js';
|
|
|
32
39
|
import { collectDeviceInfo } from './deviceInfo.js';
|
|
33
40
|
import { runPool } from './pool.js';
|
|
34
41
|
const logger = createLogger('runner.startCommand');
|
|
42
|
+
/**
|
|
43
|
+
* Row 21b / Fix A — supervisor backoff bounds.
|
|
44
|
+
*
|
|
45
|
+
* When a registration cycle fails (token fetch or config.sh), the supervisor
|
|
46
|
+
* does NOT exit (that would crashloop under launchd KeepAlive). It backs off
|
|
47
|
+
* with capped exponential delay and retries, so a transient backend blip
|
|
48
|
+
* self-heals and a persistent fault (e.g. repo de-enrolled) loops calmly —
|
|
49
|
+
* surfaced loudly by the dashboard (OFFLINE) and `buildhive-agent doctor`.
|
|
50
|
+
*/
|
|
51
|
+
const SUPERVISOR_BACKOFF_BASE_MS = 2_000;
|
|
52
|
+
const SUPERVISOR_BACKOFF_MAX_MS = 30_000;
|
|
53
|
+
/**
|
|
54
|
+
* Sleep up to `ms`, but wake early (≤250ms) once `isCancelled()` flips true so
|
|
55
|
+
* a SIGTERM during a backoff drains promptly instead of waiting the full delay.
|
|
56
|
+
*/
|
|
57
|
+
async function interruptibleSleep(ms, isCancelled) {
|
|
58
|
+
const deadline = Date.now() + ms;
|
|
59
|
+
while (Date.now() < deadline) {
|
|
60
|
+
if (isCancelled())
|
|
61
|
+
return;
|
|
62
|
+
await new Promise((r) => setTimeout(r, Math.min(250, deadline - Date.now())));
|
|
63
|
+
}
|
|
64
|
+
}
|
|
35
65
|
/**
|
|
36
66
|
* Attempt to read a JWT from the keyring.
|
|
37
67
|
* Prefers agent_enrollment_jwt (row 17c path); falls back to device_flow_jwt
|
|
@@ -221,11 +251,30 @@ export async function runStart(opts = {}) {
|
|
|
221
251
|
// Device specs (cpuCores, memoryGB) are collected once here and ride every
|
|
222
252
|
// heartbeat so the dashboard shows real hardware (Task #28).
|
|
223
253
|
const deviceInfo = collectDeviceInfo();
|
|
224
|
-
const
|
|
254
|
+
const startHeartbeatFn = opts.startHeartbeatFn ?? startHeartbeat;
|
|
255
|
+
const heartbeat = startHeartbeatFn({ platformUrl, jwt, deviceInfo });
|
|
225
256
|
// ── Step 7: Branch on concurrency ─────────────────────────────────────────────
|
|
226
257
|
if (concurrency === 1) {
|
|
227
|
-
// ── N==1:
|
|
228
|
-
//
|
|
258
|
+
// ── N==1: replenishing single-slot supervisor (Row 21b / Fix A) ────────────
|
|
259
|
+
//
|
|
260
|
+
// Each ephemeral `actions/runner` still handles exactly one job then exits
|
|
261
|
+
// (binding constraint #5 preserved). What changed in row 21b: the OUTER
|
|
262
|
+
// supervisor no longer exits with it — it re-registers a fresh ephemeral
|
|
263
|
+
// runner and keeps listening, so `buildhive-agent start` runs forever
|
|
264
|
+
// ("set and forget") instead of stopping after the first job (walk finding
|
|
265
|
+
// F3). The loop exits only on SIGTERM/SIGINT (user logout / `launchctl
|
|
266
|
+
// bootout` / Ctrl-C).
|
|
267
|
+
let shuttingDown = false;
|
|
268
|
+
const onShutdown = () => {
|
|
269
|
+
if (!shuttingDown) {
|
|
270
|
+
shuttingDown = true;
|
|
271
|
+
logger.info('Supervisor received shutdown signal — finishing current cycle then exiting');
|
|
272
|
+
}
|
|
273
|
+
};
|
|
274
|
+
// Multi-label per 2026-05-17 operator correction:
|
|
275
|
+
// runs-on: [self-hosted, buildhive, ubuntu-latest] requires all three labels.
|
|
276
|
+
const labels = 'self-hosted,buildhive,ubuntu-latest';
|
|
277
|
+
const runnerRegistrationUrl = `https://github.com/${owner}/${repo}`;
|
|
229
278
|
const tokenFetchOpts = {
|
|
230
279
|
platformUrl,
|
|
231
280
|
jwt,
|
|
@@ -233,56 +282,11 @@ export async function runStart(opts = {}) {
|
|
|
233
282
|
repo,
|
|
234
283
|
fetchFn: opts.fetchFn,
|
|
235
284
|
};
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
const msg = `Failed to fetch runner token: ${err instanceof Error ? err.message : String(err)}`;
|
|
242
|
-
logger.error(msg);
|
|
243
|
-
await heartbeat.stop('OFFLINE').catch(() => undefined);
|
|
244
|
-
return { exitCode: 1, message: msg };
|
|
245
|
-
}
|
|
246
|
-
const runnerName = generateRunnerName();
|
|
247
|
-
// Multi-label per 2026-05-17 operator correction:
|
|
248
|
-
// runs-on: [self-hosted, buildhive, ubuntu-latest] requires all three labels.
|
|
249
|
-
const labels = 'self-hosted,buildhive,ubuntu-latest';
|
|
250
|
-
const runnerRegistrationUrl = `https://github.com/${owner}/${repo}`;
|
|
251
|
-
const configOpts = {
|
|
252
|
-
configSh: runnerPaths.configSh,
|
|
253
|
-
runnerRegistrationUrl,
|
|
254
|
-
registrationToken: runnerToken.token,
|
|
255
|
-
runnerName,
|
|
256
|
-
labels,
|
|
257
|
-
};
|
|
258
|
-
logger.info('Registering runner with GitHub', { runnerName, runnerRegistrationUrl, labels });
|
|
259
|
-
let configExitCode = await configureRunner(configOpts);
|
|
260
|
-
if (configExitCode !== 0) {
|
|
261
|
-
logger.warn('config.sh failed — retrying with a fresh token once', { configExitCode });
|
|
262
|
-
let retryToken;
|
|
263
|
-
try {
|
|
264
|
-
retryToken = await fetchRunnerToken(tokenFetchOpts);
|
|
265
|
-
}
|
|
266
|
-
catch (err) {
|
|
267
|
-
const msg = `Retry token fetch failed: ${err instanceof Error ? err.message : String(err)}`;
|
|
268
|
-
logger.error(msg);
|
|
269
|
-
await heartbeat.stop('OFFLINE').catch(() => undefined);
|
|
270
|
-
return { exitCode: 1, message: msg };
|
|
271
|
-
}
|
|
272
|
-
configExitCode = await configureRunner({
|
|
273
|
-
...configOpts,
|
|
274
|
-
registrationToken: retryToken.token,
|
|
275
|
-
});
|
|
276
|
-
if (configExitCode !== 0) {
|
|
277
|
-
const msg = `Runner configuration failed after retry (config.sh exited ${configExitCode}). ` +
|
|
278
|
-
`Check your enrollment credentials and ensure the repo is enrolled in BuildHive.`;
|
|
279
|
-
logger.error(msg);
|
|
280
|
-
await heartbeat.stop('OFFLINE').catch(() => undefined);
|
|
281
|
-
return { exitCode: 1, message: msg };
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
logger.info('Runner configured successfully — starting run.sh', { runnerName });
|
|
285
|
-
// ── Cache server (Row 18b S7, Wave 2) — N==1 only ──────────────────────────
|
|
285
|
+
// DI seam (tests) — production uses the real supervisor primitives.
|
|
286
|
+
const fetchTokenFn = opts.supervisorDeps?.fetchRunnerToken ?? fetchRunnerToken;
|
|
287
|
+
const configureFn = opts.supervisorDeps?.configureRunner ?? configureRunner;
|
|
288
|
+
const runFn = opts.supervisorDeps?.runRunner ?? runRunner;
|
|
289
|
+
// ── Cache server (Row 18b S7, Wave 2) — started ONCE, lives across cycles ──
|
|
286
290
|
const defaultCacheConfig = {
|
|
287
291
|
enabled: false,
|
|
288
292
|
directory: '/tmp/buildhive/cache',
|
|
@@ -299,17 +303,124 @@ export async function runStart(opts = {}) {
|
|
|
299
303
|
catch {
|
|
300
304
|
logger.warn('Could not load agent config for cache check — proceeding without cache');
|
|
301
305
|
}
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
306
|
+
// Cache prep + signal handlers are set up INSIDE the try below so a throw
|
|
307
|
+
// (e.g. a dynamic import failing in buildProductionCacheEnvDeps) can never
|
|
308
|
+
// leak the SIGTERM/SIGINT listeners or skip heartbeat shutdown.
|
|
309
|
+
let cacheResult;
|
|
310
|
+
let runExtraEnv;
|
|
311
|
+
/**
|
|
312
|
+
* Register a fresh ephemeral runner (one fresh token, one-time retry on
|
|
313
|
+
* non-zero config.sh exit). Returns true once registered, false on failure
|
|
314
|
+
* (caller backs off). Honours `shuttingDown` between awaits.
|
|
315
|
+
*/
|
|
316
|
+
const registerOnce = async () => {
|
|
317
|
+
let token;
|
|
318
|
+
try {
|
|
319
|
+
token = await fetchTokenFn(tokenFetchOpts);
|
|
320
|
+
}
|
|
321
|
+
catch (err) {
|
|
322
|
+
logger.error('Failed to fetch runner token', {
|
|
323
|
+
err: err instanceof Error ? err.message : String(err),
|
|
324
|
+
});
|
|
325
|
+
return false;
|
|
326
|
+
}
|
|
327
|
+
if (shuttingDown)
|
|
328
|
+
return false;
|
|
329
|
+
const configOpts = {
|
|
330
|
+
configSh: runnerPaths.configSh,
|
|
331
|
+
runnerRegistrationUrl,
|
|
332
|
+
registrationToken: token.token,
|
|
333
|
+
runnerName: generateRunnerName(),
|
|
334
|
+
labels,
|
|
335
|
+
};
|
|
336
|
+
logger.info('Registering runner with GitHub', {
|
|
337
|
+
runnerName: configOpts.runnerName,
|
|
338
|
+
runnerRegistrationUrl,
|
|
339
|
+
labels,
|
|
340
|
+
});
|
|
341
|
+
let configExitCode = await configureFn(configOpts);
|
|
342
|
+
if (configExitCode !== 0) {
|
|
343
|
+
logger.warn('config.sh failed — retrying with a fresh token once', { configExitCode });
|
|
344
|
+
let retryToken;
|
|
345
|
+
try {
|
|
346
|
+
retryToken = await fetchTokenFn(tokenFetchOpts);
|
|
347
|
+
}
|
|
348
|
+
catch (err) {
|
|
349
|
+
logger.error('Retry token fetch failed', {
|
|
350
|
+
err: err instanceof Error ? err.message : String(err),
|
|
351
|
+
});
|
|
352
|
+
return false;
|
|
353
|
+
}
|
|
354
|
+
if (shuttingDown)
|
|
355
|
+
return false;
|
|
356
|
+
configExitCode = await configureFn({
|
|
357
|
+
...configOpts,
|
|
358
|
+
registrationToken: retryToken.token,
|
|
359
|
+
runnerName: generateRunnerName(),
|
|
360
|
+
});
|
|
361
|
+
if (configExitCode !== 0) {
|
|
362
|
+
logger.error('Runner configuration failed after retry — check enrollment credentials and ' +
|
|
363
|
+
'that the repo is enrolled in BuildHive.', { configExitCode });
|
|
364
|
+
return false;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
return true;
|
|
307
368
|
};
|
|
308
|
-
let
|
|
369
|
+
let consecutiveFailures = 0;
|
|
370
|
+
process.on('SIGTERM', onShutdown);
|
|
371
|
+
process.on('SIGINT', onShutdown);
|
|
309
372
|
try {
|
|
310
|
-
|
|
373
|
+
// Cache server (Row 18b S7) — started once, lives across cycles. Inside
|
|
374
|
+
// the try so a failure still hits the finally (listener + heartbeat cleanup).
|
|
375
|
+
const effectiveCacheDeps = opts.cacheEnvDeps ?? (await buildProductionCacheEnvDeps());
|
|
376
|
+
cacheResult = await prepareCache(cacheConfig, effectiveCacheDeps);
|
|
377
|
+
runExtraEnv = cacheResult.enabled ? cacheResult.patch : undefined;
|
|
378
|
+
while (!shuttingDown) {
|
|
379
|
+
// Yield to the macrotask queue each cycle so SIGTERM/timers always get
|
|
380
|
+
// a turn — a fast path (e.g. config.sh exiting instantly) must never
|
|
381
|
+
// starve the event loop on the microtask queue.
|
|
382
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
383
|
+
if (shuttingDown)
|
|
384
|
+
break;
|
|
385
|
+
const registered = await registerOnce();
|
|
386
|
+
if (shuttingDown)
|
|
387
|
+
break;
|
|
388
|
+
if (!registered) {
|
|
389
|
+
// Bounded exponential backoff so a transient outage self-heals and a
|
|
390
|
+
// persistent fault loops calmly (no respawn storm). Surfaced as
|
|
391
|
+
// OFFLINE on the dashboard + a doctor failure.
|
|
392
|
+
consecutiveFailures += 1;
|
|
393
|
+
const backoff = Math.min(SUPERVISOR_BACKOFF_MAX_MS, SUPERVISOR_BACKOFF_BASE_MS * 2 ** (consecutiveFailures - 1));
|
|
394
|
+
logger.warn('Registration failed — backing off before retry', {
|
|
395
|
+
consecutiveFailures,
|
|
396
|
+
backoffMs: backoff,
|
|
397
|
+
});
|
|
398
|
+
await interruptibleSleep(backoff, () => shuttingDown);
|
|
399
|
+
continue;
|
|
400
|
+
}
|
|
401
|
+
consecutiveFailures = 0;
|
|
402
|
+
logger.info('Runner configured successfully — starting run.sh');
|
|
403
|
+
const result = await runFn({ runSh: runnerPaths.runSh, extraEnv: runExtraEnv });
|
|
404
|
+
if (shuttingDown)
|
|
405
|
+
break;
|
|
406
|
+
if (result.ok) {
|
|
407
|
+
logger.info('Runner completed a job and exited cleanly — replenishing');
|
|
408
|
+
}
|
|
409
|
+
else {
|
|
410
|
+
// A non-zero run.sh exit reflects the runner's own health, not the
|
|
411
|
+
// build result. Replenish (consistent with the pool path) but apply
|
|
412
|
+
// a small backoff so a crash-looping runner doesn't spin hot.
|
|
413
|
+
logger.warn('Runner exited non-zero — replenishing after short backoff', {
|
|
414
|
+
exitCode: result.exitCode,
|
|
415
|
+
signal: result.signal,
|
|
416
|
+
});
|
|
417
|
+
await interruptibleSleep(SUPERVISOR_BACKOFF_BASE_MS, () => shuttingDown);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
311
420
|
}
|
|
312
421
|
finally {
|
|
422
|
+
process.removeListener('SIGTERM', onShutdown);
|
|
423
|
+
process.removeListener('SIGINT', onShutdown);
|
|
313
424
|
try {
|
|
314
425
|
await heartbeat.stop('OFFLINE');
|
|
315
426
|
}
|
|
@@ -318,7 +429,7 @@ export async function runStart(opts = {}) {
|
|
|
318
429
|
err: err instanceof Error ? err.message : String(err),
|
|
319
430
|
});
|
|
320
431
|
}
|
|
321
|
-
if (cacheResult
|
|
432
|
+
if (cacheResult?.enabled) {
|
|
322
433
|
try {
|
|
323
434
|
await stopCache(cacheResult.handle);
|
|
324
435
|
}
|
|
@@ -329,13 +440,8 @@ export async function runStart(opts = {}) {
|
|
|
329
440
|
}
|
|
330
441
|
}
|
|
331
442
|
}
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
return { exitCode: 0 };
|
|
335
|
-
}
|
|
336
|
-
const msg = `Runner exited with code ${result.exitCode ?? 'null'} signal ${result.signal ?? 'none'}`;
|
|
337
|
-
logger.warn(msg);
|
|
338
|
-
return { exitCode: result.exitCode === 0 ? 0 : 1, message: msg };
|
|
443
|
+
logger.info('Supervisor exited cleanly after shutdown signal');
|
|
444
|
+
return { exitCode: 0 };
|
|
339
445
|
}
|
|
340
446
|
// ── N>1: pool path ──────────────────────────────────────────────────────────
|
|
341
447
|
// Cache is disabled for N>1 — shared cache server is a follow-up item.
|
|
@@ -96,6 +96,15 @@ export declare function configureRunner(opts: ConfigureRunnerOptions): Promise<n
|
|
|
96
96
|
* completes. The resolved RunnerExitResult reflects that final exit.
|
|
97
97
|
*/
|
|
98
98
|
export declare function runRunner(opts: RunRunnerOptions): Promise<RunnerExitResult>;
|
|
99
|
+
/**
|
|
100
|
+
* Remove a stale `actions/runner` local registration so a fresh `config.sh`
|
|
101
|
+
* can register cleanly. Best-effort (ENOENT is fine). Exported for tests.
|
|
102
|
+
*
|
|
103
|
+
* These are the files config.sh writes on registration; a leftover set (from an
|
|
104
|
+
* unclean shutdown of an idle-Listening ephemeral runner) makes a subsequent
|
|
105
|
+
* config.sh fail with "already configured".
|
|
106
|
+
*/
|
|
107
|
+
export declare function removeStaleRunnerConfig(runnerDir: string): Promise<void>;
|
|
99
108
|
/**
|
|
100
109
|
* Generate a runner name: "buildhive-<8-char random hex>".
|
|
101
110
|
* Short enough to be readable in GitHub's runner list.
|
|
@@ -41,8 +41,9 @@
|
|
|
41
41
|
*/
|
|
42
42
|
import { spawn } from 'child_process';
|
|
43
43
|
import { createInterface } from 'readline';
|
|
44
|
-
import { dirname } from 'path';
|
|
44
|
+
import { dirname, join } from 'path';
|
|
45
45
|
import { randomBytes } from 'crypto';
|
|
46
|
+
import { promises as fsPromises } from 'fs';
|
|
46
47
|
import { createLogger } from '../utils/logger.js';
|
|
47
48
|
const logger = createLogger('runner.supervisor');
|
|
48
49
|
/** Timeout in ms to wait for the runner to exit after SIGTERM before SIGKILL */
|
|
@@ -64,6 +65,7 @@ export function sleep(ms) {
|
|
|
64
65
|
export async function configureRunner(opts) {
|
|
65
66
|
const { configSh, runnerRegistrationUrl, registrationToken, runnerName, labels, } = opts;
|
|
66
67
|
const spawnFn = opts.spawnFn ?? spawn;
|
|
68
|
+
const runnerDir = dirname(configSh);
|
|
67
69
|
const args = [
|
|
68
70
|
'--unattended',
|
|
69
71
|
'--ephemeral',
|
|
@@ -95,36 +97,61 @@ export async function configureRunner(opts) {
|
|
|
95
97
|
'--labels', labels,
|
|
96
98
|
'--replace',
|
|
97
99
|
];
|
|
98
|
-
|
|
99
|
-
//
|
|
100
|
-
//
|
|
101
|
-
//
|
|
102
|
-
// guidance
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
const stdoutLog = createLogger('runner.config.stdout');
|
|
110
|
-
const stderrLog = createLogger('runner.config.stderr');
|
|
111
|
-
if (child.stdout) {
|
|
112
|
-
const rl = createInterface({ input: child.stdout, crlfDelay: Infinity });
|
|
113
|
-
rl.on('line', (line) => stdoutLog.info(line));
|
|
114
|
-
}
|
|
115
|
-
if (child.stderr) {
|
|
116
|
-
const rl = createInterface({ input: child.stderr, crlfDelay: Infinity });
|
|
117
|
-
rl.on('line', (line) => stderrLog.warn(line));
|
|
118
|
-
}
|
|
119
|
-
return new Promise((resolve) => {
|
|
120
|
-
child.on('close', (code) => {
|
|
121
|
-
resolve(code ?? 1);
|
|
100
|
+
// Spawn config.sh once; stream its output to the log AND watch for the
|
|
101
|
+
// "already configured" marker so the caller can self-heal a stale/raced
|
|
102
|
+
// local registration. SECURITY M-1: the GH registration token is passed as a
|
|
103
|
+
// process arg (--token), briefly visible via `ps aux`; acceptable per
|
|
104
|
+
// Apple/GitHub guidance (single-use, ~1h TTL). See P6 review M-1.
|
|
105
|
+
const runConfigSh = () => {
|
|
106
|
+
logger.info('Configuring runner', { configSh, runnerName, runnerRegistrationUrl, labels });
|
|
107
|
+
const child = spawnFn(configSh, args, {
|
|
108
|
+
cwd: runnerDir,
|
|
109
|
+
env: { ...process.env },
|
|
110
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
122
111
|
});
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
112
|
+
let alreadyConfigured = false;
|
|
113
|
+
const markIfAlreadyConfigured = (line) => {
|
|
114
|
+
if (/already configured/i.test(line))
|
|
115
|
+
alreadyConfigured = true;
|
|
116
|
+
};
|
|
117
|
+
const stdoutLog = createLogger('runner.config.stdout');
|
|
118
|
+
const stderrLog = createLogger('runner.config.stderr');
|
|
119
|
+
if (child.stdout) {
|
|
120
|
+
const rl = createInterface({ input: child.stdout, crlfDelay: Infinity });
|
|
121
|
+
rl.on('line', (line) => { markIfAlreadyConfigured(line); stdoutLog.info(line); });
|
|
122
|
+
}
|
|
123
|
+
if (child.stderr) {
|
|
124
|
+
const rl = createInterface({ input: child.stderr, crlfDelay: Infinity });
|
|
125
|
+
rl.on('line', (line) => { markIfAlreadyConfigured(line); stderrLog.warn(line); });
|
|
126
|
+
}
|
|
127
|
+
return new Promise((resolve) => {
|
|
128
|
+
child.on('close', (code) => resolve({ code: code ?? 1, alreadyConfigured }));
|
|
129
|
+
child.on('error', (err) => {
|
|
130
|
+
logger.error('config.sh spawn error', err);
|
|
131
|
+
resolve({ code: 1, alreadyConfigured });
|
|
132
|
+
});
|
|
126
133
|
});
|
|
127
|
-
}
|
|
134
|
+
};
|
|
135
|
+
// Row 21b — reboot robustness. We ALWAYS register a fresh ephemeral runner
|
|
136
|
+
// each cycle, so any local registration is stale. An ephemeral runner killed
|
|
137
|
+
// while idle-Listening (a reboot, or the SIGTERM grace window on `launchctl
|
|
138
|
+
// bootout`) leaves `.runner`/`.credentials*` behind; config.sh then refuses
|
|
139
|
+
// with "already configured".
|
|
140
|
+
//
|
|
141
|
+
// 1. Pre-clean the common leftover case.
|
|
142
|
+
await removeStaleRunnerConfig(runnerDir);
|
|
143
|
+
let result = await runConfigSh();
|
|
144
|
+
// 2. Fast self-heal: if config.sh STILL reports "already configured" (a
|
|
145
|
+
// leftover the pre-clean missed, or a brief post-reboot race), clean and
|
|
146
|
+
// retry ONCE immediately — instead of returning failure and waiting for
|
|
147
|
+
// the supervisor's backoff + a launchd respawn (the 2026-06-02 reboot
|
|
148
|
+
// walk recovered only after ~6 min via a respawn; this makes it seconds).
|
|
149
|
+
if (result.code !== 0 && result.alreadyConfigured) {
|
|
150
|
+
logger.warn('config.sh reported "already configured" — cleaning stale registration and retrying immediately');
|
|
151
|
+
await removeStaleRunnerConfig(runnerDir);
|
|
152
|
+
result = await runConfigSh();
|
|
153
|
+
}
|
|
154
|
+
return result.code;
|
|
128
155
|
}
|
|
129
156
|
/**
|
|
130
157
|
* Spawn run.sh (the GitHub runner listen loop) and manage its lifecycle.
|
|
@@ -256,6 +283,32 @@ export async function runRunner(opts) {
|
|
|
256
283
|
});
|
|
257
284
|
});
|
|
258
285
|
}
|
|
286
|
+
/**
|
|
287
|
+
* Remove a stale `actions/runner` local registration so a fresh `config.sh`
|
|
288
|
+
* can register cleanly. Best-effort (ENOENT is fine). Exported for tests.
|
|
289
|
+
*
|
|
290
|
+
* These are the files config.sh writes on registration; a leftover set (from an
|
|
291
|
+
* unclean shutdown of an idle-Listening ephemeral runner) makes a subsequent
|
|
292
|
+
* config.sh fail with "already configured".
|
|
293
|
+
*/
|
|
294
|
+
export async function removeStaleRunnerConfig(runnerDir) {
|
|
295
|
+
const stale = ['.runner', '.credentials', '.credentials_rsaparams'];
|
|
296
|
+
let removedAny = false;
|
|
297
|
+
for (const name of stale) {
|
|
298
|
+
try {
|
|
299
|
+
await fsPromises.unlink(join(runnerDir, name));
|
|
300
|
+
removedAny = true;
|
|
301
|
+
}
|
|
302
|
+
catch {
|
|
303
|
+
// ENOENT (normal — clean slate) or unreadable; ignore.
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
if (removedAny) {
|
|
307
|
+
logger.info('Removed stale runner registration before re-config (reboot/unclean-exit recovery)', {
|
|
308
|
+
runnerDir,
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
}
|
|
259
312
|
/**
|
|
260
313
|
* Generate a runner name: "buildhive-<8-char random hex>".
|
|
261
314
|
* Short enough to be readable in GitHub's runner list.
|
|
@@ -168,9 +168,14 @@ export function generatePlist(mode, config) {
|
|
|
168
168
|
// and the CLI looking at different ~/.buildhive/ paths).
|
|
169
169
|
// System mode: HOME=/var/_buildhive so the dropped-priv user can read
|
|
170
170
|
// its own state.
|
|
171
|
+
// Row 21b / Fix B: prepend `/opt/homebrew/bin` (Apple-Silicon Homebrew) and
|
|
172
|
+
// keep `/usr/local/bin` (Intel Homebrew). launchd does NOT inherit the
|
|
173
|
+
// interactive shell PATH, so without this the daemon-spawned `actions/runner`
|
|
174
|
+
// can't find `git`, `node`, etc. installed via Homebrew on Apple Silicon —
|
|
175
|
+
// the runner's config.sh/run.sh fail and the job never runs.
|
|
171
176
|
const envDict = [
|
|
172
177
|
['NODE_ENV', 'production'],
|
|
173
|
-
['PATH', '/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin'],
|
|
178
|
+
['PATH', '/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin'],
|
|
174
179
|
];
|
|
175
180
|
if (mode === 'system') {
|
|
176
181
|
envDict.push(['HOME', `/var/${config.systemUserName ?? DEFAULT_SYSTEM_USER}`]);
|
|
@@ -37,6 +37,11 @@ export interface ServiceStatus {
|
|
|
37
37
|
readonly state: 'running' | 'waiting' | 'unknown' | null;
|
|
38
38
|
/** Most recent exit reason if available. */
|
|
39
39
|
readonly lastExitReason: string | null;
|
|
40
|
+
/**
|
|
41
|
+
* Most recent `last exit code` per `launchctl print` (or null if absent).
|
|
42
|
+
* Row 21b / Fix C: a non-zero value means the daemon is crashing on start.
|
|
43
|
+
*/
|
|
44
|
+
readonly lastExitCode: number | null;
|
|
40
45
|
/** Computed install paths for this mode (single source of truth). */
|
|
41
46
|
readonly paths: ServicePaths;
|
|
42
47
|
}
|
|
@@ -158,12 +158,16 @@ export async function getServiceStatus(opts, deps = {}) {
|
|
|
158
158
|
loaded: false,
|
|
159
159
|
state: null,
|
|
160
160
|
lastExitReason: null,
|
|
161
|
+
lastExitCode: null,
|
|
161
162
|
paths,
|
|
162
163
|
};
|
|
163
164
|
}
|
|
164
165
|
const out = printResult.stdout;
|
|
165
|
-
|
|
166
|
+
// Anchored + `[^\n]+` so multi-word states like `spawn scheduled` aren't
|
|
167
|
+
// truncated to `spawn` (the exact truncation Fix C corrected in the doctor).
|
|
168
|
+
const stateMatch = out.match(/(?:^|\n)\s*state\s*=\s*([^\n]+)/);
|
|
166
169
|
const exitReasonMatch = out.match(/last exit reason\s*=\s*([^\n]+)/);
|
|
170
|
+
const exitCodeMatch = out.match(/last exit code\s*=\s*(-?\d+)/i);
|
|
167
171
|
let state = 'unknown';
|
|
168
172
|
if (stateMatch) {
|
|
169
173
|
const raw = stateMatch[1].toLowerCase();
|
|
@@ -179,6 +183,7 @@ export async function getServiceStatus(opts, deps = {}) {
|
|
|
179
183
|
loaded: true,
|
|
180
184
|
state,
|
|
181
185
|
lastExitReason: exitReasonMatch ? exitReasonMatch[1].trim() : null,
|
|
186
|
+
lastExitCode: exitCodeMatch ? parseInt(exitCodeMatch[1], 10) : null,
|
|
182
187
|
paths,
|
|
183
188
|
};
|
|
184
189
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "buildhive-agent",
|
|
3
|
-
"version": "1.0.0-beta.
|
|
3
|
+
"version": "1.0.0-beta.12",
|
|
4
4
|
"description": "BuildHive CI Agent - Distributed build execution agent",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -74,4 +74,4 @@
|
|
|
74
74
|
"overrides": {
|
|
75
75
|
"minimatch": "^10.0.1"
|
|
76
76
|
}
|
|
77
|
-
}
|
|
77
|
+
}
|