@agfpd/iapeer 0.2.17 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agfpd/iapeer",
3
- "version": "0.2.17",
3
+ "version": "0.2.19",
4
4
  "description": "Foundation core for the iapeer multi-agent ecosystem: identity, registry, storage, codec.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -116,6 +116,9 @@ describe('remove (registry record via the locked writer)', () => {
116
116
  const o = await removePeerCli('zombie', { env: e })
117
117
  expect(o.action).toBe('removed')
118
118
  expect(findPeer(readPeersIndex({ env: e }), 'zombie')).toBeNull()
119
+ // the folder is deliberately KEPT; the outcome carries the cwd so the verb can
120
+ // say so instead of leaving silent orphans (boris 10.06)
121
+ expect(o.cwd).toBe('/tmp/zombie')
119
122
  })
120
123
  test('removing an absent peer is an idempotent no-op (not an error)', async () => {
121
124
  const o = await removePeerCli('never-existed', { env: env() })
package/src/cli/index.ts CHANGED
@@ -11,7 +11,7 @@
11
11
  // sentinel-marked always-on plist) are stop/start-able.
12
12
 
13
13
  import { spawnSync } from 'child_process'
14
- import { readFileSync } from 'fs'
14
+ import { existsSync, readFileSync } from 'fs'
15
15
  import { fileURLToPath } from 'url'
16
16
  import {
17
17
  isInfraRuntime,
@@ -40,7 +40,7 @@ import {
40
40
  wakeOrSpawn,
41
41
  } from '../lifecycle/index.ts'
42
42
  import { getAdapter } from '../launch/index.ts'
43
- import { isFoundationOwnedPlist, kickstartDaemon, launchdLabel, launchdPlistPath } from '../launch/launchd.ts'
43
+ import { isFoundationOwnedPlist, kickstartDaemon, launchctlBootstrap, launchdLabel, launchdPlistPath } from '../launch/launchd.ts'
44
44
  import { resolveCallerIdentity, resolveIdentity } from '../identity/index.ts'
45
45
  import { runAlwaysOn } from '../launch/launchdRun.ts'
46
46
  import { installDaemonPlist, startConfiguredDaemon } from '../daemon/main.ts'
@@ -231,10 +231,22 @@ export function startPeer(personality: string, runtime: string | undefined, opts
231
231
  const identity = buildProcessAddress(rt, personality)
232
232
  if (isInfraRuntime(rt)) {
233
233
  const plist = launchdPlistPath(personality, env)
234
- // Audit #13: a failed bootstrap means the peer did NOT start — surface it instead
235
- // of reporting success silently.
236
- const r = spawnSync('launchctl', ['bootstrap', `gui/${uid()}`, plist], { encoding: 'utf8' })
237
- out.push({ personality, runtime: rt, action: 'bootstrap', reason: r.status === 0 ? undefined : `launchctl bootstrap FAILED (exit ${r.status})${(r.stderr ?? '').trim() ? `: ${(r.stderr ?? '').trim()}` : ''} — peer not started` })
234
+ // UNDEAD-JOB-SAFE start (boris's connect-acceptance find 10.06): a bootstrap
235
+ // right after a bootout used to hit the still-dismantling job (exit 5 I/O
236
+ // error) and leave the router DOWN. launchctlBootstrap now waits for the
237
+ // job to vanish and retries with backoff (~22 s budget); a failure after
238
+ // every attempt is LOUD with the manual rescue recipe. (Also gains the
239
+ // sentinel fleet-guard + sandbox guard the raw spawn never had.)
240
+ const r = launchctlBootstrap(personality, plist, env)
241
+ const ok = r.state === 'loaded' || r.state === 'already-loaded' || r.state === 'skipped-sandbox'
242
+ out.push({
243
+ personality,
244
+ runtime: rt,
245
+ action: 'bootstrap',
246
+ reason: ok
247
+ ? undefined
248
+ : `launchctl bootstrap FAILED${r.detail ? `: ${r.detail}` : ''} — peer not started; manual rescue: launchctl bootstrap gui/$(id -u) ${plist}`,
249
+ })
238
250
  } else {
239
251
  clearStopped(cfg, identity)
240
252
  out.push({ personality, runtime: rt, action: 'started' })
@@ -255,6 +267,10 @@ export interface RemoveOutcome {
255
267
  personality: string
256
268
  action: 'removed' | 'absent' | 'refused-live'
257
269
  reason?: string
270
+ /** The removed peer's cwd (registry fact, captured BEFORE the removal). remove
271
+ * deliberately keeps the folder — user data is never deleted by a registry reap
272
+ * (boris's finding 10.06: say so in the output instead of leaving silent orphans). */
273
+ cwd?: string
258
274
  }
259
275
 
260
276
  /**
@@ -285,7 +301,7 @@ export async function removePeerCli(
285
301
  }
286
302
  }
287
303
  await removePeer(personality, { env })
288
- return { personality, action: 'removed' }
304
+ return { personality, action: 'removed', cwd: peer.cwd }
289
305
  }
290
306
 
291
307
  // ─────────────────────────────────────────────────────────────────────────────
@@ -657,8 +673,14 @@ export async function runCli(argv: string[], env: NodeJS.ProcessEnv = process.en
657
673
  // peer unless --force (orphaning a running session from routing is the risk).
658
674
  if (!positionals[0]) return usage(errOut)
659
675
  const o = await removePeerCli(positionals[0], { force: flags.force === true, env })
660
- if (o.action === 'removed') out(`removed "${o.personality}" from the registry\n`)
661
- else if (o.action === 'absent') out(`"${o.personality}" not registered — no-op\n`)
676
+ if (o.action === 'removed') {
677
+ out(`removed "${o.personality}" from the registry\n`)
678
+ // Deliberate: the registry reap never deletes user data — but SAY so, or
679
+ // the default-location peers leave silent orphan folders (boris 10.06).
680
+ if (o.cwd && existsSync(o.cwd)) {
681
+ out(`folder kept: ${o.cwd} (remove never deletes peer data — \`rm -rf\` it yourself if it was a throwaway)\n`)
682
+ }
683
+ } else if (o.action === 'absent') out(`"${o.personality}" not registered — no-op\n`)
662
684
  else errOut(`remove: ${o.reason}\n`)
663
685
  return o.action === 'refused-live' ? 1 : 0
664
686
  }
@@ -39,11 +39,12 @@ async function fixture(): Promise<{ env: NodeJS.ProcessEnv; calls: string[][]; r
39
39
  const runTg: TgRunner = (args, e) => {
40
40
  calls.push(args)
41
41
  if (args[0] === 'bot' && args[1] === 'add') {
42
- // the package's behavior: token → bots/<alias>/.env; prints the validated @username
42
+ // the package's behavior: token → bots/<alias>/.env (incl. the username
43
+ // field — the RELIABLE source, live-host fact); stdout also prints one
43
44
  const p = botEnvPath(args[2]!, e)
44
45
  mkdirSync(dirname(p), { recursive: true })
45
- writeFileSync(p, `TELEGRAM_BOT_TOKEN=${args[4]}\n`)
46
- return { status: 0, stdout: 'bot added: @leo_test_bot\n', stderr: '' }
46
+ writeFileSync(p, `TELEGRAM_BOT_TOKEN=${args[4]}\nTELEGRAM_BOT_USERNAME=leo_env_bot\n`)
47
+ return { status: 0, stdout: 'bot added: @leo_stdout_bot\n', stderr: '' }
47
48
  }
48
49
  return { status: 0, stdout: '', stderr: '' }
49
50
  }
@@ -63,7 +64,7 @@ describe('connectTelegram (one flow: bot add → interface → restart → activ
63
64
  const { env, calls, runTg, restarts } = await fixture()
64
65
  const r = await connectTelegram({ peer: 'leo', token: 'T1:abc', env, runTg, restart: okRestart(restarts) })
65
66
  expect(r.state).toBe('connected')
66
- expect(r.username).toBe('@leo_test_bot')
67
+ expect(r.username).toBe('@leo_env_bot') // .env field WINS over the stdout match
67
68
  expect(r.restart?.state).toBe('restarted')
68
69
  expect(restarts).toEqual(['arthur']) // the router = the natural telegram peer, not leo
69
70
  expect(calls[0]).toEqual(['bot', 'add', 'leo', '--token', 'T1:abc'])
@@ -116,6 +117,24 @@ describe('connectTelegram (one flow: bot add → interface → restart → activ
116
117
  expect(r2.state).toBe('refused-no-token')
117
118
  })
118
119
 
120
+ test('username falls back to the bot-add stdout when .env carries no username field', async () => {
121
+ const env = envFor(mkTmp())
122
+ writeRuntimeManifest({ runtime: 'telegram', selfConfig: '/stub/telegram-runtime self-config' }, { env })
123
+ await upsertPeer({ personality: 'leo', runtime: 'claude', cwd: '/tmp/leo', intelligence: 'artificial' }, { env })
124
+ await upsertPeer({ personality: 'arthur', runtime: 'telegram', cwd: '/tmp/arthur', intelligence: 'natural' }, { env })
125
+ const runTg: TgRunner = (args, e) => {
126
+ if (args[0] === 'bot') {
127
+ const p = botEnvPath('leo', e)
128
+ mkdirSync(dirname(p), { recursive: true })
129
+ writeFileSync(p, 'TELEGRAM_BOT_TOKEN=T\n') // no username field (older package)
130
+ return { status: 0, stdout: 'added @stdout_only_bot\n', stderr: '' }
131
+ }
132
+ return { status: 0, stdout: '', stderr: '' }
133
+ }
134
+ const r = await connectTelegram({ peer: 'leo', token: 'T', env, runTg, restart: okRestart([]) })
135
+ expect(r.username).toBe('@stdout_only_bot')
136
+ })
137
+
119
138
  test('bot add failure (getMe refusal on a bad token) → bot-add-failed with the package detail', async () => {
120
139
  const { env } = await fixture()
121
140
  const failTg: TgRunner = args =>
@@ -183,7 +183,17 @@ export async function connectTelegram(opts: ConnectTelegramOptions): Promise<Con
183
183
  if (add.status !== 0) {
184
184
  return { state: 'bot-add-failed', peer, detail: (add.stderr || add.stdout || `exit ${add.status}`).trim() }
185
185
  }
186
- const username = add.stdout.match(/@[A-Za-z0-9_]{3,}/)?.[0]
186
+ // @username: the bots/<alias>/.env TELEGRAM_BOT_USERNAME field is the RELIABLE
187
+ // source (present on the live host; survives a quiet bot-add stdout — boris's
188
+ // acceptance saw the activation line degrade to the BotFather hint). stdout
189
+ // match stays as the fallback.
190
+ const envAfterAdd = readBotEnv(alias, env)
191
+ const envUser = envAfterAdd?.match(/^TELEGRAM_BOT_USERNAME=(.+)$/m)?.[1]?.trim()
192
+ const username = envUser
193
+ ? envUser.startsWith('@')
194
+ ? envUser
195
+ : `@${envUser}`
196
+ : add.stdout.match(/@[A-Za-z0-9_]{3,}/)?.[0]
187
197
 
188
198
  // (2) interface bot — merge the channel binding into the peer's profile.
189
199
  const iface = runTg(['interface', 'bot', alias, '--peer', peer], env)
@@ -18,4 +18,15 @@ describe('resolveSockDir', () => {
18
18
  expect(resolveSockDir({ IAPEER_SOCK_DIR: ' ' })).toBe(DEFAULT_SOCK_DIR)
19
19
  expect(resolveSockDir({ IAPEER_SOCK_DIR: '' })).toBe(DEFAULT_SOCK_DIR)
20
20
  })
21
+ test('IAPEER_ROOT implies socket isolation: <root>/socks (boris e2e find 10.06)', () => {
22
+ // An alt-root used to inherit GLOBAL /tmp — a sandboxed list saw PROD sessions
23
+ // live by name collision, and sandboxed stop/start would have hit prod.
24
+ expect(resolveSockDir({ IAPEER_ROOT: '/tmp/sbx/iapeer' })).toBe('/tmp/sbx/iapeer/socks')
25
+ })
26
+ test('explicit IAPEER_SOCK_DIR wins over the root-derived dir', () => {
27
+ expect(resolveSockDir({ IAPEER_ROOT: '/tmp/sbx/iapeer', IAPEER_SOCK_DIR: '/tmp/elsewhere' })).toBe('/tmp/elsewhere')
28
+ })
29
+ test('prod shape (no IAPEER_ROOT, no IAPEER_SOCK_DIR) stays on /tmp — untouched', () => {
30
+ expect(resolveSockDir({ HOME: '/Users/x' })).toBe('/tmp')
31
+ })
21
32
  })
@@ -2,6 +2,8 @@
2
2
  // Consolidated from inter-agent-protocol/src/lib/constants.ts (wins as-is) and
3
3
  // extended with storage-layer path names (blueprint §1 core/constants).
4
4
 
5
+ import { join } from 'path'
6
+
5
7
  export const NAME_RE = /^[a-z][a-z0-9-]{0,31}$/
6
8
  export const NAME_RE_SOURCE = '^[a-z][a-z0-9-]{0,31}$'
7
9
  export const RUNTIME_RE = /^[a-z][a-z0-9]{0,31}$/
@@ -124,8 +126,18 @@ export const DEFAULT_SOCK_DIR = '/tmp'
124
126
  // scan/resolve, lifecycle, launchdRun) MUST resolve through this ONE helper so they
125
127
  // agree — a site that hardcodes DEFAULT_SOCK_DIR would look in /tmp while a sandbox
126
128
  // (IAPEER_SOCK_DIR set) created the session elsewhere → a false "offline".
129
+ //
130
+ // IAPEER_ROOT IMPLIES SOCKET ISOLATION (boris's e2e find 10.06): an alt-root used
131
+ // to inherit the GLOBAL /tmp, so a sandboxed `list` saw PROD sessions live by name
132
+ // collision, and a sandboxed stop/start would have HIT a prod session. A set root
133
+ // now derives `<root>/socks` unless IAPEER_SOCK_DIR explicitly says otherwise; the
134
+ // prod daemon (no IAPEER_ROOT) keeps the canonical /tmp untouched.
127
135
  export function resolveSockDir(env: NodeJS.ProcessEnv = process.env): string {
128
- return env.IAPEER_SOCK_DIR?.trim() || DEFAULT_SOCK_DIR
136
+ const explicit = env.IAPEER_SOCK_DIR?.trim()
137
+ if (explicit) return explicit
138
+ const root = env.IAPEER_ROOT?.trim()
139
+ if (root) return join(root, 'socks')
140
+ return DEFAULT_SOCK_DIR
129
141
  }
130
142
 
131
143
  // === per-peer cwd scope ===
@@ -57,6 +57,7 @@ export {
57
57
  installAlwaysOnPlist,
58
58
  isFoundationOwnedPlist,
59
59
  launchctlBootstrap,
60
+ bootstrapJobCore,
60
61
  resolveExecutable,
61
62
  IAPEER_PLIST_OWNER_KEY,
62
63
  } from './launchd.ts'
@@ -65,6 +66,9 @@ export type {
65
66
  InstallAlwaysOnPlistOptions,
66
67
  BootstrapResult,
67
68
  BootstrapState,
69
+ BootstrapCoreDeps,
70
+ BootstrapCoreResult,
71
+ LaunchctlRunner,
68
72
  } from './launchd.ts'
69
73
 
70
74
  // ─────────────────────────────────────────────────────────────────────────────
@@ -10,6 +10,7 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync
10
10
  import { tmpdir } from 'os'
11
11
  import { join } from 'path'
12
12
  import {
13
+ bootstrapJobCore,
13
14
  getAdapter,
14
15
  installAlwaysOnPlist,
15
16
  isFoundationOwnedPlist,
@@ -326,6 +327,74 @@ describe('resolveExecutable + runtime-bin pinning', () => {
326
327
  })
327
328
  })
328
329
 
330
+ // ─────────────────────────────────────────────────────────────────────────────
331
+ // bootstrapJobCore — the undead-job-safe bootstrap (boris's connect-acceptance
332
+ // find 10.06: bootout → immediate bootstrap → exit 5 I/O error → the whole
333
+ // fleet's telegram router stayed DOWN). Pure DI core: run/sleep injected.
334
+ // ─────────────────────────────────────────────────────────────────────────────
335
+
336
+ describe('bootstrapJobCore (undead-job race)', () => {
337
+ type Call = { args: string[] }
338
+ function harness(script: { printStatuses: number[]; bootstrapStatuses: number[] }) {
339
+ const calls: Call[] = []
340
+ const sleeps: number[] = []
341
+ let printI = 0
342
+ let bootI = 0
343
+ const run = (args: string[]) => {
344
+ calls.push({ args })
345
+ if (args[0] === 'print') {
346
+ const status = script.printStatuses[Math.min(printI, script.printStatuses.length - 1)]!
347
+ printI++
348
+ return { status, stderr: '' }
349
+ }
350
+ const status = script.bootstrapStatuses[Math.min(bootI, script.bootstrapStatuses.length - 1)]!
351
+ bootI++
352
+ return { status, stderr: status === 0 ? '' : 'Bootstrap failed: 5: Input/output error' }
353
+ }
354
+ return { calls, sleeps, deps: { run, sleepMs: (ms: number) => void sleeps.push(ms) } }
355
+ }
356
+
357
+ test('clean path: job not listed, first bootstrap succeeds — zero sleeps', () => {
358
+ const h = harness({ printStatuses: [1], bootstrapStatuses: [0] })
359
+ const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
360
+ expect(r).toEqual({ state: 'loaded', attempts: 1 })
361
+ expect(h.sleeps).toEqual([])
362
+ })
363
+
364
+ test("boris's repro: undead job vanishes after polls, first bootstrap exit 5, retry succeeds", () => {
365
+ // print: listed, listed, gone (the bootout dismantle window) → bootstrap:
366
+ // exit 5 once (still racy), success on the retry after backoff.
367
+ const h = harness({ printStatuses: [0, 0, 1, 1, 1], bootstrapStatuses: [5, 0] })
368
+ const r = bootstrapJobCore('501', 'com.iapeer.arthur', '/p.plist', h.deps)
369
+ expect(r.state).toBe('loaded')
370
+ expect(r.attempts).toBe(2)
371
+ expect(h.sleeps.length).toBeGreaterThan(0) // waited for gone + backoff before retry
372
+ })
373
+
374
+ test('genuinely LIVE job (stays listed through the gone budget) → already-loaded, bootstrap NEVER called', () => {
375
+ const h = harness({ printStatuses: [0], bootstrapStatuses: [0] }) // always listed
376
+ const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', { ...h.deps, goneTimeoutMs: 2_000 })
377
+ expect(r).toEqual({ state: 'already-loaded', attempts: 0 })
378
+ expect(h.calls.some(c => c.args[0] === 'bootstrap')).toBe(false)
379
+ })
380
+
381
+ test('every attempt fails → failed with the attempt count and the last stderr (LOUD, not silent)', () => {
382
+ const h = harness({ printStatuses: [1], bootstrapStatuses: [5] })
383
+ const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
384
+ expect(r.state).toBe('failed')
385
+ expect(r.attempts).toBe(4)
386
+ expect(r.detail).toContain('Input/output error')
387
+ expect(r.detail).toContain('4 bootstrap attempts')
388
+ })
389
+
390
+ test('a racing load between attempts reads already-loaded (idempotent success)', () => {
391
+ // first bootstrap fails; before the retry the job shows up listed (raced in)
392
+ const h = harness({ printStatuses: [1, 0], bootstrapStatuses: [5] })
393
+ const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
394
+ expect(r.state).toBe('already-loaded')
395
+ })
396
+ })
397
+
329
398
  describe('runAlwaysOn guard', () => {
330
399
  test('a non-infra runtime is rejected with exit code 1 (no tmux touched)', async () => {
331
400
  expect(await runAlwaysOn('boris', 'claude', '/tmp/whatever')).toBe(1)
@@ -215,6 +215,89 @@ function isLaunchdLoaded(label: string, uid: string): boolean {
215
215
  return spawnSync('launchctl', ['print', `gui/${uid}/${label}`], { stdio: 'ignore' }).status === 0
216
216
  }
217
217
 
218
+ // ─────────────────────────────────────────────────────────────────────────────
219
+ // UNDEAD-JOB-SAFE bootstrap core (boris's live find 10.06, connect acceptance):
220
+ // after `launchctl bootout` launchd dismantles the job ASYNCHRONOUSLY — an
221
+ // immediate `bootstrap` hits the still-listed "undead" job and fails with
222
+ // exit 5 "Input/output error" (the known PP race class, canon «Жизненный цикл
223
+ // запуска persistent-peer и точки гонки»). On the connect flow that left the
224
+ // WHOLE fleet's telegram router down. This core makes every restart-shaped flow
225
+ // (stop→start, connect router restart, update-runtime) survive the race:
226
+ // (1) WAIT-FOR-GONE: while the job is still listed, poll `print` up to
227
+ // goneTimeoutMs. Vanished → proceed to bootstrap. STILL listed at the
228
+ // deadline → it is a genuinely LIVE job (KeepAlive running), not an undead
229
+ // one → 'already-loaded' (the idempotent no-op, same meaning as before).
230
+ // (2) BOOTSTRAP WITH BACKOFF: attempts with [0, 2 s, 5 s, 15 s] pauses
231
+ // (~22 s budget — covers the observed "manual retry succeeded after ~30 s"
232
+ // window), re-checking gone before each retry. All attempts failed →
233
+ // 'failed' with the attempt count and the last stderr, so the caller can
234
+ // print the manual rescue recipe LOUD instead of leaving the job down
235
+ // silently.
236
+ // Pure DI core (run/sleep injected) — unit-testable without launchctl and
237
+ // without tripping the test-sandbox guard that wraps the public function.
238
+ // ─────────────────────────────────────────────────────────────────────────────
239
+
240
+ export interface LaunchctlRunner {
241
+ (args: string[]): { status: number | null; stderr: string }
242
+ }
243
+
244
+ export interface BootstrapCoreDeps {
245
+ run: LaunchctlRunner
246
+ sleepMs: (ms: number) => void
247
+ /** Budget for the undead job to vanish after a bootout (default 10 000 ms). */
248
+ goneTimeoutMs?: number
249
+ /** Pauses BEFORE each bootstrap attempt (default [0, 2000, 5000, 15000]). */
250
+ backoffMs?: number[]
251
+ }
252
+
253
+ export interface BootstrapCoreResult {
254
+ state: 'loaded' | 'already-loaded' | 'failed'
255
+ attempts: number
256
+ detail?: string
257
+ }
258
+
259
+ export function bootstrapJobCore(
260
+ uid: string,
261
+ label: string,
262
+ plistPath: string,
263
+ deps: BootstrapCoreDeps,
264
+ ): BootstrapCoreResult {
265
+ const goneTimeout = deps.goneTimeoutMs ?? 10_000
266
+ const backoffs = deps.backoffMs ?? [0, 2_000, 5_000, 15_000]
267
+ const listed = () => deps.run(['print', `gui/${uid}/${label}`]).status === 0
268
+
269
+ // (1) wait-for-gone (an undead job vanishes within seconds; a LIVE KeepAlive
270
+ // job stays listed → idempotent no-op, exactly the old 'already-loaded').
271
+ if (listed()) {
272
+ const pollStep = 500
273
+ let waited = 0
274
+ while (waited < goneTimeout) {
275
+ deps.sleepMs(pollStep)
276
+ waited += pollStep
277
+ if (!listed()) break
278
+ }
279
+ if (listed()) return { state: 'already-loaded', attempts: 0 }
280
+ }
281
+
282
+ // (2) bootstrap with backoff; re-verify gone before each retry.
283
+ let last = ''
284
+ for (let attempt = 0; attempt < backoffs.length; attempt++) {
285
+ if (backoffs[attempt]! > 0) deps.sleepMs(backoffs[attempt]!)
286
+ if (attempt > 0 && listed()) {
287
+ // the failed attempt may have half-loaded it, or a race loaded it — success
288
+ return { state: 'already-loaded', attempts: attempt }
289
+ }
290
+ const r = deps.run(['bootstrap', `gui/${uid}`, plistPath])
291
+ if (r.status === 0) return { state: 'loaded', attempts: attempt + 1 }
292
+ last = r.stderr.trim() || `exit ${r.status}`
293
+ }
294
+ return {
295
+ state: 'failed',
296
+ attempts: backoffs.length,
297
+ detail: `${backoffs.length} bootstrap attempts failed (last: ${last})`,
298
+ }
299
+ }
300
+
218
301
  export type DaemonRestartState =
219
302
  | 'restarted' // kickstart -k succeeded → the daemon is now on the freshly-installed binary
220
303
  | 'not-loaded' // com.agfpd.iapeer is not in the gui domain → nothing to restart (new binary
@@ -286,13 +369,20 @@ export function launchctlBootstrap(
286
369
  return { state: 'skipped-sandbox', label, detail: 'IAPEER_TEST_SANDBOX=1 — not loading a real launchd job' }
287
370
  }
288
371
  const uid = currentUid()
289
- if (isLaunchdLoaded(label, uid)) return { state: 'already-loaded', label }
290
- const r = spawnSync('launchctl', ['bootstrap', `gui/${uid}`, plistPath], { encoding: 'utf8' })
291
- if (r.status === 0) return { state: 'loaded', label }
292
- // A race could have loaded it between the check and the bootstrap; treat a
293
- // now-loaded service as success (still idempotent).
294
- if (isLaunchdLoaded(label, uid)) return { state: 'already-loaded', label }
295
- return { state: 'failed', label, detail: (r.stderr ?? '').trim() || `launchctl bootstrap exited ${r.status}` }
372
+ // UNDEAD-JOB-SAFE core (boris's connect-acceptance find): wait for a booted-out
373
+ // job to actually vanish, then bootstrap with backoff. A genuinely LIVE job
374
+ // reads 'already-loaded' (idempotent no-op, same semantics as before); only a
375
+ // job that stays failing through every attempt reads 'failed'.
376
+ const core = bootstrapJobCore(uid, label, plistPath, {
377
+ run: args => {
378
+ const r = spawnSync('launchctl', args, { encoding: 'utf8' })
379
+ return { status: r.status, stderr: r.stderr ?? '' }
380
+ },
381
+ sleepMs: ms => spawnSync('sleep', [String(ms / 1000)]),
382
+ })
383
+ return core.state === 'failed'
384
+ ? { state: 'failed', label, detail: core.detail }
385
+ : { state: core.state, label }
296
386
  }
297
387
 
298
388
  export interface InstallAlwaysOnPlistOptions {
@@ -55,7 +55,7 @@ function runtimeBin(runtime: OnboardRuntime, env: NodeJS.ProcessEnv): string {
55
55
  return env.IAPEER_CODEX_BIN?.trim() || 'codex'
56
56
  }
57
57
 
58
- function isExecutable(binOrName: string): boolean {
58
+ function isExecutable(binOrName: string, env: NodeJS.ProcessEnv = process.env): boolean {
59
59
  if (binOrName.includes('/')) {
60
60
  try {
61
61
  accessSync(binOrName, FS.X_OK)
@@ -64,13 +64,22 @@ function isExecutable(binOrName: string): boolean {
64
64
  return false
65
65
  }
66
66
  }
67
- // bare name → resolved by spawnSync against PATH; probe with `which`-free spawn.
68
- // HARD TIMEOUT (live find 10.06): `codex --version` HANGS FOREVER in a non-tty
69
- // environment (three stray probes sat 25+ min; an onboard --dry-run piped to a
70
- // file never printed a byte). A hung probe must degrade to 'runtime-missing',
71
- // not wedge the whole onboard.
72
- const r = spawnSync(binOrName, ['--version'], { stdio: 'ignore', timeout: 10_000 })
73
- return r.error === undefined && r.status !== null
67
+ // bare name → PRESENCE probe over PATH (`command -v` semantics), NO spawn.
68
+ // History (both live finds 10.06): the original `--version` ANSWER probe HANGS
69
+ // FOREVER for codex in a non-tty (three stray probes sat 25+ min); the 10 s
70
+ // timeout that replaced it then DEGRADED a LIVE codex to 'runtime-missing'
71
+ // masking a working runtime (boris's catch). The skip-decision only asks "is
72
+ // the runtime installed", and presence answers that without executing anything.
73
+ for (const dir of (env.PATH ?? '').split(':')) {
74
+ if (!dir) continue
75
+ try {
76
+ accessSync(join(dir, binOrName), FS.X_OK)
77
+ return true
78
+ } catch {
79
+ /* not in this PATH segment */
80
+ }
81
+ }
82
+ return false
74
83
  }
75
84
 
76
85
  /**
@@ -81,7 +90,11 @@ function isExecutable(binOrName: string): boolean {
81
90
  */
82
91
  export function isMarketplaceRegistered(runtime: OnboardRuntime, env: NodeJS.ProcessEnv = process.env): boolean {
83
92
  const bin = runtimeBin(runtime, env)
84
- const r = spawnSync(bin, ['plugin', 'marketplace', 'list'], { encoding: 'utf8' })
93
+ // HARD TIMEOUT the codex CLI hangs FOREVER in a non-tty on ANY subcommand
94
+ // (live 10.06: first `--version`, then `plugin marketplace list` after the
95
+ // presence-probe fix let a live codex through). Timeout → status null →
96
+ // "not registered" → the add (also time-bounded) decides; never a wedge.
97
+ const r = spawnSync(bin, ['plugin', 'marketplace', 'list'], { encoding: 'utf8', timeout: 60_000 })
85
98
  if (r.status !== 0) return false
86
99
  return isAgfpdInList(`${r.stdout ?? ''}`)
87
100
  }
@@ -102,8 +115,12 @@ export function isAgfpdInList(listOutput: string): boolean {
102
115
  /** Register OUR marketplace for this runtime (`<runtime> plugin marketplace add <ref>`). */
103
116
  function registerMarketplace(runtime: OnboardRuntime, env: NodeJS.ProcessEnv): { ok: boolean; detail?: string } {
104
117
  const bin = runtimeBin(runtime, env)
105
- const r = spawnSync(bin, ['plugin', 'marketplace', 'add', MARKETPLACE_REF], { encoding: 'utf8' })
106
- return r.status === 0 ? { ok: true } : { ok: false, detail: (r.stderr ?? '').trim() || `exit ${r.status}` }
118
+ // Same hard timeout as the list probe (codex non-tty hang class) — a wedged add
119
+ // degrades to a loud 'failed' line instead of freezing the host phase.
120
+ const r = spawnSync(bin, ['plugin', 'marketplace', 'add', MARKETPLACE_REF], { encoding: 'utf8', timeout: 120_000 })
121
+ return r.status === 0
122
+ ? { ok: true }
123
+ : { ok: false, detail: (r.stderr ?? '').trim() || (r.status === null ? 'timed out (non-tty hang?)' : `exit ${r.status}`) }
107
124
  }
108
125
 
109
126
  /**
@@ -119,7 +136,7 @@ export function onboardHost(opts: OnboardOptions = {}): OnboardResult {
119
136
  const runtimes = opts.runtimes ?? (['claude', 'codex'] as OnboardRuntime[])
120
137
  const marketplaces: OnboardRuntimeResult[] = []
121
138
  for (const runtime of runtimes) {
122
- if (!isExecutable(runtimeBin(runtime, env))) {
139
+ if (!isExecutable(runtimeBin(runtime, env), env)) {
123
140
  marketplaces.push({ runtime, state: 'runtime-missing' })
124
141
  continue
125
142
  }