npm - @agfpd/iapeer - Versions diffs - 0.2.18 → 0.2.19 - Mend

@agfpd/iapeer 0.2.18 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +1 -1
package/src/cli/index.ts +17 -5
package/src/connect/connect.test.ts +23 -4
package/src/connect/index.ts +11 -1
package/src/core/constants.test.ts +11 -0
package/src/core/constants.ts +13 -1
package/src/launch/index.ts +4 -0
package/src/launch/launchd.test.ts +69 -0
package/src/launch/launchd.ts +97 -7

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agfpd/iapeer",
-  "version": "0.2.18",
+  "version": "0.2.19",
   "description": "Foundation core for the iapeer multi-agent ecosystem: identity, registry, storage, codec.",
   "type": "module",
   "bin": {

package/src/cli/index.ts CHANGED Viewed

@@ -40,7 +40,7 @@ import {
   wakeOrSpawn,
 } from '../lifecycle/index.ts'
 import { getAdapter } from '../launch/index.ts'
-import { isFoundationOwnedPlist, kickstartDaemon, launchdLabel, launchdPlistPath } from '../launch/launchd.ts'
+import { isFoundationOwnedPlist, kickstartDaemon, launchctlBootstrap, launchdLabel, launchdPlistPath } from '../launch/launchd.ts'
 import { resolveCallerIdentity, resolveIdentity } from '../identity/index.ts'
 import { runAlwaysOn } from '../launch/launchdRun.ts'
 import { installDaemonPlist, startConfiguredDaemon } from '../daemon/main.ts'
@@ -231,10 +231,22 @@ export function startPeer(personality: string, runtime: string | undefined, opts
     const identity = buildProcessAddress(rt, personality)
     if (isInfraRuntime(rt)) {
       const plist = launchdPlistPath(personality, env)
-      // Audit #13: a failed bootstrap means the peer did NOT start — surface it instead
-      // of reporting success silently.
-      const r = spawnSync('launchctl', ['bootstrap', `gui/${uid()}`, plist], { encoding: 'utf8' })
-      out.push({ personality, runtime: rt, action: 'bootstrap', reason: r.status === 0 ? undefined : `launchctl bootstrap FAILED (exit ${r.status})${(r.stderr ?? '').trim() ? `: ${(r.stderr ?? '').trim()}` : ''} — peer not started` })
+      // UNDEAD-JOB-SAFE start (boris's connect-acceptance find 10.06): a bootstrap
+      // right after a bootout used to hit the still-dismantling job (exit 5 I/O
+      // error) and leave the router DOWN. launchctlBootstrap now waits for the
+      // job to vanish and retries with backoff (~22 s budget); a failure after
+      // every attempt is LOUD with the manual rescue recipe. (Also gains the
+      // sentinel fleet-guard + sandbox guard the raw spawn never had.)
+      const r = launchctlBootstrap(personality, plist, env)
+      const ok = r.state === 'loaded' || r.state === 'already-loaded' || r.state === 'skipped-sandbox'
+      out.push({
+        personality,
+        runtime: rt,
+        action: 'bootstrap',
+        reason: ok
+          ? undefined
+          : `launchctl bootstrap FAILED${r.detail ? `: ${r.detail}` : ''} — peer not started; manual rescue: launchctl bootstrap gui/$(id -u) ${plist}`,
+      })
     } else {
       clearStopped(cfg, identity)
       out.push({ personality, runtime: rt, action: 'started' })

package/src/connect/connect.test.ts CHANGED Viewed

@@ -39,11 +39,12 @@ async function fixture(): Promise<{ env: NodeJS.ProcessEnv; calls: string[][]; r
   const runTg: TgRunner = (args, e) => {
     calls.push(args)
     if (args[0] === 'bot' && args[1] === 'add') {
-      // the package's behavior: token → bots/<alias>/.env; prints the validated @username
+      // the package's behavior: token → bots/<alias>/.env (incl. the username
+      // field — the RELIABLE source, live-host fact); stdout also prints one
       const p = botEnvPath(args[2]!, e)
       mkdirSync(dirname(p), { recursive: true })
-      writeFileSync(p, `TELEGRAM_BOT_TOKEN=${args[4]}\n`)
-      return { status: 0, stdout: 'bot added: @leo_test_bot\n', stderr: '' }
+      writeFileSync(p, `TELEGRAM_BOT_TOKEN=${args[4]}\nTELEGRAM_BOT_USERNAME=leo_env_bot\n`)
+      return { status: 0, stdout: 'bot added: @leo_stdout_bot\n', stderr: '' }
     }
     return { status: 0, stdout: '', stderr: '' }
   }
@@ -63,7 +64,7 @@ describe('connectTelegram (one flow: bot add → interface → restart → activ
     const { env, calls, runTg, restarts } = await fixture()
     const r = await connectTelegram({ peer: 'leo', token: 'T1:abc', env, runTg, restart: okRestart(restarts) })
     expect(r.state).toBe('connected')
-    expect(r.username).toBe('@leo_test_bot')
+    expect(r.username).toBe('@leo_env_bot') // .env field WINS over the stdout match
     expect(r.restart?.state).toBe('restarted')
     expect(restarts).toEqual(['arthur']) // the router = the natural telegram peer, not leo
     expect(calls[0]).toEqual(['bot', 'add', 'leo', '--token', 'T1:abc'])
@@ -116,6 +117,24 @@ describe('connectTelegram (one flow: bot add → interface → restart → activ
     expect(r2.state).toBe('refused-no-token')
   })
+  test('username falls back to the bot-add stdout when .env carries no username field', async () => {
+    const env = envFor(mkTmp())
+    writeRuntimeManifest({ runtime: 'telegram', selfConfig: '/stub/telegram-runtime self-config' }, { env })
+    await upsertPeer({ personality: 'leo', runtime: 'claude', cwd: '/tmp/leo', intelligence: 'artificial' }, { env })
+    await upsertPeer({ personality: 'arthur', runtime: 'telegram', cwd: '/tmp/arthur', intelligence: 'natural' }, { env })
+    const runTg: TgRunner = (args, e) => {
+      if (args[0] === 'bot') {
+        const p = botEnvPath('leo', e)
+        mkdirSync(dirname(p), { recursive: true })
+        writeFileSync(p, 'TELEGRAM_BOT_TOKEN=T\n') // no username field (older package)
+        return { status: 0, stdout: 'added @stdout_only_bot\n', stderr: '' }
+      }
+      return { status: 0, stdout: '', stderr: '' }
+    }
+    const r = await connectTelegram({ peer: 'leo', token: 'T', env, runTg, restart: okRestart([]) })
+    expect(r.username).toBe('@stdout_only_bot')
+  })
   test('bot add failure (getMe refusal on a bad token) → bot-add-failed with the package detail', async () => {
     const { env } = await fixture()
     const failTg: TgRunner = args =>

package/src/connect/index.ts CHANGED Viewed

@@ -183,7 +183,17 @@ export async function connectTelegram(opts: ConnectTelegramOptions): Promise<Con
   if (add.status !== 0) {
     return { state: 'bot-add-failed', peer, detail: (add.stderr || add.stdout || `exit ${add.status}`).trim() }
   }
-  const username = add.stdout.match(/@[A-Za-z0-9_]{3,}/)?.[0]
+  // @username: the bots/<alias>/.env TELEGRAM_BOT_USERNAME field is the RELIABLE
+  // source (present on the live host; survives a quiet bot-add stdout — boris's
+  // acceptance saw the activation line degrade to the BotFather hint). stdout
+  // match stays as the fallback.
+  const envAfterAdd = readBotEnv(alias, env)
+  const envUser = envAfterAdd?.match(/^TELEGRAM_BOT_USERNAME=(.+)$/m)?.[1]?.trim()
+  const username = envUser
+    ? envUser.startsWith('@')
+      ? envUser
+      : `@${envUser}`
+    : add.stdout.match(/@[A-Za-z0-9_]{3,}/)?.[0]
   // (2) interface bot — merge the channel binding into the peer's profile.
   const iface = runTg(['interface', 'bot', alias, '--peer', peer], env)

package/src/core/constants.test.ts CHANGED Viewed

@@ -18,4 +18,15 @@ describe('resolveSockDir', () => {
     expect(resolveSockDir({ IAPEER_SOCK_DIR: '   ' })).toBe(DEFAULT_SOCK_DIR)
     expect(resolveSockDir({ IAPEER_SOCK_DIR: '' })).toBe(DEFAULT_SOCK_DIR)
   })
+  test('IAPEER_ROOT implies socket isolation: <root>/socks (boris e2e find 10.06)', () => {
+    // An alt-root used to inherit GLOBAL /tmp — a sandboxed list saw PROD sessions
+    // live by name collision, and sandboxed stop/start would have hit prod.
+    expect(resolveSockDir({ IAPEER_ROOT: '/tmp/sbx/iapeer' })).toBe('/tmp/sbx/iapeer/socks')
+  })
+  test('explicit IAPEER_SOCK_DIR wins over the root-derived dir', () => {
+    expect(resolveSockDir({ IAPEER_ROOT: '/tmp/sbx/iapeer', IAPEER_SOCK_DIR: '/tmp/elsewhere' })).toBe('/tmp/elsewhere')
+  })
+  test('prod shape (no IAPEER_ROOT, no IAPEER_SOCK_DIR) stays on /tmp — untouched', () => {
+    expect(resolveSockDir({ HOME: '/Users/x' })).toBe('/tmp')
+  })
 })

package/src/core/constants.ts CHANGED Viewed

@@ -2,6 +2,8 @@
 // Consolidated from inter-agent-protocol/src/lib/constants.ts (wins as-is) and
 // extended with storage-layer path names (blueprint §1 core/constants).
+import { join } from 'path'
 export const NAME_RE = /^[a-z][a-z0-9-]{0,31}$/
 export const NAME_RE_SOURCE = '^[a-z][a-z0-9-]{0,31}$'
 export const RUNTIME_RE = /^[a-z][a-z0-9]{0,31}$/
@@ -124,8 +126,18 @@ export const DEFAULT_SOCK_DIR = '/tmp'
 // scan/resolve, lifecycle, launchdRun) MUST resolve through this ONE helper so they
 // agree — a site that hardcodes DEFAULT_SOCK_DIR would look in /tmp while a sandbox
 // (IAPEER_SOCK_DIR set) created the session elsewhere → a false "offline".
+//
+// IAPEER_ROOT IMPLIES SOCKET ISOLATION (boris's e2e find 10.06): an alt-root used
+// to inherit the GLOBAL /tmp, so a sandboxed `list` saw PROD sessions live by name
+// collision, and a sandboxed stop/start would have HIT a prod session. A set root
+// now derives `<root>/socks` unless IAPEER_SOCK_DIR explicitly says otherwise; the
+// prod daemon (no IAPEER_ROOT) keeps the canonical /tmp untouched.
 export function resolveSockDir(env: NodeJS.ProcessEnv = process.env): string {
-  return env.IAPEER_SOCK_DIR?.trim() || DEFAULT_SOCK_DIR
+  const explicit = env.IAPEER_SOCK_DIR?.trim()
+  if (explicit) return explicit
+  const root = env.IAPEER_ROOT?.trim()
+  if (root) return join(root, 'socks')
+  return DEFAULT_SOCK_DIR
 }
 // === per-peer cwd scope ===

package/src/launch/index.ts CHANGED Viewed

@@ -57,6 +57,7 @@ export {
   installAlwaysOnPlist,
   isFoundationOwnedPlist,
   launchctlBootstrap,
+  bootstrapJobCore,
   resolveExecutable,
   IAPEER_PLIST_OWNER_KEY,
 } from './launchd.ts'
@@ -65,6 +66,9 @@ export type {
   InstallAlwaysOnPlistOptions,
   BootstrapResult,
   BootstrapState,
+  BootstrapCoreDeps,
+  BootstrapCoreResult,
+  LaunchctlRunner,
 } from './launchd.ts'
 // ─────────────────────────────────────────────────────────────────────────────

package/src/launch/launchd.test.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync
 import { tmpdir } from 'os'
 import { join } from 'path'
 import {
+  bootstrapJobCore,
   getAdapter,
   installAlwaysOnPlist,
   isFoundationOwnedPlist,
@@ -326,6 +327,74 @@ describe('resolveExecutable + runtime-bin pinning', () => {
   })
 })
+// ─────────────────────────────────────────────────────────────────────────────
+// bootstrapJobCore — the undead-job-safe bootstrap (boris's connect-acceptance
+// find 10.06: bootout → immediate bootstrap → exit 5 I/O error → the whole
+// fleet's telegram router stayed DOWN). Pure DI core: run/sleep injected.
+// ─────────────────────────────────────────────────────────────────────────────
+describe('bootstrapJobCore (undead-job race)', () => {
+  type Call = { args: string[] }
+  function harness(script: { printStatuses: number[]; bootstrapStatuses: number[] }) {
+    const calls: Call[] = []
+    const sleeps: number[] = []
+    let printI = 0
+    let bootI = 0
+    const run = (args: string[]) => {
+      calls.push({ args })
+      if (args[0] === 'print') {
+        const status = script.printStatuses[Math.min(printI, script.printStatuses.length - 1)]!
+        printI++
+        return { status, stderr: '' }
+      }
+      const status = script.bootstrapStatuses[Math.min(bootI, script.bootstrapStatuses.length - 1)]!
+      bootI++
+      return { status, stderr: status === 0 ? '' : 'Bootstrap failed: 5: Input/output error' }
+    }
+    return { calls, sleeps, deps: { run, sleepMs: (ms: number) => void sleeps.push(ms) } }
+  }
+  test('clean path: job not listed, first bootstrap succeeds — zero sleeps', () => {
+    const h = harness({ printStatuses: [1], bootstrapStatuses: [0] })
+    const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
+    expect(r).toEqual({ state: 'loaded', attempts: 1 })
+    expect(h.sleeps).toEqual([])
+  })
+  test("boris's repro: undead job vanishes after polls, first bootstrap exit 5, retry succeeds", () => {
+    // print: listed, listed, gone (the bootout dismantle window) → bootstrap:
+    // exit 5 once (still racy), success on the retry after backoff.
+    const h = harness({ printStatuses: [0, 0, 1, 1, 1], bootstrapStatuses: [5, 0] })
+    const r = bootstrapJobCore('501', 'com.iapeer.arthur', '/p.plist', h.deps)
+    expect(r.state).toBe('loaded')
+    expect(r.attempts).toBe(2)
+    expect(h.sleeps.length).toBeGreaterThan(0) // waited for gone + backoff before retry
+  })
+  test('genuinely LIVE job (stays listed through the gone budget) → already-loaded, bootstrap NEVER called', () => {
+    const h = harness({ printStatuses: [0], bootstrapStatuses: [0] }) // always listed
+    const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', { ...h.deps, goneTimeoutMs: 2_000 })
+    expect(r).toEqual({ state: 'already-loaded', attempts: 0 })
+    expect(h.calls.some(c => c.args[0] === 'bootstrap')).toBe(false)
+  })
+  test('every attempt fails → failed with the attempt count and the last stderr (LOUD, not silent)', () => {
+    const h = harness({ printStatuses: [1], bootstrapStatuses: [5] })
+    const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
+    expect(r.state).toBe('failed')
+    expect(r.attempts).toBe(4)
+    expect(r.detail).toContain('Input/output error')
+    expect(r.detail).toContain('4 bootstrap attempts')
+  })
+  test('a racing load between attempts reads already-loaded (idempotent success)', () => {
+    // first bootstrap fails; before the retry the job shows up listed (raced in)
+    const h = harness({ printStatuses: [1, 0], bootstrapStatuses: [5] })
+    const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
+    expect(r.state).toBe('already-loaded')
+  })
+})
 describe('runAlwaysOn guard', () => {
   test('a non-infra runtime is rejected with exit code 1 (no tmux touched)', async () => {
     expect(await runAlwaysOn('boris', 'claude', '/tmp/whatever')).toBe(1)

package/src/launch/launchd.ts CHANGED Viewed

@@ -215,6 +215,89 @@ function isLaunchdLoaded(label: string, uid: string): boolean {
   return spawnSync('launchctl', ['print', `gui/${uid}/${label}`], { stdio: 'ignore' }).status === 0
 }
+// ─────────────────────────────────────────────────────────────────────────────
+// UNDEAD-JOB-SAFE bootstrap core (boris's live find 10.06, connect acceptance):
+// after `launchctl bootout` launchd dismantles the job ASYNCHRONOUSLY — an
+// immediate `bootstrap` hits the still-listed "undead" job and fails with
+// exit 5 "Input/output error" (the known PP race class, canon «Жизненный цикл
+// запуска persistent-peer и точки гонки»). On the connect flow that left the
+// WHOLE fleet's telegram router down. This core makes every restart-shaped flow
+// (stop→start, connect router restart, update-runtime) survive the race:
+//   (1) WAIT-FOR-GONE: while the job is still listed, poll `print` up to
+//       goneTimeoutMs. Vanished → proceed to bootstrap. STILL listed at the
+//       deadline → it is a genuinely LIVE job (KeepAlive running), not an undead
+//       one → 'already-loaded' (the idempotent no-op, same meaning as before).
+//   (2) BOOTSTRAP WITH BACKOFF: attempts with [0, 2 s, 5 s, 15 s] pauses
+//       (~22 s budget — covers the observed "manual retry succeeded after ~30 s"
+//       window), re-checking gone before each retry. All attempts failed →
+//       'failed' with the attempt count and the last stderr, so the caller can
+//       print the manual rescue recipe LOUD instead of leaving the job down
+//       silently.
+// Pure DI core (run/sleep injected) — unit-testable without launchctl and
+// without tripping the test-sandbox guard that wraps the public function.
+// ─────────────────────────────────────────────────────────────────────────────
+export interface LaunchctlRunner {
+  (args: string[]): { status: number | null; stderr: string }
+}
+export interface BootstrapCoreDeps {
+  run: LaunchctlRunner
+  sleepMs: (ms: number) => void
+  /** Budget for the undead job to vanish after a bootout (default 10 000 ms). */
+  goneTimeoutMs?: number
+  /** Pauses BEFORE each bootstrap attempt (default [0, 2000, 5000, 15000]). */
+  backoffMs?: number[]
+}
+export interface BootstrapCoreResult {
+  state: 'loaded' | 'already-loaded' | 'failed'
+  attempts: number
+  detail?: string
+}
+export function bootstrapJobCore(
+  uid: string,
+  label: string,
+  plistPath: string,
+  deps: BootstrapCoreDeps,
+): BootstrapCoreResult {
+  const goneTimeout = deps.goneTimeoutMs ?? 10_000
+  const backoffs = deps.backoffMs ?? [0, 2_000, 5_000, 15_000]
+  const listed = () => deps.run(['print', `gui/${uid}/${label}`]).status === 0
+  // (1) wait-for-gone (an undead job vanishes within seconds; a LIVE KeepAlive
+  //     job stays listed → idempotent no-op, exactly the old 'already-loaded').
+  if (listed()) {
+    const pollStep = 500
+    let waited = 0
+    while (waited < goneTimeout) {
+      deps.sleepMs(pollStep)
+      waited += pollStep
+      if (!listed()) break
+    }
+    if (listed()) return { state: 'already-loaded', attempts: 0 }
+  }
+  // (2) bootstrap with backoff; re-verify gone before each retry.
+  let last = ''
+  for (let attempt = 0; attempt < backoffs.length; attempt++) {
+    if (backoffs[attempt]! > 0) deps.sleepMs(backoffs[attempt]!)
+    if (attempt > 0 && listed()) {
+      // the failed attempt may have half-loaded it, or a race loaded it — success
+      return { state: 'already-loaded', attempts: attempt }
+    }
+    const r = deps.run(['bootstrap', `gui/${uid}`, plistPath])
+    if (r.status === 0) return { state: 'loaded', attempts: attempt + 1 }
+    last = r.stderr.trim() || `exit ${r.status}`
+  }
+  return {
+    state: 'failed',
+    attempts: backoffs.length,
+    detail: `${backoffs.length} bootstrap attempts failed (last: ${last})`,
+  }
+}
 export type DaemonRestartState =
   | 'restarted' // kickstart -k succeeded → the daemon is now on the freshly-installed binary
   | 'not-loaded' // com.agfpd.iapeer is not in the gui domain → nothing to restart (new binary
@@ -286,13 +369,20 @@ export function launchctlBootstrap(
     return { state: 'skipped-sandbox', label, detail: 'IAPEER_TEST_SANDBOX=1 — not loading a real launchd job' }
   }
   const uid = currentUid()
-  if (isLaunchdLoaded(label, uid)) return { state: 'already-loaded', label }
-  const r = spawnSync('launchctl', ['bootstrap', `gui/${uid}`, plistPath], { encoding: 'utf8' })
-  if (r.status === 0) return { state: 'loaded', label }
-  // A race could have loaded it between the check and the bootstrap; treat a
-  // now-loaded service as success (still idempotent).
-  if (isLaunchdLoaded(label, uid)) return { state: 'already-loaded', label }
-  return { state: 'failed', label, detail: (r.stderr ?? '').trim() || `launchctl bootstrap exited ${r.status}` }
+  // UNDEAD-JOB-SAFE core (boris's connect-acceptance find): wait for a booted-out
+  // job to actually vanish, then bootstrap with backoff. A genuinely LIVE job
+  // reads 'already-loaded' (idempotent no-op, same semantics as before); only a
+  // job that stays failing through every attempt reads 'failed'.
+  const core = bootstrapJobCore(uid, label, plistPath, {
+    run: args => {
+      const r = spawnSync('launchctl', args, { encoding: 'utf8' })
+      return { status: r.status, stderr: r.stderr ?? '' }
+    },
+    sleepMs: ms => spawnSync('sleep', [String(ms / 1000)]),
+  })
+  return core.state === 'failed'
+    ? { state: 'failed', label, detail: core.detail }
+    : { state: core.state, label }
 }
 export interface InstallAlwaysOnPlistOptions {