@agfpd/iapeer 0.2.18 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agfpd/iapeer",
3
- "version": "0.2.18",
3
+ "version": "0.2.19",
4
4
  "description": "Foundation core for the iapeer multi-agent ecosystem: identity, registry, storage, codec.",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli/index.ts CHANGED
@@ -40,7 +40,7 @@ import {
40
40
  wakeOrSpawn,
41
41
  } from '../lifecycle/index.ts'
42
42
  import { getAdapter } from '../launch/index.ts'
43
- import { isFoundationOwnedPlist, kickstartDaemon, launchdLabel, launchdPlistPath } from '../launch/launchd.ts'
43
+ import { isFoundationOwnedPlist, kickstartDaemon, launchctlBootstrap, launchdLabel, launchdPlistPath } from '../launch/launchd.ts'
44
44
  import { resolveCallerIdentity, resolveIdentity } from '../identity/index.ts'
45
45
  import { runAlwaysOn } from '../launch/launchdRun.ts'
46
46
  import { installDaemonPlist, startConfiguredDaemon } from '../daemon/main.ts'
@@ -231,10 +231,22 @@ export function startPeer(personality: string, runtime: string | undefined, opts
231
231
  const identity = buildProcessAddress(rt, personality)
232
232
  if (isInfraRuntime(rt)) {
233
233
  const plist = launchdPlistPath(personality, env)
234
- // Audit #13: a failed bootstrap means the peer did NOT start — surface it instead
235
- // of reporting success silently.
236
- const r = spawnSync('launchctl', ['bootstrap', `gui/${uid()}`, plist], { encoding: 'utf8' })
237
- out.push({ personality, runtime: rt, action: 'bootstrap', reason: r.status === 0 ? undefined : `launchctl bootstrap FAILED (exit ${r.status})${(r.stderr ?? '').trim() ? `: ${(r.stderr ?? '').trim()}` : ''} — peer not started` })
234
+ // UNDEAD-JOB-SAFE start (boris's connect-acceptance find 10.06): a bootstrap
235
+ // right after a bootout used to hit the still-dismantling job (exit 5 I/O
236
+ // error) and leave the router DOWN. launchctlBootstrap now waits for the
237
+ // job to vanish and retries with backoff (~22 s budget); a failure after
238
+ // every attempt is LOUD with the manual rescue recipe. (Also gains the
239
+ // sentinel fleet-guard + sandbox guard the raw spawn never had.)
240
+ const r = launchctlBootstrap(personality, plist, env)
241
+ const ok = r.state === 'loaded' || r.state === 'already-loaded' || r.state === 'skipped-sandbox'
242
+ out.push({
243
+ personality,
244
+ runtime: rt,
245
+ action: 'bootstrap',
246
+ reason: ok
247
+ ? undefined
248
+ : `launchctl bootstrap FAILED${r.detail ? `: ${r.detail}` : ''} — peer not started; manual rescue: launchctl bootstrap gui/$(id -u) ${plist}`,
249
+ })
238
250
  } else {
239
251
  clearStopped(cfg, identity)
240
252
  out.push({ personality, runtime: rt, action: 'started' })
@@ -39,11 +39,12 @@ async function fixture(): Promise<{ env: NodeJS.ProcessEnv; calls: string[][]; r
39
39
  const runTg: TgRunner = (args, e) => {
40
40
  calls.push(args)
41
41
  if (args[0] === 'bot' && args[1] === 'add') {
42
- // the package's behavior: token → bots/<alias>/.env; prints the validated @username
42
+ // the package's behavior: token → bots/<alias>/.env (incl. the username
43
+ // field — the RELIABLE source, live-host fact); stdout also prints one
43
44
  const p = botEnvPath(args[2]!, e)
44
45
  mkdirSync(dirname(p), { recursive: true })
45
- writeFileSync(p, `TELEGRAM_BOT_TOKEN=${args[4]}\n`)
46
- return { status: 0, stdout: 'bot added: @leo_test_bot\n', stderr: '' }
46
+ writeFileSync(p, `TELEGRAM_BOT_TOKEN=${args[4]}\nTELEGRAM_BOT_USERNAME=leo_env_bot\n`)
47
+ return { status: 0, stdout: 'bot added: @leo_stdout_bot\n', stderr: '' }
47
48
  }
48
49
  return { status: 0, stdout: '', stderr: '' }
49
50
  }
@@ -63,7 +64,7 @@ describe('connectTelegram (one flow: bot add → interface → restart → activ
63
64
  const { env, calls, runTg, restarts } = await fixture()
64
65
  const r = await connectTelegram({ peer: 'leo', token: 'T1:abc', env, runTg, restart: okRestart(restarts) })
65
66
  expect(r.state).toBe('connected')
66
- expect(r.username).toBe('@leo_test_bot')
67
+ expect(r.username).toBe('@leo_env_bot') // .env field WINS over the stdout match
67
68
  expect(r.restart?.state).toBe('restarted')
68
69
  expect(restarts).toEqual(['arthur']) // the router = the natural telegram peer, not leo
69
70
  expect(calls[0]).toEqual(['bot', 'add', 'leo', '--token', 'T1:abc'])
@@ -116,6 +117,24 @@ describe('connectTelegram (one flow: bot add → interface → restart → activ
116
117
  expect(r2.state).toBe('refused-no-token')
117
118
  })
118
119
 
120
+ test('username falls back to the bot-add stdout when .env carries no username field', async () => {
121
+ const env = envFor(mkTmp())
122
+ writeRuntimeManifest({ runtime: 'telegram', selfConfig: '/stub/telegram-runtime self-config' }, { env })
123
+ await upsertPeer({ personality: 'leo', runtime: 'claude', cwd: '/tmp/leo', intelligence: 'artificial' }, { env })
124
+ await upsertPeer({ personality: 'arthur', runtime: 'telegram', cwd: '/tmp/arthur', intelligence: 'natural' }, { env })
125
+ const runTg: TgRunner = (args, e) => {
126
+ if (args[0] === 'bot') {
127
+ const p = botEnvPath('leo', e)
128
+ mkdirSync(dirname(p), { recursive: true })
129
+ writeFileSync(p, 'TELEGRAM_BOT_TOKEN=T\n') // no username field (older package)
130
+ return { status: 0, stdout: 'added @stdout_only_bot\n', stderr: '' }
131
+ }
132
+ return { status: 0, stdout: '', stderr: '' }
133
+ }
134
+ const r = await connectTelegram({ peer: 'leo', token: 'T', env, runTg, restart: okRestart([]) })
135
+ expect(r.username).toBe('@stdout_only_bot')
136
+ })
137
+
119
138
  test('bot add failure (getMe refusal on a bad token) → bot-add-failed with the package detail', async () => {
120
139
  const { env } = await fixture()
121
140
  const failTg: TgRunner = args =>
@@ -183,7 +183,17 @@ export async function connectTelegram(opts: ConnectTelegramOptions): Promise<Con
183
183
  if (add.status !== 0) {
184
184
  return { state: 'bot-add-failed', peer, detail: (add.stderr || add.stdout || `exit ${add.status}`).trim() }
185
185
  }
186
- const username = add.stdout.match(/@[A-Za-z0-9_]{3,}/)?.[0]
186
+ // @username: the bots/<alias>/.env TELEGRAM_BOT_USERNAME field is the RELIABLE
187
+ // source (present on the live host; survives a quiet bot-add stdout — boris's
188
+ // acceptance saw the activation line degrade to the BotFather hint). stdout
189
+ // match stays as the fallback.
190
+ const envAfterAdd = readBotEnv(alias, env)
191
+ const envUser = envAfterAdd?.match(/^TELEGRAM_BOT_USERNAME=(.+)$/m)?.[1]?.trim()
192
+ const username = envUser
193
+ ? envUser.startsWith('@')
194
+ ? envUser
195
+ : `@${envUser}`
196
+ : add.stdout.match(/@[A-Za-z0-9_]{3,}/)?.[0]
187
197
 
188
198
  // (2) interface bot — merge the channel binding into the peer's profile.
189
199
  const iface = runTg(['interface', 'bot', alias, '--peer', peer], env)
@@ -18,4 +18,15 @@ describe('resolveSockDir', () => {
18
18
  expect(resolveSockDir({ IAPEER_SOCK_DIR: ' ' })).toBe(DEFAULT_SOCK_DIR)
19
19
  expect(resolveSockDir({ IAPEER_SOCK_DIR: '' })).toBe(DEFAULT_SOCK_DIR)
20
20
  })
21
+ test('IAPEER_ROOT implies socket isolation: <root>/socks (boris e2e find 10.06)', () => {
22
+ // An alt-root used to inherit GLOBAL /tmp — a sandboxed list saw PROD sessions
23
+ // live by name collision, and sandboxed stop/start would have hit prod.
24
+ expect(resolveSockDir({ IAPEER_ROOT: '/tmp/sbx/iapeer' })).toBe('/tmp/sbx/iapeer/socks')
25
+ })
26
+ test('explicit IAPEER_SOCK_DIR wins over the root-derived dir', () => {
27
+ expect(resolveSockDir({ IAPEER_ROOT: '/tmp/sbx/iapeer', IAPEER_SOCK_DIR: '/tmp/elsewhere' })).toBe('/tmp/elsewhere')
28
+ })
29
+ test('prod shape (no IAPEER_ROOT, no IAPEER_SOCK_DIR) stays on /tmp — untouched', () => {
30
+ expect(resolveSockDir({ HOME: '/Users/x' })).toBe('/tmp')
31
+ })
21
32
  })
@@ -2,6 +2,8 @@
2
2
  // Consolidated from inter-agent-protocol/src/lib/constants.ts (wins as-is) and
3
3
  // extended with storage-layer path names (blueprint §1 core/constants).
4
4
 
5
+ import { join } from 'path'
6
+
5
7
  export const NAME_RE = /^[a-z][a-z0-9-]{0,31}$/
6
8
  export const NAME_RE_SOURCE = '^[a-z][a-z0-9-]{0,31}$'
7
9
  export const RUNTIME_RE = /^[a-z][a-z0-9]{0,31}$/
@@ -124,8 +126,18 @@ export const DEFAULT_SOCK_DIR = '/tmp'
124
126
  // scan/resolve, lifecycle, launchdRun) MUST resolve through this ONE helper so they
125
127
  // agree — a site that hardcodes DEFAULT_SOCK_DIR would look in /tmp while a sandbox
126
128
  // (IAPEER_SOCK_DIR set) created the session elsewhere → a false "offline".
129
+ //
130
+ // IAPEER_ROOT IMPLIES SOCKET ISOLATION (boris's e2e find 10.06): an alt-root used
131
+ // to inherit the GLOBAL /tmp, so a sandboxed `list` saw PROD sessions live by name
132
+ // collision, and a sandboxed stop/start would have HIT a prod session. A set root
133
+ // now derives `<root>/socks` unless IAPEER_SOCK_DIR explicitly says otherwise; the
134
+ // prod daemon (no IAPEER_ROOT) keeps the canonical /tmp untouched.
127
135
  export function resolveSockDir(env: NodeJS.ProcessEnv = process.env): string {
128
- return env.IAPEER_SOCK_DIR?.trim() || DEFAULT_SOCK_DIR
136
+ const explicit = env.IAPEER_SOCK_DIR?.trim()
137
+ if (explicit) return explicit
138
+ const root = env.IAPEER_ROOT?.trim()
139
+ if (root) return join(root, 'socks')
140
+ return DEFAULT_SOCK_DIR
129
141
  }
130
142
 
131
143
  // === per-peer cwd scope ===
@@ -57,6 +57,7 @@ export {
57
57
  installAlwaysOnPlist,
58
58
  isFoundationOwnedPlist,
59
59
  launchctlBootstrap,
60
+ bootstrapJobCore,
60
61
  resolveExecutable,
61
62
  IAPEER_PLIST_OWNER_KEY,
62
63
  } from './launchd.ts'
@@ -65,6 +66,9 @@ export type {
65
66
  InstallAlwaysOnPlistOptions,
66
67
  BootstrapResult,
67
68
  BootstrapState,
69
+ BootstrapCoreDeps,
70
+ BootstrapCoreResult,
71
+ LaunchctlRunner,
68
72
  } from './launchd.ts'
69
73
 
70
74
  // ─────────────────────────────────────────────────────────────────────────────
@@ -10,6 +10,7 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync
10
10
  import { tmpdir } from 'os'
11
11
  import { join } from 'path'
12
12
  import {
13
+ bootstrapJobCore,
13
14
  getAdapter,
14
15
  installAlwaysOnPlist,
15
16
  isFoundationOwnedPlist,
@@ -326,6 +327,74 @@ describe('resolveExecutable + runtime-bin pinning', () => {
326
327
  })
327
328
  })
328
329
 
330
+ // ─────────────────────────────────────────────────────────────────────────────
331
+ // bootstrapJobCore — the undead-job-safe bootstrap (boris's connect-acceptance
332
+ // find 10.06: bootout → immediate bootstrap → exit 5 I/O error → the whole
333
+ // fleet's telegram router stayed DOWN). Pure DI core: run/sleep injected.
334
+ // ─────────────────────────────────────────────────────────────────────────────
335
+
336
+ describe('bootstrapJobCore (undead-job race)', () => {
337
+ type Call = { args: string[] }
338
+ function harness(script: { printStatuses: number[]; bootstrapStatuses: number[] }) {
339
+ const calls: Call[] = []
340
+ const sleeps: number[] = []
341
+ let printI = 0
342
+ let bootI = 0
343
+ const run = (args: string[]) => {
344
+ calls.push({ args })
345
+ if (args[0] === 'print') {
346
+ const status = script.printStatuses[Math.min(printI, script.printStatuses.length - 1)]!
347
+ printI++
348
+ return { status, stderr: '' }
349
+ }
350
+ const status = script.bootstrapStatuses[Math.min(bootI, script.bootstrapStatuses.length - 1)]!
351
+ bootI++
352
+ return { status, stderr: status === 0 ? '' : 'Bootstrap failed: 5: Input/output error' }
353
+ }
354
+ return { calls, sleeps, deps: { run, sleepMs: (ms: number) => void sleeps.push(ms) } }
355
+ }
356
+
357
+ test('clean path: job not listed, first bootstrap succeeds — zero sleeps', () => {
358
+ const h = harness({ printStatuses: [1], bootstrapStatuses: [0] })
359
+ const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
360
+ expect(r).toEqual({ state: 'loaded', attempts: 1 })
361
+ expect(h.sleeps).toEqual([])
362
+ })
363
+
364
+ test("boris's repro: undead job vanishes after polls, first bootstrap exit 5, retry succeeds", () => {
365
+ // print: listed, listed, gone (the bootout dismantle window) → bootstrap:
366
+ // exit 5 once (still racy), success on the retry after backoff.
367
+ const h = harness({ printStatuses: [0, 0, 1, 1, 1], bootstrapStatuses: [5, 0] })
368
+ const r = bootstrapJobCore('501', 'com.iapeer.arthur', '/p.plist', h.deps)
369
+ expect(r.state).toBe('loaded')
370
+ expect(r.attempts).toBe(2)
371
+ expect(h.sleeps.length).toBeGreaterThan(0) // waited for gone + backoff before retry
372
+ })
373
+
374
+ test('genuinely LIVE job (stays listed through the gone budget) → already-loaded, bootstrap NEVER called', () => {
375
+ const h = harness({ printStatuses: [0], bootstrapStatuses: [0] }) // always listed
376
+ const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', { ...h.deps, goneTimeoutMs: 2_000 })
377
+ expect(r).toEqual({ state: 'already-loaded', attempts: 0 })
378
+ expect(h.calls.some(c => c.args[0] === 'bootstrap')).toBe(false)
379
+ })
380
+
381
+ test('every attempt fails → failed with the attempt count and the last stderr (LOUD, not silent)', () => {
382
+ const h = harness({ printStatuses: [1], bootstrapStatuses: [5] })
383
+ const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
384
+ expect(r.state).toBe('failed')
385
+ expect(r.attempts).toBe(4)
386
+ expect(r.detail).toContain('Input/output error')
387
+ expect(r.detail).toContain('4 bootstrap attempts')
388
+ })
389
+
390
+ test('a racing load between attempts reads already-loaded (idempotent success)', () => {
391
+ // first bootstrap fails; before the retry the job shows up listed (raced in)
392
+ const h = harness({ printStatuses: [1, 0], bootstrapStatuses: [5] })
393
+ const r = bootstrapJobCore('501', 'com.iapeer.x', '/p.plist', h.deps)
394
+ expect(r.state).toBe('already-loaded')
395
+ })
396
+ })
397
+
329
398
  describe('runAlwaysOn guard', () => {
330
399
  test('a non-infra runtime is rejected with exit code 1 (no tmux touched)', async () => {
331
400
  expect(await runAlwaysOn('boris', 'claude', '/tmp/whatever')).toBe(1)
@@ -215,6 +215,89 @@ function isLaunchdLoaded(label: string, uid: string): boolean {
215
215
  return spawnSync('launchctl', ['print', `gui/${uid}/${label}`], { stdio: 'ignore' }).status === 0
216
216
  }
217
217
 
218
+ // ─────────────────────────────────────────────────────────────────────────────
219
+ // UNDEAD-JOB-SAFE bootstrap core (boris's live find 10.06, connect acceptance):
220
+ // after `launchctl bootout` launchd dismantles the job ASYNCHRONOUSLY — an
221
+ // immediate `bootstrap` hits the still-listed "undead" job and fails with
222
+ // exit 5 "Input/output error" (the known PP race class, canon «Жизненный цикл
223
+ // запуска persistent-peer и точки гонки»). On the connect flow that left the
224
+ // WHOLE fleet's telegram router down. This core makes every restart-shaped flow
225
+ // (stop→start, connect router restart, update-runtime) survive the race:
226
+ // (1) WAIT-FOR-GONE: while the job is still listed, poll `print` up to
227
+ // goneTimeoutMs. Vanished → proceed to bootstrap. STILL listed at the
228
+ // deadline → it is a genuinely LIVE job (KeepAlive running), not an undead
229
+ // one → 'already-loaded' (the idempotent no-op, same meaning as before).
230
+ // (2) BOOTSTRAP WITH BACKOFF: attempts with [0, 2 s, 5 s, 15 s] pauses
231
+ // (~22 s budget — covers the observed "manual retry succeeded after ~30 s"
232
+ // window), re-checking gone before each retry. All attempts failed →
233
+ // 'failed' with the attempt count and the last stderr, so the caller can
234
+ // print the manual rescue recipe LOUD instead of leaving the job down
235
+ // silently.
236
+ // Pure DI core (run/sleep injected) — unit-testable without launchctl and
237
+ // without tripping the test-sandbox guard that wraps the public function.
238
+ // ─────────────────────────────────────────────────────────────────────────────
239
+
240
+ export interface LaunchctlRunner {
241
+ (args: string[]): { status: number | null; stderr: string }
242
+ }
243
+
244
+ export interface BootstrapCoreDeps {
245
+ run: LaunchctlRunner
246
+ sleepMs: (ms: number) => void
247
+ /** Budget for the undead job to vanish after a bootout (default 10 000 ms). */
248
+ goneTimeoutMs?: number
249
+ /** Pauses BEFORE each bootstrap attempt (default [0, 2000, 5000, 15000]). */
250
+ backoffMs?: number[]
251
+ }
252
+
253
+ export interface BootstrapCoreResult {
254
+ state: 'loaded' | 'already-loaded' | 'failed'
255
+ attempts: number
256
+ detail?: string
257
+ }
258
+
259
+ export function bootstrapJobCore(
260
+ uid: string,
261
+ label: string,
262
+ plistPath: string,
263
+ deps: BootstrapCoreDeps,
264
+ ): BootstrapCoreResult {
265
+ const goneTimeout = deps.goneTimeoutMs ?? 10_000
266
+ const backoffs = deps.backoffMs ?? [0, 2_000, 5_000, 15_000]
267
+ const listed = () => deps.run(['print', `gui/${uid}/${label}`]).status === 0
268
+
269
+ // (1) wait-for-gone (an undead job vanishes within seconds; a LIVE KeepAlive
270
+ // job stays listed → idempotent no-op, exactly the old 'already-loaded').
271
+ if (listed()) {
272
+ const pollStep = 500
273
+ let waited = 0
274
+ while (waited < goneTimeout) {
275
+ deps.sleepMs(pollStep)
276
+ waited += pollStep
277
+ if (!listed()) break
278
+ }
279
+ if (listed()) return { state: 'already-loaded', attempts: 0 }
280
+ }
281
+
282
+ // (2) bootstrap with backoff; re-verify gone before each retry.
283
+ let last = ''
284
+ for (let attempt = 0; attempt < backoffs.length; attempt++) {
285
+ if (backoffs[attempt]! > 0) deps.sleepMs(backoffs[attempt]!)
286
+ if (attempt > 0 && listed()) {
287
+ // the failed attempt may have half-loaded it, or a race loaded it — success
288
+ return { state: 'already-loaded', attempts: attempt }
289
+ }
290
+ const r = deps.run(['bootstrap', `gui/${uid}`, plistPath])
291
+ if (r.status === 0) return { state: 'loaded', attempts: attempt + 1 }
292
+ last = r.stderr.trim() || `exit ${r.status}`
293
+ }
294
+ return {
295
+ state: 'failed',
296
+ attempts: backoffs.length,
297
+ detail: `${backoffs.length} bootstrap attempts failed (last: ${last})`,
298
+ }
299
+ }
300
+
218
301
  export type DaemonRestartState =
219
302
  | 'restarted' // kickstart -k succeeded → the daemon is now on the freshly-installed binary
220
303
  | 'not-loaded' // com.agfpd.iapeer is not in the gui domain → nothing to restart (new binary
@@ -286,13 +369,20 @@ export function launchctlBootstrap(
286
369
  return { state: 'skipped-sandbox', label, detail: 'IAPEER_TEST_SANDBOX=1 — not loading a real launchd job' }
287
370
  }
288
371
  const uid = currentUid()
289
- if (isLaunchdLoaded(label, uid)) return { state: 'already-loaded', label }
290
- const r = spawnSync('launchctl', ['bootstrap', `gui/${uid}`, plistPath], { encoding: 'utf8' })
291
- if (r.status === 0) return { state: 'loaded', label }
292
- // A race could have loaded it between the check and the bootstrap; treat a
293
- // now-loaded service as success (still idempotent).
294
- if (isLaunchdLoaded(label, uid)) return { state: 'already-loaded', label }
295
- return { state: 'failed', label, detail: (r.stderr ?? '').trim() || `launchctl bootstrap exited ${r.status}` }
372
+ // UNDEAD-JOB-SAFE core (boris's connect-acceptance find): wait for a booted-out
373
+ // job to actually vanish, then bootstrap with backoff. A genuinely LIVE job
374
+ // reads 'already-loaded' (idempotent no-op, same semantics as before); only a
375
+ // job that stays failing through every attempt reads 'failed'.
376
+ const core = bootstrapJobCore(uid, label, plistPath, {
377
+ run: args => {
378
+ const r = spawnSync('launchctl', args, { encoding: 'utf8' })
379
+ return { status: r.status, stderr: r.stderr ?? '' }
380
+ },
381
+ sleepMs: ms => spawnSync('sleep', [String(ms / 1000)]),
382
+ })
383
+ return core.state === 'failed'
384
+ ? { state: 'failed', label, detail: core.detail }
385
+ : { state: core.state, label }
296
386
  }
297
387
 
298
388
  export interface InstallAlwaysOnPlistOptions {