@triflux/remote 10.0.0-alpha.1 → 10.0.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,671 @@
1
+ // hub/team/conductor.mjs — 세션 오케스트레이션 Conductor
2
+ // native-supervisor.mjs의 spawn/kill을 래핑하되, 상태 머신 + health probe +
3
+ // auto-restart + event log를 추가하여 "조용한 실패"를 구조적으로 불가능하게 만든다.
4
+ //
5
+ // 기존 native-supervisor와의 차이:
6
+ // 1. 상태 머신 (alive/dead → 7 states + 2 terminal)
7
+ // 2. Health probe 4단계 (+ INPUT_WAIT 감지)
8
+ // 3. Auto-restart (maxRestarts=3)
9
+ // 4. JSONL event log (블랙박스 리코더)
10
+
11
+ import { spawn, execFile } from 'node:child_process';
12
+ import { dirname, join } from 'node:path';
13
+ import { homedir } from 'node:os';
14
+ import { mkdirSync, createWriteStream, readFileSync, copyFileSync } from 'node:fs';
15
+ import { EventEmitter } from 'node:events';
16
+
17
+ import { killProcess, IS_WINDOWS } from '@triflux/core/hub/platform.mjs';
18
+ import { createEventLog } from './event-log.mjs';
19
+ import { createHealthProbe } from './health-probe.mjs';
20
+ import { createRemoteProbe } from './remote-probe.mjs';
21
+ import { buildLauncher } from './launcher-template.mjs';
22
+ import { broker } from '@triflux/core/hub/account-broker.mjs';
23
+
24
+ /** 세션 상태 */
25
+ export const STATES = Object.freeze({
26
+ INIT: 'init',
27
+ STARTING: 'starting',
28
+ HEALTHY: 'healthy',
29
+ STALLED: 'stalled',
30
+ INPUT_WAIT: 'input_wait',
31
+ FAILED: 'failed',
32
+ RESTARTING: 'restarting',
33
+ DEAD: 'dead',
34
+ COMPLETED: 'completed',
35
+ });
36
+
37
+ /** 유효한 상태 전이 테이블 */
38
+ const TRANSITIONS = Object.freeze({
39
+ [STATES.INIT]: [STATES.STARTING],
40
+ [STATES.STARTING]: [STATES.HEALTHY, STATES.FAILED],
41
+ [STATES.HEALTHY]: [STATES.STALLED, STATES.INPUT_WAIT, STATES.FAILED, STATES.COMPLETED],
42
+ [STATES.STALLED]: [STATES.HEALTHY, STATES.FAILED],
43
+ [STATES.INPUT_WAIT]: [STATES.HEALTHY, STATES.FAILED],
44
+ [STATES.FAILED]: [STATES.RESTARTING, STATES.DEAD],
45
+ [STATES.RESTARTING]: [STATES.STARTING],
46
+ [STATES.DEAD]: [],
47
+ [STATES.COMPLETED]: [],
48
+ });
49
+
50
+ const TERMINAL_STATES = new Set([STATES.DEAD, STATES.COMPLETED]);
51
+ const DEFAULT_MAX_RESTARTS = 3;
52
+ const DEFAULT_GRACE_MS = 10_000;
53
+
54
+ /**
55
+ * Conductor 팩토리.
56
+ * @param {object} opts
57
+ * @param {string} opts.logsDir — 이벤트 로그 디렉토리
58
+ * @param {number} [opts.maxRestarts=3]
59
+ * @param {number} [opts.graceMs=10000] — shutdown grace period
60
+ * @param {object} [opts.probeOpts] — health-probe 옵션 오버라이드
61
+ * @returns {Conductor}
62
+ */
63
+ export function createConductor(opts = {}) {
64
+ const {
65
+ logsDir,
66
+ maxRestarts = DEFAULT_MAX_RESTARTS,
67
+ graceMs = DEFAULT_GRACE_MS,
68
+ probeOpts = {},
69
+ } = opts;
70
+
71
+ if (!logsDir) throw new Error('logsDir is required');
72
+ mkdirSync(logsDir, { recursive: true });
73
+
74
+ const emitter = new EventEmitter();
75
+ const sessions = new Map();
76
+ let shuttingDown = false;
77
+
78
+ // 공유 event log (모든 세션 이벤트를 하나의 JSONL에)
79
+ const eventLog = createEventLog(join(logsDir, 'conductor-events.jsonl'));
80
+
81
+ /**
82
+ * 세션 상태 전이.
83
+ * @param {object} session
84
+ * @param {string} nextState
85
+ * @param {string} [reason]
86
+ */
87
+ function transition(session, nextState, reason = '') {
88
+ const valid = TRANSITIONS[session.state] || [];
89
+ if (!valid.includes(nextState)) {
90
+ eventLog.append('invalid_transition', {
91
+ session: session.id,
92
+ from: session.state,
93
+ to: nextState,
94
+ reason,
95
+ });
96
+ return false;
97
+ }
98
+
99
+ const prev = session.state;
100
+ session.state = nextState;
101
+
102
+ eventLog.append('stateChange', {
103
+ session: session.id,
104
+ from: prev,
105
+ to: nextState,
106
+ reason,
107
+ restarts: session.restarts,
108
+ });
109
+
110
+ emitter.emit('stateChange', { sessionId: session.id, from: prev, to: nextState, reason });
111
+
112
+ // Terminal state cleanup
113
+ if (TERMINAL_STATES.has(nextState)) {
114
+ session.probe?.stop();
115
+ }
116
+
117
+ return true;
118
+ }
119
+
120
+ /**
121
+ * 프로세스를 강제 종료.
122
+ * Windows: taskkill /T /F /PID (프로세스 트리). POSIX: SIGKILL.
123
+ */
124
+ function forceKill(pid) {
125
+ if (!pid || pid <= 0) return;
126
+ killProcess(pid, { signal: 'SIGKILL', tree: true, force: true, timeout: 5000 });
127
+ }
128
+
129
+ /**
130
+ * 원격 세션의 psmux 세션을 SSH 경유로 kill.
131
+ * fire-and-forget: 실패해도 에러 전파 안 함.
132
+ */
133
+ function killRemoteSession(session) {
134
+ const host = session.config.host;
135
+ if (!host) return;
136
+ let sshUser = session.config.sshUser;
137
+ let sshIp = host;
138
+ // hosts.json에서 ssh_user/IP 해결
139
+ try {
140
+ const hostsPath = join(opts.repoRoot || process.cwd(), 'references', 'hosts.json');
141
+ const hosts = JSON.parse(readFileSync(hostsPath, 'utf8'));
142
+ const hostCfg = hosts.hosts?.[host];
143
+ if (hostCfg) {
144
+ sshUser = sshUser || hostCfg.ssh_user;
145
+ sshIp = hostCfg.tailscale?.ip || host;
146
+ }
147
+ } catch { /* hosts.json 없으면 fallback */ }
148
+ if (!sshUser) return;
149
+ const execFn = opts.deps?.execFile || execFile;
150
+ execFn('ssh', [`${sshUser}@${sshIp}`, 'psmux', 'kill-session', '-t', session.id],
151
+ { timeout: 10_000 }, () => {});
152
+ eventLog.append('remote_kill', { session: session.id, host, sshUser, sshIp });
153
+ }
154
+
155
+ /**
156
+ * 단일 세션의 child process를 정리.
157
+ * 원격 세션은 SSH 경유 psmux kill-session으로 정리.
158
+ */
159
+ async function cleanupChild(session) {
160
+ session.probe?.stop();
161
+
162
+ // 원격 세션 — SSH 경유 psmux kill-session
163
+ if (session.config.remote) {
164
+ killRemoteSession(session);
165
+ return;
166
+ }
167
+
168
+ const child = session.child;
169
+ if (!child) return;
170
+
171
+ const pid = child.pid;
172
+ if (!pid) return;
173
+
174
+ // SIGTERM 먼저
175
+ try { child.kill('SIGTERM'); } catch { /* already dead */ }
176
+
177
+ // Grace period 대기
178
+ await new Promise((resolve) => {
179
+ const timer = setTimeout(() => {
180
+ forceKill(pid);
181
+ resolve();
182
+ }, graceMs);
183
+ timer.unref?.();
184
+ child.once('exit', () => {
185
+ clearTimeout(timer);
186
+ resolve();
187
+ });
188
+ });
189
+ }
190
+
191
+ /**
192
+ * Health probe 콜백 — probe 결과에 따라 상태 전이 판단.
193
+ */
194
+ function handleProbeResult(session, result) {
195
+ if (TERMINAL_STATES.has(session.state)) return;
196
+ if (session.state === STATES.INIT || session.state === STATES.RESTARTING) return;
197
+
198
+ eventLog.append('health', {
199
+ session: session.id,
200
+ ...result,
201
+ });
202
+
203
+ // L0 실패 — 로컬: exit handler에서 처리. 원격: probe가 유일한 감지 수단.
204
+ if (result.l0 === 'fail') {
205
+ if (session.config.remote) {
206
+ handleFailure(session, 'remote_L0_fail');
207
+ }
208
+ return;
209
+ }
210
+
211
+ // L3 completed (원격 완료 토큰 감지)
212
+ if (result.l3 === 'completed' && session.config.remote) {
213
+ transition(session, STATES.COMPLETED, 'remote_completion_token');
214
+ emitter.emit('completed', { sessionId: session.id });
215
+ if (typeof session.config.onCompleted === 'function') {
216
+ session.config.onCompleted({ sessionId: session.id });
217
+ }
218
+ maybeAutoShutdown();
219
+ return;
220
+ }
221
+
222
+ // L1 INPUT_WAIT 감지
223
+ if (result.l1 === 'input_wait' && session.state === STATES.HEALTHY) {
224
+ transition(session, STATES.INPUT_WAIT, `input_wait:${result.inputWaitPattern}`);
225
+ emitter.emit('inputWait', {
226
+ sessionId: session.id,
227
+ pattern: result.inputWaitPattern,
228
+ });
229
+ return;
230
+ }
231
+
232
+ // INPUT_WAIT → output 재개 시 HEALTHY 복귀
233
+ if (session.state === STATES.INPUT_WAIT && result.l1 === 'ok') {
234
+ transition(session, STATES.HEALTHY, 'output_resumed');
235
+ return;
236
+ }
237
+
238
+ // L1 stall
239
+ if (result.l1 === 'stall' && session.state === STATES.HEALTHY) {
240
+ transition(session, STATES.STALLED, 'L1_stall');
241
+ return;
242
+ }
243
+
244
+ // STALLED → output 재개 시 HEALTHY 복귀
245
+ if (session.state === STATES.STALLED && result.l1 === 'ok') {
246
+ transition(session, STATES.HEALTHY, 'output_resumed');
247
+ return;
248
+ }
249
+
250
+ // L3 timeout (아직 STARTING 상태)
251
+ if (result.l3 === 'timeout' && session.state === STATES.STARTING) {
252
+ handleFailure(session, 'L3_timeout');
253
+ return;
254
+ }
255
+
256
+ // STARTING → L0 ok + L3 ok → HEALTHY
257
+ if (session.state === STATES.STARTING && result.l0 === 'ok' && result.l3 === 'ok') {
258
+ transition(session, STATES.HEALTHY, 'probe_healthy');
259
+ return;
260
+ }
261
+
262
+ // STARTING → L0 ok (L3 아직 미판정) → STARTING 유지 (대기)
263
+ }
264
+
265
+ /**
266
+ * 실패 처리 — restart 또는 DEAD.
267
+ */
268
+ function handleFailure(session, reason) {
269
+ if (TERMINAL_STATES.has(session.state)) return;
270
+
271
+ transition(session, STATES.FAILED, reason);
272
+
273
+ if (session.restarts < maxRestarts) {
274
+ transition(session, STATES.RESTARTING, `restart_${session.restarts + 1}/${maxRestarts}`);
275
+ session.restarts += 1;
276
+ void respawnSession(session);
277
+ } else {
278
+ transition(session, STATES.DEAD, `maxRestarts(${maxRestarts})_exceeded`);
279
+ emitter.emit('dead', { sessionId: session.id, reason });
280
+
281
+ // broker release on final death
282
+ if (broker && session.config.accountId) {
283
+ broker.release(session.config.accountId, { ok: false, failureMode: session.lastFailureMode });
284
+ if (session.lastFailureMode === 'rate_limited') {
285
+ broker.markRateLimited(session.config.accountId, 5 * 60 * 1000);
286
+ }
287
+ }
288
+ }
289
+ }
290
+
291
+ /**
292
+ * 세션의 child process를 (재)시작.
293
+ */
294
+ async function respawnSession(session) {
295
+ // 기존 child 정리
296
+ await cleanupChild(session);
297
+
298
+ transition(session, STATES.STARTING, session.restarts > 0 ? 'respawn' : 'initial');
299
+
300
+ const launcher = session.launcher;
301
+ const outPath = join(logsDir, `${session.id}.out.log`);
302
+ const errPath = join(logsDir, `${session.id}.err.log`);
303
+ mkdirSync(logsDir, { recursive: true });
304
+
305
+ const outWs = createWriteStream(outPath, { flags: 'a' });
306
+ const errWs = createWriteStream(errPath, { flags: 'a' });
307
+
308
+ let outputBytes = 0;
309
+ let recentOutput = '';
310
+
311
+ let child;
312
+ try {
313
+ child = spawn(launcher.command, {
314
+ shell: true,
315
+ env: { ...process.env, ...launcher.env, ...(session.config.env || {}) },
316
+ stdio: ['pipe', 'pipe', 'pipe'],
317
+ windowsHide: true,
318
+ });
319
+ } catch (err) {
320
+ eventLog.append('spawn_error', { session: session.id, error: err.message });
321
+ handleFailure(session, `spawn_error:${err.message}`);
322
+ return;
323
+ }
324
+
325
+ session.child = child;
326
+ session.outPath = outPath;
327
+ session.errPath = errPath;
328
+
329
+ eventLog.append('spawn', {
330
+ session: session.id,
331
+ agent: session.config.agent,
332
+ pid: child.pid,
333
+ command: launcher.command,
334
+ restart: session.restarts,
335
+ });
336
+
337
+ // stdout+stderr 통합 추적 (F3 해결: stderr만 출력되는 경우도 advancing 판정)
338
+ const trackOutput = (buf) => {
339
+ outputBytes += buf.length;
340
+ const txt = String(buf);
341
+ // 최근 2KB만 유지 (INPUT_WAIT 패턴 감지용)
342
+ recentOutput += txt;
343
+ if (recentOutput.length > 2048) {
344
+ recentOutput = recentOutput.slice(-2048);
345
+ }
346
+ };
347
+
348
+ child.stdout?.on('data', (buf) => { outWs.write(buf); trackOutput(buf); });
349
+ child.stderr?.on('data', (buf) => { errWs.write(buf); trackOutput(buf); });
350
+
351
+ child.on('exit', (code, signal) => {
352
+ session.alive = false;
353
+ try { outWs.end(); } catch { /* ignore */ }
354
+ try { errWs.end(); } catch { /* ignore */ }
355
+
356
+ eventLog.append('exit', {
357
+ session: session.id,
358
+ code,
359
+ signal,
360
+ restart: session.restarts,
361
+ });
362
+
363
+ if (TERMINAL_STATES.has(session.state)) return;
364
+
365
+ if (code === 0 && !signal) {
366
+ transition(session, STATES.COMPLETED, 'exit_0');
367
+ emitter.emit('completed', { sessionId: session.id });
368
+ if (typeof session.config.onCompleted === 'function') {
369
+ session.config.onCompleted({ sessionId: session.id });
370
+ }
371
+ if (broker && session.config.accountId) {
372
+ broker.release(session.config.accountId, { ok: true });
373
+ }
374
+ } else {
375
+ // detect rate_limited from recent output before handleFailure
376
+ if (/(rate.?limit|quota|throttl|too.many.requests|429|usage.limit)/ui.test(recentOutput)) {
377
+ session.lastFailureMode = 'rate_limited';
378
+ }
379
+ handleFailure(session, `exit_code:${code},signal:${signal}`);
380
+ }
381
+
382
+ maybeAutoShutdown();
383
+ });
384
+
385
+ child.on('error', (err) => {
386
+ session.alive = false;
387
+ eventLog.append('child_error', { session: session.id, error: err.message });
388
+ if (!TERMINAL_STATES.has(session.state)) {
389
+ handleFailure(session, `child_error:${err.message}`);
390
+ }
391
+ });
392
+
393
+ session.alive = true;
394
+
395
+ // Health probe 설정
396
+ session.probe?.stop();
397
+ const probe = createHealthProbe(
398
+ {
399
+ get pid() { return child.pid; },
400
+ get alive() { return session.alive; },
401
+ getOutputBytes: () => outputBytes,
402
+ getRecentOutput: () => recentOutput,
403
+ },
404
+ {
405
+ ...probeOpts,
406
+ onProbe: (result) => handleProbeResult(session, result),
407
+ },
408
+ );
409
+ session.probe = probe;
410
+ probe.start();
411
+ }
412
+
413
+ /**
414
+ * 원격 세션 시작 — child process 대신 SSH capture-pane 폴링.
415
+ * 원격 세션은 remote-spawn.mjs가 이미 psmux 세션을 생성한 상태를 가정.
416
+ */
417
+ function startRemoteSession(session) {
418
+ transition(session, STATES.STARTING, 'remote_initial');
419
+
420
+ const { host, paneTarget, sessionName } = session.config;
421
+ const resolvedPane = paneTarget || `${sessionName || session.id}:0.0`;
422
+ const resolvedSessionName = sessionName || session.id;
423
+
424
+ eventLog.append('remote_start', {
425
+ session: session.id,
426
+ host,
427
+ paneTarget: resolvedPane,
428
+ sessionName: resolvedSessionName,
429
+ });
430
+
431
+ session.alive = true;
432
+
433
+ // Remote health probe 설정
434
+ session.probe?.stop();
435
+ const probe = createRemoteProbe(
436
+ {
437
+ host,
438
+ paneTarget: resolvedPane,
439
+ sessionName: resolvedSessionName,
440
+ },
441
+ {
442
+ ...probeOpts,
443
+ onProbe: (result) => handleProbeResult(session, result),
444
+ },
445
+ );
446
+ session.probe = probe;
447
+ probe.start();
448
+ }
449
+
450
+ /**
451
+ * 모든 세션이 terminal이면 auto-shutdown.
452
+ */
453
+ function maybeAutoShutdown() {
454
+ if (shuttingDown) return;
455
+ const allTerminal = [...sessions.values()].every(
456
+ (s) => TERMINAL_STATES.has(s.state),
457
+ );
458
+ if (allTerminal && sessions.size > 0) {
459
+ emitter.emit('allCompleted');
460
+ }
461
+ }
462
+
463
+ // ── Public API ──────────────────────────────────────────────
464
+
465
+ /**
466
+ * 새 세션 spawn.
467
+ * @param {object} config
468
+ * @param {string} config.id — 세션 ID (unique)
469
+ * @param {'codex'|'gemini'|'claude'} config.agent
470
+ * @param {string} config.prompt
471
+ * @param {string} [config.profile]
472
+ * @param {string} [config.workdir]
473
+ * @param {string} [config.model]
474
+ * @param {boolean} [config.remote=false] — 원격 세션 여부
475
+ * @param {string} [config.host] — SSH 호스트 (remote=true 필수)
476
+ * @param {string} [config.paneTarget] — psmux pane target (remote용)
477
+ * @param {string} [config.sessionName] — psmux 세션 이름 (remote용)
478
+ * @param {function} [config.onCompleted] — 세션 완료 시 콜백 ({sessionId}) => void
479
+ * @returns {string} session ID
480
+ */
481
+ function spawnSession(config) {
482
+ if (shuttingDown) throw new Error('Conductor is shutting down');
483
+ if (!config.id) throw new Error('session id is required');
484
+ if (sessions.has(config.id)) throw new Error(`Session "${config.id}" already exists`);
485
+ if (config.remote && !config.host) throw new Error('host is required for remote sessions');
486
+
487
+ // broker lease (graceful — broker null if accounts.json absent)
488
+ let lease = null;
489
+ if (broker && config.agent && !config.remote) {
490
+ lease = broker.lease({ provider: config.agent });
491
+ if (lease === null) {
492
+ const eta = broker.nextAvailableEta(config.agent);
493
+ eventLog.append('broker_no_lease', {
494
+ session: config.id,
495
+ agent: config.agent,
496
+ eta: eta ? new Date(eta).toISOString() : 'unknown',
497
+ });
498
+ // PoC: skip session when all accounts in cooldown
499
+ return config.id;
500
+ }
501
+ }
502
+
503
+ // apply lease profile/env/auth to config (immutable — new object)
504
+ const resolvedConfig = lease
505
+ ? {
506
+ ...config,
507
+ profile: lease.profile ?? config.profile,
508
+ env: { ...(config.env || {}), ...(lease.env || {}) },
509
+ accountId: lease.id,
510
+ }
511
+ : config;
512
+
513
+ // auth file copy — broker resolved absolute path, conductor does the actual copy
514
+ if (lease?.mode === 'auth' && lease.authFile) {
515
+ const dests = config.agent === 'codex'
516
+ ? [join(homedir(), '.codex', 'auth.json')]
517
+ : [
518
+ join(homedir(), '.gemini', 'oauth_creds.json'),
519
+ join(homedir(), '.gemini', 'gemini-credentials.json'),
520
+ ];
521
+ for (const dest of dests) {
522
+ try {
523
+ mkdirSync(dirname(dest), { recursive: true });
524
+ copyFileSync(lease.authFile, dest);
525
+ eventLog.append('auth_copy', { session: config.id, agent: config.agent, dest });
526
+ } catch (err) {
527
+ eventLog.append('auth_copy_error', { session: config.id, dest, error: err.message });
528
+ }
529
+ }
530
+ }
531
+
532
+ // 원격 세션은 launcher 불필요 (이미 원격에서 실행 중)
533
+ const launcher = resolvedConfig.remote
534
+ ? null
535
+ : buildLauncher({
536
+ agent: resolvedConfig.agent,
537
+ profile: resolvedConfig.profile,
538
+ prompt: resolvedConfig.prompt,
539
+ workdir: resolvedConfig.workdir,
540
+ model: resolvedConfig.model,
541
+ });
542
+
543
+ const session = {
544
+ id: resolvedConfig.id,
545
+ config: resolvedConfig,
546
+ launcher,
547
+ state: STATES.INIT,
548
+ child: null,
549
+ probe: null,
550
+ alive: false,
551
+ restarts: 0,
552
+ outPath: null,
553
+ errPath: null,
554
+ createdAt: Date.now(),
555
+ };
556
+
557
+ sessions.set(resolvedConfig.id, session);
558
+
559
+ if (resolvedConfig.remote) {
560
+ startRemoteSession(session);
561
+ } else {
562
+ void respawnSession(session);
563
+ }
564
+ return resolvedConfig.id;
565
+ }
566
+
567
+ /**
568
+ * 세션 kill.
569
+ * @param {string} id
570
+ * @param {string} [reason]
571
+ */
572
+ async function killSession(id, reason = 'user_kill') {
573
+ const session = sessions.get(id);
574
+ if (!session) return;
575
+ if (TERMINAL_STATES.has(session.state)) return;
576
+
577
+ eventLog.append('kill', { session: id, reason });
578
+ await cleanupChild(session);
579
+ transition(session, STATES.FAILED, reason);
580
+ transition(session, STATES.DEAD, reason);
581
+ }
582
+
583
+ /**
584
+ * 세션에 stdin 입력 전송 (INPUT_WAIT 해소용).
585
+ * @param {string} id
586
+ * @param {string} text
587
+ */
588
+ function sendInput(id, text) {
589
+ const session = sessions.get(id);
590
+ if (!session) return false;
591
+
592
+ // 원격 세션 — stdin 미지원 (psmux send-keys는 별도 경로)
593
+ if (session.config.remote) {
594
+ eventLog.append('stdin_remote_unsupported', { session: id });
595
+ return false;
596
+ }
597
+
598
+ if (!session.child) return false;
599
+ try {
600
+ session.child.stdin.write(`${text}\n`);
601
+ eventLog.append('stdin', { session: id, text: text.slice(0, 100) });
602
+ return true;
603
+ } catch {
604
+ return false;
605
+ }
606
+ }
607
+
608
+ /**
609
+ * 전체 세션 스냅샷.
610
+ * @returns {object[]}
611
+ */
612
+ function getSnapshot() {
613
+ return [...sessions.values()].map((s) => ({
614
+ id: s.id,
615
+ agent: s.config.agent,
616
+ state: s.state,
617
+ pid: s.child?.pid || null,
618
+ remote: s.config.remote || false,
619
+ host: s.config.host || null,
620
+ restarts: s.restarts,
621
+ health: s.probe?.getStatus() || null,
622
+ outPath: s.outPath,
623
+ errPath: s.errPath,
624
+ createdAt: s.createdAt,
625
+ }));
626
+ }
627
+
628
+ /**
629
+ * Graceful shutdown — 전체 세션 종료.
630
+ */
631
+ async function shutdown(reason = 'shutdown') {
632
+ if (shuttingDown) return;
633
+ shuttingDown = true;
634
+
635
+ eventLog.append('shutdown', { reason, sessions: sessions.size });
636
+
637
+ const cleanups = [...sessions.values()]
638
+ .filter((s) => !TERMINAL_STATES.has(s.state))
639
+ .map(async (s) => {
640
+ s.probe?.stop();
641
+ await cleanupChild(s);
642
+ if (!TERMINAL_STATES.has(s.state)) {
643
+ transition(s, STATES.FAILED, reason);
644
+ transition(s, STATES.DEAD, reason);
645
+ }
646
+ });
647
+
648
+ await Promise.allSettled(cleanups);
649
+ await eventLog.flush();
650
+ await eventLog.close();
651
+ emitter.emit('shutdown');
652
+ }
653
+
654
+ // Shutdown traps
655
+ const onSignal = () => { void shutdown('signal'); };
656
+ process.on('SIGINT', onSignal);
657
+ process.on('SIGTERM', onSignal);
658
+
659
+ return Object.freeze({
660
+ spawnSession,
661
+ killSession,
662
+ sendInput,
663
+ getSnapshot,
664
+ shutdown,
665
+ on: emitter.on.bind(emitter),
666
+ off: emitter.off.bind(emitter),
667
+ get sessionCount() { return sessions.size; },
668
+ get isShuttingDown() { return shuttingDown; },
669
+ get eventLogPath() { return eventLog.filePath; },
670
+ });
671
+ }