triflux 10.14.0 → 10.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/triflux.mjs CHANGED
@@ -61,6 +61,7 @@ import {
61
61
  probePsmuxSupport,
62
62
  } from "../scripts/lib/psmux-info.mjs";
63
63
  import {
64
+ buildWindowsHubAutostartCommand,
64
65
  cleanupStaleSkills,
65
66
  ensureCodexHubServerConfig,
66
67
  ensureCodexProfiles,
@@ -68,6 +69,7 @@ import {
68
69
  extractManagedHookFilename,
69
70
  getManagedRegistryHooks,
70
71
  getVersion,
72
+ getWindowsHubAutostartStatus,
71
73
  hasProfileSection,
72
74
  LEGACY_CODEX_MODELS,
73
75
  REQUIRED_CODEX_PROFILES,
@@ -123,7 +125,7 @@ const NORMALIZED_ARGS = RAW_ARGS.filter((arg) => arg !== "--json");
123
125
 
124
126
  const CLI_COMMAND_SCHEMAS = Object.freeze({
125
127
  setup: {
126
- usage: "tfx setup [--dry-run]",
128
+ usage: "tfx setup [--dry-run] [--enable-hub-autostart]",
127
129
  description: "파일 동기화 + HUD/MCP 설정",
128
130
  options: [
129
131
  {
@@ -131,6 +133,12 @@ const CLI_COMMAND_SCHEMAS = Object.freeze({
131
133
  type: "boolean",
132
134
  description: "실제 변경 없이 예정 작업을 JSON으로 출력",
133
135
  },
136
+ {
137
+ name: "--enable-hub-autostart",
138
+ type: "boolean",
139
+ description:
140
+ "Windows 로그인 시 tfx-hub를 보장하는 Task Scheduler 항목 등록",
141
+ },
134
142
  ],
135
143
  },
136
144
  doctor: {
@@ -1115,6 +1123,16 @@ function buildSetupDryRunPlan() {
1115
1123
  const defaultHubUrl = `http://127.0.0.1:${process.env.TFX_HUB_PORT || "27888"}/mcp`;
1116
1124
  actions.push(...previewMcpRegistrationActions(defaultHubUrl));
1117
1125
  actions.push(previewStatusLineAction());
1126
+ const autostart = getWindowsHubAutostartStatus();
1127
+ actions.push({
1128
+ type: "hub-autostart",
1129
+ platform: process.platform,
1130
+ taskName: autostart.taskName,
1131
+ change: autostart.supported && !autostart.registered ? "available" : "noop",
1132
+ registered: autostart.registered,
1133
+ command: autostart.supported ? buildWindowsHubAutostartCommand() : null,
1134
+ enableWith: "tfx setup --enable-hub-autostart",
1135
+ });
1118
1136
 
1119
1137
  return {
1120
1138
  dry_run: true,
@@ -1123,7 +1141,12 @@ function buildSetupDryRunPlan() {
1123
1141
  }
1124
1142
 
1125
1143
  function cmdSetup(options = {}) {
1126
- const { dryRun = false, overrideVersion, skipClaudeMdSync = false } = options;
1144
+ const {
1145
+ dryRun = false,
1146
+ overrideVersion,
1147
+ skipClaudeMdSync = false,
1148
+ enableHubAutostart = false,
1149
+ } = options;
1127
1150
  if (dryRun) {
1128
1151
  printJson(buildSetupDryRunPlan());
1129
1152
  return;
@@ -1351,6 +1374,67 @@ function cmdSetup(options = {}) {
1351
1374
  console.log("");
1352
1375
  }
1353
1376
 
1377
+ if (process.platform === "win32") {
1378
+ const status = getWindowsHubAutostartStatus();
1379
+ if (enableHubAutostart) {
1380
+ try {
1381
+ const script = join(PKG_ROOT, "scripts", "setup.mjs");
1382
+ execFileSync(
1383
+ process.execPath,
1384
+ [script, "--enable-hub-autostart", "--sync"],
1385
+ {
1386
+ stdio: ["ignore", "pipe", "pipe"],
1387
+ timeout: 10000,
1388
+ windowsHide: true,
1389
+ },
1390
+ );
1391
+ // subprocess silent-catch 회귀 가드: schtasks /Query 로 실제 등록 재검증.
1392
+ const verified = getWindowsHubAutostartStatus();
1393
+ if (verified.registered) {
1394
+ ok(`Hub autostart: ${verified.taskName} 등록됨`);
1395
+ summary.push({
1396
+ item: "Hub autostart",
1397
+ status: "✅",
1398
+ detail: `${verified.taskName} 등록됨`,
1399
+ });
1400
+ } else {
1401
+ warn(
1402
+ "Hub autostart 등록 실패: subprocess 성공했으나 /Query 에서 미발견",
1403
+ );
1404
+ summary.push({
1405
+ item: "Hub autostart",
1406
+ status: "⚠️",
1407
+ detail: "등록 실패 (subprocess silent catch 의심)",
1408
+ });
1409
+ }
1410
+ } catch (error) {
1411
+ warn(`Hub autostart 등록 실패: ${renderErrorMessage(error.message)}`);
1412
+ summary.push({
1413
+ item: "Hub autostart",
1414
+ status: "⚠️",
1415
+ detail: "등록 실패",
1416
+ });
1417
+ }
1418
+ } else if (status.registered) {
1419
+ ok(`Hub autostart: ${status.taskName} 이미 등록됨`);
1420
+ summary.push({
1421
+ item: "Hub autostart",
1422
+ status: "✅",
1423
+ detail: "이미 등록됨",
1424
+ });
1425
+ } else {
1426
+ warn(
1427
+ "Hub autostart 미등록 — Codex 단독 시작 전 hub가 죽어 있으면 MCP가 실패할 수 있음",
1428
+ );
1429
+ info("등록: tfx setup --enable-hub-autostart");
1430
+ summary.push({
1431
+ item: "Hub autostart",
1432
+ status: "⏭️",
1433
+ detail: "미등록",
1434
+ });
1435
+ }
1436
+ }
1437
+
1354
1438
  // HUD statusLine 설정
1355
1439
  console.log(`${CYAN}[HUD 설정]${RESET}`);
1356
1440
  const settingsPath = join(CLAUDE_DIR, "settings.json");
@@ -5579,7 +5663,10 @@ async function main() {
5579
5663
 
5580
5664
  switch (cmd) {
5581
5665
  case "setup":
5582
- cmdSetup({ dryRun: cmdArgs.includes("--dry-run") });
5666
+ cmdSetup({
5667
+ dryRun: cmdArgs.includes("--dry-run"),
5668
+ enableHubAutostart: cmdArgs.includes("--enable-hub-autostart"),
5669
+ });
5583
5670
  return;
5584
5671
  case "doctor": {
5585
5672
  if (cmdArgs.includes("--audit")) {
@@ -696,6 +696,10 @@ export function createConductor(opts = {}) {
696
696
  },
697
697
  {
698
698
  ...probeOpts,
699
+ // #165: default off → on. atomic write (#162) 로 race 제거됨.
700
+ // opt-out: TFX_PROBE_WRITE_STATE=0 명시.
701
+ writeStateFile:
702
+ probeOpts.writeStateFile ?? process.env.TFX_PROBE_WRITE_STATE !== "0",
699
703
  onProbe: (result) => handleProbeResult(session, result),
700
704
  },
701
705
  );
@@ -2,6 +2,10 @@
2
2
  // 기존 cli-adapter-base.mjs:stallThresholdMs(30s)와 headless.mjs:STALL_DEFAULTS(120s)를
3
3
  // 4단계 probe 모델로 교체. stdout+stderr 통합 스트림으로 평가 (F3 해결).
4
4
 
5
+ import { mkdirSync, renameSync, unlinkSync, writeFileSync } from "node:fs";
6
+ import { tmpdir } from "node:os";
7
+ import { dirname, join } from "node:path";
8
+
5
9
  /**
6
10
  * Health probe level 정의.
7
11
  * L0: Process alive (PID 존재 + exit code 없음)
@@ -25,6 +29,8 @@ export const PROBE_DEFAULTS = Object.freeze({
25
29
  l2ThresholdMs: 30_000,
26
30
  l3ThresholdMs: 120_000,
27
31
  enableL2: false,
32
+ writeStateFile: false,
33
+ stateDir: join(tmpdir(), "tfx-probe"),
28
34
  });
29
35
 
30
36
  /**
@@ -78,6 +84,16 @@ export function createHealthProbe(session, opts = {}) {
78
84
  const config = { ...PROBE_DEFAULTS, ...opts };
79
85
  let timer = null;
80
86
  let started = false;
87
+ // stopped flag는 in-flight probe()가 stop() 사이의 await 점에서
88
+ // writeState/unlink 와 race 하는 것을 막는다. start() 시 false 로 reset.
89
+ let stopped = false;
90
+ // P0 (#167 review): start() 마다 증가. probe() 가 시작 시 캡처 → writeState() 가 epoch
91
+ // 비교로 stop()→start() 사이의 in-flight 가 새 run 의 state 를 덮는 race 차단.
92
+ // stopped flag 만으로는 start() 가 stopped=false 로 reset 한 직후 race 못 막음.
93
+ let runEpoch = 0;
94
+ // P1-1 (#167 review): Set 으로 모든 in-flight probe 추적. interval 이 빨라 N+1 이 N 끝나기
95
+ // 전에 시작되면 단일 var 는 N 을 덮어써 stopAndDrain() 시 N 이 누락된다.
96
+ const inFlightProbes = new Set();
81
97
 
82
98
  // L1 tracking
83
99
  let lastOutputBytes = 0;
@@ -96,6 +112,97 @@ export function createHealthProbe(session, opts = {}) {
96
112
  inputWaitPattern: null,
97
113
  };
98
114
 
115
+ function getStateFilePath() {
116
+ if (typeof config.stateFile === "string" && config.stateFile.length > 0) {
117
+ return config.stateFile;
118
+ }
119
+ const pid = session.pid;
120
+ if (pid == null || pid <= 0) return null;
121
+ return join(config.stateDir, `${pid}.json`);
122
+ }
123
+
124
+ function deriveState(result) {
125
+ if (result.l0 === "fail") return "exited";
126
+ if (result.l1 === "input_wait") return "input_wait";
127
+ if (result.l2 === "fail") return "mcp_initializing";
128
+ if (result.l1 === "stall") return "stalled";
129
+ if (result.l3 === "timeout") return "reasoning";
130
+ return "active";
131
+ }
132
+
133
+ function writeState(result, probeEpoch) {
134
+ // stop() 직후 in-flight probe()의 재생성 방지.
135
+ // P0 (#167): probeEpoch !== runEpoch 면 이전 run 의 stale probe 가 새 run 의 state 를
136
+ // 덮으려는 시도 → skip.
137
+ if (stopped) return;
138
+ if (probeEpoch !== undefined && probeEpoch !== runEpoch) return;
139
+ if (!config.writeStateFile && !config.stateFile) return;
140
+ const stateFile = getStateFilePath();
141
+ if (!stateFile) return;
142
+ const payload =
143
+ JSON.stringify(
144
+ {
145
+ pid: session.pid ?? null,
146
+ state: deriveState(result),
147
+ result,
148
+ updatedAt: new Date(result.ts).toISOString(),
149
+ },
150
+ null,
151
+ 2,
152
+ ) + "\n";
153
+ // tmp+rename 으로 atomic write — heartbeat 의 sed 가 부분 파일을 읽는 race 제거.
154
+ // tmp 는 같은 디렉토리에 둬야 EXDEV (cross-device link) 가 안 난다.
155
+ const tmpPath = `${stateFile}.tmp-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
156
+ try {
157
+ mkdirSync(dirname(stateFile), { recursive: true });
158
+ writeFileSync(tmpPath, payload, "utf8");
159
+ try {
160
+ renameSync(tmpPath, stateFile);
161
+ } catch (renameErr) {
162
+ // Windows: 대상이 존재할 때 EPERM/EACCES.
163
+ // P1-2 (#167 review): backup-then-swap 으로 기존 파일 보존. 옛 unlinkSync→renameSync
164
+ // 패턴은 1차 unlink 후 2차 rename 실패 시 기존 파일과 tmp 둘 다 잃었다.
165
+ if (
166
+ renameErr?.code === "EEXIST" ||
167
+ renameErr?.code === "EPERM" ||
168
+ renameErr?.code === "EACCES"
169
+ ) {
170
+ const backupPath = `${stateFile}.old-${process.pid}-${Date.now()}`;
171
+ let backupCreated = false;
172
+ try {
173
+ renameSync(stateFile, backupPath);
174
+ backupCreated = true;
175
+ } catch {
176
+ // backup 실패 (대상 없음/잠김) — 그래도 진행 (tmp→stateFile 시도)
177
+ }
178
+ try {
179
+ renameSync(tmpPath, stateFile);
180
+ if (backupCreated) {
181
+ try {
182
+ unlinkSync(backupPath);
183
+ } catch {}
184
+ }
185
+ } catch (secondErr) {
186
+ // 2차도 실패 → backup 복구해서 기존 파일 보존
187
+ if (backupCreated) {
188
+ try {
189
+ renameSync(backupPath, stateFile);
190
+ } catch {}
191
+ }
192
+ throw secondErr;
193
+ }
194
+ } else {
195
+ throw renameErr;
196
+ }
197
+ }
198
+ } catch {
199
+ // probe state is advisory only — tmp cleanup
200
+ try {
201
+ unlinkSync(tmpPath);
202
+ } catch {}
203
+ }
204
+ }
205
+
99
206
  /**
100
207
  * L0: Process alive check.
101
208
  */
@@ -215,29 +322,44 @@ export function createHealthProbe(session, opts = {}) {
215
322
 
216
323
  /**
217
324
  * 전체 probe 실행 (L0→L1→L2→L3).
218
- * @returns {Promise<object>} probe 결과
325
+ * @returns {Promise<object|null>} probe 결과. stop() 이후 호출이면 null.
219
326
  */
220
327
  async function probe() {
221
- const result = {
222
- l0: probeL0(),
223
- l1: probeL1(),
224
- l2: await probeL2(),
225
- l3: probeL3(),
226
- inputWaitPattern: status.inputWaitPattern,
227
- ts: Date.now(),
228
- };
229
- status.lastProbeAt = result.ts;
230
-
231
- if (typeof config.onProbe === "function") {
232
- config.onProbe(result);
233
- }
328
+ if (stopped) return null;
329
+ // P0 (#167): probe 시작 시 epoch 캡처. start() 가 새 epoch 로 바꾸면 이 probe 의
330
+ // writeState 는 stale 로 판정 → skip.
331
+ const probeEpoch = runEpoch;
332
+ const promise = (async () => {
333
+ const result = {
334
+ l0: probeL0(),
335
+ l1: probeL1(),
336
+ l2: await probeL2(),
337
+ l3: probeL3(),
338
+ inputWaitPattern: status.inputWaitPattern,
339
+ ts: Date.now(),
340
+ };
341
+ status.lastProbeAt = result.ts;
342
+ writeState(result, probeEpoch);
343
+
344
+ if (typeof config.onProbe === "function") {
345
+ config.onProbe(result);
346
+ }
234
347
 
235
- return result;
348
+ return result;
349
+ })();
350
+ // P1-1 (#167): Set 으로 모든 in-flight 추적. 단일 var 패턴은 N+1 이 N 끝나기 전에
351
+ // 시작되면 N 을 덮어써 stopAndDrain() 시 N 이 누락된다.
352
+ inFlightProbes.add(promise);
353
+ promise.finally(() => inFlightProbes.delete(promise));
354
+ return promise;
236
355
  }
237
356
 
238
357
  function start() {
239
358
  if (started) return;
240
359
  started = true;
360
+ stopped = false;
361
+ // P0 (#167): epoch 증가 — 이전 run 의 in-flight probe 가 새 run 의 state 를 덮지 못하게.
362
+ runEpoch += 1;
241
363
  spawnedAt = Date.now();
242
364
  lastOutputChangeAt = Date.now();
243
365
  lastOutputBytes = 0;
@@ -255,10 +377,32 @@ export function createHealthProbe(session, opts = {}) {
255
377
  function stop() {
256
378
  if (!started) return;
257
379
  started = false;
380
+ // stopped 를 먼저 set 해야 in-flight probe()의 writeState() 가 skip 된다.
381
+ // 이후 unlink — in-flight 가 끝나도 writeState 가 no-op 이므로 재생성 없음.
382
+ stopped = true;
258
383
  if (timer) {
259
384
  clearInterval(timer);
260
385
  timer = null;
261
386
  }
387
+ if (config.writeStateFile || config.stateFile) {
388
+ try {
389
+ const stateFile = getStateFilePath();
390
+ if (stateFile) unlinkSync(stateFile);
391
+ } catch {}
392
+ }
393
+ }
394
+
395
+ /**
396
+ * stop() 후 in-flight probe() 가 완료될 때까지 대기.
397
+ * 결정적 종료가 필요한 테스트/teardown 용. conductor 의 sync stop() 호출자는
398
+ * 그대로 stop() 만 호출하면 stopped flag 가 race 를 막는다.
399
+ */
400
+ async function stopAndDrain() {
401
+ stop();
402
+ // P1-1 (#167): 모든 in-flight probe 대기. allSettled 로 unhandled rejection 방지.
403
+ if (inFlightProbes.size > 0) {
404
+ await Promise.allSettled(Array.from(inFlightProbes));
405
+ }
262
406
  }
263
407
 
264
408
  /** L1 tracking 리셋 (restart 후 호출) */
@@ -277,6 +421,7 @@ export function createHealthProbe(session, opts = {}) {
277
421
  return Object.freeze({
278
422
  start,
279
423
  stop,
424
+ stopAndDrain,
280
425
  probe,
281
426
  resetTracking,
282
427
  getStatus: () => ({ ...status }),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "triflux",
3
- "version": "10.14.0",
3
+ "version": "10.14.2",
4
4
  "description": "CLI-first multi-model orchestrator for Claude Code — route tasks to Codex, Gemini, and Claude",
5
5
  "type": "module",
6
6
  "bin": {
@@ -13,6 +13,7 @@ import { fileURLToPath } from "url";
13
13
  const LOOPBACK_HOSTS = new Set(["127.0.0.1", "localhost", "::1"]);
14
14
  const PLUGIN_ROOT = dirname(dirname(fileURLToPath(import.meta.url)));
15
15
  const HUB_PID_FILE = join(homedir(), ".claude", "cache", "tfx-hub", "hub.pid");
16
+ const HUB_DEFAULT_PORT = 27888;
16
17
 
17
18
  function formatHostForUrl(host) {
18
19
  return host.includes(":") ? `[${host}]` : host;
@@ -34,29 +35,33 @@ async function syncHubConfigsIfAvailable({ hubUrl }) {
34
35
  await mod.syncCodexHubUrl({ hubUrl });
35
36
  }
36
37
  if (typeof mod?.syncProjectMcpJson === "function") {
37
- await mod.syncProjectMcpJson({ hubUrl, projectRoot: PLUGIN_ROOT });
38
+ // 사용자 작업 디렉토리의 .mcp.json sync 대상으로 한다.
39
+ // 이전에는 PLUGIN_ROOT(triflux 설치 경로)를 넘겨서 설치 경로의 .mcp.json
40
+ // 만 sync 되고 사용자 실제 프로젝트는 drift 되던 증상이 있었다.
41
+ await mod.syncProjectMcpJson({ hubUrl, projectRoot: process.cwd() });
38
42
  }
39
43
  } catch {
40
44
  // sync는 best-effort이며 hub-ensure 성공/실패를 좌우하지 않는다.
41
45
  }
42
46
  }
43
47
 
44
- function resolveHubTarget() {
48
+ export function resolveHubTarget() {
45
49
  const envPortRaw = Number(process.env.TFX_HUB_PORT || "");
46
50
  const envPort =
47
51
  Number.isFinite(envPortRaw) && envPortRaw > 0 ? envPortRaw : null;
48
52
  const target = {
49
53
  host: "127.0.0.1",
50
- port: envPort || 27888,
54
+ port: envPort ?? HUB_DEFAULT_PORT,
51
55
  };
52
56
 
57
+ // PID 파일의 port는 source of truth가 아니다. host 힌트만 재사용한다.
58
+ // 과거에는 `!envPort`일 때 PID file의 port로 target.port를 덮었으나,
59
+ // 이는 이전 세션의 오염된 port(비표준 포트)가 cascade로 영속화되는 버그 원인이었다.
60
+ // 포트는 오직 TFX_HUB_PORT env(없으면 HUB_DEFAULT_PORT=27888)만 source of truth다.
61
+ // client config 는 sync-hub-mcp-settings.mjs가 이 hubUrl로 재동기화한다.
53
62
  if (existsSync(HUB_PID_FILE)) {
54
63
  try {
55
64
  const info = JSON.parse(readFileSync(HUB_PID_FILE, "utf8"));
56
- if (!envPort) {
57
- const pidPort = Number(info?.port);
58
- if (Number.isFinite(pidPort) && pidPort > 0) target.port = pidPort;
59
- }
60
65
  if (typeof info?.host === "string") {
61
66
  const host = info.host.trim();
62
67
  if (LOOPBACK_HOSTS.has(host)) target.host = host;
@@ -334,7 +334,7 @@ function makeInitializeRequest() {
334
334
 
335
335
  function isValidInitResponse(line) {
336
336
  const trimmed = line.trim();
337
- if (!trimmed || !trimmed.startsWith("{")) return false;
337
+ if (!trimmed?.startsWith("{")) return false;
338
338
  try {
339
339
  const msg = JSON.parse(trimmed);
340
340
  if (msg.jsonrpc !== "2.0") return false;
@@ -576,8 +576,7 @@ export async function probeAll({
576
576
  const prior = cachedResults[name];
577
577
  if (
578
578
  cacheWithinTtl &&
579
- prior &&
580
- prior.fingerprint &&
579
+ prior?.fingerprint &&
581
580
  fingerprintsEqual(prior.fingerprint, fingerprints[name])
582
581
  ) {
583
582
  hits[name] = prior;
package/scripts/setup.mjs CHANGED
@@ -47,6 +47,7 @@ function detectDevMode(root = PLUGIN_ROOT) {
47
47
  const BREADCRUMB_PATH = join(CLAUDE_DIR, "scripts", ".tfx-pkg-root");
48
48
  const SETTINGS_PATH = join(CLAUDE_DIR, "settings.json");
49
49
  const HUD_PATH = join(CLAUDE_DIR, "hud", "hud-qos-status.mjs");
50
+ const WINDOWS_HUB_AUTOSTART_TASK = "TrifluxHubEnsure";
50
51
 
51
52
  const REQUIRED_CODEX_PROFILES = [
52
53
  // gpt-5.5 — 새 main 플래그십. xhigh/high/med/low 4 tier 전부 보장.
@@ -747,6 +748,81 @@ function getSetupArgv(stdinData) {
747
748
  return Array.isArray(stdinData?.argv) ? stdinData.argv : [];
748
749
  }
749
750
 
751
+ function quoteWindowsTaskArg(value) {
752
+ return `"${String(value).replace(/"/g, '\\"')}"`;
753
+ }
754
+
755
+ function buildWindowsHubAutostartCommand({
756
+ nodePath = process.execPath,
757
+ pluginRoot = PLUGIN_ROOT,
758
+ } = {}) {
759
+ return [
760
+ quoteWindowsTaskArg(nodePath),
761
+ quoteWindowsTaskArg(join(pluginRoot, "scripts", "hub-ensure.mjs")),
762
+ ].join(" ");
763
+ }
764
+
765
+ function getWindowsHubAutostartStatus({
766
+ taskName = WINDOWS_HUB_AUTOSTART_TASK,
767
+ } = {}) {
768
+ if (process.platform !== "win32") {
769
+ return { supported: false, registered: false, taskName };
770
+ }
771
+ try {
772
+ execFileSync("schtasks.exe", ["/Query", "/TN", taskName], {
773
+ stdio: "ignore",
774
+ windowsHide: true,
775
+ });
776
+ return { supported: true, registered: true, taskName };
777
+ } catch {
778
+ return { supported: true, registered: false, taskName };
779
+ }
780
+ }
781
+
782
+ function ensureWindowsHubAutostart({
783
+ taskName = WINDOWS_HUB_AUTOSTART_TASK,
784
+ nodePath = process.execPath,
785
+ pluginRoot = PLUGIN_ROOT,
786
+ force = true,
787
+ } = {}) {
788
+ if (process.platform !== "win32") {
789
+ return {
790
+ supported: false,
791
+ changed: false,
792
+ registered: false,
793
+ taskName,
794
+ reason: "non-windows",
795
+ };
796
+ }
797
+
798
+ const command = buildWindowsHubAutostartCommand({ nodePath, pluginRoot });
799
+ const args = [
800
+ "/Create",
801
+ "/TN",
802
+ taskName,
803
+ "/SC",
804
+ "ONLOGON",
805
+ "/TR",
806
+ command,
807
+ "/RL",
808
+ "LIMITED",
809
+ ];
810
+ if (force) args.push("/F");
811
+
812
+ execFileSync("schtasks.exe", args, {
813
+ stdio: ["ignore", "pipe", "pipe"],
814
+ windowsHide: true,
815
+ });
816
+
817
+ return {
818
+ supported: true,
819
+ changed: true,
820
+ registered: true,
821
+ taskName,
822
+ command,
823
+ };
824
+ }
825
+
750
826
  function loadSettings() {
751
827
  if (!existsSync(SETTINGS_PATH)) return {};
752
828
 
@@ -980,6 +1056,7 @@ function ensureCriticalSetup() {
980
1056
 
981
1057
  export {
982
1058
  BREADCRUMB_PATH,
1059
+ buildWindowsHubAutostartCommand,
983
1060
  CLAUDE_DIR,
984
1061
  cleanupStaleSkills,
985
1062
  DEPRECATED_SKILLS,
@@ -987,9 +1064,11 @@ export {
987
1064
  ensureCodexHubServerConfig,
988
1065
  ensureCodexProfiles,
989
1066
  ensureHooksInSettings,
1067
+ ensureWindowsHubAutostart,
990
1068
  extractManagedHookFilename,
991
1069
  getManagedRegistryHooks,
992
1070
  getVersion,
1071
+ getWindowsHubAutostartStatus,
993
1072
  hasProfileSection,
994
1073
  LEGACY_CODEX_MODELS,
995
1074
  PLUGIN_ROOT,
@@ -1002,6 +1081,7 @@ export {
1002
1081
  SYNC_MAP,
1003
1082
  scanHudFiles,
1004
1083
  syncAliasedSkillDir,
1084
+ WINDOWS_HUB_AUTOSTART_TASK,
1005
1085
  writeMarker,
1006
1086
  };
1007
1087
 
@@ -1032,6 +1112,9 @@ export async function runDeferred(stdinData) {
1032
1112
  const argv = getSetupArgv(stdinData);
1033
1113
  const isSync = argv.includes("--sync");
1034
1114
  const isForce = argv.includes("--force");
1115
+ const enableHubAutostart =
1116
+ argv.includes("--enable-hub-autostart") ||
1117
+ process.env.TFX_HUB_AUTOSTART === "1";
1035
1118
  const isDev = detectDevMode();
1036
1119
 
1037
1120
  if (isDev) {
@@ -1668,6 +1751,22 @@ export async function runDeferred(stdinData) {
1668
1751
  synced++;
1669
1752
  }
1670
1753
 
1754
+ // ── Windows Codex 단독 실행 보호: 로그인 시 hub-ensure 등록 ──
1755
+ // Claude SessionStart 훅이 없는 순수 Codex 시작 경로에서도 tfx-hub가 살아있게 한다.
1756
+ if (enableHubAutostart) {
1757
+ try {
1758
+ const result = ensureWindowsHubAutostart();
1759
+ if (result.registered) {
1760
+ io.log(` \x1b[32m✓\x1b[0m Windows hub autostart: ${result.taskName}`);
1761
+ synced++;
1762
+ }
1763
+ } catch (error) {
1764
+ io.log(
1765
+ ` \x1b[33m⚠\x1b[0m Windows hub autostart 등록 실패: ${error.message}`,
1766
+ );
1767
+ }
1768
+ }
1769
+
1671
1770
  // ── CLAUDE.md 라우팅 섹션 자동 동기화 ──
1672
1771
 
1673
1772
  try {
@@ -25,7 +25,7 @@ function getCodexConfigPath(codexConfigPath) {
25
25
  return join(home, ...CODEX_CONFIG_FILE);
26
26
  }
27
27
 
28
- function getProjectMcpJsonPaths(projectRoot) {
28
+ export function getProjectMcpJsonPaths(projectRoot) {
29
29
  const root =
30
30
  typeof projectRoot === "string" && projectRoot.length > 0
31
31
  ? projectRoot
@@ -132,8 +132,11 @@ function parseTomlScalar(rawValue) {
132
132
  }
133
133
 
134
134
  function findMcpServerSection(raw, sectionName) {
135
+ // TOML 동치 표현 지원: [mcp_servers.name] / [mcp_servers."name"] / [mcp_servers . name]
136
+ // 미검출 시 appendCodexMcpServerSection이 중복 테이블 생성 → TOMLDecodeError 회귀 방지.
137
+ const escaped = escapeRegExp(sectionName);
135
138
  const headerRegex = new RegExp(
136
- `^\\[mcp_servers\\.${escapeRegExp(sectionName)}\\]\\s*$`,
139
+ `^\\[\\s*mcp_servers\\s*\\.\\s*(?:${escaped}|"${escaped}"|'${escaped}')\\s*\\]\\s*$`,
137
140
  "m",
138
141
  );
139
142
  const headerMatch = headerRegex.exec(raw);
@@ -153,6 +156,13 @@ function findMcpServerSection(raw, sectionName) {
153
156
  };
154
157
  }
155
158
 
159
+ function appendCodexMcpServerSection(raw, sectionName, hubUrl) {
160
+ const normalized = raw.length > 0 && !raw.endsWith("\n") ? `${raw}\n` : raw;
161
+ const separator =
162
+ normalized.length > 0 && !normalized.endsWith("\n\n") ? "\n" : "";
163
+ return `${normalized}${separator}[mcp_servers.${sectionName}]\nurl = ${formatTomlString(hubUrl)}\n`;
164
+ }
165
+
156
166
  async function syncSingleFile({ filePath, hubUrl, dryRun, logger }) {
157
167
  return withFileLock(filePath, async () => {
158
168
  if (!(await fileExists(filePath))) {
@@ -243,8 +253,29 @@ async function syncCodexConfigFile({ filePath, hubUrl, dryRun, logger }) {
243
253
 
244
254
  const section = findMcpServerSection(raw, TFX_HUB_SECTION);
245
255
  if (!section) {
246
- log(logger, "info", `[codex-mcp-sync] skipped: ${filePath}`);
247
- return { kind: "skipped", path: filePath };
256
+ const nextRaw = appendCodexMcpServerSection(raw, TFX_HUB_SECTION, hubUrl);
257
+ log(
258
+ logger,
259
+ "debug",
260
+ `[codex-mcp-sync] ${filePath} add ${TFX_HUB_SECTION}: ${hubUrl}`,
261
+ );
262
+
263
+ if (!dryRun) {
264
+ try {
265
+ await writeTextAtomic(filePath, nextRaw);
266
+ } catch (error) {
267
+ const reason = getReason(error, "write failed");
268
+ log(
269
+ logger,
270
+ "error",
271
+ `[codex-mcp-sync] error: ${filePath} (${reason})`,
272
+ );
273
+ return { kind: "error", path: filePath, reason };
274
+ }
275
+ }
276
+
277
+ log(logger, "info", `[codex-mcp-sync] updated: ${filePath}`);
278
+ return { kind: "updated", path: filePath };
248
279
  }
249
280
 
250
281
  const urlMatch = /^(\s*url\s*=\s*)(.+?)(\s*(?:#.*)?)$/m.exec(section.body);
@@ -262,7 +262,7 @@ if [[ "$MCP_PROFILE" == --* ]]; then
262
262
  fi
263
263
 
264
264
  # ── CLI 경로 해석 (Windows npm global 대응) ──
265
- NODE_BIN="${NODE_BIN:-$(command -v node 2>/dev/null || echo node)}"
265
+ NODE_BIN="${NODE_BIN:-$(command -v node 2>/dev/null || command -v node.exe 2>/dev/null || echo node)}"
266
266
  CODEX_BIN="${CODEX_BIN:-$(command -v codex 2>/dev/null || echo codex)}"
267
267
  GEMINI_BIN="${GEMINI_BIN:-$(command -v gemini 2>/dev/null || echo gemini)}"
268
268
  CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude 2>/dev/null || echo claude)}"
@@ -278,6 +278,56 @@ GEMINI_PROFILES_PATH="${GEMINI_PROFILES_PATH:-${HOME}/.gemini/triflux-profiles.j
278
278
  # ── 상수 ──
279
279
  MAX_STDOUT_BYTES=51200 # 50KB — Claude 컨텍스트 절약
280
280
  TIMESTAMP=$(date +%s)
281
+ TFX_PROBE_DIR="${TFX_PROBE_DIR:-${TFX_TMP}/tfx-probe}"
282
+ mkdir -p "$TFX_PROBE_DIR" 2>/dev/null || true
283
+
284
+ estimate_expected_duration_sec() {
285
+ local agent="${1:-}" profile="${2:-}" prompt="${3:-}"
286
+ local text="${prompt,,}"
287
+ local expected=30
288
+
289
+ case "$agent" in
290
+ explore|style-reviewer) expected=30 ;;
291
+ writer|verifier|qa-tester) expected=90 ;;
292
+ executor|debugger|test-engineer) expected=300 ;;
293
+ code-reviewer|security-reviewer|architect|planner|critic|analyst) expected=600 ;;
294
+ scientist|scientist-deep|deep-executor|document-specialist) expected=900 ;;
295
+ esac
296
+
297
+ case "$profile" in
298
+ minimal|default) [[ "$expected" -lt 60 ]] && expected=60 ;;
299
+ analyze|review|full) [[ "$expected" -lt 300 ]] && expected=300 ;;
300
+ implement|executor) [[ "$expected" -lt 300 ]] && expected=300 ;;
301
+ esac
302
+
303
+ if [[ "$text" =~ (deep|research|analy[sz]e|분석|리서치|조사|전체|전부|싹다|comprehensive) ]]; then
304
+ [[ "$expected" -lt 600 ]] && expected=600
305
+ fi
306
+ if [[ "$text" =~ (refactor|migration|migrate|리팩터|마이그레이션|대규모|rewrite) ]]; then
307
+ [[ "$expected" -lt 900 ]] && expected=900
308
+ fi
309
+ if [[ "$text" =~ (test|lint|build|npm|pnpm|pytest|검증|테스트) ]]; then
310
+ [[ "$expected" -lt 180 ]] && expected=180
311
+ fi
312
+ if [[ "$text" =~ (mcp|browser|playwright|context7|exa|tavily|brave) ]]; then
313
+ [[ "$expected" -lt 120 ]] && expected=120
314
+ fi
315
+
316
+ printf '%s\n' "$expected"
317
+ }
318
+
319
+ read_probe_state() {
320
+ local pid="$1"
321
+ local state_file="${TFX_PROBE_STATE_FILE:-${TFX_PROBE_DIR}/${pid}.json}"
322
+ [[ -f "$state_file" ]] || return 1
323
+ # 2-step read (#162): health-probe.mjs 의 atomic write (tmp+rename) 가 도입되었지만
324
+ # writer 쪽 OS race 또는 race-free 보장을 못 받는 환경 (예: 일부 FS) 에서 빈/부분 파일
325
+ # 을 sed 가 읽고 stale state 를 반환하는 것을 방지하기 위해 size 가 너무 작으면 무시.
326
+ local size
327
+ size=$(wc -c < "$state_file" 2>/dev/null || printf '0')
328
+ [[ "$size" -ge 20 ]] || return 1
329
+ sed -n 's/.*"state"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$state_file" 2>/dev/null | head -1
330
+ }
281
331
  RUN_ID="${TIMESTAMP}-$$-${RANDOM}"
282
332
  STDERR_LOG="${TFX_TMP}/tfx-route-${AGENT_TYPE}-${RUN_ID}-stderr.log"
283
333
  STDOUT_LOG="${TFX_TMP}/tfx-route-${AGENT_TYPE}-${RUN_ID}-stdout.log"
@@ -833,7 +883,7 @@ route_agent() {
833
883
 
834
884
  # ── CLI_TYPE: 단일 소스 (agent-map.json) ──
835
885
  local _raw_type
836
- _raw_type=$(node -e "
886
+ _raw_type=$("$NODE_BIN" -e "
837
887
  const p=require('path').resolve(process.argv[1]);
838
888
  const m=JSON.parse(require('fs').readFileSync(p,'utf8'));
839
889
  const t=m[process.argv[2]];
@@ -842,7 +892,7 @@ route_agent() {
842
892
 
843
893
  if [[ -z "$_raw_type" ]]; then
844
894
  echo "ERROR: 알 수 없는 에이전트 타입: $agent" >&2
845
- echo "사용 가능: $(node -e "console.log(Object.keys(JSON.parse(require('fs').readFileSync(require('path').resolve(process.argv[1]),'utf8'))).join(', '))" "$map_file" 2>/dev/null)" >&2
895
+ echo "사용 가능: $("$NODE_BIN" -e "console.log(Object.keys(JSON.parse(require('fs').readFileSync(require('path').resolve(process.argv[1]),'utf8'))).join(', '))" "$map_file" 2>/dev/null)" >&2
846
896
  exit 1
847
897
  fi
848
898
 
@@ -1299,7 +1349,9 @@ heartbeat_monitor() {
1299
1349
  [[ "${TFX_HEARTBEAT:-1}" -eq 0 ]] && return 0
1300
1350
  local pid="$1"
1301
1351
  local interval="${2:-${TFX_HEARTBEAT_INTERVAL:-10}}"
1302
- local stall_threshold="${3:-${TFX_STALL_THRESHOLD:-60}}"
1352
+ # 땜빵(PLANNING P4 구현 전): 60 → 300. MCP init/재시도 여유 + false STALL 감소.
1353
+ local stall_threshold="${3:-${TFX_STALL_THRESHOLD:-300}}"
1354
+ local expected_duration="${TFX_EXPECTED_DURATION_SEC:-}"
1303
1355
  local last_size=0 stall_count=0
1304
1356
  local pid_gone=false
1305
1357
  local post_exit_checks=0
@@ -1330,18 +1382,27 @@ heartbeat_monitor() {
1330
1382
  [[ -f "$STDERR_LOG" ]] && stderr_size=$(wc -c < "$STDERR_LOG" 2>/dev/null || echo 0)
1331
1383
  current_size=$((current_size + stderr_size))
1332
1384
  local elapsed=$(($(date +%s) - TIMESTAMP))
1385
+ local expected_suffix=""
1386
+ if [[ -n "$expected_duration" && "$expected_duration" =~ ^[0-9]+$ && "$expected_duration" -gt 0 ]]; then
1387
+ expected_suffix=" expected=${expected_duration}s"
1388
+ if [[ "$elapsed" -gt $((expected_duration * 2)) ]]; then
1389
+ expected_suffix="${expected_suffix} anomaly=slow"
1390
+ fi
1391
+ fi
1333
1392
 
1334
1393
  if [[ "$current_size" -gt "$last_size" ]]; then
1335
1394
  stall_count=0
1336
1395
  if [[ "$pid_gone" == "true" ]]; then
1337
1396
  local _fi="forked"; [[ -n "$last_known_forks" ]] && _fi="forks:${last_known_forks// /,}"
1338
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=active(${_fi})" >&2
1397
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=active(${_fi})" >&2
1339
1398
  post_exit_checks=0 # reset — still producing output
1340
1399
  else
1341
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=active" >&2
1400
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=active" >&2
1342
1401
  fi
1343
1402
  else
1344
1403
  stall_count=$((stall_count + interval))
1404
+ local probe_state=""
1405
+ probe_state="$(read_probe_state "$pid" 2>/dev/null || true)"
1345
1406
  if [[ "$pid_gone" == "true" ]]; then
1346
1407
  if [[ -n "$last_known_forks" ]]; then
1347
1408
  # Direct fork tracking — terminate when all forks are dead
@@ -1350,26 +1411,43 @@ heartbeat_monitor() {
1350
1411
  kill -0 "$_fp" 2>/dev/null && _alive=true && break
1351
1412
  done
1352
1413
  if [[ "$_alive" == "false" ]]; then
1353
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=terminated(forks-exited)" >&2
1414
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=terminated(forks-exited)" >&2
1354
1415
  break
1355
1416
  fi
1356
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=fork-idle(${last_known_forks// /,})" >&2
1417
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=fork-idle(${last_known_forks// /,})" >&2
1357
1418
  else
1358
1419
  # Fallback: output-based drain (no fork PIDs found)
1359
1420
  post_exit_checks=$((post_exit_checks + 1))
1360
1421
  if [[ "$post_exit_checks" -ge "$max_post_exit_checks" ]]; then
1361
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=terminated(drain-done)" >&2
1422
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=terminated(drain-done)" >&2
1362
1423
  break
1363
1424
  fi
1364
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=draining(${post_exit_checks}/${max_post_exit_checks})" >&2
1425
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=draining(${post_exit_checks}/${max_post_exit_checks})" >&2
1365
1426
  fi
1427
+ elif [[ "$probe_state" =~ ^(mcp_initializing|input_wait)$ ]]; then
1428
+ stall_count=0
1429
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=${probe_state}(probe-grace)" >&2
1366
1430
  elif [[ "$stall_count" -ge "$stall_threshold" ]]; then
1367
- # STALL kill (#144/#66 regression guard): stall=threshold+grace 이상 지속 시 SIGTERM→SIGKILL.
1368
- # 기본 활성화. TFX_STALL_KILL=0 으로 opt-out. grace=30s (기본) SSE/MCP 정상 handshake 여유.
1369
- local kill_on_stall="${TFX_STALL_KILL:-1}"
1431
+ # STALL 판정 modes (#165 PLANNING P4):
1432
+ # off (alias: 0, disabled) — silent. kill 함, STALL_CLASSIFY 로그도 없음
1433
+ # classify (default) — kill 안 함. STALL_CLASSIFY 로그로 evidence 노출
1434
+ # kill (alias: 1, on) — threshold+grace 초과 시 SIGTERM→SIGKILL
1435
+ # PR #160 에서 default 1 → 0 으로 임시 후퇴 (false kill 방지). 본 PR(#165) 에서
1436
+ # classify 로 승격 — evidence 는 남기되 false kill 리스크 없음.
1437
+ local kill_on_stall="${TFX_STALL_KILL:-classify}"
1438
+ [[ -z "$kill_on_stall" ]] && kill_on_stall="classify"
1370
1439
  local kill_grace="${TFX_STALL_KILL_GRACE:-30}"
1371
- if [[ "$kill_on_stall" -eq 1 && "$stall_count" -ge $((stall_threshold + kill_grace)) ]]; then
1372
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=STALL_KILL stall=${stall_count}s — SIGTERM" >&2
1440
+ local _should_kill=0
1441
+ case "$kill_on_stall" in
1442
+ 1|on|kill) _should_kill=1 ;;
1443
+ classify|0|off|disabled) ;;
1444
+ *)
1445
+ echo "[tfx-heartbeat] pid=$pid warning TFX_STALL_KILL=$kill_on_stall unknown, fallback classify" >&2
1446
+ kill_on_stall="classify"
1447
+ ;;
1448
+ esac
1449
+ if [[ "$_should_kill" -eq 1 && "$stall_count" -ge $((stall_threshold + kill_grace)) ]]; then
1450
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=STALL_KILL stall=${stall_count}s — SIGTERM" >&2
1373
1451
  # Snapshot child PIDs before SIGTERM — wrapper 가 SIGTERM 을 수용해 죽으면
1374
1452
  # 부모 소멸 후 taskkill /T 가 자식 트리를 탐색하지 못해 codex 자식이 orphan 으로 남는다.
1375
1453
  # 사용자 보고(2026-04-22): "tfx-route 래퍼 exit 이후에도 Codex 자식이 살아있음".
@@ -1417,9 +1495,12 @@ heartbeat_monitor() {
1417
1495
  fi
1418
1496
  break
1419
1497
  fi
1420
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=STALL stall=${stall_count}s" >&2
1498
+ if [[ "$kill_on_stall" == "classify" && "$stall_count" -ge $((stall_threshold + kill_grace)) ]]; then
1499
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=STALL_CLASSIFY stall=${stall_count}s (no-kill — TFX_STALL_KILL=classify)" >&2
1500
+ fi
1501
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=STALL stall=${stall_count}s" >&2
1421
1502
  else
1422
- echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B status=quiet stall=${stall_count}s" >&2
1503
+ echo "[tfx-heartbeat] pid=$pid elapsed=${elapsed}s output=${current_size}B${expected_suffix} status=quiet stall=${stall_count}s" >&2
1423
1504
  fi
1424
1505
  fi
1425
1506
  last_size=$current_size
@@ -1581,26 +1662,32 @@ _mcp_preflight_filter_dead() {
1581
1662
  CODEX_CONFIG_FLAGS=("${new_flags[@]}")
1582
1663
  echo "[tfx-route] MCP preflight: ${#dead_names[@]}개 dead MCP 제외 (${dead_list})" >&2
1583
1664
 
1584
- # #148: profile-allowed 전부 dead 인 all-dead 엣지케이스 조기 실패.
1585
- # allowed_pat _codex_config_swap fail-safe (#132) 의해 원본 config
1586
- # 전체를 유지 비필요 MCP 까지 전부 spawn 역효과.
1587
- # TFX_MCP_ALLOW_ALL_DEAD=1 명시적 opt-in MCP 없이 진행 (degraded).
1665
+ # #170 graceful degradation (회귀 fix):
1666
+ # all-dead default exec mode 자동 fallback. TFX_MCP_FAIL_ON_ALL_DEAD=1
1667
+ # 명시 opt-in 시만 #148 기존 동작 (early fail). TFX_MCP_ALLOW_ALL_DEAD=1 은 호환성
1668
+ # 유지 (alias for graceful default). transport auto 인 채로 run_codex_mcp 를
1669
+ # 호출하면 dead MCP 와 connect 시도 → stall → 본 fix 의 _TFX_MCP_DEGRADED=1 marker
1670
+ # 가 호출자 에서 transport=exec 강제 + MCP_HINT 자동 주입 skip 을 유발한다.
1588
1671
  local remaining_alive=0
1589
1672
  local rflag
1590
1673
  for rflag in "${CODEX_CONFIG_FLAGS[@]}"; do
1591
- if [[ "$rflag" =~ ^mcp_servers\.[^.]+\.enabled=true$ ]]; then
1674
+ # #153 + #170 P1: candidate 추출 정규식 (line 1607) 과 일관 — dotted server 이름
1675
+ # (e.g. mcp_servers.foo.bar.enabled=true) 도 alive 로 카운트한다. `[^.]+` 는 첫 dot
1676
+ # 에서 끊겨 dotted alive 만 남은 경우 false all-dead 판정 → 불필요 degraded.
1677
+ if [[ "$rflag" =~ ^mcp_servers\..+\.enabled=true$ ]]; then
1592
1678
  remaining_alive=$((remaining_alive + 1))
1593
1679
  fi
1594
1680
  done
1595
1681
 
1596
1682
  if [[ "$remaining_alive" -eq 0 ]]; then
1597
- if [[ "${TFX_MCP_ALLOW_ALL_DEAD:-0}" == "1" ]]; then
1598
- echo "[tfx-route] TFX_MCP_ALLOW_ALL_DEAD=1 MCP 없이 계속 진행 (degraded)" >&2
1599
- return 0
1683
+ if [[ "${TFX_MCP_FAIL_ON_ALL_DEAD:-0}" == "1" ]]; then
1684
+ echo "[tfx-route] 조기 실패: TFX_MCP_FAIL_ON_ALL_DEAD=1 + MCP 전부 dead Codex 호출 중단" >&2
1685
+ echo " 복구: (1) dead MCP 복구 (2) TFX_MCP_HEALTH_CHECK=0 preflight 비활성 (3) TFX_MCP_FAIL_ON_ALL_DEAD=0 graceful degradation" >&2
1686
+ return 78
1600
1687
  fi
1601
- echo "[tfx-route] 조기 실패: profile 에서 허용한 MCP 전부 dead — Codex 호출 중단" >&2
1602
- echo " 복구: (1) dead MCP 복구 (2) TFX_MCP_HEALTH_CHECK=0 preflight 비활성 (3) TFX_MCP_ALLOW_ALL_DEAD=1 MCP 없이 진행" >&2
1603
- return 78
1688
+ export _TFX_MCP_DEGRADED=1
1689
+ echo "[tfx-route] graceful degradation: MCP 전부 dead exec mode 자동 전환 (set TFX_MCP_FAIL_ON_ALL_DEAD=1 to revert to early-fail)" >&2
1690
+ return 0
1604
1691
  fi
1605
1692
  }
1606
1693
 
@@ -1922,6 +2009,9 @@ main() {
1922
2009
  TIMEOUT_SEC="$DEFAULT_TIMEOUT"
1923
2010
  fi
1924
2011
 
2012
+ TFX_EXPECTED_DURATION_SEC="${TFX_EXPECTED_DURATION_SEC:-$(estimate_expected_duration_sec "$AGENT_TYPE" "$MCP_PROFILE" "$PROMPT")}"
2013
+ export TFX_EXPECTED_DURATION_SEC
2014
+
1925
2015
  # 컨텍스트 파일 → 프롬프트에 주입
1926
2016
  if [[ -n "$CONTEXT_FILE" && -f "$CONTEXT_FILE" ]]; then
1927
2017
  local ctx_content
@@ -2046,6 +2136,19 @@ FALLBACK_EOF
2046
2136
  # swap 후 config override 플래그 클리어 — 제거된 서버에 override 보내면 "invalid transport" 에러
2047
2137
  CODEX_CONFIG_FLAGS=()
2048
2138
  CODEX_CONFIG_JSON="{}"
2139
+ # #170 graceful degradation: MCP 전부 dead 면 transport 무관 exec 강제.
2140
+ # _mcp_preflight_filter_dead 가 _TFX_MCP_DEGRADED=1 를 export 했으면 이미 stall 보장 안 됨.
2141
+ # 사용자가 TFX_CODEX_TRANSPORT=mcp 명시했더라도 dead MCP 와 connect 시도 = stall →
2142
+ # warning + exec 강제 (transport 명시는 사용자 의도지만 stall 회피가 우선).
2143
+ # MCP_HINT (e.g. "context7으로 조회하세요") 도 prompt 에서 제거 — degraded 환경에서
2144
+ # 모델이 사용 불가 도구를 시도하면 stall/실패 trigger.
2145
+ if [[ "${_TFX_MCP_DEGRADED:-0}" == "1" ]]; then
2146
+ if [[ "$TFX_CODEX_TRANSPORT" == "mcp" ]]; then
2147
+ echo "[tfx-route] WARNING: TFX_CODEX_TRANSPORT=mcp + all-MCP-dead → exec 강제 (stall 회피)" >&2
2148
+ fi
2149
+ TFX_CODEX_TRANSPORT="exec"
2150
+ FULL_PROMPT="$PROMPT"
2151
+ fi
2049
2152
  codex_transport_effective="exec"
2050
2153
  if [[ "$TFX_CODEX_TRANSPORT" != "exec" ]]; then
2051
2154
  run_codex_mcp "$FULL_PROMPT" "$use_tee" || exit_code=$?
@@ -2055,6 +2158,11 @@ FALLBACK_EOF
2055
2158
  # MCP 실패 → exec fallback. run_codex_exec는 < /dev/null 로 stdin 블록 회피 (line 1639).
2056
2159
  # 정책: codex/gemini 강건성 — MCP 가용 시 MCP, 실패 시 그래도 워커 자체는 굴러간다.
2057
2160
  echo "[tfx-route] Codex MCP 실패(exit=${exit_code}). exec fallback 시도." >&2
2161
+ local _sd
2162
+ _sd="$(_get_script_dir)"
2163
+ if [[ -f "$_sd/hub-ensure.mjs" ]]; then
2164
+ "$NODE_BIN" "$_sd/hub-ensure.mjs" >/dev/null 2>&1 || true
2165
+ fi
2058
2166
  exit_code=0
2059
2167
  run_codex_exec "$FULL_PROMPT" "$use_tee" || exit_code=$?
2060
2168
  codex_transport_effective="exec-fallback"