@pushpalsdev/cli 1.1.26 → 1.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pushpalsdev/cli",
3
- "version": "1.1.26",
3
+ "version": "1.1.28",
4
4
  "description": "PushPals terminal CLI for LocalBuddy -> RemoteBuddy orchestration",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -8408,6 +8408,7 @@ ${detail}`.toLowerCase();
8408
8408
  "codex cli is mandatory in this backend"
8409
8409
  ].some((needle) => text.includes(needle));
8410
8410
  }
8411
+ var CODEX_STARTUP_STALL_WORKER_EXIT_CODE = 87;
8411
8412
  function asAutonomyComponentArea2(value) {
8412
8413
  return normalizeAutonomyComponentArea(value) ?? undefined;
8413
8414
  }
@@ -8950,6 +8951,7 @@ class RemoteBuddyOrchestrator {
8950
8951
  workerpalsEnvFile;
8951
8952
  workerpalsEntrypoint;
8952
8953
  workerpalsUnavailableReason;
8954
+ workerDockerFallbackActivated = false;
8953
8955
  statusHeartbeatMs;
8954
8956
  fetchFailureLogsOnJobFailure;
8955
8957
  executionBudgetInteractiveMs;
@@ -9956,6 +9958,25 @@ Please reply with the missing details and I will enqueue a follow-up request.` :
9956
9958
  entrypoint: this.workerpalsEntrypoint
9957
9959
  });
9958
9960
  }
9961
+ maybeFallbackFromDockerAfterWorkerExit(workerId, code) {
9962
+ if (code !== CODEX_STARTUP_STALL_WORKER_EXIT_CODE)
9963
+ return false;
9964
+ if (!this.spawnWorkerDocker)
9965
+ return false;
9966
+ if (this.workerDockerFallbackActivated)
9967
+ return false;
9968
+ if (parseEnabledFlag(process.env.REMOTEBUDDY_DISABLE_WORKERPAL_DIRECT_FALLBACK, false)) {
9969
+ console.warn(`[RemoteBuddy] WorkerPal ${workerId} exited after a Docker Codex startup stall, but direct WorkerPal fallback is disabled.`);
9970
+ return false;
9971
+ }
9972
+ this.workerDockerFallbackActivated = true;
9973
+ this.spawnWorkerDocker = false;
9974
+ this.spawnWorkerRequireDocker = false;
9975
+ this.workerSpawnCooldownUntil = 0;
9976
+ this.workerpalsUnavailableReason = "Docker-backed WorkerPal Codex startup stalled; falling back to direct isolated-worktree WorkerPal.";
9977
+ console.warn(`[RemoteBuddy] WorkerPal ${workerId} exited after a Docker Codex startup stall; falling back to direct isolated-worktree WorkerPal for future spawns.`);
9978
+ return true;
9979
+ }
9959
9980
  async spawnWorker() {
9960
9981
  if (this.workerSpawnInFlight) {
9961
9982
  return await this.workerSpawnInFlight;
@@ -9983,6 +10004,9 @@ Please reply with the missing details and I will enqueue a follow-up request.` :
9983
10004
  this.managedWorkers.set(workerId, child);
9984
10005
  child.exited.then((code) => {
9985
10006
  this.managedWorkers.delete(workerId);
10007
+ if (this.maybeFallbackFromDockerAfterWorkerExit(workerId, code)) {
10008
+ this.ensureAutoscaledWorkerCapacity("docker codex startup fallback");
10009
+ }
9986
10010
  console.warn(`[RemoteBuddy] WorkerPal process ${workerId} exited with code ${code}`);
9987
10011
  });
9988
10012
  const ready = await this.waitForOnlineWorker(this.workerStartupTimeoutMs, workerId);
@@ -106,6 +106,7 @@ _MAX_WRAPPER_BOOTSTRAP_OUTPUT_CHARS = 1_200
106
106
  _MAX_WRAPPER_BOOTSTRAP_TOTAL_CHARS = 5_000
107
107
  _MAX_CREDIBLE_WRAPPER_LOOP_CHANGED_PATHS = 8
108
108
  _MAX_CREDIBLE_WRAPPER_LOOP_TOP_LEVELS = 4
109
+ _MAX_STARTUP_STALL_RECOVERY_ATTEMPTS = 1
109
110
  _MAX_NO_EDIT_RECOVERY_ATTEMPTS = 1
110
111
  _MAX_ROLLOUT_RECOVERY_ATTEMPTS = 1
111
112
  _DEFAULT_NO_EDIT_WATCHDOG_S = 480
@@ -121,6 +122,7 @@ _NARROW_TEST_TASK_ROLLOUT_WATCHDOG_S = 150
121
122
  _WEB_REVIEW_ROLLOUT_WATCHDOG_S = 180
122
123
  _BACKGROUND_ROLLOUT_WATCHDOG_S = 90
123
124
  _NO_PUBLISHABLE_FAILURE_COOLDOWN_MS = 10 * 60 * 1000
125
+ _CODEX_STARTUP_ONLY_EVENT_TYPES = {"thread.started", "turn.started"}
124
126
 
125
127
 
126
128
  def _model_supports_xhigh_reasoning(model: str) -> bool:
@@ -862,6 +864,19 @@ def _build_no_edit_recovery_guidance(trace_excerpt: str, artifact_only_paths: st
862
864
  return "\n".join(lines)
863
865
 
864
866
 
867
+ def _build_startup_stall_recovery_guidance(trace_excerpt: str) -> str:
868
+ lines = [
869
+ "Codex startup-stall recovery: the previous Codex subprocess started but emitted no assistant, tool, or reasoning progress before the watchdog.",
870
+ "Treat this as a fresh execution with a patch-first contract. After at most one narrow read of the hinted owner, make the smallest publishable edit.",
871
+ "Do not spend this recovery attempt re-reading broad repository topology or validating before an edit exists.",
872
+ "If the hinted path is absent, choose the nearest existing repo-native owner or test rather than creating unrelated scaffolding.",
873
+ ]
874
+ if trace_excerpt:
875
+ lines.append("Previous Codex event trace excerpt:")
876
+ lines.append(trace_excerpt)
877
+ return "\n".join(lines)
878
+
879
+
865
880
  def _trace_summaries_text(trace: Dict[str, Any]) -> str:
866
881
  summaries = trace.get("summaries")
867
882
  if not isinstance(summaries, list):
@@ -869,6 +884,36 @@ def _trace_summaries_text(trace: Dict[str, Any]) -> str:
869
884
  return "\n".join(str(item or "") for item in summaries[-80:]).lower()
870
885
 
871
886
 
887
+ def _codex_trace_has_work_progress(trace: Dict[str, Any]) -> bool:
888
+ if to_int(trace.get("reasoning_events"), 0) > 0:
889
+ return True
890
+
891
+ event_counts = trace.get("event_type_counts")
892
+ if isinstance(event_counts, dict):
893
+ for key, value in event_counts.items():
894
+ event_type = str(key or "").strip()
895
+ if to_int(value, 0) > 0 and event_type not in _CODEX_STARTUP_ONLY_EVENT_TYPES:
896
+ return True
897
+
898
+ summaries = trace.get("summaries")
899
+ if isinstance(summaries, list):
900
+ for item in summaries:
901
+ summary = str(item or "").strip()
902
+ if not summary:
903
+ continue
904
+ event_type = summary.split("|", 1)[0].strip()
905
+ if event_type not in _CODEX_STARTUP_ONLY_EVENT_TYPES:
906
+ return True
907
+
908
+ return False
909
+
910
+
911
+ def _codex_trace_is_startup_stall(trace: Dict[str, Any]) -> bool:
912
+ if to_int(trace.get("total_tokens"), 0) > 0:
913
+ return False
914
+ return not _codex_trace_has_work_progress(trace)
915
+
916
+
872
917
  def _detect_offtrack_rollout(trace: Dict[str, Any], artifact_only_paths: str = "") -> str:
873
918
  text = _trace_summaries_text(trace)
874
919
  if artifact_only_paths:
@@ -1981,6 +2026,7 @@ def _run_codex_task(
1981
2026
  *,
1982
2027
  wrapper_recovery_attempt: int = 0,
1983
2028
  model_compatibility_recovery_attempt: int = 0,
2029
+ startup_stall_recovery_attempt: int = 0,
1984
2030
  no_edit_recovery_attempt: int = 0,
1985
2031
  rollout_recovery_attempt: int = 0,
1986
2032
  model_override: Optional[str] = None,
@@ -2475,6 +2521,7 @@ def _run_codex_task(
2475
2521
  retry_guidance,
2476
2522
  wrapper_recovery_attempt=wrapper_recovery_attempt,
2477
2523
  model_compatibility_recovery_attempt=model_compatibility_recovery_attempt,
2524
+ startup_stall_recovery_attempt=startup_stall_recovery_attempt,
2478
2525
  no_edit_recovery_attempt=no_edit_recovery_attempt,
2479
2526
  rollout_recovery_attempt=rollout_recovery_attempt + 1,
2480
2527
  model_override=model_override,
@@ -2497,6 +2544,54 @@ def _run_codex_task(
2497
2544
  }
2498
2545
 
2499
2546
  if no_edit_watchdog_fired:
2547
+ startup_stall = _codex_trace_is_startup_stall(stdout_trace)
2548
+ if startup_stall and startup_stall_recovery_attempt < _MAX_STARTUP_STALL_RECOVERY_ATTEMPTS:
2549
+ retry_guidance = [
2550
+ *supplemental_guidance,
2551
+ _build_startup_stall_recovery_guidance(trace_excerpt),
2552
+ ]
2553
+ log.warning(
2554
+ "Codex emitted only startup events before the no-edit watchdog; "
2555
+ "restarting Codex once before classifying the job terminally."
2556
+ )
2557
+ retry_result = _run_codex_task(
2558
+ repo,
2559
+ instruction,
2560
+ retry_guidance,
2561
+ wrapper_recovery_attempt=wrapper_recovery_attempt,
2562
+ model_compatibility_recovery_attempt=model_compatibility_recovery_attempt,
2563
+ startup_stall_recovery_attempt=startup_stall_recovery_attempt + 1,
2564
+ no_edit_recovery_attempt=no_edit_recovery_attempt,
2565
+ rollout_recovery_attempt=rollout_recovery_attempt,
2566
+ model_override=model_override,
2567
+ baseline_changes=baseline_snapshot,
2568
+ )
2569
+ retry_result["usage"] = _merge_usage_records(usage, retry_result.get("usage"))
2570
+ if retry_result.get("ok"):
2571
+ recovered_stdout = str(retry_result.get("stdout") or "").strip()
2572
+ retry_result["stdout"] = _truncate(
2573
+ (
2574
+ "Recovered after the first Codex subprocess stalled before emitting "
2575
+ f"assistant/tool progress.\n\n{recovered_stdout}"
2576
+ ).strip()
2577
+ )
2578
+ return retry_result
2579
+ if startup_stall:
2580
+ detail = (
2581
+ "Codex subprocess started but did not emit assistant, tool, reasoning, "
2582
+ "or usage progress before the startup watchdog."
2583
+ )
2584
+ if trace_excerpt:
2585
+ detail = f"{detail}\n{trace_excerpt}"
2586
+ return {
2587
+ "ok": False,
2588
+ "summary": "openai_codex stalled before first response",
2589
+ "stdout": _truncate(stdout),
2590
+ "stderr": _truncate(f"{detail}\n{stderr}".strip()),
2591
+ "exitCode": 124,
2592
+ "usage": usage,
2593
+ "cooldownMs": _NO_PUBLISHABLE_FAILURE_COOLDOWN_MS,
2594
+ }
2500
2595
  if no_edit_recovery_attempt < _MAX_NO_EDIT_RECOVERY_ATTEMPTS:
2501
2596
  retry_guidance = [
2502
2597
  *supplemental_guidance,
@@ -2511,6 +2606,7 @@ def _run_codex_task(
2511
2606
  retry_guidance,
2512
2607
  wrapper_recovery_attempt=wrapper_recovery_attempt,
2513
2608
  model_compatibility_recovery_attempt=model_compatibility_recovery_attempt,
2609
+ startup_stall_recovery_attempt=startup_stall_recovery_attempt,
2514
2610
  no_edit_recovery_attempt=no_edit_recovery_attempt + 1,
2515
2611
  rollout_recovery_attempt=rollout_recovery_attempt,
2516
2612
  model_override=model_override,
@@ -2706,6 +2802,7 @@ def _run_codex_task(
2706
2802
  ],
2707
2803
  wrapper_recovery_attempt=wrapper_recovery_attempt + 1,
2708
2804
  model_compatibility_recovery_attempt=model_compatibility_recovery_attempt,
2805
+ startup_stall_recovery_attempt=startup_stall_recovery_attempt,
2709
2806
  no_edit_recovery_attempt=no_edit_recovery_attempt,
2710
2807
  rollout_recovery_attempt=rollout_recovery_attempt,
2711
2808
  model_override=model_override,
@@ -2820,6 +2917,7 @@ def _run_codex_task(
2820
2917
  effective_supplemental_guidance,
2821
2918
  wrapper_recovery_attempt=wrapper_recovery_attempt,
2822
2919
  model_compatibility_recovery_attempt=model_compatibility_recovery_attempt + 1,
2920
+ startup_stall_recovery_attempt=startup_stall_recovery_attempt,
2823
2921
  no_edit_recovery_attempt=no_edit_recovery_attempt,
2824
2922
  rollout_recovery_attempt=rollout_recovery_attempt,
2825
2923
  model_override=LEGACY_CODEX_MODEL_FALLBACK,
@@ -1029,6 +1029,159 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
1029
1029
  self.assertNotIn("broad/noisy", str(result.get("summary") or ""))
1030
1030
  self.assertNotIn("too broad/noisy", str(result.get("stderr") or ""))
1031
1031
 
1032
+ def test_run_codex_task_retries_once_when_codex_stalls_before_first_response(self) -> None:
1033
+ with tempfile.TemporaryDirectory(prefix="pushpals-codex-startup-stall-") as temp_dir:
1034
+ repo = Path(temp_dir) / "repo"
1035
+ repo.mkdir(parents=True, exist_ok=True)
1036
+ (repo / "README.md").write_text("# startup stall repo\n", encoding="utf-8")
1037
+ subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
1038
+ subprocess.run(
1039
+ ["git", "config", "user.name", "PushPals Test"],
1040
+ cwd=repo,
1041
+ check=True,
1042
+ capture_output=True,
1043
+ text=True,
1044
+ )
1045
+ subprocess.run(
1046
+ ["git", "config", "user.email", "pushpals-tests@example.com"],
1047
+ cwd=repo,
1048
+ check=True,
1049
+ capture_output=True,
1050
+ text=True,
1051
+ )
1052
+ subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
1053
+ subprocess.run(
1054
+ ["git", "commit", "-m", "chore: seed startup stall repo"],
1055
+ cwd=repo,
1056
+ check=True,
1057
+ capture_output=True,
1058
+ text=True,
1059
+ )
1060
+
1061
+ stub_path = Path(temp_dir) / "fake_codex_startup_stall.py"
1062
+ stub_path.write_text(
1063
+ "\n".join(
1064
+ [
1065
+ "from pathlib import Path",
1066
+ "import json",
1067
+ "import sys",
1068
+ "import time",
1069
+ "",
1070
+ "argv = sys.argv[1:]",
1071
+ "last_message_path = None",
1072
+ "for index, arg in enumerate(argv):",
1073
+ " if arg == '--output-last-message' and index + 1 < len(argv):",
1074
+ " last_message_path = argv[index + 1]",
1075
+ " break",
1076
+ "",
1077
+ "prompt = sys.stdin.read()",
1078
+ "if 'Codex startup-stall recovery' in prompt:",
1079
+ " Path('src').mkdir(exist_ok=True)",
1080
+ " Path('src/startup-stall-recovered.txt').write_text('patched after restart\\n', encoding='utf-8')",
1081
+ " if last_message_path:",
1082
+ " Path(last_message_path).write_text('Patched after Codex startup-stall recovery.', encoding='utf-8')",
1083
+ " print(json.dumps({'type': 'item.completed', 'message': 'Patched after Codex startup-stall recovery.'}), flush=True)",
1084
+ " sys.exit(0)",
1085
+ "",
1086
+ "print(json.dumps({'type': 'thread.started'}), flush=True)",
1087
+ "print(json.dumps({'type': 'turn.started'}), flush=True)",
1088
+ "time.sleep(10)",
1089
+ ]
1090
+ ),
1091
+ encoding="utf-8",
1092
+ )
1093
+
1094
+ env_overrides = {
1095
+ "PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
1096
+ "PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
1097
+ "OPENAI_API_KEY": "pushpals-startup-stall-test-key",
1098
+ "WORKERPALS_OPENAI_CODEX_JSON": "true",
1099
+ "WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
1100
+ "WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
1101
+ "WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
1102
+ }
1103
+ with mock.patch.dict(os.environ, env_overrides, clear=False):
1104
+ result = _run_codex_task(
1105
+ str(repo),
1106
+ "Rename one misleading test fixture constant and update the related assertions.",
1107
+ [],
1108
+ )
1109
+
1110
+ self.assertTrue(result.get("ok"), result)
1111
+ self.assertEqual(result.get("exitCode"), 0)
1112
+ stdout = str(result.get("stdout") or "")
1113
+ self.assertIn("Recovered after the first Codex subprocess stalled", stdout)
1114
+ self.assertIn("Patched after Codex startup-stall recovery", stdout)
1115
+ self.assertIn("src/", stdout)
1116
+
1117
+ def test_run_codex_task_reports_startup_stall_when_restart_also_never_responds(self) -> None:
1118
+ with tempfile.TemporaryDirectory(prefix="pushpals-codex-startup-stall-fail-") as temp_dir:
1119
+ repo = Path(temp_dir) / "repo"
1120
+ repo.mkdir(parents=True, exist_ok=True)
1121
+ (repo / "README.md").write_text("# startup stall failure repo\n", encoding="utf-8")
1122
+ subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
1123
+ subprocess.run(
1124
+ ["git", "config", "user.name", "PushPals Test"],
1125
+ cwd=repo,
1126
+ check=True,
1127
+ capture_output=True,
1128
+ text=True,
1129
+ )
1130
+ subprocess.run(
1131
+ ["git", "config", "user.email", "pushpals-tests@example.com"],
1132
+ cwd=repo,
1133
+ check=True,
1134
+ capture_output=True,
1135
+ text=True,
1136
+ )
1137
+ subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
1138
+ subprocess.run(
1139
+ ["git", "commit", "-m", "chore: seed startup stall failure repo"],
1140
+ cwd=repo,
1141
+ check=True,
1142
+ capture_output=True,
1143
+ text=True,
1144
+ )
1145
+
1146
+ stub_path = Path(temp_dir) / "fake_codex_startup_stall_fail.py"
1147
+ stub_path.write_text(
1148
+ "\n".join(
1149
+ [
1150
+ "import json",
1151
+ "import sys",
1152
+ "import time",
1153
+ "",
1154
+ "sys.stdin.read()",
1155
+ "print(json.dumps({'type': 'thread.started'}), flush=True)",
1156
+ "print(json.dumps({'type': 'turn.started'}), flush=True)",
1157
+ "time.sleep(10)",
1158
+ ]
1159
+ ),
1160
+ encoding="utf-8",
1161
+ )
1162
+
1163
+ env_overrides = {
1164
+ "PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
1165
+ "PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
1166
+ "OPENAI_API_KEY": "pushpals-startup-stall-fail-test-key",
1167
+ "WORKERPALS_OPENAI_CODEX_JSON": "true",
1168
+ "WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
1169
+ "WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
1170
+ "WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
1171
+ }
1172
+ with mock.patch.dict(os.environ, env_overrides, clear=False):
1173
+ result = _run_codex_task(
1174
+ str(repo),
1175
+ "Rename one misleading test fixture constant and update the related assertions.",
1176
+ [],
1177
+ )
1178
+
1179
+ self.assertFalse(result.get("ok"), result)
1180
+ self.assertEqual(result.get("exitCode"), 124)
1181
+ self.assertEqual(result.get("summary"), "openai_codex stalled before first response")
1182
+ self.assertNotIn("no publishable", str(result.get("summary") or "").lower())
1183
+ self.assertEqual(result.get("cooldownMs"), 600000)
1184
+
1032
1185
  def test_run_codex_task_retries_once_when_no_edit_watchdog_fires(self) -> None:
1033
1186
  with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-watchdog-") as temp_dir:
1034
1187
  repo = Path(temp_dir) / "repo"
@@ -92,6 +92,7 @@ export interface JobResult {
92
92
  stdout?: string;
93
93
  stderr?: string;
94
94
  exitCode?: number;
95
+ cooldownMs?: number;
95
96
  usage?: JobTokenUsage;
96
97
  publishBlocked?: JobPublishBlockedInfo;
97
98
  diagnostics?: JobDiagnostics;
@@ -249,7 +249,9 @@ export interface Job {
249
249
  }
250
250
 
251
251
  function compactDockerDiagnosticText(value: unknown, maxChars = 1000): string | null {
252
- const text = String(value ?? "").replace(/\s+$/g, "").trim();
252
+ const text = String(value ?? "")
253
+ .replace(/\s+$/g, "")
254
+ .trim();
253
255
  if (!text) return null;
254
256
  return text.length <= maxChars ? text : text.slice(0, maxChars);
255
257
  }
@@ -482,11 +484,11 @@ export class DockerExecutor {
482
484
  if (
483
485
  retryableFailure &&
484
486
  attempt >= this.jobRetryMaxAttempts &&
485
- this.failureCooldownMs > 0
487
+ this.retryExhaustionCooldownMs(result) > 0
486
488
  ) {
487
489
  return {
488
490
  ...result,
489
- cooldownMs: this.failureCooldownMs,
491
+ cooldownMs: this.retryExhaustionCooldownMs(result),
490
492
  };
491
493
  }
492
494
  return result;
@@ -1279,9 +1281,8 @@ export class DockerExecutor {
1279
1281
  onLog?.("stdout", note);
1280
1282
  }
1281
1283
 
1282
- const { leadMs: warningLeadMs, delayMs: warningDelayMs } = computeTimeoutWarningWindow(
1283
- timeoutMs,
1284
- );
1284
+ const { leadMs: warningLeadMs, delayMs: warningDelayMs } =
1285
+ computeTimeoutWarningWindow(timeoutMs);
1285
1286
  const warningTimer = setTimeout(() => {
1286
1287
  const warning = `[DockerExecutor] Job nearing timeout in warm container (${Math.round(
1287
1288
  warningLeadMs / 1000,
@@ -1424,13 +1425,13 @@ export class DockerExecutor {
1424
1425
  const worktreePrefix = shellSingleQuote(`${containerWorktreePath}/`);
1425
1426
  const command = [
1426
1427
  "set -eu",
1427
- "linked=\"\"",
1428
+ 'linked=""',
1428
1429
  "for name in node_modules; do",
1429
- " src=\"/repo/$name\"",
1430
+ ' src="/repo/$name"',
1430
1431
  ` dest=${worktreePrefix}$name`,
1431
- " if { [ -e \"$src\" ] || [ -L \"$src\" ]; } && [ ! -e \"$dest\" ] && [ ! -L \"$dest\" ]; then",
1432
- " ln -s \"$src\" \"$dest\"",
1433
- " linked=\"$linked $name\"",
1432
+ ' if { [ -e "$src" ] || [ -L "$src" ]; } && [ ! -e "$dest" ] && [ ! -L "$dest" ]; then',
1433
+ ' ln -s "$src" "$dest"',
1434
+ ' linked="$linked $name"',
1434
1435
  " fi",
1435
1436
  "done",
1436
1437
  "printf '%s' \"$linked\"",
@@ -1454,9 +1455,7 @@ export class DockerExecutor {
1454
1455
  .filter(Boolean);
1455
1456
  if (linked.length === 0) return;
1456
1457
 
1457
- const note = `[DockerExecutor] Linked worktree dependency artifact(s): ${linked.join(
1458
- ", ",
1459
- )}`;
1458
+ const note = `[DockerExecutor] Linked worktree dependency artifact(s): ${linked.join(", ")}`;
1460
1459
  console.log(note);
1461
1460
  onLog?.("stdout", note);
1462
1461
  }
@@ -1701,9 +1700,15 @@ export class DockerExecutor {
1701
1700
  stdout,
1702
1701
  stderr: details.join("\n"),
1703
1702
  exitCode,
1704
- diagnostics: dockerFallbackDiagnostics(summary, context, exitCode, "malformed_structured_result", {
1705
- sentinelParseError,
1706
- }),
1703
+ diagnostics: dockerFallbackDiagnostics(
1704
+ summary,
1705
+ context,
1706
+ exitCode,
1707
+ "malformed_structured_result",
1708
+ {
1709
+ sentinelParseError,
1710
+ },
1711
+ ),
1707
1712
  };
1708
1713
  }
1709
1714
 
@@ -1906,8 +1911,15 @@ export class DockerExecutor {
1906
1911
  return this.matchesRetryablePattern(text);
1907
1912
  }
1908
1913
 
1914
+ private retryExhaustionCooldownMs(result: DockerJobResult): number {
1915
+ const resultCooldownMs = readPositiveNumber(result.cooldownMs) ?? 0;
1916
+ return Math.max(this.failureCooldownMs, resultCooldownMs);
1917
+ }
1918
+
1909
1919
  private matchesRetryablePattern(text: string): boolean {
1910
1920
  const transientPatterns: RegExp[] = [
1921
+ /\bstalled before first response\b/i,
1922
+ /\bstartup stall\b/i,
1911
1923
  /warm .*runtime/i,
1912
1924
  /failed to start warm container/i,
1913
1925
  /docker execution error/i,
@@ -688,6 +688,9 @@ function inferTerminalFailureClass(result: JobResult, changedPaths: string[]): s
688
688
  if (result.ok) return "success";
689
689
  const text = `${result.summary ?? ""}\n${result.stderr ?? ""}\n${result.stdout ?? ""}`.toLowerCase();
690
690
  const publishableCount = publishableChangedPaths(changedPaths).length;
691
+ if (text.includes("stalled before first response") || text.includes("startup stall")) {
692
+ return "codex_startup_stall";
693
+ }
691
694
  if (changedPaths.length > 0 && publishableCount === 0) return "artifact_only_no_publishable_patch";
692
695
  if (result.exitCode === 124 || text.includes("timed out") || text.includes("timeout")) return "timeout";
693
696
  if (text.includes("validationgate") || text.includes("validation")) return "validation";
@@ -700,6 +703,9 @@ function inferTerminalFailureClass(result: JobResult, changedPaths: string[]): s
700
703
 
701
704
  function inferTerminalStage(result: JobResult, fallback: string): string {
702
705
  const text = `${result.summary ?? ""}\n${result.stderr ?? ""}`.toLowerCase();
706
+ if (text.includes("stalled before first response") || text.includes("startup stall")) {
707
+ return "executor_startup";
708
+ }
703
709
  if (text.includes("validationgate") || text.includes("validation")) return "validation";
704
710
  if (text.includes("scopegate") || text.includes("scope")) return "scope";
705
711
  if (text.includes("criticgate") || text.includes("critic")) return "critic";
@@ -748,7 +754,7 @@ function buildTerminalDiagnostics(args: {
748
754
  terminalStage: inferTerminalStage(args.result, args.terminalStage),
749
755
  executorBackend: args.executor,
750
756
  summary: compactDiagnosticText(args.result.summary, 1_000),
751
- watchdogFired: /watchdog|rollout coach/i.test(text),
757
+ watchdogFired: /watchdog|rollout coach|stalled before first response|startup stall/i.test(text),
752
758
  timeoutMs: args.timeoutMs ?? null,
753
759
  publishableFileCount: publishable.length,
754
760
  artifactOnlyPathCount: artifactOnly.length,
@@ -44,6 +44,7 @@ interface JobResult {
44
44
  stdout?: string;
45
45
  stderr?: string;
46
46
  exitCode?: number;
47
+ cooldownMs?: number;
47
48
  commit?: {
48
49
  branch: string;
49
50
  sha: string;
@@ -115,6 +116,23 @@ echo "password=${token}"
115
116
  }
116
117
  }
117
118
 
119
+ export function buildJobRunnerResult(
120
+ result: Pick<
121
+ Awaited<ReturnType<typeof executeJob>>,
122
+ "ok" | "summary" | "stdout" | "stderr" | "exitCode" | "cooldownMs" | "diagnostics"
123
+ >,
124
+ ): JobResult {
125
+ return {
126
+ ok: result.ok,
127
+ summary: result.summary,
128
+ stdout: result.stdout,
129
+ stderr: result.stderr,
130
+ exitCode: result.exitCode,
131
+ cooldownMs: result.cooldownMs,
132
+ diagnostics: result.diagnostics,
133
+ };
134
+ }
135
+
118
136
  // ─── Main ───────────────────────────────────────────────────────────────────
119
137
 
120
138
  async function main(): Promise<void> {
@@ -127,8 +145,7 @@ async function main(): Promise<void> {
127
145
  process.exit(1);
128
146
  }
129
147
 
130
- const base64Spec =
131
- rawSpecArg === "--spec-stdin" ? (await Bun.stdin.text()).trim() : rawSpecArg;
148
+ const base64Spec = rawSpecArg === "--spec-stdin" ? (await Bun.stdin.text()).trim() : rawSpecArg;
132
149
  if (!base64Spec) {
133
150
  // eslint-disable-next-line no-console
134
151
  console.error("Job spec was empty");
@@ -179,14 +196,7 @@ async function main(): Promise<void> {
179
196
  CONFIG,
180
197
  );
181
198
  // Build result object
182
- const jobResult: JobResult = {
183
- ok: result.ok,
184
- summary: result.summary,
185
- stdout: result.stdout,
186
- stderr: result.stderr,
187
- exitCode: result.exitCode,
188
- diagnostics: result.diagnostics,
189
- };
199
+ const jobResult = buildJobRunnerResult(result);
190
200
  // Create commit for file-modifying jobs
191
201
  if (result.ok && shouldCommit(spec.kind, CONFIG)) {
192
202
  log("stdout", `[JobRunner] Job modified files, creating commit...`);
@@ -224,7 +234,8 @@ async function main(): Promise<void> {
224
234
  if (commitResult.publishBlocked) {
225
235
  jobResult.publishBlocked = commitResult.publishBlocked;
226
236
  }
227
- jobResult.exitCode = jobResult.exitCode && jobResult.exitCode !== 0 ? jobResult.exitCode : 1;
237
+ jobResult.exitCode =
238
+ jobResult.exitCode && jobResult.exitCode !== 0 ? jobResult.exitCode : 1;
228
239
  log(
229
240
  "stderr",
230
241
  commitResult.publishBlocked
@@ -246,8 +257,10 @@ async function main(): Promise<void> {
246
257
  }
247
258
  }
248
259
 
249
- main().catch((err) => {
250
- // eslint-disable-next-line no-console
251
- console.error(`[JobRunner] Fatal error: ${err}`);
252
- process.exit(1);
253
- });
260
+ if (import.meta.main) {
261
+ main().catch((err) => {
262
+ // eslint-disable-next-line no-console
263
+ console.error(`[JobRunner] Fatal error: ${err}`);
264
+ process.exit(1);
265
+ });
266
+ }
@@ -67,6 +67,7 @@ type WorkerJobResult = JobResult & {
67
67
 
68
68
  const DEFAULT_LLM_MODEL = "local-model";
69
69
  const CODEX_UNAVAILABLE_WORKER_EXIT_CODE = 86;
70
+ const CODEX_STARTUP_STALL_WORKER_EXIT_CODE = 87;
70
71
  const CODEX_UNAVAILABLE_DOCKER_SHUTDOWN_GRACE_MS = 5_000;
71
72
  const CODEX_UNAVAILABLE_WORKER_FORCE_EXIT_MS = 4_000;
72
73
  const DEFAULT_JOB_PROGRESS_LOG_EVERY_MS = 60_000;
@@ -391,7 +392,9 @@ function inferWorkerJobPhaseFromLogLine(line: string): WorkerJobPhase | null {
391
392
  ) {
392
393
  return "full validation";
393
394
  }
394
- if (/creating commit|Publish blocked|publish-blocked|completion ref|enqueueCompletion/i.test(text)) {
395
+ if (
396
+ /creating commit|Publish blocked|publish-blocked|completion ref|enqueueCompletion/i.test(text)
397
+ ) {
395
398
  return "publishing";
396
399
  }
397
400
  if (
@@ -447,11 +450,20 @@ function mergeWorkerDiagnostics(
447
450
  };
448
451
  }
449
452
 
450
- function inferWorkerTerminalFailureClass(result: JobResult): string {
453
+ function isCodexStartupStallResult(result: JobResult): boolean {
454
+ const text =
455
+ `${result.summary ?? ""}\n${result.stderr ?? ""}\n${result.stdout ?? ""}`.toLowerCase();
456
+ return /stalled before first response|startup stall/.test(text);
457
+ }
458
+
459
+ export function inferWorkerTerminalFailureClass(result: JobResult): string {
451
460
  if (result.ok) return "success";
452
- const text = `${result.summary ?? ""}\n${result.stderr ?? ""}\n${result.stdout ?? ""}`.toLowerCase();
461
+ const text =
462
+ `${result.summary ?? ""}\n${result.stderr ?? ""}\n${result.stdout ?? ""}`.toLowerCase();
463
+ if (isCodexStartupStallResult(result)) return "codex_startup_stall";
453
464
  if (/timed out|timeout|signal 15|terminated|exit 143|exit 137/.test(text)) return "timeout";
454
- if (/no publishable|non-publishable|node_modules/.test(text)) return "artifact_only_no_publishable_patch";
465
+ if (/no publishable|non-publishable|node_modules/.test(text))
466
+ return "artifact_only_no_publishable_patch";
455
467
  if (/validationgate|validation/.test(text)) return "validation";
456
468
  if (/scopegate|scope/.test(text)) return "scope";
457
469
  if (/criticgate|critic/.test(text)) return "critic";
@@ -497,11 +509,12 @@ export function shouldRecycleWorkerForHeartbeatDegradation(options: {
497
509
  return options.transportStale;
498
510
  }
499
511
 
500
- function shouldRecycleWorkerForCodexUnavailableFailure(
512
+ export function shouldRecycleWorkerForCodexUnavailableFailure(
501
513
  summary: string,
502
514
  stderr?: string | null,
503
515
  ): boolean {
504
516
  const text = `${summary}\n${stderr ?? ""}`.toLowerCase();
517
+ if (/stalled before first response|startup stall/.test(text)) return true;
505
518
  return [
506
519
  "openai_codex cli is not installed",
507
520
  "openai_codex chatgpt auth is not ready",
@@ -512,6 +525,12 @@ function shouldRecycleWorkerForCodexUnavailableFailure(
512
525
  ].some((needle) => text.includes(needle));
513
526
  }
514
527
 
528
+ export function workerRecycleExitCodeForResult(result: WorkerJobResult): number {
529
+ return isCodexStartupStallResult(result)
530
+ ? CODEX_STARTUP_STALL_WORKER_EXIT_CODE
531
+ : CODEX_UNAVAILABLE_WORKER_EXIT_CODE;
532
+ }
533
+
515
534
  async function shutdownDockerExecutorBeforeCodexRecycle(
516
535
  dockerExecutor: DockerExecutor | null,
517
536
  ): Promise<void> {
@@ -754,14 +773,9 @@ async function createIsolatedWorktree(
754
773
  .toLowerCase()
755
774
  .replace(/[^a-z0-9]+/g, "")
756
775
  .slice(0, 8);
757
- const nonce = `${Date.now().toString(36).slice(-6)}-${Math.random()
758
- .toString(36)
759
- .slice(2, 6)}`;
776
+ const nonce = `${Date.now().toString(36).slice(-6)}-${Math.random().toString(36).slice(2, 6)}`;
760
777
 
761
- const worktreePath = resolve(
762
- worktreeRoot,
763
- `job-${safeJobId || "host"}-${nonce}`,
764
- );
778
+ const worktreePath = resolve(worktreeRoot, `job-${safeJobId || "host"}-${nonce}`);
765
779
 
766
780
  const addResult = await git(repo, ["worktree", "add", "--detach", worktreePath, baseRef]);
767
781
  if (!addResult.ok) {
@@ -1752,6 +1766,7 @@ async function workerLoop(
1752
1766
  const jobAttempt =
1753
1767
  Number.isFinite(jobAttemptRaw) && jobAttemptRaw > 0 ? Math.floor(jobAttemptRaw) : 1;
1754
1768
  const llm = workerLlmConfig(CONFIG);
1769
+ const terminalFailureClass = inferWorkerTerminalFailureClass(result);
1755
1770
  result = {
1756
1771
  ...result,
1757
1772
  diagnostics: mergeWorkerDiagnostics(result.diagnostics, {
@@ -1781,12 +1796,15 @@ async function workerLoop(
1781
1796
  result.ok ? "completed" : result.publishBlocked ? "publish_blocked" : "failed",
1782
1797
  ),
1783
1798
  terminal: {
1784
- failureClass: inferWorkerTerminalFailureClass(result),
1785
- terminalStage: currentJobPhase ?? (result.ok ? "completed" : "worker"),
1799
+ failureClass: terminalFailureClass,
1800
+ terminalStage:
1801
+ terminalFailureClass === "codex_startup_stall"
1802
+ ? "executor_startup"
1803
+ : (currentJobPhase ?? (result.ok ? "completed" : "worker")),
1786
1804
  executorBackend: resolveExecutor(CONFIG),
1787
1805
  summary: result.summary,
1788
1806
  watchdogFired:
1789
- /watchdog|rollout coach|timed out|timeout|signal 15|terminated|exit 143|exit 137/i.test(
1807
+ /watchdog|rollout coach|stalled before first response|startup stall|timed out|timeout|signal 15|terminated|exit 143|exit 137/i.test(
1790
1808
  `${result.summary}\n${result.stderr ?? ""}\n${result.stdout ?? ""}`,
1791
1809
  ),
1792
1810
  metadata: {
@@ -1961,12 +1979,15 @@ async function workerLoop(
1961
1979
  clearInterval(busyHeartbeat);
1962
1980
  if (jobProgressTimer) clearInterval(jobProgressTimer);
1963
1981
  if (recycleWorkerAfterJob) {
1982
+ const recycleExitCode = result
1983
+ ? workerRecycleExitCodeForResult(result)
1984
+ : CODEX_UNAVAILABLE_WORKER_EXIT_CODE;
1964
1985
  runtimeState.shutdownRequested = true;
1965
1986
  const forceExitTimer = setTimeout(() => {
1966
1987
  console.warn(
1967
1988
  `[WorkerPals] Forcing worker recycle ${CODEX_UNAVAILABLE_WORKER_FORCE_EXIT_MS}ms after Codex backend failure.`,
1968
1989
  );
1969
- process.exit(CODEX_UNAVAILABLE_WORKER_EXIT_CODE);
1990
+ process.exit(recycleExitCode);
1970
1991
  }, CODEX_UNAVAILABLE_WORKER_FORCE_EXIT_MS);
1971
1992
  try {
1972
1993
  await maybeHeartbeat("offline", null, true);
@@ -1983,7 +2004,7 @@ async function workerLoop(
1983
2004
  await shutdownDockerExecutorBeforeCodexRecycle(dockerExecutor);
1984
2005
  } finally {
1985
2006
  clearTimeout(forceExitTimer);
1986
- process.exit(CODEX_UNAVAILABLE_WORKER_EXIT_CODE);
2007
+ process.exit(recycleExitCode);
1987
2008
  }
1988
2009
  }
1989
2010
  if (job.sessionId && result?.cooldownMs && result.cooldownMs > 0) {