@pushpalsdev/cli 1.1.12 → 1.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5308,22 +5308,31 @@ function shouldSuppressCliSessionJobLogLine(line) {
5308
5308
  return true;
5309
5309
  if (/^(___RESULT___|__PUSHPALS_OH_RESULT__)\b/.test(text))
5310
5310
  return true;
5311
- if (/^\[DockerExecutor\]\s+Linked worktree dependency artifact/i.test(text))
5311
+ if (/^\[DockerExecutor\]\s+(?:Linked worktree dependency artifact|Capped job timeout|Extended job timeout)/i.test(text)) {
5312
5312
  return true;
5313
- if (/^\[OpenAICodexExecutor\]\s+(?:Planner guidance|Codex auth mode|ChatGPT auth mode|Starting codex exec|codex exec finished|Codex JSON stream captured|Codex stdout captured|No reasoning-like|Reasoning-like event|Usage observed|Temporarily masked repo-local)/i.test(text)) {
5313
+ }
5314
+ if (/^\[JobRunner\]\s+Starting job\b/i.test(text))
5315
+ return true;
5316
+ if (/^\[QualityGate\]\s+(?:Policy:|Gates:)/i.test(text))
5317
+ return true;
5318
+ if (/^\[Openai_codexExecutor\]\s+Spawning openai_codex executor/i.test(text))
5319
+ return true;
5320
+ if (/^\[OpenAICodexExecutor\]\s+(?:Planner guidance|Codex auth mode|ChatGPT auth mode|Starting codex exec|codex exec finished|Codex JSON stream captured|Codex stdout captured|No reasoning-like|Reasoning-like event|Usage observed|Temporarily masked repo-local|Timeout reached after|Process did not exit after graceful timeout termination)/i.test(text)) {
5314
5321
  return true;
5315
5322
  }
5316
5323
  if (/^\[OpenAICodexExecutor\]\s+codex exec still running\b/i.test(text))
5317
5324
  return true;
5325
+ if (/^\[OpenAICodexExecutor\]\s+\[codex\]\s+(?:No reasoning-like|Reasoning-like|turn\.failed|turn\.completed|error\s+\|)/i.test(text)) {
5326
+ return true;
5327
+ }
5318
5328
  if (/^\[OpenAICodexExecutor\]\s+\[codex\]\s+(?:thread|turn)\.started\b/i.test(text)) {
5319
5329
  return true;
5320
5330
  }
5321
5331
  if (/^\[OpenAICodexExecutor\]\s+\[codex\]\s+item\.started\b/i.test(text))
5322
5332
  return true;
5323
- if (/^\[OpenAICodexExecutor\]\s+\[codex\]\s+item\.completed\s*$/i.test(text))
5324
- return true;
5325
- if (/^\[OpenAICodexExecutor\]\s+\[codex\]\s+item\.updated\s*$/i.test(text))
5333
+ if (/^\[OpenAICodexExecutor\]\s+\[codex\]\s+item\.(?:completed|updated)\b/i.test(text)) {
5326
5334
  return true;
5335
+ }
5327
5336
  if (/^\[OpenAICodexExecutor\]\s+\[stderr\].*codex_core::tools::router: error=exec_command failed/i.test(text)) {
5328
5337
  return true;
5329
5338
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pushpalsdev/cli",
3
- "version": "1.1.12",
3
+ "version": "1.1.13",
4
4
  "description": "PushPals terminal CLI for LocalBuddy -> RemoteBuddy orchestration",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -103,6 +103,9 @@ _VALID_REASONING_EFFORTS = {"low", "medium", "high", "xhigh"}
103
103
  _MAX_WRAPPER_RECOVERY_ATTEMPTS = 2
104
104
  _MAX_WRAPPER_BOOTSTRAP_OUTPUT_CHARS = 1_200
105
105
  _MAX_WRAPPER_BOOTSTRAP_TOTAL_CHARS = 5_000
106
+ _MAX_NO_EDIT_RECOVERY_ATTEMPTS = 1
107
+ _DEFAULT_NO_EDIT_WATCHDOG_S = 480
108
+ _SMALL_TASK_NO_EDIT_WATCHDOG_S = 360
106
109
 
107
110
 
108
111
  def _model_supports_xhigh_reasoning(model: str) -> bool:
@@ -559,12 +562,99 @@ def _resolve_reasoning_effort(config: OpenAICodexRuntimeConfig, model: str = DEF
559
562
  return default_effort
560
563
 
561
564
 
565
+ def _looks_like_small_task_prompt(prompt: str) -> bool:
566
+ text = str(prompt or "").lower()
567
+ small_markers = (
568
+ "risk=low",
569
+ "small scoped",
570
+ "small or medium repo tasks",
571
+ "compact",
572
+ "low-risk",
573
+ "low risk",
574
+ "route-entry",
575
+ "first-entry",
576
+ "home shell",
577
+ "startup shell",
578
+ "shell polish",
579
+ "visual/affordance",
580
+ )
581
+ heavy_markers = (
582
+ "merge-conflict",
583
+ "merge conflict",
584
+ "rebase",
585
+ "broad refactor",
586
+ "migration",
587
+ "security",
588
+ "architecture",
589
+ "deep debug",
590
+ )
591
+ return any(marker in text for marker in small_markers) and not any(
592
+ marker in text for marker in heavy_markers
593
+ )
594
+
595
+
596
+ def _resolve_task_reasoning_effort(
597
+ configured_effort: str,
598
+ prompt: str,
599
+ model: str = DEFAULT_CODEX_MODEL,
600
+ ) -> str:
601
+ effort = configured_effort if configured_effort in _VALID_REASONING_EFFORTS else "high"
602
+ if not _looks_like_small_task_prompt(prompt):
603
+ return effort
604
+ if effort == "xhigh":
605
+ log.info(
606
+ f"Routing compact task on model {model!r} from reasoning_effort='xhigh' to 'high' for faster convergence."
607
+ )
608
+ return "high"
609
+ return effort
610
+
611
+
562
612
  def _resolve_progress_log_interval_seconds(config: OpenAICodexRuntimeConfig) -> int:
563
613
  interval = to_int(config.progress_log_interval_s, 30)
564
614
  # Avoid noisy logs (<30s) and stale logs (>120s).
565
615
  return max(30, min(120, interval))
566
616
 
567
617
 
618
+ def _resolve_no_edit_watchdog_seconds(
619
+ prompt: str,
620
+ communicate_timeout_s: Optional[int],
621
+ ) -> Optional[int]:
622
+ if not communicate_timeout_s:
623
+ return None
624
+
625
+ raw = os.environ.get("WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S", "").strip()
626
+ if raw:
627
+ if raw == "0":
628
+ return None
629
+ parsed = _to_positive_int(raw)
630
+ if parsed is None:
631
+ log.info(
632
+ f"Invalid WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S={raw!r}; using default no-edit watchdog."
633
+ )
634
+ else:
635
+ return max(1, min(parsed, max(1, communicate_timeout_s - 1)))
636
+
637
+ if communicate_timeout_s < 600:
638
+ return None
639
+
640
+ default_s = _SMALL_TASK_NO_EDIT_WATCHDOG_S if _looks_like_small_task_prompt(prompt) else _DEFAULT_NO_EDIT_WATCHDOG_S
641
+ return max(120, min(default_s, max(120, communicate_timeout_s - 60)))
642
+
643
+
644
+ def _build_no_edit_recovery_guidance(trace_excerpt: str) -> str:
645
+ lines = [
646
+ "No-edit watchdog recovery: the previous Codex attempt spent too much of the execution budget without producing publishable file changes.",
647
+ "Start from the already inspected context. Do not re-read broad repo topology, route wrappers, or missing test infrastructure unless that is the blocker.",
648
+ "Within the first response/action, edit the smallest behavior-owning file that satisfies the task. If the hinted file is a thin wrapper, patch the owner you already identified.",
649
+ "Use existing tests or a narrow helper/style assertion; do not create broad React Native mocks or a new full render harness for a compact shell/visual polish task.",
650
+ "Run at most one focused fast validation check before final diff review; let PushPals ValidationGate own long required/browser validation.",
651
+ ]
652
+ if trace_excerpt:
653
+ lines.append("Previous Codex event trace excerpt:")
654
+ lines.append(trace_excerpt)
655
+ return "\n".join(lines)
656
+
657
+
568
658
  def _normalize_auth_mode(raw: str) -> str:
569
659
  lowered = (raw or "").strip().lower()
570
660
  aliases = {
@@ -1506,6 +1596,7 @@ def _run_codex_task(
1506
1596
  *,
1507
1597
  wrapper_recovery_attempt: int = 0,
1508
1598
  model_compatibility_recovery_attempt: int = 0,
1599
+ no_edit_recovery_attempt: int = 0,
1509
1600
  model_override: Optional[str] = None,
1510
1601
  baseline_changes: Optional[List[str]] = None,
1511
1602
  ) -> Dict[str, Any]:
@@ -1567,10 +1658,14 @@ def _run_codex_task(
1567
1658
  )
1568
1659
  # JSON event output is noisy by default; prefer plain text + output-last-message.
1569
1660
  use_json = runtime_config.json_output
1570
- reasoning_effort = _resolve_reasoning_effort(runtime_config, model)
1571
1661
  communicate_timeout_s = _resolve_communicate_timeout_seconds(runtime_config)
1572
1662
  effective_supplemental_guidance = _augment_supplemental_guidance(supplemental_guidance)
1573
1663
  prompt = _build_instruction(instruction, effective_supplemental_guidance)
1664
+ reasoning_effort = _resolve_task_reasoning_effort(
1665
+ _resolve_reasoning_effort(runtime_config, model),
1666
+ prompt,
1667
+ model,
1668
+ )
1574
1669
  baseline_snapshot = list(baseline_changes) if baseline_changes is not None else summarize_git_changes(repo)
1575
1670
 
1576
1671
  with tempfile.TemporaryDirectory(prefix="pushpals-codex-") as tmp_dir:
@@ -1793,7 +1888,18 @@ def _run_codex_task(
1793
1888
  )
1794
1889
  next_progress_at = started_at + float(progress_interval_s)
1795
1890
  timed_out = False
1891
+ no_edit_watchdog_fired = False
1796
1892
  command_policy_rejection_loop = False
1893
+ no_edit_watchdog_s = (
1894
+ _resolve_no_edit_watchdog_seconds(prompt, communicate_timeout_s)
1895
+ if no_edit_recovery_attempt < _MAX_NO_EDIT_RECOVERY_ATTEMPTS
1896
+ else None
1897
+ )
1898
+ no_edit_deadline = (
1899
+ started_at + float(no_edit_watchdog_s)
1900
+ if no_edit_watchdog_s is not None
1901
+ else None
1902
+ )
1797
1903
 
1798
1904
  while proc.poll() is None:
1799
1905
  now = time.monotonic()
@@ -1802,6 +1908,17 @@ def _run_codex_task(
1802
1908
  _terminate_active_child()
1803
1909
  break
1804
1910
 
1911
+ if no_edit_deadline is not None and now >= no_edit_deadline:
1912
+ _, _, effective_paths = _codex_changed_paths(repo, baseline_snapshot)
1913
+ if not effective_paths:
1914
+ no_edit_watchdog_fired = True
1915
+ log.info(
1916
+ f"No-edit watchdog fired after {int(no_edit_watchdog_s or 0)}s with no publishable file changes; retrying with patch-first guidance."
1917
+ )
1918
+ _terminate_active_child()
1919
+ break
1920
+ no_edit_deadline = None
1921
+
1805
1922
  with trace_lock:
1806
1923
  wrapper_rejections = to_int(wrapper_rejection_state.get("count"), 0)
1807
1924
  if wrapper_rejections >= 3:
@@ -1869,6 +1986,34 @@ def _run_codex_task(
1869
1986
  continue
1870
1987
  rejected_shell_wrappers.append(text)
1871
1988
 
1989
+ if no_edit_watchdog_fired:
1990
+ if no_edit_recovery_attempt < _MAX_NO_EDIT_RECOVERY_ATTEMPTS:
1991
+ retry_guidance = [
1992
+ *supplemental_guidance,
1993
+ _build_no_edit_recovery_guidance(trace_excerpt),
1994
+ ]
1995
+ return _run_codex_task(
1996
+ repo,
1997
+ instruction,
1998
+ retry_guidance,
1999
+ wrapper_recovery_attempt=wrapper_recovery_attempt,
2000
+ model_compatibility_recovery_attempt=model_compatibility_recovery_attempt,
2001
+ no_edit_recovery_attempt=no_edit_recovery_attempt + 1,
2002
+ model_override=model_override,
2003
+ baseline_changes=baseline_snapshot,
2004
+ )
2005
+ detail = "Codex spent too much of the execution budget without producing publishable file changes."
2006
+ if trace_excerpt:
2007
+ detail = f"{detail}\n{trace_excerpt}"
2008
+ return {
2009
+ "ok": False,
2010
+ "summary": "openai_codex made no publishable changes before the no-edit watchdog",
2011
+ "stdout": _truncate(stdout),
2012
+ "stderr": _truncate(f"{detail}\n{stderr}".strip()),
2013
+ "exitCode": 124,
2014
+ "usage": usage,
2015
+ }
2016
+
1872
2017
  if timed_out:
1873
2018
  detail = (
1874
2019
  f"codex exec timed out after {communicate_timeout_s}s"
@@ -1877,6 +2022,34 @@ def _run_codex_task(
1877
2022
  )
1878
2023
  if trace_excerpt:
1879
2024
  detail = f"{detail}\n{trace_excerpt}"
2025
+ _, _, effective_paths = _codex_changed_paths(repo, baseline_snapshot)
2026
+ if effective_paths:
2027
+ last_message = _read_text_if_exists(last_message_path)
2028
+ log_git_status(repo, log)
2029
+ prefix = (
2030
+ "Codex reached the execution timeout after producing publishable file "
2031
+ "changes. Returning the partial patch to QualityGate/ValidationGate "
2032
+ "instead of discarding it; any incomplete edit will be caught by the "
2033
+ "normal gates or revision loop."
2034
+ )
2035
+ return {
2036
+ "ok": True,
2037
+ "summary": (
2038
+ f"openai_codex timed out after modifying {len(effective_paths)} "
2039
+ "publishable file(s)"
2040
+ ),
2041
+ "stdout": _truncate(
2042
+ _build_success_stdout(
2043
+ effective_paths=effective_paths,
2044
+ last_message=last_message,
2045
+ trace_excerpt=trace_excerpt,
2046
+ prefix=prefix,
2047
+ )
2048
+ ),
2049
+ "stderr": _truncate(f"{detail}\n{stderr}".strip()),
2050
+ "exitCode": 0,
2051
+ "usage": usage,
2052
+ }
1880
2053
  return {
1881
2054
  "ok": False,
1882
2055
  "summary": "openai_codex execution timed out",
@@ -1975,6 +2148,7 @@ def _run_codex_task(
1975
2148
  ],
1976
2149
  wrapper_recovery_attempt=wrapper_recovery_attempt + 1,
1977
2150
  model_compatibility_recovery_attempt=model_compatibility_recovery_attempt,
2151
+ no_edit_recovery_attempt=no_edit_recovery_attempt,
1978
2152
  model_override=model_override,
1979
2153
  baseline_changes=baseline_snapshot,
1980
2154
  )
@@ -2057,6 +2231,7 @@ def _run_codex_task(
2057
2231
  effective_supplemental_guidance,
2058
2232
  wrapper_recovery_attempt=wrapper_recovery_attempt,
2059
2233
  model_compatibility_recovery_attempt=model_compatibility_recovery_attempt + 1,
2234
+ no_edit_recovery_attempt=no_edit_recovery_attempt,
2060
2235
  model_override=LEGACY_CODEX_MODEL_FALLBACK,
2061
2236
  baseline_changes=baseline_snapshot,
2062
2237
  )
@@ -30,6 +30,7 @@ from openai_codex_executor import (
30
30
  _build_wrapper_recovery_guidance,
31
31
  _run_codex_task,
32
32
  _resolve_reasoning_effort,
33
+ _resolve_task_reasoning_effort,
33
34
  _build_instruction,
34
35
  _collect_disallowed_shell_wrapper_rejections,
35
36
  _codex_changed_paths,
@@ -202,6 +203,24 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
202
203
  )
203
204
  self.assertEqual(_resolve_reasoning_effort(cfg, model="gpt-6-preview"), "xhigh")
204
205
 
206
+ def test_task_reasoning_effort_routes_compact_shell_tasks_to_high(self) -> None:
207
+ prompt = (
208
+ "Task planning contract from PushPals:\n"
209
+ "- Planning summary: intent=code_change, risk=low, priority=normal\n"
210
+ "- Route-entry/shell task rule: inspect the hinted route wrapper, then patch the owner.\n"
211
+ )
212
+
213
+ self.assertEqual(_resolve_task_reasoning_effort("xhigh", prompt, "gpt-5.5"), "high")
214
+ self.assertEqual(_resolve_task_reasoning_effort("high", prompt, "gpt-5.5"), "high")
215
+ self.assertEqual(
216
+ _resolve_task_reasoning_effort(
217
+ "xhigh",
218
+ "Merge-conflict rebase task with risk=low wording in reviewer text.",
219
+ "gpt-5.5",
220
+ ),
221
+ "xhigh",
222
+ )
223
+
205
224
  def test_runtime_config_prefers_explicit_config_dir_override(self) -> None:
206
225
  import executor_base
207
226
 
@@ -344,6 +363,43 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
344
363
  self.assertIn("prefer pure helper/state/style-prop tests", guidance)
345
364
  self.assertIn("full React Native/component render regression", guidance)
346
365
 
366
+ def test_parse_payload_adds_route_shell_convergence_guidance(self) -> None:
367
+ with tempfile.TemporaryDirectory(prefix="pushpals-shell-guidance-") as temp_dir:
368
+ repo = Path(temp_dir) / "repo"
369
+ repo.mkdir(parents=True, exist_ok=True)
370
+ payload = {
371
+ "kind": "task.execute",
372
+ "repo": str(repo),
373
+ "params": {
374
+ "instruction": (
375
+ "Polish the first-entry shell. Start with app/_layout.tsx and "
376
+ "app/index.tsx, then tighten the home/settings route-entry affordance."
377
+ ),
378
+ "schemaVersion": 2,
379
+ "planning": {
380
+ "intent": "code_change",
381
+ "riskLevel": "low",
382
+ "queuePriority": "normal",
383
+ "queueWaitBudgetMs": 90_000,
384
+ "executionBudgetMs": 1_200_000,
385
+ "finalizationBudgetMs": 120_000,
386
+ "scope": {"readAnywhere": True, "writeAllowed": True},
387
+ "targetPaths": ["app/_layout.tsx", "app/index.tsx"],
388
+ "acceptanceCriteria": ["Home shell feels coherent with the match UI"],
389
+ },
390
+ },
391
+ }
392
+ encoded = base64.b64encode(json.dumps(payload).encode("utf-8")).decode("ascii")
393
+
394
+ task = parse_task_execute_payload(["executor", encoded], logger=Logger("[test]"))
395
+ guidance = "\n".join(task.supplemental_guidance)
396
+
397
+ self.assertIn("Route-entry/shell task rule", guidance)
398
+ self.assertIn("route is thin", guidance)
399
+ self.assertIn("Do not keep re-reading navigation topology", guidance)
400
+ self.assertIn("missing test infrastructure", guidance)
401
+ self.assertIn("make one small visual/affordance patch", guidance)
402
+
347
403
  def test_detects_codex_workaround_signals(self) -> None:
348
404
  signal = _detect_codex_workaround_signal(
349
405
  "Adapting test to avoid external Codex calls because Codex CLI isn't available in this environment.",
@@ -610,6 +666,163 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
610
666
  self.assertIn("src/", str(result.get("stdout") or ""))
611
667
  self.assertNotIn("Recovered after Codex attempts", str(result.get("stdout") or ""))
612
668
 
669
+ def test_run_codex_task_hands_changed_worktree_to_gates_after_timeout(self) -> None:
670
+ with tempfile.TemporaryDirectory(prefix="pushpals-codex-timeout-changed-") as temp_dir:
671
+ repo = Path(temp_dir) / "repo"
672
+ repo.mkdir(parents=True, exist_ok=True)
673
+ (repo / "README.md").write_text("# timeout changed repo\n", encoding="utf-8")
674
+ subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
675
+ subprocess.run(
676
+ ["git", "config", "user.name", "PushPals Test"],
677
+ cwd=repo,
678
+ check=True,
679
+ capture_output=True,
680
+ text=True,
681
+ )
682
+ subprocess.run(
683
+ ["git", "config", "user.email", "pushpals-tests@example.com"],
684
+ cwd=repo,
685
+ check=True,
686
+ capture_output=True,
687
+ text=True,
688
+ )
689
+ subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
690
+ subprocess.run(
691
+ ["git", "commit", "-m", "chore: seed timeout changed repo"],
692
+ cwd=repo,
693
+ check=True,
694
+ capture_output=True,
695
+ text=True,
696
+ )
697
+
698
+ stub_path = Path(temp_dir) / "fake_codex_timeout_changed.py"
699
+ stub_path.write_text(
700
+ "\n".join(
701
+ [
702
+ "from pathlib import Path",
703
+ "import sys",
704
+ "import time",
705
+ "",
706
+ "argv = sys.argv[1:]",
707
+ "last_message_path = None",
708
+ "for index, arg in enumerate(argv):",
709
+ " if arg == '--output-last-message' and index + 1 < len(argv):",
710
+ " last_message_path = argv[index + 1]",
711
+ " break",
712
+ "",
713
+ "sys.stdin.read()",
714
+ "Path('src').mkdir(exist_ok=True)",
715
+ "Path('src/timeout-patch.txt').write_text('changed before timeout\\n', encoding='utf-8')",
716
+ "if last_message_path:",
717
+ " Path(last_message_path).write_text('Made a small patch before timeout.', encoding='utf-8')",
718
+ "print('item.completed | Made a small patch before timeout.', flush=True)",
719
+ "time.sleep(5)",
720
+ ]
721
+ ),
722
+ encoding="utf-8",
723
+ )
724
+
725
+ env_overrides = {
726
+ "PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
727
+ "PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
728
+ "OPENAI_API_KEY": "pushpals-timeout-changed-test-key",
729
+ "WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "1",
730
+ "WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
731
+ }
732
+ with mock.patch.dict(os.environ, env_overrides, clear=False):
733
+ result = _run_codex_task(
734
+ str(repo),
735
+ "Create a small file, then continue thinking too long.",
736
+ [],
737
+ )
738
+
739
+ self.assertTrue(result.get("ok"), result)
740
+ self.assertEqual(result.get("exitCode"), 0)
741
+ self.assertIn("timed out after modifying", str(result.get("summary") or ""))
742
+ self.assertIn("partial patch", str(result.get("stdout") or "").lower())
743
+ self.assertIn("src/", str(result.get("stdout") or ""))
744
+ self.assertIn("Made a small patch before timeout", str(result.get("stdout") or ""))
745
+
746
+ def test_run_codex_task_retries_once_when_no_edit_watchdog_fires(self) -> None:
747
+ with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-watchdog-") as temp_dir:
748
+ repo = Path(temp_dir) / "repo"
749
+ repo.mkdir(parents=True, exist_ok=True)
750
+ (repo / "README.md").write_text("# no edit watchdog repo\n", encoding="utf-8")
751
+ subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
752
+ subprocess.run(
753
+ ["git", "config", "user.name", "PushPals Test"],
754
+ cwd=repo,
755
+ check=True,
756
+ capture_output=True,
757
+ text=True,
758
+ )
759
+ subprocess.run(
760
+ ["git", "config", "user.email", "pushpals-tests@example.com"],
761
+ cwd=repo,
762
+ check=True,
763
+ capture_output=True,
764
+ text=True,
765
+ )
766
+ subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
767
+ subprocess.run(
768
+ ["git", "commit", "-m", "chore: seed no-edit watchdog repo"],
769
+ cwd=repo,
770
+ check=True,
771
+ capture_output=True,
772
+ text=True,
773
+ )
774
+
775
+ stub_path = Path(temp_dir) / "fake_codex_no_edit_watchdog.py"
776
+ stub_path.write_text(
777
+ "\n".join(
778
+ [
779
+ "from pathlib import Path",
780
+ "import sys",
781
+ "import time",
782
+ "",
783
+ "argv = sys.argv[1:]",
784
+ "last_message_path = None",
785
+ "for index, arg in enumerate(argv):",
786
+ " if arg == '--output-last-message' and index + 1 < len(argv):",
787
+ " last_message_path = argv[index + 1]",
788
+ " break",
789
+ "",
790
+ "prompt = sys.stdin.read()",
791
+ "if 'No-edit watchdog recovery' in prompt:",
792
+ " Path('src').mkdir(exist_ok=True)",
793
+ " Path('src/no-edit-retry.txt').write_text('patched on retry\\n', encoding='utf-8')",
794
+ " if last_message_path:",
795
+ " Path(last_message_path).write_text('Patched immediately after no-edit recovery.', encoding='utf-8')",
796
+ " print('item.completed | Patched immediately after no-edit recovery.', flush=True)",
797
+ " sys.exit(0)",
798
+ "",
799
+ "print('item.completed | Still inspecting route wrappers.', flush=True)",
800
+ "time.sleep(10)",
801
+ ]
802
+ ),
803
+ encoding="utf-8",
804
+ )
805
+
806
+ env_overrides = {
807
+ "PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
808
+ "PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
809
+ "OPENAI_API_KEY": "pushpals-no-edit-watchdog-test-key",
810
+ "WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
811
+ "WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
812
+ "WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
813
+ }
814
+ with mock.patch.dict(os.environ, env_overrides, clear=False):
815
+ result = _run_codex_task(
816
+ str(repo),
817
+ "Polish the first-entry home shell with a compact visual patch.",
818
+ [],
819
+ )
820
+
821
+ self.assertTrue(result.get("ok"), result)
822
+ self.assertEqual(result.get("exitCode"), 0)
823
+ self.assertIn("Patched immediately after no-edit recovery", str(result.get("stdout") or ""))
824
+ self.assertIn("src/", str(result.get("stdout") or ""))
825
+
613
826
  def test_codex_changed_paths_filters_dependency_artifacts_from_publishable_delta(self) -> None:
614
827
  with tempfile.TemporaryDirectory(prefix="pushpals-codex-artifact-delta-") as temp_dir:
615
828
  repo = Path(temp_dir) / "repo"
@@ -787,14 +787,58 @@ def _looks_like_visual_derivation_task(params: Dict[str, Any]) -> bool:
787
787
  return any(marker in text for marker in visual_markers)
788
788
 
789
789
 
790
+ def _looks_like_route_shell_task(params: Dict[str, Any]) -> bool:
791
+ text = _joined_task_text(params)
792
+ shell_markers = (
793
+ "route-entry",
794
+ "route entry",
795
+ "first-entry",
796
+ "first entry",
797
+ "startup shell",
798
+ "home shell",
799
+ "entry route",
800
+ "shell/navigation",
801
+ "app/_layout",
802
+ "app/index",
803
+ "homescreen",
804
+ "home screen",
805
+ "settingsscreen",
806
+ "settings screen",
807
+ "shopscreen",
808
+ "shop screen",
809
+ "help",
810
+ "game-over",
811
+ "game over",
812
+ "match-start",
813
+ "match start",
814
+ "return affordance",
815
+ )
816
+ return any(marker in text for marker in shell_markers)
817
+
818
+
790
819
  def _build_efficiency_guidance(params: Dict[str, Any]) -> str:
791
820
  lines: List[str] = [
792
821
  "Worker speed/convergence contract from PushPals:",
793
822
  "- Target useful completion in roughly 20 minutes for small or medium repo tasks; optimize for the smallest coherent patch over exhaustive exploration.",
794
- "- Phase soft budgets: discovery <= 5m, editing <= 10m, focused validation <= 5m, final diff review <= 2m. If a phase runs long, narrow scope rather than expanding the harness.",
795
- "- Test-harness soft budget: if setting up a focused test requires multiple new shared mocks, broad React Native shims, or repeated import fixes, stop building that harness and switch to smaller pure helper/state coverage.",
823
+ "- Phase soft budgets: discovery <= 3m for small scoped tasks and <= 5m otherwise, editing <= 10m, focused validation <= 5m, final diff review <= 2m. If a phase runs long, narrow scope rather than expanding the harness.",
824
+ "- No-edit checkpoint: if you have not made a patch after identifying the behavior-owning file, stop discovering and edit that file now. Do not spend the execution budget proving every adjacent assumption first.",
825
+ "- Discovery command budget: for compact tasks, use at most 5-8 targeted read/search commands before editing. If that is not enough, state the blocker and patch the best behavior owner rather than widening discovery.",
796
826
  ]
797
- if _looks_like_visual_derivation_task(params):
827
+ route_shell_task = _looks_like_route_shell_task(params)
828
+ visual_task = _looks_like_visual_derivation_task(params)
829
+ if route_shell_task or visual_task:
830
+ lines.append(
831
+ "- Test-harness soft budget: if setting up a focused test requires multiple new shared mocks, broad React Native shims, or repeated import fixes, stop building that harness and switch to smaller pure helper/state/style coverage.",
832
+ )
833
+ if route_shell_task:
834
+ lines.extend(
835
+ [
836
+ "- Route-entry/shell task rule: inspect the hinted route wrapper, then move immediately to the behavior-owning shell component when the route is thin. Do not keep re-reading navigation topology once the owner is found.",
837
+ "- Compact shell polish rule: make one small visual/affordance patch before chasing missing test infrastructure. If a referenced React Native mock or app/__tests__ path is absent, use existing nearby tests or a focused style/helper assertion instead of creating a broad render harness.",
838
+ "- Shell task deadline: by the first clear owner hypothesis, choose the home/settings/shop/help/game-over surface and patch it; ValidationGate can run long browser checks after your focused validation.",
839
+ ]
840
+ )
841
+ if visual_task:
798
842
  lines.extend(
799
843
  [
800
844
  "- Visual/rendering task rule: prefer pure helper/state/style-prop tests for derived visual cues. Use a full React Native/component render regression only if the repo already has a stable harness for that exact surface.",
@@ -809,6 +853,7 @@ def _build_planning_guidance(params: Dict[str, Any]) -> str:
809
853
  if not isinstance(planning, dict):
810
854
  return ""
811
855
 
856
+ compact_task = _looks_like_route_shell_task(params) or _looks_like_visual_derivation_task(params)
812
857
  lines: List[str] = ["Task planning contract from PushPals:"]
813
858
  intent = to_single_line(planning.get("intent"), 80)
814
859
  risk = to_single_line(planning.get("riskLevel"), 80)
@@ -856,17 +901,29 @@ def _build_planning_guidance(params: Dict[str, Any]) -> str:
856
901
  forbidden = _string_list(scope.get("forbiddenGlobs"), limit=8)
857
902
  _append_list_guidance(lines, "Forbidden path hints", forbidden)
858
903
 
859
- _append_list_guidance(lines, "Target path hints", _string_list(planning.get("targetPaths"), limit=12))
904
+ _append_list_guidance(
905
+ lines,
906
+ "Target path hints",
907
+ _string_list(planning.get("targetPaths"), limit=6 if compact_task else 12),
908
+ )
860
909
 
861
910
  discovery = planning.get("discovery")
862
911
  if isinstance(discovery, dict):
863
912
  _append_list_guidance(
864
913
  lines,
865
914
  "Suggested discovery commands",
866
- _string_list(discovery.get("ripgrepQueries"), limit=8),
915
+ _string_list(discovery.get("ripgrepQueries"), limit=4 if compact_task else 8),
916
+ )
917
+ _append_list_guidance(
918
+ lines,
919
+ "Likely directories",
920
+ _string_list(discovery.get("likelyDirs"), limit=4 if compact_task else 8),
921
+ )
922
+ _append_list_guidance(
923
+ lines,
924
+ "Search keywords",
925
+ _string_list(discovery.get("keywords"), limit=8 if compact_task else 12),
867
926
  )
868
- _append_list_guidance(lines, "Likely directories", _string_list(discovery.get("likelyDirs"), limit=8))
869
- _append_list_guidance(lines, "Search keywords", _string_list(discovery.get("keywords"), limit=12))
870
927
 
871
928
  _append_list_guidance(
872
929
  lines,
@@ -1145,6 +1145,21 @@ export function isLongRunningBrowserValidationCommand(command: string): boolean
1145
1145
  );
1146
1146
  }
1147
1147
 
1148
+ export function isParallelSafeFastValidationCommand(repo: string, command: string): boolean {
1149
+ if (isLongRunningBrowserValidationCommand(command)) return false;
1150
+ if (shouldEnsurePlaywrightBrowserRuntime(repo, command)) return false;
1151
+ const tokens = tokenizeValidationCommandArgv(command);
1152
+ if (!tokens || tokens.length === 0) return false;
1153
+ const lower = tokens.map((token) => token.toLowerCase());
1154
+ if (lower[0] !== "bun") return false;
1155
+ if (lower[1] === "test") return true;
1156
+ if (lower[1] === "x" && lower[2] === "tsc") return true;
1157
+ if (lower[1] === "run" && ["lint", "typecheck", "test", "test:unit"].includes(lower[2] ?? "")) {
1158
+ return true;
1159
+ }
1160
+ return false;
1161
+ }
1162
+
1148
1163
  function readPackageJson(repo: string): {
1149
1164
  scripts?: Record<string, unknown>;
1150
1165
  dependencies?: Record<string, unknown>;
@@ -3116,7 +3131,71 @@ async function runDeterministicQualityGate(
3116
3131
  );
3117
3132
  }
3118
3133
  const playwrightBrowserRuntimeReadyTargets = new Set<string>();
3119
- for (const command of commandsToRun) {
3134
+ for (let commandIndex = 0; commandIndex < commandsToRun.length; ) {
3135
+ const parallelBatch: string[] = [];
3136
+ while (
3137
+ commandIndex + parallelBatch.length < commandsToRun.length &&
3138
+ parallelBatch.length < 3
3139
+ ) {
3140
+ const candidate = commandsToRun[commandIndex + parallelBatch.length];
3141
+ if (!isParallelSafeFastValidationCommand(repo, candidate)) break;
3142
+ parallelBatch.push(candidate);
3143
+ }
3144
+ if (parallelBatch.length > 1) {
3145
+ onLog?.(
3146
+ "stdout",
3147
+ `[ValidationGate] Running fast validation batch in parallel: ${parallelBatch.join(" | ")}`,
3148
+ );
3149
+ const batchRuns = await Promise.all(
3150
+ parallelBatch.map(async (command) => {
3151
+ const commandMissingTools = requirementsForValidationCommand(
3152
+ toolchainPlan,
3153
+ command,
3154
+ ).filter((requirement) =>
3155
+ missingToolRequirements.some((missing) => missing.tool === requirement.tool),
3156
+ );
3157
+ if (commandMissingTools.length > 0) {
3158
+ const stderr = `Validation skipped before execution because required tool(s) are missing: ${formatMissingToolRequirements(
3159
+ commandMissingTools,
3160
+ )}.`;
3161
+ return {
3162
+ run: {
3163
+ step: command,
3164
+ command,
3165
+ ok: false,
3166
+ exitCode: 127,
3167
+ stdout: "",
3168
+ stderr,
3169
+ elapsedMs: 1,
3170
+ } satisfies ValidationExecutionResult,
3171
+ stream: "stderr" as const,
3172
+ summary: `[ValidationGate] Validation skipped (missing toolchain): ${command}`,
3173
+ };
3174
+ }
3175
+ const run = await runValidationCommand(
3176
+ repo,
3177
+ command,
3178
+ resolveValidationCommandTimeoutMs(command, qualityValidationStepTimeoutMs),
3179
+ outputPolicy,
3180
+ );
3181
+ const digest = run.ok ? "" : extractValidationFailureDigest(run);
3182
+ return {
3183
+ run,
3184
+ stream: (run.ok ? "stdout" : "stderr") as "stdout" | "stderr",
3185
+ summary: `[ValidationGate] ${run.ok ? "Passed" : "Failed"} (${run.elapsedMs}ms, exit ${run.exitCode}): ${command}${digest ? ` - ${digest}` : ""}`,
3186
+ };
3187
+ }),
3188
+ );
3189
+ for (const { run, stream, summary } of batchRuns) {
3190
+ validationRuns.push(run);
3191
+ onLog?.(stream, summary);
3192
+ }
3193
+ commandIndex += parallelBatch.length;
3194
+ continue;
3195
+ }
3196
+
3197
+ const command = commandsToRun[commandIndex];
3198
+ commandIndex += 1;
3120
3199
  const commandMissingTools = requirementsForValidationCommand(toolchainPlan, command).filter(
3121
3200
  (requirement) =>
3122
3201
  missingToolRequirements.some((missing) => missing.tool === requirement.tool),
@@ -6665,6 +6744,7 @@ export async function executeJob(
6665
6744
  const previousValidationFailureDigests = new Map<string, string>();
6666
6745
  const failureJobFamily = buildTaskFailureJobFamily(normalizedParams);
6667
6746
  while (revisionAttempt <= qualityRevisionLoopMax) {
6747
+ const attemptStartedAt = Date.now();
6668
6748
  const attemptParams: Record<string, unknown> = { ...normalizedParams };
6669
6749
  if (revisionHint) {
6670
6750
  attemptParams.qualityRevisionHint = revisionHint;
@@ -6683,6 +6763,7 @@ export async function executeJob(
6683
6763
  }
6684
6764
  let result: Awaited<ReturnType<typeof runExecutor>> | null = null;
6685
6765
  let mergeConflictPass = 0;
6766
+ let executorElapsedMs = 0;
6686
6767
  while (true) {
6687
6768
  const currentResult = await runExecutor(
6688
6769
  kind,
@@ -6751,6 +6832,7 @@ export async function executeJob(
6751
6832
  exitCode: 4,
6752
6833
  };
6753
6834
  }
6835
+ executorElapsedMs = Date.now() - attemptStartedAt;
6754
6836
 
6755
6837
  const preQualityStatus = await git(repo, ["status", "--porcelain"]);
6756
6838
  const preQualityChangedPaths = preQualityStatus.ok
@@ -6799,6 +6881,7 @@ export async function executeJob(
6799
6881
  };
6800
6882
  }
6801
6883
 
6884
+ const qualityStartedAt = Date.now();
6802
6885
  const quality = await runDeterministicQualityGate(
6803
6886
  repo,
6804
6887
  attemptParams,
@@ -6810,6 +6893,15 @@ export async function executeJob(
6810
6893
  revisionAttempt,
6811
6894
  },
6812
6895
  );
6896
+ const qualityElapsedMs = Date.now() - qualityStartedAt;
6897
+ const validationCommandElapsedMs = quality.validationRuns.reduce(
6898
+ (total, run) => total + Math.max(0, Number(run.elapsedMs) || 0),
6899
+ 0,
6900
+ );
6901
+ onLog?.(
6902
+ "stdout",
6903
+ `[JobRunner] Performance summary: attempt=${revisionAttempt}, executor=${executorElapsedMs}ms, quality=${qualityElapsedMs}ms, validation_commands=${quality.validationRuns.length}, validation_command_time=${validationCommandElapsedMs}ms, changed_files=${quality.changedPaths.length}`,
6904
+ );
6813
6905
  let browserRepairPacket = buildBrowserValidationRepairPacket(
6814
6906
  quality.validationRuns,
6815
6907
  previousValidationFailureDigests,