@pushpalsdev/cli 1.1.36 → 1.1.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/runtime/sandbox/.pushpals-remotebuddy-fallback.js +30 -3
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/openai_codex_executor.py +336 -16
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/test_openai_codex_runtime_config.py +442 -2
- package/runtime/sandbox/apps/workerpals/src/common/generic_python_executor.ts +4 -4
- package/runtime/sandbox/apps/workerpals/src/execute_job.ts +44 -2
package/package.json
CHANGED
|
@@ -6545,6 +6545,33 @@ function sanitizeForGitRef(value) {
|
|
|
6545
6545
|
const text = value.trim().replace(/[^A-Za-z0-9._-]/g, "-");
|
|
6546
6546
|
return text || "default";
|
|
6547
6547
|
}
|
|
6548
|
+
function isSafeGitBranchName(value) {
|
|
6549
|
+
const text = String(value ?? "").trim();
|
|
6550
|
+
if (!text || text.length > 200)
|
|
6551
|
+
return false;
|
|
6552
|
+
if (text.startsWith("-") || text.startsWith("/") || text.endsWith("/"))
|
|
6553
|
+
return false;
|
|
6554
|
+
if (text.endsWith(".") || text.endsWith(".lock"))
|
|
6555
|
+
return false;
|
|
6556
|
+
if (text.includes("..") || text.includes("//") || text.includes("@{"))
|
|
6557
|
+
return false;
|
|
6558
|
+
return !/[\\\s~^:?*\[\]\x00-\x1F\x7F]/.test(text);
|
|
6559
|
+
}
|
|
6560
|
+
function normalizeConfiguredGitBranchName(value, fallback, label = "branch") {
|
|
6561
|
+
const candidate = String(value ?? "").trim();
|
|
6562
|
+
if (isSafeGitBranchName(candidate))
|
|
6563
|
+
return candidate;
|
|
6564
|
+
const safeFallback = isSafeGitBranchName(fallback) ? fallback : "main";
|
|
6565
|
+
console.warn(`[RemoteBuddyAutonomousEngine] Ignoring unsafe ${label} ref ${JSON.stringify(candidate)}; using ${safeFallback}.`);
|
|
6566
|
+
return safeFallback;
|
|
6567
|
+
}
|
|
6568
|
+
function normalizeConfiguredGitRemoteName(value, fallback = "origin") {
|
|
6569
|
+
const candidate = String(value ?? "").trim();
|
|
6570
|
+
if (/^[A-Za-z0-9._-]+$/.test(candidate) && !candidate.startsWith("-"))
|
|
6571
|
+
return candidate;
|
|
6572
|
+
console.warn(`[RemoteBuddyAutonomousEngine] Ignoring unsafe git remote ${JSON.stringify(candidate)}; using ${fallback}.`);
|
|
6573
|
+
return fallback;
|
|
6574
|
+
}
|
|
6548
6575
|
async function repoPreflight(repo) {
|
|
6549
6576
|
const porcelain = await gitOutput(repo, ["status", "--porcelain"]);
|
|
6550
6577
|
const mergeHead = await gitOutput(repo, ["rev-parse", "-q", "--verify", "MERGE_HEAD"]);
|
|
@@ -6594,9 +6621,9 @@ class RemoteBuddyAutonomousEngine {
|
|
|
6594
6621
|
const safeSession = sanitizeForGitRef(this.sessionId).slice(0, 40);
|
|
6595
6622
|
this.autonomyRepo = resolve4(this.repoRoot, ".worktrees", `remotebuddy-autonomy-${safeSession}`);
|
|
6596
6623
|
this.autonomyBranch = `_remotebuddy/autonomy-${safeSession}`;
|
|
6597
|
-
this.gitRemote = String(opts.config.sourceControlManager.remote || "origin")
|
|
6598
|
-
this.integrationBranch = String(opts.config.sourceControlManager.mainBranch || "main_agents")
|
|
6599
|
-
this.baseBranch = String(opts.config.sourceControlManager.baseBranch || "main")
|
|
6624
|
+
this.gitRemote = normalizeConfiguredGitRemoteName(String(opts.config.sourceControlManager.remote || "origin"), "origin");
|
|
6625
|
+
this.integrationBranch = normalizeConfiguredGitBranchName(String(opts.config.sourceControlManager.mainBranch || "main_agents"), "main_agents", "integration branch");
|
|
6626
|
+
this.baseBranch = normalizeConfiguredGitBranchName(String(opts.config.sourceControlManager.baseBranch || "main"), "main", "base branch");
|
|
6600
6627
|
this.llm = opts.llm;
|
|
6601
6628
|
this.comm = opts.comm;
|
|
6602
6629
|
this.llmCfg = opts.config.remotebuddy.llm;
|
|
@@ -116,7 +116,10 @@ _WEB_REVIEW_NO_EDIT_WATCHDOG_S = 240
|
|
|
116
116
|
_BACKGROUND_NO_EDIT_WATCHDOG_S = 120
|
|
117
117
|
_NO_EDIT_RECOVERY_WATCHDOG_S = 90
|
|
118
118
|
_DEFAULT_NO_EDIT_RECHECK_S = 120
|
|
119
|
+
_NO_EDIT_RECOVERY_RECHECK_S = 30
|
|
119
120
|
_DEFAULT_NO_EDIT_COMMAND_GRACE_S = 240
|
|
121
|
+
_DEFAULT_NO_EDIT_COMMAND_PROGRESS_CAP_S = 360
|
|
122
|
+
_NO_EDIT_RECOVERY_COMMAND_PROGRESS_CAP_S = 120
|
|
120
123
|
_DEFAULT_STARTUP_STALL_WATCHDOG_S = 210
|
|
121
124
|
_RECOVERY_STARTUP_STALL_WATCHDOG_S = 150
|
|
122
125
|
_DEFAULT_ROLLOUT_WATCHDOG_S = 300
|
|
@@ -124,6 +127,7 @@ _SMALL_TASK_ROLLOUT_WATCHDOG_S = 240
|
|
|
124
127
|
_NARROW_TEST_TASK_ROLLOUT_WATCHDOG_S = 150
|
|
125
128
|
_WEB_REVIEW_ROLLOUT_WATCHDOG_S = 180
|
|
126
129
|
_BACKGROUND_ROLLOUT_WATCHDOG_S = 90
|
|
130
|
+
_MIN_CODEX_RECOVERY_ATTEMPT_S = 120
|
|
127
131
|
_NO_PUBLISHABLE_FAILURE_COOLDOWN_MS = 10 * 60 * 1000
|
|
128
132
|
_CODEX_STARTUP_ONLY_EVENT_TYPES = {"thread.started", "turn.started"}
|
|
129
133
|
|
|
@@ -609,11 +613,19 @@ def _looks_like_small_task_prompt(prompt: str) -> bool:
|
|
|
609
613
|
"contract-level tests",
|
|
610
614
|
"contract around",
|
|
611
615
|
"contract coverage",
|
|
616
|
+
"focused contract coverage",
|
|
612
617
|
"ranking contract",
|
|
613
618
|
"regression coverage",
|
|
619
|
+
"focused coverage",
|
|
614
620
|
"focused regression",
|
|
615
621
|
"focused scenario",
|
|
616
622
|
"targeted test",
|
|
623
|
+
"small deterministic",
|
|
624
|
+
"review-fix",
|
|
625
|
+
"review fix",
|
|
626
|
+
"rejected pr",
|
|
627
|
+
"must-fix",
|
|
628
|
+
"cleanup harness",
|
|
617
629
|
"one-file",
|
|
618
630
|
"one file",
|
|
619
631
|
"single-file",
|
|
@@ -648,13 +660,16 @@ def _looks_like_narrow_test_task_prompt(prompt: str) -> bool:
|
|
|
648
660
|
"contract-level tests",
|
|
649
661
|
"contract around",
|
|
650
662
|
"contract coverage",
|
|
663
|
+
"focused contract coverage",
|
|
651
664
|
"ranking contract",
|
|
652
665
|
"regression coverage",
|
|
666
|
+
"focused coverage",
|
|
653
667
|
"focused regression",
|
|
654
668
|
"test-only",
|
|
655
669
|
"test only",
|
|
656
670
|
"targeted test",
|
|
657
671
|
"focused scenario",
|
|
672
|
+
"cleanup harness",
|
|
658
673
|
)
|
|
659
674
|
if not any(marker in text for marker in narrow_markers):
|
|
660
675
|
return False
|
|
@@ -668,6 +683,13 @@ def _looks_like_narrow_test_task_prompt(prompt: str) -> bool:
|
|
|
668
683
|
return not any(marker in text for marker in broad_markers)
|
|
669
684
|
|
|
670
685
|
|
|
686
|
+
def _minimum_recovery_attempt_seconds(requested_timeout_s: Optional[int]) -> int:
|
|
687
|
+
if not requested_timeout_s or requested_timeout_s <= 0:
|
|
688
|
+
return _MIN_CODEX_RECOVERY_ATTEMPT_S
|
|
689
|
+
scaled_s = max(1, int(requested_timeout_s * 0.25))
|
|
690
|
+
return max(1, min(_MIN_CODEX_RECOVERY_ATTEMPT_S, scaled_s))
|
|
691
|
+
|
|
692
|
+
|
|
671
693
|
def _resolve_task_reasoning_effort(
|
|
672
694
|
configured_effort: str,
|
|
673
695
|
prompt: str,
|
|
@@ -743,7 +765,10 @@ def _resolve_no_edit_watchdog_seconds(
|
|
|
743
765
|
return max(floor_s, min(default_s, max(floor_s, communicate_timeout_s - 60)))
|
|
744
766
|
|
|
745
767
|
|
|
746
|
-
def _resolve_no_edit_recheck_seconds(
|
|
768
|
+
def _resolve_no_edit_recheck_seconds(
|
|
769
|
+
communicate_timeout_s: Optional[int],
|
|
770
|
+
recovery_attempt: int = 0,
|
|
771
|
+
) -> int:
|
|
747
772
|
raw = os.environ.get("WORKERPALS_OPENAI_CODEX_NO_EDIT_RECHECK_S", "").strip()
|
|
748
773
|
if raw:
|
|
749
774
|
parsed = _to_positive_int(raw)
|
|
@@ -754,8 +779,13 @@ def _resolve_no_edit_recheck_seconds(communicate_timeout_s: Optional[int]) -> in
|
|
|
754
779
|
else:
|
|
755
780
|
upper = max(1, (communicate_timeout_s or parsed + 1) - 1)
|
|
756
781
|
return max(1, min(parsed, upper))
|
|
757
|
-
|
|
758
|
-
|
|
782
|
+
default_s = (
|
|
783
|
+
_NO_EDIT_RECOVERY_RECHECK_S
|
|
784
|
+
if recovery_attempt > 0
|
|
785
|
+
else _DEFAULT_NO_EDIT_RECHECK_S
|
|
786
|
+
)
|
|
787
|
+
upper = max(1, (communicate_timeout_s or default_s + 1) - 1)
|
|
788
|
+
return max(1, min(default_s, upper))
|
|
759
789
|
|
|
760
790
|
|
|
761
791
|
def _resolve_no_edit_command_grace_seconds(communicate_timeout_s: Optional[int]) -> Optional[int]:
|
|
@@ -779,6 +809,36 @@ def _resolve_no_edit_command_grace_seconds(communicate_timeout_s: Optional[int])
|
|
|
779
809
|
return max(1, min(_DEFAULT_NO_EDIT_COMMAND_GRACE_S, upper))
|
|
780
810
|
|
|
781
811
|
|
|
812
|
+
def _resolve_no_edit_command_progress_cap_seconds(
|
|
813
|
+
communicate_timeout_s: Optional[int],
|
|
814
|
+
no_edit_command_grace_s: Optional[int],
|
|
815
|
+
recovery_attempt: int = 0,
|
|
816
|
+
) -> Optional[int]:
|
|
817
|
+
if not communicate_timeout_s or no_edit_command_grace_s is None:
|
|
818
|
+
return None
|
|
819
|
+
|
|
820
|
+
raw = os.environ.get("WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_PROGRESS_CAP_S", "").strip()
|
|
821
|
+
if raw:
|
|
822
|
+
if raw == "0":
|
|
823
|
+
return None
|
|
824
|
+
parsed = _to_positive_int(raw)
|
|
825
|
+
if parsed is None:
|
|
826
|
+
log.info(
|
|
827
|
+
"Invalid WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_PROGRESS_CAP_S="
|
|
828
|
+
f"{raw!r}; using default command-progress cap."
|
|
829
|
+
)
|
|
830
|
+
else:
|
|
831
|
+
return max(1, min(parsed, max(1, communicate_timeout_s - 1)))
|
|
832
|
+
|
|
833
|
+
default_s = (
|
|
834
|
+
_NO_EDIT_RECOVERY_COMMAND_PROGRESS_CAP_S
|
|
835
|
+
if recovery_attempt > 0
|
|
836
|
+
else _DEFAULT_NO_EDIT_COMMAND_PROGRESS_CAP_S
|
|
837
|
+
)
|
|
838
|
+
upper = max(1, communicate_timeout_s - 1)
|
|
839
|
+
return max(1, min(default_s, upper))
|
|
840
|
+
|
|
841
|
+
|
|
782
842
|
def _resolve_startup_stall_watchdog_seconds(
|
|
783
843
|
communicate_timeout_s: Optional[int],
|
|
784
844
|
recovery_attempt: int = 0,
|
|
@@ -2091,6 +2151,118 @@ def _codex_changed_paths(repo: str, baseline_snapshot: Any) -> Tuple[List[str],
|
|
|
2091
2151
|
return changed_paths, delta, effective
|
|
2092
2152
|
|
|
2093
2153
|
|
|
2154
|
+
def _safe_repo_relative_path(repo: str, path: str) -> Optional[Path]:
|
|
2155
|
+
raw = str(path or "").replace("\\", "/").strip()
|
|
2156
|
+
if not raw or raw.startswith("/") or re.match(r"^[A-Za-z]:", raw):
|
|
2157
|
+
return None
|
|
2158
|
+
parts = [part for part in raw.split("/") if part]
|
|
2159
|
+
if not parts or any(part in ("..", ".") for part in parts):
|
|
2160
|
+
return None
|
|
2161
|
+
try:
|
|
2162
|
+
repo_path = Path(repo).resolve()
|
|
2163
|
+
candidate = (repo_path / Path(*parts)).resolve()
|
|
2164
|
+
candidate.relative_to(repo_path)
|
|
2165
|
+
return candidate
|
|
2166
|
+
except Exception:
|
|
2167
|
+
return None
|
|
2168
|
+
|
|
2169
|
+
|
|
2170
|
+
def _git_status_entries(repo: str) -> List[Tuple[str, str]]:
|
|
2171
|
+
try:
|
|
2172
|
+
proc = subprocess.run(
|
|
2173
|
+
["git", "status", "--porcelain"],
|
|
2174
|
+
cwd=repo,
|
|
2175
|
+
capture_output=True,
|
|
2176
|
+
text=True,
|
|
2177
|
+
timeout=20,
|
|
2178
|
+
check=False,
|
|
2179
|
+
)
|
|
2180
|
+
except Exception:
|
|
2181
|
+
return []
|
|
2182
|
+
if proc.returncode != 0:
|
|
2183
|
+
return []
|
|
2184
|
+
entries: List[Tuple[str, str]] = []
|
|
2185
|
+
for raw_line in proc.stdout.splitlines():
|
|
2186
|
+
line = str(raw_line or "").rstrip("\r\n")
|
|
2187
|
+
if len(line) < 4:
|
|
2188
|
+
continue
|
|
2189
|
+
status = line[:2]
|
|
2190
|
+
path = line[3:].strip()
|
|
2191
|
+
if " -> " in path:
|
|
2192
|
+
path = path.split(" -> ", 1)[1].strip()
|
|
2193
|
+
if path:
|
|
2194
|
+
entries.append((status, path))
|
|
2195
|
+
return entries
|
|
2196
|
+
|
|
2197
|
+
|
|
2198
|
+
def _restore_retry_baseline(repo: str, baseline_snapshot: Any, reason: str = "") -> bool:
|
|
2199
|
+
_changed_paths, delta_paths, _effective_paths = _codex_changed_paths(repo, baseline_snapshot)
|
|
2200
|
+
if not delta_paths:
|
|
2201
|
+
return True
|
|
2202
|
+
baseline_paths = set(_baseline_snapshot_paths(baseline_snapshot))
|
|
2203
|
+
unsafe_delta = [path for path in delta_paths if _safe_repo_relative_path(repo, path) is None]
|
|
2204
|
+
if unsafe_delta:
|
|
2205
|
+
log.info(
|
|
2206
|
+
"Rollout recovery cannot safely restore worker sandbox baseline; unsafe changed paths: "
|
|
2207
|
+
f"{_describe_publishable_paths(unsafe_delta)}"
|
|
2208
|
+
)
|
|
2209
|
+
return False
|
|
2210
|
+
mutated_baseline_paths = [path for path in delta_paths if path in baseline_paths]
|
|
2211
|
+
if mutated_baseline_paths:
|
|
2212
|
+
log.info(
|
|
2213
|
+
"Rollout recovery will not reset paths that were already dirty at baseline: "
|
|
2214
|
+
f"{_describe_publishable_paths(mutated_baseline_paths)}"
|
|
2215
|
+
)
|
|
2216
|
+
return False
|
|
2217
|
+
|
|
2218
|
+
log.info(
|
|
2219
|
+
"Restoring worker sandbox baseline before rollout recovery retry"
|
|
2220
|
+
f"{f' ({reason})' if reason else ''}: {_describe_publishable_paths(delta_paths)}"
|
|
2221
|
+
)
|
|
2222
|
+
try:
|
|
2223
|
+
subprocess.run(
|
|
2224
|
+
["git", "restore", "--staged", "--worktree", "--", *delta_paths],
|
|
2225
|
+
cwd=repo,
|
|
2226
|
+
capture_output=True,
|
|
2227
|
+
text=True,
|
|
2228
|
+
timeout=30,
|
|
2229
|
+
check=False,
|
|
2230
|
+
)
|
|
2231
|
+
except Exception as exc:
|
|
2232
|
+
log.info(f"Failed to run git restore for rollout recovery baseline: {exc}")
|
|
2233
|
+
return False
|
|
2234
|
+
|
|
2235
|
+
delta_set = set(delta_paths)
|
|
2236
|
+
for status, path in _git_status_entries(repo):
|
|
2237
|
+
if status != "??":
|
|
2238
|
+
continue
|
|
2239
|
+
if path not in delta_set and not any(path.startswith(f"{delta.rstrip('/')}/") for delta in delta_set):
|
|
2240
|
+
continue
|
|
2241
|
+
candidate = _safe_repo_relative_path(repo, path)
|
|
2242
|
+
if candidate is None:
|
|
2243
|
+
return False
|
|
2244
|
+
try:
|
|
2245
|
+
if candidate.is_dir():
|
|
2246
|
+
rmtree(candidate)
|
|
2247
|
+
elif candidate.exists():
|
|
2248
|
+
candidate.unlink()
|
|
2249
|
+
except Exception as exc:
|
|
2250
|
+
log.info(f"Failed to remove untracked rollout recovery path {path}: {exc}")
|
|
2251
|
+
return False
|
|
2252
|
+
|
|
2253
|
+
_remaining_changed, remaining_delta, remaining_effective = _codex_changed_paths(
|
|
2254
|
+
repo,
|
|
2255
|
+
baseline_snapshot,
|
|
2256
|
+
)
|
|
2257
|
+
if remaining_delta:
|
|
2258
|
+
log.info(
|
|
2259
|
+
"Rollout recovery baseline restore left changed paths after cleanup: "
|
|
2260
|
+
f"{_describe_publishable_paths(remaining_effective or remaining_delta)}"
|
|
2261
|
+
)
|
|
2262
|
+
return False
|
|
2263
|
+
return True
|
|
2264
|
+
|
|
2265
|
+
|
|
2094
2266
|
def _changed_path_top_level(path: str) -> str:
|
|
2095
2267
|
raw = str(path or "").replace("\\", "/").strip()
|
|
2096
2268
|
is_top_level_directory = raw.endswith("/")
|
|
@@ -2183,6 +2355,7 @@ def _run_codex_task(
|
|
|
2183
2355
|
rollout_recovery_attempt: int = 0,
|
|
2184
2356
|
model_override: Optional[str] = None,
|
|
2185
2357
|
baseline_changes: Optional[List[str]] = None,
|
|
2358
|
+
execution_deadline_monotonic: Optional[float] = None,
|
|
2186
2359
|
) -> Dict[str, Any]:
|
|
2187
2360
|
global _ACTIVE_CHILD, _INTERRUPTED_SIGNAL
|
|
2188
2361
|
_INTERRUPTED_SIGNAL = None
|
|
@@ -2242,7 +2415,39 @@ def _run_codex_task(
|
|
|
2242
2415
|
)
|
|
2243
2416
|
# JSON event output is noisy by default; prefer plain text + output-last-message.
|
|
2244
2417
|
use_json = runtime_config.json_output
|
|
2245
|
-
|
|
2418
|
+
requested_communicate_timeout_s = _resolve_communicate_timeout_seconds(runtime_config)
|
|
2419
|
+
recovery_depth = (
|
|
2420
|
+
wrapper_recovery_attempt
|
|
2421
|
+
+ model_compatibility_recovery_attempt
|
|
2422
|
+
+ startup_stall_recovery_attempt
|
|
2423
|
+
+ no_edit_recovery_attempt
|
|
2424
|
+
+ rollout_recovery_attempt
|
|
2425
|
+
)
|
|
2426
|
+
communicate_timeout_s = requested_communicate_timeout_s
|
|
2427
|
+
overall_deadline = execution_deadline_monotonic
|
|
2428
|
+
if requested_communicate_timeout_s and requested_communicate_timeout_s > 0:
|
|
2429
|
+
if overall_deadline is None:
|
|
2430
|
+
overall_deadline = time.monotonic() + float(requested_communicate_timeout_s)
|
|
2431
|
+
else:
|
|
2432
|
+
remaining_s = int(max(0.0, overall_deadline - time.monotonic()))
|
|
2433
|
+
min_attempt_s = (
|
|
2434
|
+
_minimum_recovery_attempt_seconds(requested_communicate_timeout_s)
|
|
2435
|
+
if recovery_depth > 0
|
|
2436
|
+
else 1
|
|
2437
|
+
)
|
|
2438
|
+
if remaining_s < min_attempt_s:
|
|
2439
|
+
return {
|
|
2440
|
+
"ok": False,
|
|
2441
|
+
"summary": "openai_codex recovery budget exhausted before retry",
|
|
2442
|
+
"stderr": (
|
|
2443
|
+
"Codex recovery was requested, but the shared executor budget had only "
|
|
2444
|
+
f"{remaining_s}s remaining (< {min_attempt_s}s). Stopping before a low-odds "
|
|
2445
|
+
"retry so ValidationGate/QualityGate can return a structured result."
|
|
2446
|
+
),
|
|
2447
|
+
"exitCode": 124,
|
|
2448
|
+
"cooldownMs": _NO_PUBLISHABLE_FAILURE_COOLDOWN_MS,
|
|
2449
|
+
}
|
|
2450
|
+
communicate_timeout_s = max(1, min(requested_communicate_timeout_s, remaining_s))
|
|
2246
2451
|
effective_supplemental_guidance = _augment_supplemental_guidance(supplemental_guidance)
|
|
2247
2452
|
prompt = _build_instruction(instruction, effective_supplemental_guidance)
|
|
2248
2453
|
reasoning_effort = _resolve_task_reasoning_effort(
|
|
@@ -2484,6 +2689,7 @@ def _run_codex_task(
|
|
|
2484
2689
|
rollout_watchdog_reason = ""
|
|
2485
2690
|
rollout_artifact_only_paths = ""
|
|
2486
2691
|
rollout_watchdog_retryable = True
|
|
2692
|
+
rollout_restore_before_retry = False
|
|
2487
2693
|
command_policy_rejection_loop = False
|
|
2488
2694
|
no_edit_watchdog_s = (
|
|
2489
2695
|
_resolve_no_edit_watchdog_seconds(
|
|
@@ -2494,8 +2700,16 @@ def _run_codex_task(
|
|
|
2494
2700
|
if no_edit_recovery_attempt <= _MAX_NO_EDIT_RECOVERY_ATTEMPTS
|
|
2495
2701
|
else None
|
|
2496
2702
|
)
|
|
2497
|
-
no_edit_recheck_s = _resolve_no_edit_recheck_seconds(
|
|
2703
|
+
no_edit_recheck_s = _resolve_no_edit_recheck_seconds(
|
|
2704
|
+
communicate_timeout_s,
|
|
2705
|
+
recovery_attempt=recovery_depth,
|
|
2706
|
+
)
|
|
2498
2707
|
no_edit_command_grace_s = _resolve_no_edit_command_grace_seconds(communicate_timeout_s)
|
|
2708
|
+
no_edit_command_progress_cap_s = _resolve_no_edit_command_progress_cap_seconds(
|
|
2709
|
+
communicate_timeout_s,
|
|
2710
|
+
no_edit_command_grace_s,
|
|
2711
|
+
recovery_attempt=recovery_depth,
|
|
2712
|
+
)
|
|
2499
2713
|
startup_stall_watchdog_s = _resolve_startup_stall_watchdog_seconds(
|
|
2500
2714
|
communicate_timeout_s,
|
|
2501
2715
|
recovery_attempt=startup_stall_recovery_attempt,
|
|
@@ -2519,16 +2733,15 @@ def _run_codex_task(
|
|
|
2519
2733
|
if no_edit_watchdog_s is not None
|
|
2520
2734
|
else None
|
|
2521
2735
|
)
|
|
2522
|
-
no_edit_command_grace_cap_deadline = (
|
|
2523
|
-
started_at + float(no_edit_watchdog_s + no_edit_command_grace_s)
|
|
2524
|
-
if no_edit_watchdog_s is not None and no_edit_command_grace_s is not None
|
|
2525
|
-
else None
|
|
2526
|
-
)
|
|
2527
2736
|
rollout_deadline = (
|
|
2528
2737
|
started_at + float(rollout_watchdog_s)
|
|
2529
2738
|
if rollout_watchdog_s is not None
|
|
2530
2739
|
else None
|
|
2531
2740
|
)
|
|
2741
|
+
publishable_progress_seen_at: Optional[float] = None
|
|
2742
|
+
publishable_progress_finalized = False
|
|
2743
|
+
publishable_progress_paths: List[str] = []
|
|
2744
|
+
first_no_edit_command_progress_at: Optional[float] = None
|
|
2532
2745
|
|
|
2533
2746
|
while proc.poll() is None:
|
|
2534
2747
|
now = time.monotonic()
|
|
@@ -2595,22 +2808,50 @@ def _run_codex_task(
|
|
|
2595
2808
|
)
|
|
2596
2809
|
except Exception:
|
|
2597
2810
|
last_command_activity_at = 0.0
|
|
2811
|
+
command_progress_cap_reached = False
|
|
2812
|
+
command_progress_elapsed_s = 0
|
|
2598
2813
|
if command_event_count > 0 and no_edit_command_grace_s is not None:
|
|
2814
|
+
observed_command_progress_at = (
|
|
2815
|
+
last_command_activity_at if last_command_activity_at > 0 else now
|
|
2816
|
+
)
|
|
2817
|
+
if first_no_edit_command_progress_at is None:
|
|
2818
|
+
first_no_edit_command_progress_at = observed_command_progress_at
|
|
2819
|
+
if no_edit_command_progress_cap_s is not None:
|
|
2820
|
+
command_progress_cap_deadline = (
|
|
2821
|
+
first_no_edit_command_progress_at
|
|
2822
|
+
+ float(no_edit_command_progress_cap_s)
|
|
2823
|
+
)
|
|
2824
|
+
command_progress_elapsed_s = int(
|
|
2825
|
+
max(0.0, now - first_no_edit_command_progress_at)
|
|
2826
|
+
)
|
|
2827
|
+
if now >= command_progress_cap_deadline:
|
|
2828
|
+
command_progress_cap_reached = True
|
|
2599
2829
|
command_grace_deadline = 0.0
|
|
2600
2830
|
if active_command_count > 0:
|
|
2601
2831
|
# Do not kill while Codex is actively running a tool command; poll
|
|
2602
|
-
# again soon, but keep
|
|
2832
|
+
# again soon, but keep endless read-only discovery bounded by the
|
|
2833
|
+
# command-progress cap above.
|
|
2603
2834
|
command_grace_deadline = now + min(60.0, float(no_edit_command_grace_s))
|
|
2604
2835
|
elif last_command_activity_at > 0:
|
|
2605
2836
|
command_grace_deadline = last_command_activity_at + float(
|
|
2606
2837
|
no_edit_command_grace_s
|
|
2607
2838
|
)
|
|
2608
|
-
if
|
|
2839
|
+
if (
|
|
2840
|
+
no_edit_command_progress_cap_s is not None
|
|
2841
|
+
and first_no_edit_command_progress_at is not None
|
|
2842
|
+
):
|
|
2609
2843
|
command_grace_deadline = min(
|
|
2610
2844
|
command_grace_deadline,
|
|
2611
|
-
|
|
2845
|
+
first_no_edit_command_progress_at
|
|
2846
|
+
+ float(no_edit_command_progress_cap_s),
|
|
2612
2847
|
)
|
|
2613
|
-
if
|
|
2848
|
+
if command_progress_cap_reached:
|
|
2849
|
+
log.info(
|
|
2850
|
+
"No-edit watchdog observed Codex tool progress for "
|
|
2851
|
+
f"{command_progress_elapsed_s}s without a publishable patch; "
|
|
2852
|
+
"forcing patch-first recovery instead of waiting for the child timeout."
|
|
2853
|
+
)
|
|
2854
|
+
elif command_grace_deadline > now:
|
|
2614
2855
|
no_edit_deadline = command_grace_deadline
|
|
2615
2856
|
remaining_s = int(max(1.0, command_grace_deadline - now))
|
|
2616
2857
|
command_detail = (
|
|
@@ -2645,6 +2886,22 @@ def _run_codex_task(
|
|
|
2645
2886
|
)
|
|
2646
2887
|
_terminate_active_child()
|
|
2647
2888
|
break
|
|
2889
|
+
if publishable_progress_seen_at is None:
|
|
2890
|
+
publishable_progress_seen_at = now
|
|
2891
|
+
publishable_progress_paths = list(effective_paths)
|
|
2892
|
+
elif _has_credible_shell_wrapper_progress(effective_paths):
|
|
2893
|
+
publishable_progress_paths = list(effective_paths)
|
|
2894
|
+
publishable_age_s = now - publishable_progress_seen_at
|
|
2895
|
+
if publishable_age_s >= float(no_edit_recheck_s):
|
|
2896
|
+
publishable_progress_finalized = True
|
|
2897
|
+
log.info(
|
|
2898
|
+
"No-edit watchdog observed durable publishable file changes "
|
|
2899
|
+
f"({_describe_publishable_paths(effective_paths)}) for "
|
|
2900
|
+
f"{int(publishable_age_s)}s; stopping Codex early so "
|
|
2901
|
+
"QualityGate/ValidationGate can use the remaining budget."
|
|
2902
|
+
)
|
|
2903
|
+
_terminate_active_child()
|
|
2904
|
+
break
|
|
2648
2905
|
no_edit_deadline = now + float(no_edit_recheck_s)
|
|
2649
2906
|
log.info(
|
|
2650
2907
|
"No-edit watchdog observed publishable-looking file changes "
|
|
@@ -2671,7 +2928,8 @@ def _run_codex_task(
|
|
|
2671
2928
|
"publishable-looking changed paths are broad/noisy for a small task: "
|
|
2672
2929
|
f"{_describe_publishable_paths(effective_paths)}"
|
|
2673
2930
|
)
|
|
2674
|
-
rollout_watchdog_retryable =
|
|
2931
|
+
rollout_watchdog_retryable = True
|
|
2932
|
+
rollout_restore_before_retry = True
|
|
2675
2933
|
else:
|
|
2676
2934
|
rollout_deadline = None
|
|
2677
2935
|
else:
|
|
@@ -2690,9 +2948,16 @@ def _run_codex_task(
|
|
|
2690
2948
|
if rollout_artifact_only_paths
|
|
2691
2949
|
else ""
|
|
2692
2950
|
)
|
|
2951
|
+
can_retry_rollout = (
|
|
2952
|
+
rollout_watchdog_retryable
|
|
2953
|
+
and rollout_recovery_attempt < _MAX_ROLLOUT_RECOVERY_ATTEMPTS
|
|
2954
|
+
)
|
|
2693
2955
|
action = (
|
|
2956
|
+
"Restoring worker sandbox baseline and retrying with stricter guidance."
|
|
2957
|
+
if rollout_restore_before_retry and can_retry_rollout
|
|
2958
|
+
else
|
|
2694
2959
|
"Retrying with course-correction guidance."
|
|
2695
|
-
if
|
|
2960
|
+
if can_retry_rollout
|
|
2696
2961
|
else "Failing fast instead of retrying on top of a broad/noisy diff."
|
|
2697
2962
|
)
|
|
2698
2963
|
log.info(
|
|
@@ -2770,6 +3035,27 @@ def _run_codex_task(
|
|
|
2770
3035
|
|
|
2771
3036
|
if rollout_watchdog_fired:
|
|
2772
3037
|
if rollout_watchdog_retryable and rollout_recovery_attempt < _MAX_ROLLOUT_RECOVERY_ATTEMPTS:
|
|
3038
|
+
if rollout_restore_before_retry and not _restore_retry_baseline(
|
|
3039
|
+
repo,
|
|
3040
|
+
baseline_snapshot,
|
|
3041
|
+
rollout_watchdog_reason,
|
|
3042
|
+
):
|
|
3043
|
+
detail = (
|
|
3044
|
+
"Codex trajectory drifted into broad/noisy changes and the worker sandbox "
|
|
3045
|
+
"could not be restored safely for a clean recovery retry: "
|
|
3046
|
+
f"{rollout_watchdog_reason or 'broad/noisy changes'}."
|
|
3047
|
+
)
|
|
3048
|
+
if trace_excerpt:
|
|
3049
|
+
detail = f"{detail}\n{trace_excerpt}"
|
|
3050
|
+
return {
|
|
3051
|
+
"ok": False,
|
|
3052
|
+
"summary": "openai_codex rollout coach could not safely reset broad changes",
|
|
3053
|
+
"stdout": _truncate(stdout),
|
|
3054
|
+
"stderr": _truncate(f"{detail}\n{stderr}".strip()),
|
|
3055
|
+
"exitCode": 124,
|
|
3056
|
+
"usage": usage,
|
|
3057
|
+
"cooldownMs": _NO_PUBLISHABLE_FAILURE_COOLDOWN_MS,
|
|
3058
|
+
}
|
|
2773
3059
|
retry_guidance = [
|
|
2774
3060
|
*supplemental_guidance,
|
|
2775
3061
|
_build_rollout_recovery_guidance(
|
|
@@ -2789,6 +3075,7 @@ def _run_codex_task(
|
|
|
2789
3075
|
rollout_recovery_attempt=rollout_recovery_attempt + 1,
|
|
2790
3076
|
model_override=model_override,
|
|
2791
3077
|
baseline_changes=baseline_snapshot,
|
|
3078
|
+
execution_deadline_monotonic=overall_deadline,
|
|
2792
3079
|
)
|
|
2793
3080
|
detail = (
|
|
2794
3081
|
"Codex trajectory remained off-track or too broad for safe recovery: "
|
|
@@ -2806,6 +3093,35 @@ def _run_codex_task(
|
|
|
2806
3093
|
"cooldownMs": _NO_PUBLISHABLE_FAILURE_COOLDOWN_MS,
|
|
2807
3094
|
}
|
|
2808
3095
|
|
|
3096
|
+
if publishable_progress_finalized:
|
|
3097
|
+
changed_paths, _, effective_paths = _codex_changed_paths(repo, baseline_snapshot)
|
|
3098
|
+
effective_paths = effective_paths or publishable_progress_paths
|
|
3099
|
+
last_message = _read_text_if_exists(last_message_path)
|
|
3100
|
+
log_git_status(repo, log)
|
|
3101
|
+
prefix = (
|
|
3102
|
+
"Codex produced durable publishable file changes. PushPals stopped the "
|
|
3103
|
+
"Codex child early to preserve validation and revision budget; the normal "
|
|
3104
|
+
"QualityGate/ValidationGate will catch any incomplete edit."
|
|
3105
|
+
)
|
|
3106
|
+
return {
|
|
3107
|
+
"ok": True,
|
|
3108
|
+
"summary": (
|
|
3109
|
+
"openai_codex stopped after durable publishable progress "
|
|
3110
|
+
f"({len(effective_paths)} file(s))"
|
|
3111
|
+
),
|
|
3112
|
+
"stdout": _truncate(
|
|
3113
|
+
_build_success_stdout(
|
|
3114
|
+
effective_paths=effective_paths,
|
|
3115
|
+
last_message=last_message,
|
|
3116
|
+
trace_excerpt=trace_excerpt,
|
|
3117
|
+
prefix=prefix,
|
|
3118
|
+
)
|
|
3119
|
+
),
|
|
3120
|
+
"stderr": _truncate(stderr),
|
|
3121
|
+
"exitCode": 0,
|
|
3122
|
+
"usage": usage,
|
|
3123
|
+
}
|
|
3124
|
+
|
|
2809
3125
|
if no_edit_watchdog_fired:
|
|
2810
3126
|
startup_stall = _codex_trace_is_startup_stall(stdout_trace)
|
|
2811
3127
|
if startup_stall and startup_stall_recovery_attempt < _MAX_STARTUP_STALL_RECOVERY_ATTEMPTS:
|
|
@@ -2834,6 +3150,7 @@ def _run_codex_task(
|
|
|
2834
3150
|
rollout_recovery_attempt=rollout_recovery_attempt,
|
|
2835
3151
|
model_override=recovery_model or model_override,
|
|
2836
3152
|
baseline_changes=baseline_snapshot,
|
|
3153
|
+
execution_deadline_monotonic=overall_deadline,
|
|
2837
3154
|
)
|
|
2838
3155
|
retry_result["usage"] = _merge_usage_records(usage, retry_result.get("usage"))
|
|
2839
3156
|
if retry_result.get("ok"):
|
|
@@ -2880,6 +3197,7 @@ def _run_codex_task(
|
|
|
2880
3197
|
rollout_recovery_attempt=rollout_recovery_attempt,
|
|
2881
3198
|
model_override=model_override,
|
|
2882
3199
|
baseline_changes=baseline_snapshot,
|
|
3200
|
+
execution_deadline_monotonic=overall_deadline,
|
|
2883
3201
|
)
|
|
2884
3202
|
detail = "Codex spent too much of the execution budget without producing publishable file changes."
|
|
2885
3203
|
if trace_excerpt:
|
|
@@ -3076,6 +3394,7 @@ def _run_codex_task(
|
|
|
3076
3394
|
rollout_recovery_attempt=rollout_recovery_attempt,
|
|
3077
3395
|
model_override=model_override,
|
|
3078
3396
|
baseline_changes=baseline_snapshot,
|
|
3397
|
+
execution_deadline_monotonic=overall_deadline,
|
|
3079
3398
|
)
|
|
3080
3399
|
retry_result["usage"] = _merge_usage_records(usage, retry_result.get("usage"))
|
|
3081
3400
|
if wrapper_recovery_attempt == 0 and retry_result.get("ok"):
|
|
@@ -3191,6 +3510,7 @@ def _run_codex_task(
|
|
|
3191
3510
|
rollout_recovery_attempt=rollout_recovery_attempt,
|
|
3192
3511
|
model_override=LEGACY_CODEX_MODEL_FALLBACK,
|
|
3193
3512
|
baseline_changes=baseline_snapshot,
|
|
3513
|
+
execution_deadline_monotonic=overall_deadline,
|
|
3194
3514
|
)
|
|
3195
3515
|
retry_result["usage"] = _merge_usage_records(usage, retry_result.get("usage"))
|
|
3196
3516
|
if retry_result.get("ok"):
|
|
@@ -4,6 +4,7 @@ import re
|
|
|
4
4
|
import json
|
|
5
5
|
import subprocess
|
|
6
6
|
import sys
|
|
7
|
+
import time
|
|
7
8
|
import unittest
|
|
8
9
|
import tempfile
|
|
9
10
|
from unittest import mock
|
|
@@ -47,6 +48,9 @@ from openai_codex_executor import (
|
|
|
47
48
|
_repo_root_for_prompt_loading,
|
|
48
49
|
_restore_repo_local_codex_files,
|
|
49
50
|
_resolve_codex_command_prefix,
|
|
51
|
+
_resolve_no_edit_command_grace_seconds,
|
|
52
|
+
_resolve_no_edit_command_progress_cap_seconds,
|
|
53
|
+
_resolve_no_edit_recheck_seconds,
|
|
50
54
|
_resolve_no_edit_watchdog_seconds,
|
|
51
55
|
_resolve_rollout_watchdog_seconds,
|
|
52
56
|
_resolve_startup_stall_watchdog_seconds,
|
|
@@ -1431,6 +1435,265 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1431
1435
|
self.assertIn("Patched after command-backed discovery", str(result.get("stdout") or ""))
|
|
1432
1436
|
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1433
1437
|
|
|
1438
|
+
def test_run_codex_task_no_edit_watchdog_extends_after_later_command_progress(self) -> None:
|
|
1439
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-late-command-") as temp_dir:
|
|
1440
|
+
repo = Path(temp_dir) / "repo"
|
|
1441
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1442
|
+
(repo / "README.md").write_text("# late command grace repo\n", encoding="utf-8")
|
|
1443
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1444
|
+
subprocess.run(
|
|
1445
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1446
|
+
cwd=repo,
|
|
1447
|
+
check=True,
|
|
1448
|
+
capture_output=True,
|
|
1449
|
+
text=True,
|
|
1450
|
+
)
|
|
1451
|
+
subprocess.run(
|
|
1452
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1453
|
+
cwd=repo,
|
|
1454
|
+
check=True,
|
|
1455
|
+
capture_output=True,
|
|
1456
|
+
text=True,
|
|
1457
|
+
)
|
|
1458
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1459
|
+
subprocess.run(
|
|
1460
|
+
["git", "commit", "-m", "chore: seed late command repo"],
|
|
1461
|
+
cwd=repo,
|
|
1462
|
+
check=True,
|
|
1463
|
+
capture_output=True,
|
|
1464
|
+
text=True,
|
|
1465
|
+
)
|
|
1466
|
+
|
|
1467
|
+
stub_path = Path(temp_dir) / "fake_codex_late_command_grace.py"
|
|
1468
|
+
stub_path.write_text(
|
|
1469
|
+
"\n".join(
|
|
1470
|
+
[
|
|
1471
|
+
"from pathlib import Path",
|
|
1472
|
+
"import json",
|
|
1473
|
+
"import sys",
|
|
1474
|
+
"import time",
|
|
1475
|
+
"",
|
|
1476
|
+
"argv = sys.argv[1:]",
|
|
1477
|
+
"last_message_path = None",
|
|
1478
|
+
"for index, arg in enumerate(argv):",
|
|
1479
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1480
|
+
" last_message_path = argv[index + 1]",
|
|
1481
|
+
" break",
|
|
1482
|
+
"",
|
|
1483
|
+
"sys.stdin.read()",
|
|
1484
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1485
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1486
|
+
"print(json.dumps({'type': 'item.started', 'item': {'id': 'cmd-one', 'type': 'command_execution', 'command': 'cat README.md', 'status': 'in_progress'}}), flush=True)",
|
|
1487
|
+
"time.sleep(0.2)",
|
|
1488
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'id': 'cmd-one', 'type': 'command_execution', 'command': 'cat README.md', 'status': 'completed', 'exit_code': 0}}), flush=True)",
|
|
1489
|
+
"time.sleep(2.2)",
|
|
1490
|
+
"print(json.dumps({'type': 'item.started', 'item': {'id': 'cmd-two', 'type': 'command_execution', 'command': 'ls', 'status': 'in_progress'}}), flush=True)",
|
|
1491
|
+
"time.sleep(0.2)",
|
|
1492
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'id': 'cmd-two', 'type': 'command_execution', 'command': 'ls', 'status': 'completed', 'exit_code': 0}}), flush=True)",
|
|
1493
|
+
"time.sleep(2.0)",
|
|
1494
|
+
"Path('src').mkdir(exist_ok=True)",
|
|
1495
|
+
"Path('src/late-command-grace.txt').write_text('patched after later command progress\\n', encoding='utf-8')",
|
|
1496
|
+
"if last_message_path:",
|
|
1497
|
+
" Path(last_message_path).write_text('Patched after later command progress.', encoding='utf-8')",
|
|
1498
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Patched after later command progress.'}}), flush=True)",
|
|
1499
|
+
]
|
|
1500
|
+
),
|
|
1501
|
+
encoding="utf-8",
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1504
|
+
env_overrides = {
|
|
1505
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1506
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1507
|
+
"OPENAI_API_KEY": "pushpals-no-edit-late-command-test-key",
|
|
1508
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1509
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1510
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1511
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S": "3",
|
|
1512
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1513
|
+
}
|
|
1514
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1515
|
+
result = _run_codex_task(
|
|
1516
|
+
str(repo),
|
|
1517
|
+
"Add one focused contract assertion after a later targeted read.",
|
|
1518
|
+
[],
|
|
1519
|
+
)
|
|
1520
|
+
|
|
1521
|
+
self.assertTrue(result.get("ok"), result)
|
|
1522
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1523
|
+
self.assertIn("Patched after later command progress", str(result.get("stdout") or ""))
|
|
1524
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1525
|
+
|
|
1526
|
+
def test_run_codex_task_command_progress_cap_forces_patch_first_recovery(self) -> None:
|
|
1527
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-command-progress-cap-") as temp_dir:
|
|
1528
|
+
repo = Path(temp_dir) / "repo"
|
|
1529
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1530
|
+
(repo / "README.md").write_text("# command progress cap repo\n", encoding="utf-8")
|
|
1531
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1532
|
+
subprocess.run(
|
|
1533
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1534
|
+
cwd=repo,
|
|
1535
|
+
check=True,
|
|
1536
|
+
capture_output=True,
|
|
1537
|
+
text=True,
|
|
1538
|
+
)
|
|
1539
|
+
subprocess.run(
|
|
1540
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1541
|
+
cwd=repo,
|
|
1542
|
+
check=True,
|
|
1543
|
+
capture_output=True,
|
|
1544
|
+
text=True,
|
|
1545
|
+
)
|
|
1546
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1547
|
+
subprocess.run(
|
|
1548
|
+
["git", "commit", "-m", "chore: seed command progress cap repo"],
|
|
1549
|
+
cwd=repo,
|
|
1550
|
+
check=True,
|
|
1551
|
+
capture_output=True,
|
|
1552
|
+
text=True,
|
|
1553
|
+
)
|
|
1554
|
+
|
|
1555
|
+
stub_path = Path(temp_dir) / "fake_codex_command_progress_cap.py"
|
|
1556
|
+
stub_path.write_text(
|
|
1557
|
+
"\n".join(
|
|
1558
|
+
[
|
|
1559
|
+
"from pathlib import Path",
|
|
1560
|
+
"import json",
|
|
1561
|
+
"import sys",
|
|
1562
|
+
"import time",
|
|
1563
|
+
"",
|
|
1564
|
+
"argv = sys.argv[1:]",
|
|
1565
|
+
"last_message_path = None",
|
|
1566
|
+
"for index, arg in enumerate(argv):",
|
|
1567
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1568
|
+
" last_message_path = argv[index + 1]",
|
|
1569
|
+
" break",
|
|
1570
|
+
"",
|
|
1571
|
+
"prompt = sys.stdin.read()",
|
|
1572
|
+
"if 'No-edit watchdog recovery' in prompt:",
|
|
1573
|
+
" Path('src').mkdir(exist_ok=True)",
|
|
1574
|
+
" Path('src/capped-command-recovery.txt').write_text('patched after capped command progress\\n', encoding='utf-8')",
|
|
1575
|
+
" if last_message_path:",
|
|
1576
|
+
" Path(last_message_path).write_text('Patched after capped command progress.', encoding='utf-8')",
|
|
1577
|
+
" print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Patched after capped command progress.'}}), flush=True)",
|
|
1578
|
+
" raise SystemExit(0)",
|
|
1579
|
+
"",
|
|
1580
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1581
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1582
|
+
"for index in range(8):",
|
|
1583
|
+
" command_id = f'cmd-{index}'",
|
|
1584
|
+
" print(json.dumps({'type': 'item.started', 'item': {'id': command_id, 'type': 'command_execution', 'command': 'cat README.md', 'status': 'in_progress'}}), flush=True)",
|
|
1585
|
+
" time.sleep(0.2)",
|
|
1586
|
+
" print(json.dumps({'type': 'item.completed', 'item': {'id': command_id, 'type': 'command_execution', 'command': 'cat README.md', 'status': 'completed', 'exit_code': 0}}), flush=True)",
|
|
1587
|
+
" time.sleep(0.8)",
|
|
1588
|
+
]
|
|
1589
|
+
),
|
|
1590
|
+
encoding="utf-8",
|
|
1591
|
+
)
|
|
1592
|
+
|
|
1593
|
+
env_overrides = {
|
|
1594
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1595
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1596
|
+
"OPENAI_API_KEY": "pushpals-command-progress-cap-test-key",
|
|
1597
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1598
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "12",
|
|
1599
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1600
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S": "3",
|
|
1601
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_PROGRESS_CAP_S": "3",
|
|
1602
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1603
|
+
}
|
|
1604
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1605
|
+
result = _run_codex_task(
|
|
1606
|
+
str(repo),
|
|
1607
|
+
"Add one focused patch after bounded command-backed discovery.",
|
|
1608
|
+
[],
|
|
1609
|
+
)
|
|
1610
|
+
|
|
1611
|
+
self.assertTrue(result.get("ok"), result)
|
|
1612
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1613
|
+
self.assertIn("Patched after capped command progress", str(result.get("stdout") or ""))
|
|
1614
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1615
|
+
|
|
1616
|
+
def test_run_codex_task_finalizes_after_durable_publishable_progress(self) -> None:
|
|
1617
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-durable-progress-") as temp_dir:
|
|
1618
|
+
repo = Path(temp_dir) / "repo"
|
|
1619
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1620
|
+
(repo / "README.md").write_text("# durable progress repo\n", encoding="utf-8")
|
|
1621
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1622
|
+
subprocess.run(
|
|
1623
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1624
|
+
cwd=repo,
|
|
1625
|
+
check=True,
|
|
1626
|
+
capture_output=True,
|
|
1627
|
+
text=True,
|
|
1628
|
+
)
|
|
1629
|
+
subprocess.run(
|
|
1630
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1631
|
+
cwd=repo,
|
|
1632
|
+
check=True,
|
|
1633
|
+
capture_output=True,
|
|
1634
|
+
text=True,
|
|
1635
|
+
)
|
|
1636
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1637
|
+
subprocess.run(
|
|
1638
|
+
["git", "commit", "-m", "chore: seed durable progress repo"],
|
|
1639
|
+
cwd=repo,
|
|
1640
|
+
check=True,
|
|
1641
|
+
capture_output=True,
|
|
1642
|
+
text=True,
|
|
1643
|
+
)
|
|
1644
|
+
|
|
1645
|
+
stub_path = Path(temp_dir) / "fake_codex_durable_progress.py"
|
|
1646
|
+
stub_path.write_text(
|
|
1647
|
+
"\n".join(
|
|
1648
|
+
[
|
|
1649
|
+
"from pathlib import Path",
|
|
1650
|
+
"import json",
|
|
1651
|
+
"import sys",
|
|
1652
|
+
"import time",
|
|
1653
|
+
"",
|
|
1654
|
+
"argv = sys.argv[1:]",
|
|
1655
|
+
"last_message_path = None",
|
|
1656
|
+
"for index, arg in enumerate(argv):",
|
|
1657
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1658
|
+
" last_message_path = argv[index + 1]",
|
|
1659
|
+
" break",
|
|
1660
|
+
"",
|
|
1661
|
+
"sys.stdin.read()",
|
|
1662
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1663
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1664
|
+
"Path('src').mkdir(exist_ok=True)",
|
|
1665
|
+
"Path('src/durable-progress.txt').write_text('durable patch\\n', encoding='utf-8')",
|
|
1666
|
+
"if last_message_path:",
|
|
1667
|
+
" Path(last_message_path).write_text('Created durable patch and kept thinking.', encoding='utf-8')",
|
|
1668
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Created durable patch and kept thinking.'}}), flush=True)",
|
|
1669
|
+
"time.sleep(10)",
|
|
1670
|
+
]
|
|
1671
|
+
),
|
|
1672
|
+
encoding="utf-8",
|
|
1673
|
+
)
|
|
1674
|
+
|
|
1675
|
+
env_overrides = {
|
|
1676
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1677
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1678
|
+
"OPENAI_API_KEY": "pushpals-durable-progress-test-key",
|
|
1679
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1680
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1681
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1682
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_RECHECK_S": "1",
|
|
1683
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1684
|
+
}
|
|
1685
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1686
|
+
result = _run_codex_task(
|
|
1687
|
+
str(repo),
|
|
1688
|
+
"Make a focused patch and stop once it is durable.",
|
|
1689
|
+
[],
|
|
1690
|
+
)
|
|
1691
|
+
|
|
1692
|
+
self.assertTrue(result.get("ok"), result)
|
|
1693
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1694
|
+
self.assertIn("stopped after durable publishable progress", str(result.get("summary") or ""))
|
|
1695
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1696
|
+
|
|
1434
1697
|
def test_run_codex_task_recovery_attempt_is_still_guarded_by_no_edit_watchdog(self) -> None:
|
|
1435
1698
|
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-watchdog-fail-") as temp_dir:
|
|
1436
1699
|
repo = Path(temp_dir) / "repo"
|
|
@@ -1793,6 +2056,86 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1793
2056
|
|
|
1794
2057
|
self.assertEqual(watchdog_s, 300)
|
|
1795
2058
|
|
|
2059
|
+
def test_no_edit_recovery_attempt_uses_short_durable_recheck_and_command_cap(self) -> None:
|
|
2060
|
+
env = {
|
|
2061
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_RECHECK_S": "",
|
|
2062
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S": "",
|
|
2063
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_PROGRESS_CAP_S": "",
|
|
2064
|
+
}
|
|
2065
|
+
with mock.patch.dict(os.environ, env, clear=False):
|
|
2066
|
+
first_recheck_s = _resolve_no_edit_recheck_seconds(750)
|
|
2067
|
+
recovery_recheck_s = _resolve_no_edit_recheck_seconds(750, recovery_attempt=1)
|
|
2068
|
+
command_grace_s = _resolve_no_edit_command_grace_seconds(750)
|
|
2069
|
+
first_command_cap_s = _resolve_no_edit_command_progress_cap_seconds(
|
|
2070
|
+
750,
|
|
2071
|
+
command_grace_s,
|
|
2072
|
+
)
|
|
2073
|
+
recovery_command_cap_s = _resolve_no_edit_command_progress_cap_seconds(
|
|
2074
|
+
750,
|
|
2075
|
+
command_grace_s,
|
|
2076
|
+
recovery_attempt=1,
|
|
2077
|
+
)
|
|
2078
|
+
|
|
2079
|
+
self.assertEqual(first_recheck_s, 120)
|
|
2080
|
+
self.assertEqual(recovery_recheck_s, 30)
|
|
2081
|
+
self.assertEqual(first_command_cap_s, 360)
|
|
2082
|
+
self.assertEqual(recovery_command_cap_s, 120)
|
|
2083
|
+
|
|
2084
|
+
def test_codex_recovery_attempt_refuses_exhausted_shared_deadline(self) -> None:
|
|
2085
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-exhausted-recovery-") as temp_dir:
|
|
2086
|
+
repo = Path(temp_dir) / "repo"
|
|
2087
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
2088
|
+
(repo / "README.md").write_text("# exhausted recovery repo\n", encoding="utf-8")
|
|
2089
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
2090
|
+
subprocess.run(
|
|
2091
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
2092
|
+
cwd=repo,
|
|
2093
|
+
check=True,
|
|
2094
|
+
capture_output=True,
|
|
2095
|
+
text=True,
|
|
2096
|
+
)
|
|
2097
|
+
subprocess.run(
|
|
2098
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
2099
|
+
cwd=repo,
|
|
2100
|
+
check=True,
|
|
2101
|
+
capture_output=True,
|
|
2102
|
+
text=True,
|
|
2103
|
+
)
|
|
2104
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
2105
|
+
subprocess.run(
|
|
2106
|
+
["git", "commit", "-m", "chore: seed exhausted recovery repo"],
|
|
2107
|
+
cwd=repo,
|
|
2108
|
+
check=True,
|
|
2109
|
+
capture_output=True,
|
|
2110
|
+
text=True,
|
|
2111
|
+
)
|
|
2112
|
+
|
|
2113
|
+
stub_path = Path(temp_dir) / "fake_codex_should_not_run.py"
|
|
2114
|
+
stub_path.write_text(
|
|
2115
|
+
"raise SystemExit('fake codex should not run when recovery budget is exhausted')\n",
|
|
2116
|
+
encoding="utf-8",
|
|
2117
|
+
)
|
|
2118
|
+
|
|
2119
|
+
env_overrides = {
|
|
2120
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
2121
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
2122
|
+
"OPENAI_API_KEY": "pushpals-exhausted-recovery-test-key",
|
|
2123
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "750",
|
|
2124
|
+
}
|
|
2125
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
2126
|
+
result = _run_codex_task(
|
|
2127
|
+
str(repo),
|
|
2128
|
+
"Apply patch-first recovery.",
|
|
2129
|
+
[],
|
|
2130
|
+
no_edit_recovery_attempt=1,
|
|
2131
|
+
execution_deadline_monotonic=time.monotonic() - 1.0,
|
|
2132
|
+
)
|
|
2133
|
+
|
|
2134
|
+
self.assertFalse(result.get("ok"), result)
|
|
2135
|
+
self.assertEqual(result.get("exitCode"), 124)
|
|
2136
|
+
self.assertIn("recovery budget exhausted", str(result.get("summary") or ""))
|
|
2137
|
+
self.assertIn("Stopping before a low-odds retry", str(result.get("stderr") or ""))
|
|
2138
|
+
|
|
1796
2139
|
def test_review_fix_contract_level_tests_use_fast_no_edit_watchdog(self) -> None:
|
|
1797
2140
|
prompt = (
|
|
1798
2141
|
"Restore exact score assertions for contract-level tests where score is part "
|
|
@@ -1803,6 +2146,16 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1803
2146
|
|
|
1804
2147
|
self.assertEqual(watchdog_s, 180)
|
|
1805
2148
|
|
|
2149
|
+
def test_rejected_pr_review_fix_prompt_uses_compact_no_edit_watchdog(self) -> None:
|
|
2150
|
+
prompt = (
|
|
2151
|
+
"Rejected PR revision brief: Previous ReviewAgent score: 7.6 / 10. "
|
|
2152
|
+
"Address reviewer must-fix items in the cleanup harness with focused coverage."
|
|
2153
|
+
)
|
|
2154
|
+
with mock.patch.dict(os.environ, {"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": ""}, clear=False):
|
|
2155
|
+
watchdog_s = _resolve_no_edit_watchdog_seconds(prompt, 1200)
|
|
2156
|
+
|
|
2157
|
+
self.assertEqual(watchdog_s, 180)
|
|
2158
|
+
|
|
1806
2159
|
def test_no_edit_recovery_guidance_warns_against_artifact_only_progress(self) -> None:
|
|
1807
2160
|
guidance = _build_no_edit_recovery_guidance(
|
|
1808
2161
|
"item.completed | still inspecting",
|
|
@@ -1943,7 +2296,7 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1943
2296
|
self.assertIn("Patched after rollout coach guidance", str(result.get("stdout") or ""))
|
|
1944
2297
|
self.assertIn("scripts/", str(result.get("stdout") or ""))
|
|
1945
2298
|
|
|
1946
|
-
def
|
|
2299
|
+
def test_run_codex_task_rollout_coach_resets_broad_small_task_changes_before_retry(self) -> None:
|
|
1947
2300
|
with tempfile.TemporaryDirectory(prefix="pushpals-codex-rollout-noisy-") as temp_dir:
|
|
1948
2301
|
repo = Path(temp_dir) / "repo"
|
|
1949
2302
|
repo.mkdir(parents=True, exist_ok=True)
|
|
@@ -1980,7 +2333,22 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1980
2333
|
"import sys",
|
|
1981
2334
|
"import time",
|
|
1982
2335
|
"",
|
|
1983
|
-
"sys.
|
|
2336
|
+
"argv = sys.argv[1:]",
|
|
2337
|
+
"last_message_path = None",
|
|
2338
|
+
"for index, arg in enumerate(argv):",
|
|
2339
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
2340
|
+
" last_message_path = argv[index + 1]",
|
|
2341
|
+
" break",
|
|
2342
|
+
"",
|
|
2343
|
+
"prompt = sys.stdin.read()",
|
|
2344
|
+
"if 'Rollout coach recovery' in prompt:",
|
|
2345
|
+
" Path('src').mkdir(exist_ok=True)",
|
|
2346
|
+
" Path('src/narrow-rollout-recovery.txt').write_text('narrow recovery patch\\n', encoding='utf-8')",
|
|
2347
|
+
" if last_message_path:",
|
|
2348
|
+
" Path(last_message_path).write_text('Patched narrowly after broad rollout reset.', encoding='utf-8')",
|
|
2349
|
+
" print('item.completed | Patched narrowly after broad rollout reset.', flush=True)",
|
|
2350
|
+
" sys.exit(0)",
|
|
2351
|
+
"",
|
|
1984
2352
|
"for index in range(5):",
|
|
1985
2353
|
" root = Path(f'area{index}')",
|
|
1986
2354
|
" root.mkdir(exist_ok=True)",
|
|
@@ -2007,6 +2375,78 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
2007
2375
|
"Make a small low-risk repo-native patch.",
|
|
2008
2376
|
[],
|
|
2009
2377
|
)
|
|
2378
|
+
area0_exists_after_retry = (repo / "area0").exists()
|
|
2379
|
+
|
|
2380
|
+
self.assertTrue(result.get("ok"), result)
|
|
2381
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
2382
|
+
self.assertIn("Patched narrowly after broad rollout reset", str(result.get("stdout") or ""))
|
|
2383
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
2384
|
+
self.assertFalse(area0_exists_after_retry)
|
|
2385
|
+
|
|
2386
|
+
def test_run_codex_task_rollout_coach_fails_after_repeated_broad_small_task_changes(self) -> None:
|
|
2387
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-rollout-repeat-noisy-") as temp_dir:
|
|
2388
|
+
repo = Path(temp_dir) / "repo"
|
|
2389
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
2390
|
+
(repo / "README.md").write_text("# repeated rollout noisy repo\n", encoding="utf-8")
|
|
2391
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
2392
|
+
subprocess.run(
|
|
2393
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
2394
|
+
cwd=repo,
|
|
2395
|
+
check=True,
|
|
2396
|
+
capture_output=True,
|
|
2397
|
+
text=True,
|
|
2398
|
+
)
|
|
2399
|
+
subprocess.run(
|
|
2400
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
2401
|
+
cwd=repo,
|
|
2402
|
+
check=True,
|
|
2403
|
+
capture_output=True,
|
|
2404
|
+
text=True,
|
|
2405
|
+
)
|
|
2406
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
2407
|
+
subprocess.run(
|
|
2408
|
+
["git", "commit", "-m", "chore: seed repeated rollout noisy repo"],
|
|
2409
|
+
cwd=repo,
|
|
2410
|
+
check=True,
|
|
2411
|
+
capture_output=True,
|
|
2412
|
+
text=True,
|
|
2413
|
+
)
|
|
2414
|
+
|
|
2415
|
+
stub_path = Path(temp_dir) / "fake_codex_rollout_repeat_noisy.py"
|
|
2416
|
+
stub_path.write_text(
|
|
2417
|
+
"\n".join(
|
|
2418
|
+
[
|
|
2419
|
+
"from pathlib import Path",
|
|
2420
|
+
"import sys",
|
|
2421
|
+
"import time",
|
|
2422
|
+
"",
|
|
2423
|
+
"sys.stdin.read()",
|
|
2424
|
+
"for index in range(5):",
|
|
2425
|
+
" root = Path(f'area{index}')",
|
|
2426
|
+
" root.mkdir(exist_ok=True)",
|
|
2427
|
+
" (root / 'changed.txt').write_text('broad rollout change\\n', encoding='utf-8')",
|
|
2428
|
+
"print('item.completed | Repeated broad edits for a small task.', flush=True)",
|
|
2429
|
+
"time.sleep(10)",
|
|
2430
|
+
]
|
|
2431
|
+
),
|
|
2432
|
+
encoding="utf-8",
|
|
2433
|
+
)
|
|
2434
|
+
|
|
2435
|
+
env_overrides = {
|
|
2436
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
2437
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
2438
|
+
"OPENAI_API_KEY": "pushpals-rollout-repeat-noisy-test-key",
|
|
2439
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "700",
|
|
2440
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "10",
|
|
2441
|
+
"WORKERPALS_OPENAI_CODEX_ROLLOUT_WATCHDOG_S": "1",
|
|
2442
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
2443
|
+
}
|
|
2444
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
2445
|
+
result = _run_codex_task(
|
|
2446
|
+
str(repo),
|
|
2447
|
+
"Make a small low-risk repo-native patch.",
|
|
2448
|
+
[],
|
|
2449
|
+
)
|
|
2010
2450
|
|
|
2011
2451
|
self.assertFalse(result.get("ok"), result)
|
|
2012
2452
|
self.assertEqual(result.get("exitCode"), 124)
|
|
@@ -34,9 +34,9 @@ interface GenericPythonExecutorConfig {
|
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
const BACKEND_TIMEOUT_RESULT_GRACE_MS = 30_000;
|
|
37
|
-
const OPENAI_CODEX_MIN_VALIDATION_RESERVE_MS =
|
|
38
|
-
const OPENAI_CODEX_MAX_VALIDATION_RESERVE_MS =
|
|
39
|
-
const OPENAI_CODEX_MIN_PRIMARY_TURN_BUDGET_MS =
|
|
37
|
+
const OPENAI_CODEX_MIN_VALIDATION_RESERVE_MS = 240_000;
|
|
38
|
+
const OPENAI_CODEX_MAX_VALIDATION_RESERVE_MS = 720_000;
|
|
39
|
+
const OPENAI_CODEX_MIN_PRIMARY_TURN_BUDGET_MS = 540_000;
|
|
40
40
|
|
|
41
41
|
function estimateTokensFromText(text: string): number {
|
|
42
42
|
return Math.max(0, Math.ceil(String(text ?? "").length / 3));
|
|
@@ -161,7 +161,7 @@ export function resolveOpenAICodexValidationReserveMs(
|
|
|
161
161
|
budgetMs,
|
|
162
162
|
Math.max(
|
|
163
163
|
OPENAI_CODEX_MIN_VALIDATION_RESERVE_MS,
|
|
164
|
-
Math.min(OPENAI_CODEX_MAX_VALIDATION_RESERVE_MS, budgetMs * 0.
|
|
164
|
+
Math.min(OPENAI_CODEX_MAX_VALIDATION_RESERVE_MS, budgetMs * 0.5),
|
|
165
165
|
),
|
|
166
166
|
),
|
|
167
167
|
);
|
|
@@ -201,6 +201,9 @@ const BROWSER_VALIDATION_MAX_AUTO_REVISIONS = 3;
|
|
|
201
201
|
const CRITIC_COMPACT_RETRY_MIN_REDUCTION_RATIO = 0.25;
|
|
202
202
|
const MAX_DIAGNOSTIC_PATH_SAMPLES = 50;
|
|
203
203
|
const MAX_DIAGNOSTIC_TEXT_CHARS = 8_000;
|
|
204
|
+
const QUALITY_MIN_REVISION_BUDGET_MS = 120_000;
|
|
205
|
+
const QUALITY_MAX_REVISION_BUDGET_MS = 420_000;
|
|
206
|
+
const QUALITY_REVISION_BUDGET_RATIO = 0.25;
|
|
204
207
|
|
|
205
208
|
export function qualityRevisionLoopUpperBound(policy: {
|
|
206
209
|
maxAutoRevisions: number;
|
|
@@ -234,7 +237,13 @@ export function qualityRevisionBudgetDecision(opts: {
|
|
|
234
237
|
const elapsedMs = Math.max(0, Number(opts.jobElapsedMs) || 0);
|
|
235
238
|
const remainingBudgetMs = Math.max(0, Math.floor(executionBudgetMs - elapsedMs));
|
|
236
239
|
const minimumRevisionBudgetMs = Math.floor(
|
|
237
|
-
Math.min(
|
|
240
|
+
Math.min(
|
|
241
|
+
executionBudgetMs,
|
|
242
|
+
Math.max(
|
|
243
|
+
QUALITY_MIN_REVISION_BUDGET_MS,
|
|
244
|
+
Math.min(QUALITY_MAX_REVISION_BUDGET_MS, executionBudgetMs * QUALITY_REVISION_BUDGET_RATIO),
|
|
245
|
+
),
|
|
246
|
+
),
|
|
238
247
|
);
|
|
239
248
|
return {
|
|
240
249
|
shouldStart: remainingBudgetMs >= minimumRevisionBudgetMs,
|
|
@@ -335,6 +344,22 @@ export function shouldSkipCriticForDeterministicValidationRevision(opts: {
|
|
|
335
344
|
return opts.validationRuns.some(isDeterministicFastValidationFailure);
|
|
336
345
|
}
|
|
337
346
|
|
|
347
|
+
export function shouldSkipCriticToPreserveRevisionBudget(opts: {
|
|
348
|
+
deterministicRequiresRevision: boolean;
|
|
349
|
+
remainingBudgetMs: number;
|
|
350
|
+
minimumRevisionBudgetMs: number;
|
|
351
|
+
criticTimeoutMs: number;
|
|
352
|
+
criticTimeoutBehavior: "skip" | "retry_once" | "block" | string;
|
|
353
|
+
}): boolean {
|
|
354
|
+
if (!opts.deterministicRequiresRevision) return false;
|
|
355
|
+
const remainingBudgetMs = Math.max(0, Math.floor(opts.remainingBudgetMs));
|
|
356
|
+
const minimumRevisionBudgetMs = Math.max(0, Math.floor(opts.minimumRevisionBudgetMs));
|
|
357
|
+
const criticTimeoutMs = Math.max(0, Math.floor(opts.criticTimeoutMs));
|
|
358
|
+
const criticAttempts = opts.criticTimeoutBehavior === "retry_once" ? 2 : 1;
|
|
359
|
+
const criticWorstCaseMs = criticTimeoutMs * criticAttempts;
|
|
360
|
+
return remainingBudgetMs < minimumRevisionBudgetMs + criticWorstCaseMs;
|
|
361
|
+
}
|
|
362
|
+
|
|
338
363
|
export function workerAttemptRolloutScore(params: {
|
|
339
364
|
executorElapsedMs: number;
|
|
340
365
|
qualityElapsedMs: number;
|
|
@@ -7975,11 +8000,23 @@ export async function executeJob(
|
|
|
7975
8000
|
validationOutsideTaskScope,
|
|
7976
8001
|
validationRuns: quality.validationRuns,
|
|
7977
8002
|
});
|
|
8003
|
+
const preCriticRevisionBudget = qualityRevisionBudgetDecision({
|
|
8004
|
+
jobElapsedMs: Date.now() - jobStartedAt,
|
|
8005
|
+
executionBudgetMs,
|
|
8006
|
+
});
|
|
8007
|
+
const skipCriticForRevisionBudget = shouldSkipCriticToPreserveRevisionBudget({
|
|
8008
|
+
deterministicRequiresRevision: preCriticDeterministicRequiresRevision,
|
|
8009
|
+
remainingBudgetMs: preCriticRevisionBudget.remainingBudgetMs,
|
|
8010
|
+
minimumRevisionBudgetMs: preCriticRevisionBudget.minimumRevisionBudgetMs,
|
|
8011
|
+
criticTimeoutMs: resolveQualityCriticTimeoutMs(runtimeConfig),
|
|
8012
|
+
criticTimeoutBehavior: resolveQualityCriticTimeoutBehavior(runtimeConfig),
|
|
8013
|
+
});
|
|
7978
8014
|
const critic =
|
|
7979
8015
|
quality.skipped ||
|
|
7980
8016
|
!qualityGatePolicy.criticGateEnabled ||
|
|
7981
8017
|
skipCriticAfterExecutorTimeout ||
|
|
7982
|
-
skipCriticForDeterministicValidationRevision
|
|
8018
|
+
skipCriticForDeterministicValidationRevision ||
|
|
8019
|
+
skipCriticForRevisionBudget
|
|
7983
8020
|
? null
|
|
7984
8021
|
: executor === "openai_codex"
|
|
7985
8022
|
? await runCodexCriticReview(repo, attemptParams, qualityForCritic, runtimeConfig, onLog)
|
|
@@ -8020,6 +8057,11 @@ export async function executeJob(
|
|
|
8020
8057
|
"stdout",
|
|
8021
8058
|
"[CriticGate] Skipping critic because deterministic fast validation already requires a quality revision.",
|
|
8022
8059
|
);
|
|
8060
|
+
} else if (skipCriticForRevisionBudget) {
|
|
8061
|
+
onLog?.(
|
|
8062
|
+
"stdout",
|
|
8063
|
+
`[CriticGate] Skipping critic because deterministic quality already requires revision and remaining budget (${preCriticRevisionBudget.remainingBudgetMs}ms) must be reserved for the next worker turn.`,
|
|
8064
|
+
);
|
|
8023
8065
|
}
|
|
8024
8066
|
const rolloutScore = workerAttemptRolloutScore({
|
|
8025
8067
|
executorElapsedMs,
|