@pushpalsdev/cli 1.1.35 → 1.1.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/openai_codex_executor.py +208 -3
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/test_openai_codex_runtime_config.py +253 -0
- package/runtime/sandbox/apps/workerpals/src/execute_job.ts +34 -1
package/package.json
CHANGED
|
@@ -116,6 +116,7 @@ _WEB_REVIEW_NO_EDIT_WATCHDOG_S = 240
|
|
|
116
116
|
_BACKGROUND_NO_EDIT_WATCHDOG_S = 120
|
|
117
117
|
_NO_EDIT_RECOVERY_WATCHDOG_S = 90
|
|
118
118
|
_DEFAULT_NO_EDIT_RECHECK_S = 120
|
|
119
|
+
_DEFAULT_NO_EDIT_COMMAND_GRACE_S = 240
|
|
119
120
|
_DEFAULT_STARTUP_STALL_WATCHDOG_S = 210
|
|
120
121
|
_RECOVERY_STARTUP_STALL_WATCHDOG_S = 150
|
|
121
122
|
_DEFAULT_ROLLOUT_WATCHDOG_S = 300
|
|
@@ -757,6 +758,27 @@ def _resolve_no_edit_recheck_seconds(communicate_timeout_s: Optional[int]) -> in
|
|
|
757
758
|
return max(1, min(_DEFAULT_NO_EDIT_RECHECK_S, upper))
|
|
758
759
|
|
|
759
760
|
|
|
761
|
+
def _resolve_no_edit_command_grace_seconds(communicate_timeout_s: Optional[int]) -> Optional[int]:
|
|
762
|
+
if not communicate_timeout_s:
|
|
763
|
+
return None
|
|
764
|
+
|
|
765
|
+
raw = os.environ.get("WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S", "").strip()
|
|
766
|
+
if raw:
|
|
767
|
+
if raw == "0":
|
|
768
|
+
return None
|
|
769
|
+
parsed = _to_positive_int(raw)
|
|
770
|
+
if parsed is None:
|
|
771
|
+
log.info(
|
|
772
|
+
"Invalid WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S="
|
|
773
|
+
f"{raw!r}; using default command-progress grace."
|
|
774
|
+
)
|
|
775
|
+
else:
|
|
776
|
+
return max(1, min(parsed, max(1, communicate_timeout_s - 1)))
|
|
777
|
+
|
|
778
|
+
upper = max(1, communicate_timeout_s - 1)
|
|
779
|
+
return max(1, min(_DEFAULT_NO_EDIT_COMMAND_GRACE_S, upper))
|
|
780
|
+
|
|
781
|
+
|
|
760
782
|
def _resolve_startup_stall_watchdog_seconds(
|
|
761
783
|
communicate_timeout_s: Optional[int],
|
|
762
784
|
recovery_attempt: int = 0,
|
|
@@ -1339,10 +1361,94 @@ def _empty_codex_trace() -> Dict[str, Any]:
|
|
|
1339
1361
|
"prompt_tokens": 0,
|
|
1340
1362
|
"completion_tokens": 0,
|
|
1341
1363
|
"total_tokens": 0,
|
|
1364
|
+
"active_command_ids": [],
|
|
1365
|
+
"command_event_count": 0,
|
|
1366
|
+
"last_command_activity_at": None,
|
|
1367
|
+
"last_command_summary": "",
|
|
1342
1368
|
}
|
|
1343
1369
|
|
|
1344
1370
|
|
|
1345
|
-
def
|
|
1371
|
+
def _looks_like_codex_command_item(value: Any) -> bool:
|
|
1372
|
+
if not isinstance(value, dict):
|
|
1373
|
+
return False
|
|
1374
|
+
type_text = " ".join(
|
|
1375
|
+
str(value.get(key) or "")
|
|
1376
|
+
for key in ("type", "item_type", "kind", "name", "tool_name")
|
|
1377
|
+
).lower()
|
|
1378
|
+
if any(marker in type_text for marker in ("command_execution", "exec_command", "shell_command")):
|
|
1379
|
+
return True
|
|
1380
|
+
return any(key in value for key in ("command", "cmd", "exit_code", "aggregated_output"))
|
|
1381
|
+
|
|
1382
|
+
|
|
1383
|
+
def _record_codex_command_activity(
|
|
1384
|
+
parsed: Dict[str, Any],
|
|
1385
|
+
event_type: str,
|
|
1386
|
+
trace: Dict[str, Any],
|
|
1387
|
+
now: float,
|
|
1388
|
+
) -> None:
|
|
1389
|
+
item = parsed.get("item")
|
|
1390
|
+
command_source: Any = item if _looks_like_codex_command_item(item) else parsed
|
|
1391
|
+
if not _looks_like_codex_command_item(command_source):
|
|
1392
|
+
return
|
|
1393
|
+
|
|
1394
|
+
command_text = ""
|
|
1395
|
+
if isinstance(command_source, dict):
|
|
1396
|
+
for key in ("command", "cmd", "name"):
|
|
1397
|
+
raw = command_source.get(key)
|
|
1398
|
+
if isinstance(raw, str) and raw.strip():
|
|
1399
|
+
command_text = _truncate_inline(raw.strip(), 160)
|
|
1400
|
+
break
|
|
1401
|
+
command_id = ""
|
|
1402
|
+
if isinstance(command_source, dict):
|
|
1403
|
+
command_id = str(
|
|
1404
|
+
command_source.get("id")
|
|
1405
|
+
or command_source.get("call_id")
|
|
1406
|
+
or command_source.get("item_id")
|
|
1407
|
+
or command_text
|
|
1408
|
+
or "command"
|
|
1409
|
+
).strip()
|
|
1410
|
+
command_id = command_id or "command"
|
|
1411
|
+
|
|
1412
|
+
active = trace.setdefault("active_command_ids", [])
|
|
1413
|
+
if not isinstance(active, list):
|
|
1414
|
+
active = []
|
|
1415
|
+
trace["active_command_ids"] = active
|
|
1416
|
+
|
|
1417
|
+
status_text = ""
|
|
1418
|
+
if isinstance(command_source, dict):
|
|
1419
|
+
status_text = " ".join(
|
|
1420
|
+
str(command_source.get(key) or "")
|
|
1421
|
+
for key in ("status", "state", "outcome")
|
|
1422
|
+
).lower()
|
|
1423
|
+
event_lower = event_type.lower()
|
|
1424
|
+
completed = (
|
|
1425
|
+
"completed" in event_lower
|
|
1426
|
+
or "failed" in event_lower
|
|
1427
|
+
or "error" in event_lower
|
|
1428
|
+
or any(marker in status_text for marker in ("completed", "failed", "cancelled", "canceled", "exited"))
|
|
1429
|
+
)
|
|
1430
|
+
started = (
|
|
1431
|
+
"started" in event_lower
|
|
1432
|
+
or "updated" in event_lower
|
|
1433
|
+
or any(marker in status_text for marker in ("running", "in_progress", "started"))
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
if completed:
|
|
1437
|
+
trace["active_command_ids"] = [item for item in active if str(item) != command_id]
|
|
1438
|
+
elif started and command_id not in active:
|
|
1439
|
+
active.append(command_id)
|
|
1440
|
+
|
|
1441
|
+
trace["command_event_count"] = to_int(trace.get("command_event_count"), 0) + 1
|
|
1442
|
+
trace["last_command_activity_at"] = float(now)
|
|
1443
|
+
trace["last_command_summary"] = command_text or event_type
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
def _record_live_codex_stdout_line(
|
|
1447
|
+
line: str,
|
|
1448
|
+
use_json: bool,
|
|
1449
|
+
trace: Dict[str, Any],
|
|
1450
|
+
now: Optional[float] = None,
|
|
1451
|
+
) -> None:
|
|
1346
1452
|
stripped = line.strip()
|
|
1347
1453
|
if not stripped:
|
|
1348
1454
|
return
|
|
@@ -1369,6 +1475,7 @@ def _record_live_codex_stdout_line(line: str, use_json: bool, trace: Dict[str, A
|
|
|
1369
1475
|
return
|
|
1370
1476
|
|
|
1371
1477
|
if isinstance(parsed, dict):
|
|
1478
|
+
observed_at = float(now if now is not None else time.monotonic())
|
|
1372
1479
|
usage = _extract_usage_counts(parsed)
|
|
1373
1480
|
if usage is not None:
|
|
1374
1481
|
trace["prompt_tokens"] = max(
|
|
@@ -1385,6 +1492,7 @@ def _record_live_codex_stdout_line(line: str, use_json: bool, trace: Dict[str, A
|
|
|
1385
1492
|
.strip()
|
|
1386
1493
|
or "event"
|
|
1387
1494
|
)
|
|
1495
|
+
_record_codex_command_activity(parsed, event_type, trace, observed_at)
|
|
1388
1496
|
event_type_counts[event_type] = to_int(event_type_counts.get(event_type), 0) + 1
|
|
1389
1497
|
summary = _summarize_json_event(parsed)
|
|
1390
1498
|
# Reasoning can arrive under generic event types (for example item.updated).
|
|
@@ -1449,10 +1557,13 @@ def _finalize_codex_stdout_trace(trace: Dict[str, Any], use_json: bool) -> Dict[
|
|
|
1449
1557
|
prompt_tokens = to_int(trace.get("prompt_tokens"), 0)
|
|
1450
1558
|
completion_tokens = to_int(trace.get("completion_tokens"), 0)
|
|
1451
1559
|
total_tokens = to_int(trace.get("total_tokens"), 0)
|
|
1560
|
+
command_event_count = to_int(trace.get("command_event_count"), 0)
|
|
1452
1561
|
if reasoning_events > 0:
|
|
1453
1562
|
log.info(f"[codex] Reasoning-like event(s): {reasoning_events}")
|
|
1454
1563
|
elif use_json and valid_json > 0:
|
|
1455
1564
|
log.info("[codex] No reasoning-like events observed in this run.")
|
|
1565
|
+
if command_event_count > 0:
|
|
1566
|
+
log.info(f"[codex] Command execution event(s): {command_event_count}")
|
|
1456
1567
|
if total_tokens > 0:
|
|
1457
1568
|
log.info(
|
|
1458
1569
|
f"[codex] Usage observed: prompt={prompt_tokens} completion={completion_tokens} total={total_tokens}"
|
|
@@ -1473,6 +1584,7 @@ def _finalize_codex_stdout_trace(trace: Dict[str, Any], use_json: bool) -> Dict[
|
|
|
1473
1584
|
"prompt_tokens": prompt_tokens,
|
|
1474
1585
|
"completion_tokens": completion_tokens,
|
|
1475
1586
|
"total_tokens": total_tokens,
|
|
1587
|
+
"command_event_count": command_event_count,
|
|
1476
1588
|
}
|
|
1477
1589
|
|
|
1478
1590
|
|
|
@@ -2299,8 +2411,14 @@ def _run_codex_task(
|
|
|
2299
2411
|
if not line:
|
|
2300
2412
|
continue
|
|
2301
2413
|
with trace_lock:
|
|
2302
|
-
|
|
2303
|
-
|
|
2414
|
+
observed_at = time.monotonic()
|
|
2415
|
+
last_activity_at["ts"] = observed_at
|
|
2416
|
+
_record_live_codex_stdout_line(
|
|
2417
|
+
line,
|
|
2418
|
+
use_json,
|
|
2419
|
+
stdout_trace_state,
|
|
2420
|
+
observed_at,
|
|
2421
|
+
)
|
|
2304
2422
|
except Exception:
|
|
2305
2423
|
pass
|
|
2306
2424
|
finally:
|
|
@@ -2377,6 +2495,7 @@ def _run_codex_task(
|
|
|
2377
2495
|
else None
|
|
2378
2496
|
)
|
|
2379
2497
|
no_edit_recheck_s = _resolve_no_edit_recheck_seconds(communicate_timeout_s)
|
|
2498
|
+
no_edit_command_grace_s = _resolve_no_edit_command_grace_seconds(communicate_timeout_s)
|
|
2380
2499
|
startup_stall_watchdog_s = _resolve_startup_stall_watchdog_seconds(
|
|
2381
2500
|
communicate_timeout_s,
|
|
2382
2501
|
recovery_attempt=startup_stall_recovery_attempt,
|
|
@@ -2405,6 +2524,9 @@ def _run_codex_task(
|
|
|
2405
2524
|
if rollout_watchdog_s is not None
|
|
2406
2525
|
else None
|
|
2407
2526
|
)
|
|
2527
|
+
publishable_progress_seen_at: Optional[float] = None
|
|
2528
|
+
publishable_progress_finalized = False
|
|
2529
|
+
publishable_progress_paths: List[str] = []
|
|
2408
2530
|
|
|
2409
2531
|
while proc.poll() is None:
|
|
2410
2532
|
now = time.monotonic()
|
|
@@ -2457,6 +2579,44 @@ def _run_codex_task(
|
|
|
2457
2579
|
"before startup-stall recovery."
|
|
2458
2580
|
)
|
|
2459
2581
|
continue
|
|
2582
|
+
command_event_count = to_int(live_trace.get("command_event_count"), 0)
|
|
2583
|
+
active_commands_raw = live_trace.get("active_command_ids")
|
|
2584
|
+
active_command_count = (
|
|
2585
|
+
len(active_commands_raw)
|
|
2586
|
+
if isinstance(active_commands_raw, list)
|
|
2587
|
+
else 0
|
|
2588
|
+
)
|
|
2589
|
+
last_command_activity_at = 0.0
|
|
2590
|
+
try:
|
|
2591
|
+
last_command_activity_at = float(
|
|
2592
|
+
live_trace.get("last_command_activity_at") or 0.0
|
|
2593
|
+
)
|
|
2594
|
+
except Exception:
|
|
2595
|
+
last_command_activity_at = 0.0
|
|
2596
|
+
if command_event_count > 0 and no_edit_command_grace_s is not None:
|
|
2597
|
+
command_grace_deadline = 0.0
|
|
2598
|
+
if active_command_count > 0:
|
|
2599
|
+
# Do not kill while Codex is actively running a tool command; poll
|
|
2600
|
+
# again soon, but keep the total grace bounded by the hard cap below.
|
|
2601
|
+
command_grace_deadline = now + min(60.0, float(no_edit_command_grace_s))
|
|
2602
|
+
elif last_command_activity_at > 0:
|
|
2603
|
+
command_grace_deadline = last_command_activity_at + float(
|
|
2604
|
+
no_edit_command_grace_s
|
|
2605
|
+
)
|
|
2606
|
+
if command_grace_deadline > now:
|
|
2607
|
+
no_edit_deadline = command_grace_deadline
|
|
2608
|
+
remaining_s = int(max(1.0, command_grace_deadline - now))
|
|
2609
|
+
command_detail = (
|
|
2610
|
+
f"{active_command_count} active command(s)"
|
|
2611
|
+
if active_command_count > 0
|
|
2612
|
+
else "recent command completion"
|
|
2613
|
+
)
|
|
2614
|
+
log.info(
|
|
2615
|
+
"No-edit watchdog observed Codex tool progress "
|
|
2616
|
+
f"({command_detail}); allowing {remaining_s}s for a "
|
|
2617
|
+
"publishable patch before recovery."
|
|
2618
|
+
)
|
|
2619
|
+
continue
|
|
2460
2620
|
no_edit_artifact_only_paths = _describe_non_publishable_paths(
|
|
2461
2621
|
changed_paths,
|
|
2462
2622
|
baseline_snapshot,
|
|
@@ -2478,6 +2638,22 @@ def _run_codex_task(
|
|
|
2478
2638
|
)
|
|
2479
2639
|
_terminate_active_child()
|
|
2480
2640
|
break
|
|
2641
|
+
if publishable_progress_seen_at is None:
|
|
2642
|
+
publishable_progress_seen_at = now
|
|
2643
|
+
publishable_progress_paths = list(effective_paths)
|
|
2644
|
+
elif _has_credible_shell_wrapper_progress(effective_paths):
|
|
2645
|
+
publishable_progress_paths = list(effective_paths)
|
|
2646
|
+
publishable_age_s = now - publishable_progress_seen_at
|
|
2647
|
+
if publishable_age_s >= float(no_edit_recheck_s):
|
|
2648
|
+
publishable_progress_finalized = True
|
|
2649
|
+
log.info(
|
|
2650
|
+
"No-edit watchdog observed durable publishable file changes "
|
|
2651
|
+
f"({_describe_publishable_paths(effective_paths)}) for "
|
|
2652
|
+
f"{int(publishable_age_s)}s; stopping Codex early so "
|
|
2653
|
+
"QualityGate/ValidationGate can use the remaining budget."
|
|
2654
|
+
)
|
|
2655
|
+
_terminate_active_child()
|
|
2656
|
+
break
|
|
2481
2657
|
no_edit_deadline = now + float(no_edit_recheck_s)
|
|
2482
2658
|
log.info(
|
|
2483
2659
|
"No-edit watchdog observed publishable-looking file changes "
|
|
@@ -2639,6 +2815,35 @@ def _run_codex_task(
|
|
|
2639
2815
|
"cooldownMs": _NO_PUBLISHABLE_FAILURE_COOLDOWN_MS,
|
|
2640
2816
|
}
|
|
2641
2817
|
|
|
2818
|
+
if publishable_progress_finalized:
|
|
2819
|
+
changed_paths, _, effective_paths = _codex_changed_paths(repo, baseline_snapshot)
|
|
2820
|
+
effective_paths = effective_paths or publishable_progress_paths
|
|
2821
|
+
last_message = _read_text_if_exists(last_message_path)
|
|
2822
|
+
log_git_status(repo, log)
|
|
2823
|
+
prefix = (
|
|
2824
|
+
"Codex produced durable publishable file changes. PushPals stopped the "
|
|
2825
|
+
"Codex child early to preserve validation and revision budget; the normal "
|
|
2826
|
+
"QualityGate/ValidationGate will catch any incomplete edit."
|
|
2827
|
+
)
|
|
2828
|
+
return {
|
|
2829
|
+
"ok": True,
|
|
2830
|
+
"summary": (
|
|
2831
|
+
"openai_codex stopped after durable publishable progress "
|
|
2832
|
+
f"({len(effective_paths)} file(s))"
|
|
2833
|
+
),
|
|
2834
|
+
"stdout": _truncate(
|
|
2835
|
+
_build_success_stdout(
|
|
2836
|
+
effective_paths=effective_paths,
|
|
2837
|
+
last_message=last_message,
|
|
2838
|
+
trace_excerpt=trace_excerpt,
|
|
2839
|
+
prefix=prefix,
|
|
2840
|
+
)
|
|
2841
|
+
),
|
|
2842
|
+
"stderr": _truncate(stderr),
|
|
2843
|
+
"exitCode": 0,
|
|
2844
|
+
"usage": usage,
|
|
2845
|
+
}
|
|
2846
|
+
|
|
2642
2847
|
if no_edit_watchdog_fired:
|
|
2643
2848
|
startup_stall = _codex_trace_is_startup_stall(stdout_trace)
|
|
2644
2849
|
if startup_stall and startup_stall_recovery_attempt < _MAX_STARTUP_STALL_RECOVERY_ATTEMPTS:
|
|
@@ -1347,6 +1347,259 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1347
1347
|
self.assertIn("Patched immediately after no-edit recovery", str(result.get("stdout") or ""))
|
|
1348
1348
|
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1349
1349
|
|
|
1350
|
+
def test_run_codex_task_no_edit_watchdog_allows_command_backed_discovery(self) -> None:
|
|
1351
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-command-grace-") as temp_dir:
|
|
1352
|
+
repo = Path(temp_dir) / "repo"
|
|
1353
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1354
|
+
(repo / "README.md").write_text("# command grace repo\n", encoding="utf-8")
|
|
1355
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1356
|
+
subprocess.run(
|
|
1357
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1358
|
+
cwd=repo,
|
|
1359
|
+
check=True,
|
|
1360
|
+
capture_output=True,
|
|
1361
|
+
text=True,
|
|
1362
|
+
)
|
|
1363
|
+
subprocess.run(
|
|
1364
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1365
|
+
cwd=repo,
|
|
1366
|
+
check=True,
|
|
1367
|
+
capture_output=True,
|
|
1368
|
+
text=True,
|
|
1369
|
+
)
|
|
1370
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1371
|
+
subprocess.run(
|
|
1372
|
+
["git", "commit", "-m", "chore: seed command grace repo"],
|
|
1373
|
+
cwd=repo,
|
|
1374
|
+
check=True,
|
|
1375
|
+
capture_output=True,
|
|
1376
|
+
text=True,
|
|
1377
|
+
)
|
|
1378
|
+
|
|
1379
|
+
stub_path = Path(temp_dir) / "fake_codex_no_edit_command_grace.py"
|
|
1380
|
+
stub_path.write_text(
|
|
1381
|
+
"\n".join(
|
|
1382
|
+
[
|
|
1383
|
+
"from pathlib import Path",
|
|
1384
|
+
"import json",
|
|
1385
|
+
"import sys",
|
|
1386
|
+
"import time",
|
|
1387
|
+
"",
|
|
1388
|
+
"argv = sys.argv[1:]",
|
|
1389
|
+
"last_message_path = None",
|
|
1390
|
+
"for index, arg in enumerate(argv):",
|
|
1391
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1392
|
+
" last_message_path = argv[index + 1]",
|
|
1393
|
+
" break",
|
|
1394
|
+
"",
|
|
1395
|
+
"sys.stdin.read()",
|
|
1396
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1397
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1398
|
+
"print(json.dumps({'type': 'item.started', 'item': {'id': 'cmd-read-target', 'type': 'command_execution', 'command': 'sed -n 1,120p README.md', 'status': 'in_progress'}}), flush=True)",
|
|
1399
|
+
"time.sleep(1.4)",
|
|
1400
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'id': 'cmd-read-target', 'type': 'command_execution', 'command': 'sed -n 1,120p README.md', 'status': 'completed', 'exit_code': 0, 'aggregated_output': '# command grace repo'}}), flush=True)",
|
|
1401
|
+
"time.sleep(1.6)",
|
|
1402
|
+
"Path('src').mkdir(exist_ok=True)",
|
|
1403
|
+
"Path('src/command-grace.txt').write_text('patched after command-backed discovery\\n', encoding='utf-8')",
|
|
1404
|
+
"if last_message_path:",
|
|
1405
|
+
" Path(last_message_path).write_text('Patched after command-backed discovery.', encoding='utf-8')",
|
|
1406
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Patched after command-backed discovery.'}}), flush=True)",
|
|
1407
|
+
]
|
|
1408
|
+
),
|
|
1409
|
+
encoding="utf-8",
|
|
1410
|
+
)
|
|
1411
|
+
|
|
1412
|
+
env_overrides = {
|
|
1413
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1414
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1415
|
+
"OPENAI_API_KEY": "pushpals-no-edit-command-grace-test-key",
|
|
1416
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1417
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1418
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1419
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S": "5",
|
|
1420
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1421
|
+
}
|
|
1422
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1423
|
+
result = _run_codex_task(
|
|
1424
|
+
str(repo),
|
|
1425
|
+
"Add one focused contract assertion after inspecting the hinted test.",
|
|
1426
|
+
[],
|
|
1427
|
+
)
|
|
1428
|
+
|
|
1429
|
+
self.assertTrue(result.get("ok"), result)
|
|
1430
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1431
|
+
self.assertIn("Patched after command-backed discovery", str(result.get("stdout") or ""))
|
|
1432
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1433
|
+
|
|
1434
|
+
def test_run_codex_task_no_edit_watchdog_extends_after_later_command_progress(self) -> None:
|
|
1435
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-late-command-") as temp_dir:
|
|
1436
|
+
repo = Path(temp_dir) / "repo"
|
|
1437
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1438
|
+
(repo / "README.md").write_text("# late command grace repo\n", encoding="utf-8")
|
|
1439
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1440
|
+
subprocess.run(
|
|
1441
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1442
|
+
cwd=repo,
|
|
1443
|
+
check=True,
|
|
1444
|
+
capture_output=True,
|
|
1445
|
+
text=True,
|
|
1446
|
+
)
|
|
1447
|
+
subprocess.run(
|
|
1448
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1449
|
+
cwd=repo,
|
|
1450
|
+
check=True,
|
|
1451
|
+
capture_output=True,
|
|
1452
|
+
text=True,
|
|
1453
|
+
)
|
|
1454
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1455
|
+
subprocess.run(
|
|
1456
|
+
["git", "commit", "-m", "chore: seed late command repo"],
|
|
1457
|
+
cwd=repo,
|
|
1458
|
+
check=True,
|
|
1459
|
+
capture_output=True,
|
|
1460
|
+
text=True,
|
|
1461
|
+
)
|
|
1462
|
+
|
|
1463
|
+
stub_path = Path(temp_dir) / "fake_codex_late_command_grace.py"
|
|
1464
|
+
stub_path.write_text(
|
|
1465
|
+
"\n".join(
|
|
1466
|
+
[
|
|
1467
|
+
"from pathlib import Path",
|
|
1468
|
+
"import json",
|
|
1469
|
+
"import sys",
|
|
1470
|
+
"import time",
|
|
1471
|
+
"",
|
|
1472
|
+
"argv = sys.argv[1:]",
|
|
1473
|
+
"last_message_path = None",
|
|
1474
|
+
"for index, arg in enumerate(argv):",
|
|
1475
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1476
|
+
" last_message_path = argv[index + 1]",
|
|
1477
|
+
" break",
|
|
1478
|
+
"",
|
|
1479
|
+
"sys.stdin.read()",
|
|
1480
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1481
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1482
|
+
"print(json.dumps({'type': 'item.started', 'item': {'id': 'cmd-one', 'type': 'command_execution', 'command': 'cat README.md', 'status': 'in_progress'}}), flush=True)",
|
|
1483
|
+
"time.sleep(0.2)",
|
|
1484
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'id': 'cmd-one', 'type': 'command_execution', 'command': 'cat README.md', 'status': 'completed', 'exit_code': 0}}), flush=True)",
|
|
1485
|
+
"time.sleep(2.2)",
|
|
1486
|
+
"print(json.dumps({'type': 'item.started', 'item': {'id': 'cmd-two', 'type': 'command_execution', 'command': 'ls', 'status': 'in_progress'}}), flush=True)",
|
|
1487
|
+
"time.sleep(0.2)",
|
|
1488
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'id': 'cmd-two', 'type': 'command_execution', 'command': 'ls', 'status': 'completed', 'exit_code': 0}}), flush=True)",
|
|
1489
|
+
"time.sleep(2.0)",
|
|
1490
|
+
"Path('src').mkdir(exist_ok=True)",
|
|
1491
|
+
"Path('src/late-command-grace.txt').write_text('patched after later command progress\\n', encoding='utf-8')",
|
|
1492
|
+
"if last_message_path:",
|
|
1493
|
+
" Path(last_message_path).write_text('Patched after later command progress.', encoding='utf-8')",
|
|
1494
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Patched after later command progress.'}}), flush=True)",
|
|
1495
|
+
]
|
|
1496
|
+
),
|
|
1497
|
+
encoding="utf-8",
|
|
1498
|
+
)
|
|
1499
|
+
|
|
1500
|
+
env_overrides = {
|
|
1501
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1502
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1503
|
+
"OPENAI_API_KEY": "pushpals-no-edit-late-command-test-key",
|
|
1504
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1505
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1506
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1507
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S": "3",
|
|
1508
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1509
|
+
}
|
|
1510
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1511
|
+
result = _run_codex_task(
|
|
1512
|
+
str(repo),
|
|
1513
|
+
"Add one focused contract assertion after a later targeted read.",
|
|
1514
|
+
[],
|
|
1515
|
+
)
|
|
1516
|
+
|
|
1517
|
+
self.assertTrue(result.get("ok"), result)
|
|
1518
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1519
|
+
self.assertIn("Patched after later command progress", str(result.get("stdout") or ""))
|
|
1520
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1521
|
+
|
|
1522
|
+
def test_run_codex_task_finalizes_after_durable_publishable_progress(self) -> None:
|
|
1523
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-durable-progress-") as temp_dir:
|
|
1524
|
+
repo = Path(temp_dir) / "repo"
|
|
1525
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1526
|
+
(repo / "README.md").write_text("# durable progress repo\n", encoding="utf-8")
|
|
1527
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1528
|
+
subprocess.run(
|
|
1529
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1530
|
+
cwd=repo,
|
|
1531
|
+
check=True,
|
|
1532
|
+
capture_output=True,
|
|
1533
|
+
text=True,
|
|
1534
|
+
)
|
|
1535
|
+
subprocess.run(
|
|
1536
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1537
|
+
cwd=repo,
|
|
1538
|
+
check=True,
|
|
1539
|
+
capture_output=True,
|
|
1540
|
+
text=True,
|
|
1541
|
+
)
|
|
1542
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1543
|
+
subprocess.run(
|
|
1544
|
+
["git", "commit", "-m", "chore: seed durable progress repo"],
|
|
1545
|
+
cwd=repo,
|
|
1546
|
+
check=True,
|
|
1547
|
+
capture_output=True,
|
|
1548
|
+
text=True,
|
|
1549
|
+
)
|
|
1550
|
+
|
|
1551
|
+
stub_path = Path(temp_dir) / "fake_codex_durable_progress.py"
|
|
1552
|
+
stub_path.write_text(
|
|
1553
|
+
"\n".join(
|
|
1554
|
+
[
|
|
1555
|
+
"from pathlib import Path",
|
|
1556
|
+
"import json",
|
|
1557
|
+
"import sys",
|
|
1558
|
+
"import time",
|
|
1559
|
+
"",
|
|
1560
|
+
"argv = sys.argv[1:]",
|
|
1561
|
+
"last_message_path = None",
|
|
1562
|
+
"for index, arg in enumerate(argv):",
|
|
1563
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1564
|
+
" last_message_path = argv[index + 1]",
|
|
1565
|
+
" break",
|
|
1566
|
+
"",
|
|
1567
|
+
"sys.stdin.read()",
|
|
1568
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1569
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1570
|
+
"Path('src').mkdir(exist_ok=True)",
|
|
1571
|
+
"Path('src/durable-progress.txt').write_text('durable patch\\n', encoding='utf-8')",
|
|
1572
|
+
"if last_message_path:",
|
|
1573
|
+
" Path(last_message_path).write_text('Created durable patch and kept thinking.', encoding='utf-8')",
|
|
1574
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Created durable patch and kept thinking.'}}), flush=True)",
|
|
1575
|
+
"time.sleep(10)",
|
|
1576
|
+
]
|
|
1577
|
+
),
|
|
1578
|
+
encoding="utf-8",
|
|
1579
|
+
)
|
|
1580
|
+
|
|
1581
|
+
env_overrides = {
|
|
1582
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1583
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1584
|
+
"OPENAI_API_KEY": "pushpals-durable-progress-test-key",
|
|
1585
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1586
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1587
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1588
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_RECHECK_S": "1",
|
|
1589
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1590
|
+
}
|
|
1591
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1592
|
+
result = _run_codex_task(
|
|
1593
|
+
str(repo),
|
|
1594
|
+
"Make a focused patch and stop once it is durable.",
|
|
1595
|
+
[],
|
|
1596
|
+
)
|
|
1597
|
+
|
|
1598
|
+
self.assertTrue(result.get("ok"), result)
|
|
1599
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1600
|
+
self.assertIn("stopped after durable publishable progress", str(result.get("summary") or ""))
|
|
1601
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1602
|
+
|
|
1350
1603
|
def test_run_codex_task_recovery_attempt_is_still_guarded_by_no_edit_watchdog(self) -> None:
|
|
1351
1604
|
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-watchdog-fail-") as temp_dir:
|
|
1352
1605
|
repo = Path(temp_dir) / "repo"
|
|
@@ -335,6 +335,22 @@ export function shouldSkipCriticForDeterministicValidationRevision(opts: {
|
|
|
335
335
|
return opts.validationRuns.some(isDeterministicFastValidationFailure);
|
|
336
336
|
}
|
|
337
337
|
|
|
338
|
+
export function shouldSkipCriticToPreserveRevisionBudget(opts: {
|
|
339
|
+
deterministicRequiresRevision: boolean;
|
|
340
|
+
remainingBudgetMs: number;
|
|
341
|
+
minimumRevisionBudgetMs: number;
|
|
342
|
+
criticTimeoutMs: number;
|
|
343
|
+
criticTimeoutBehavior: "skip" | "retry_once" | "block" | string;
|
|
344
|
+
}): boolean {
|
|
345
|
+
if (!opts.deterministicRequiresRevision) return false;
|
|
346
|
+
const remainingBudgetMs = Math.max(0, Math.floor(opts.remainingBudgetMs));
|
|
347
|
+
const minimumRevisionBudgetMs = Math.max(0, Math.floor(opts.minimumRevisionBudgetMs));
|
|
348
|
+
const criticTimeoutMs = Math.max(0, Math.floor(opts.criticTimeoutMs));
|
|
349
|
+
const criticAttempts = opts.criticTimeoutBehavior === "retry_once" ? 2 : 1;
|
|
350
|
+
const criticWorstCaseMs = criticTimeoutMs * criticAttempts;
|
|
351
|
+
return remainingBudgetMs < minimumRevisionBudgetMs + criticWorstCaseMs;
|
|
352
|
+
}
|
|
353
|
+
|
|
338
354
|
export function workerAttemptRolloutScore(params: {
|
|
339
355
|
executorElapsedMs: number;
|
|
340
356
|
qualityElapsedMs: number;
|
|
@@ -7975,11 +7991,23 @@ export async function executeJob(
|
|
|
7975
7991
|
validationOutsideTaskScope,
|
|
7976
7992
|
validationRuns: quality.validationRuns,
|
|
7977
7993
|
});
|
|
7994
|
+
const preCriticRevisionBudget = qualityRevisionBudgetDecision({
|
|
7995
|
+
jobElapsedMs: Date.now() - jobStartedAt,
|
|
7996
|
+
executionBudgetMs,
|
|
7997
|
+
});
|
|
7998
|
+
const skipCriticForRevisionBudget = shouldSkipCriticToPreserveRevisionBudget({
|
|
7999
|
+
deterministicRequiresRevision: preCriticDeterministicRequiresRevision,
|
|
8000
|
+
remainingBudgetMs: preCriticRevisionBudget.remainingBudgetMs,
|
|
8001
|
+
minimumRevisionBudgetMs: preCriticRevisionBudget.minimumRevisionBudgetMs,
|
|
8002
|
+
criticTimeoutMs: resolveQualityCriticTimeoutMs(runtimeConfig),
|
|
8003
|
+
criticTimeoutBehavior: resolveQualityCriticTimeoutBehavior(runtimeConfig),
|
|
8004
|
+
});
|
|
7978
8005
|
const critic =
|
|
7979
8006
|
quality.skipped ||
|
|
7980
8007
|
!qualityGatePolicy.criticGateEnabled ||
|
|
7981
8008
|
skipCriticAfterExecutorTimeout ||
|
|
7982
|
-
skipCriticForDeterministicValidationRevision
|
|
8009
|
+
skipCriticForDeterministicValidationRevision ||
|
|
8010
|
+
skipCriticForRevisionBudget
|
|
7983
8011
|
? null
|
|
7984
8012
|
: executor === "openai_codex"
|
|
7985
8013
|
? await runCodexCriticReview(repo, attemptParams, qualityForCritic, runtimeConfig, onLog)
|
|
@@ -8020,6 +8048,11 @@ export async function executeJob(
|
|
|
8020
8048
|
"stdout",
|
|
8021
8049
|
"[CriticGate] Skipping critic because deterministic fast validation already requires a quality revision.",
|
|
8022
8050
|
);
|
|
8051
|
+
} else if (skipCriticForRevisionBudget) {
|
|
8052
|
+
onLog?.(
|
|
8053
|
+
"stdout",
|
|
8054
|
+
`[CriticGate] Skipping critic because deterministic quality already requires revision and remaining budget (${preCriticRevisionBudget.remainingBudgetMs}ms) must be reserved for the next worker turn.`,
|
|
8055
|
+
);
|
|
8023
8056
|
}
|
|
8024
8057
|
const rolloutScore = workerAttemptRolloutScore({
|
|
8025
8058
|
executorElapsedMs,
|