@pushpalsdev/cli 1.1.34 → 1.1.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/openai_codex_executor.py +277 -8
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/test_openai_codex_runtime_config.py +174 -2
- package/runtime/sandbox/apps/workerpals/src/backends/shared/executor_base.py +35 -3
- package/runtime/sandbox/apps/workerpals/src/docker_executor.ts +0 -2
package/package.json
CHANGED
|
@@ -116,6 +116,9 @@ _WEB_REVIEW_NO_EDIT_WATCHDOG_S = 240
|
|
|
116
116
|
_BACKGROUND_NO_EDIT_WATCHDOG_S = 120
|
|
117
117
|
_NO_EDIT_RECOVERY_WATCHDOG_S = 90
|
|
118
118
|
_DEFAULT_NO_EDIT_RECHECK_S = 120
|
|
119
|
+
_DEFAULT_NO_EDIT_COMMAND_GRACE_S = 240
|
|
120
|
+
_DEFAULT_STARTUP_STALL_WATCHDOG_S = 210
|
|
121
|
+
_RECOVERY_STARTUP_STALL_WATCHDOG_S = 150
|
|
119
122
|
_DEFAULT_ROLLOUT_WATCHDOG_S = 300
|
|
120
123
|
_SMALL_TASK_ROLLOUT_WATCHDOG_S = 240
|
|
121
124
|
_NARROW_TEST_TASK_ROLLOUT_WATCHDOG_S = 150
|
|
@@ -755,6 +758,65 @@ def _resolve_no_edit_recheck_seconds(communicate_timeout_s: Optional[int]) -> in
|
|
|
755
758
|
return max(1, min(_DEFAULT_NO_EDIT_RECHECK_S, upper))
|
|
756
759
|
|
|
757
760
|
|
|
761
|
+
def _resolve_no_edit_command_grace_seconds(communicate_timeout_s: Optional[int]) -> Optional[int]:
|
|
762
|
+
if not communicate_timeout_s:
|
|
763
|
+
return None
|
|
764
|
+
|
|
765
|
+
raw = os.environ.get("WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S", "").strip()
|
|
766
|
+
if raw:
|
|
767
|
+
if raw == "0":
|
|
768
|
+
return None
|
|
769
|
+
parsed = _to_positive_int(raw)
|
|
770
|
+
if parsed is None:
|
|
771
|
+
log.info(
|
|
772
|
+
"Invalid WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S="
|
|
773
|
+
f"{raw!r}; using default command-progress grace."
|
|
774
|
+
)
|
|
775
|
+
else:
|
|
776
|
+
return max(1, min(parsed, max(1, communicate_timeout_s - 1)))
|
|
777
|
+
|
|
778
|
+
upper = max(1, communicate_timeout_s - 1)
|
|
779
|
+
return max(1, min(_DEFAULT_NO_EDIT_COMMAND_GRACE_S, upper))
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def _resolve_startup_stall_watchdog_seconds(
|
|
783
|
+
communicate_timeout_s: Optional[int],
|
|
784
|
+
recovery_attempt: int = 0,
|
|
785
|
+
) -> Optional[int]:
|
|
786
|
+
if not communicate_timeout_s:
|
|
787
|
+
return None
|
|
788
|
+
|
|
789
|
+
raw = os.environ.get("WORKERPALS_OPENAI_CODEX_STARTUP_STALL_WATCHDOG_S", "").strip()
|
|
790
|
+
if raw:
|
|
791
|
+
if raw == "0":
|
|
792
|
+
return None
|
|
793
|
+
parsed = _to_positive_int(raw)
|
|
794
|
+
if parsed is None:
|
|
795
|
+
log.info(
|
|
796
|
+
"Invalid WORKERPALS_OPENAI_CODEX_STARTUP_STALL_WATCHDOG_S="
|
|
797
|
+
f"{raw!r}; using default startup-stall watchdog."
|
|
798
|
+
)
|
|
799
|
+
else:
|
|
800
|
+
return max(1, min(parsed, max(1, communicate_timeout_s - 1)))
|
|
801
|
+
|
|
802
|
+
default_s = (
|
|
803
|
+
_RECOVERY_STARTUP_STALL_WATCHDOG_S
|
|
804
|
+
if recovery_attempt > 0
|
|
805
|
+
else _DEFAULT_STARTUP_STALL_WATCHDOG_S
|
|
806
|
+
)
|
|
807
|
+
floor_s = 60
|
|
808
|
+
return max(floor_s, min(default_s, max(floor_s, communicate_timeout_s - 1)))
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def _startup_stall_recovery_model(current_model: str) -> str:
|
|
812
|
+
normalized = str(current_model or "").strip()
|
|
813
|
+
if not normalized:
|
|
814
|
+
return LEGACY_CODEX_MODEL_FALLBACK
|
|
815
|
+
if normalized.lower() == LEGACY_CODEX_MODEL_FALLBACK.lower():
|
|
816
|
+
return normalized
|
|
817
|
+
return LEGACY_CODEX_MODEL_FALLBACK
|
|
818
|
+
|
|
819
|
+
|
|
758
820
|
def _looks_like_web_review_prompt(prompt: str) -> bool:
|
|
759
821
|
text = str(prompt or "").lower()
|
|
760
822
|
return "repo-native web review" in text or "web review path" in text
|
|
@@ -1299,10 +1361,94 @@ def _empty_codex_trace() -> Dict[str, Any]:
|
|
|
1299
1361
|
"prompt_tokens": 0,
|
|
1300
1362
|
"completion_tokens": 0,
|
|
1301
1363
|
"total_tokens": 0,
|
|
1364
|
+
"active_command_ids": [],
|
|
1365
|
+
"command_event_count": 0,
|
|
1366
|
+
"last_command_activity_at": None,
|
|
1367
|
+
"last_command_summary": "",
|
|
1302
1368
|
}
|
|
1303
1369
|
|
|
1304
1370
|
|
|
1305
|
-
def
|
|
1371
|
+
def _looks_like_codex_command_item(value: Any) -> bool:
|
|
1372
|
+
if not isinstance(value, dict):
|
|
1373
|
+
return False
|
|
1374
|
+
type_text = " ".join(
|
|
1375
|
+
str(value.get(key) or "")
|
|
1376
|
+
for key in ("type", "item_type", "kind", "name", "tool_name")
|
|
1377
|
+
).lower()
|
|
1378
|
+
if any(marker in type_text for marker in ("command_execution", "exec_command", "shell_command")):
|
|
1379
|
+
return True
|
|
1380
|
+
return any(key in value for key in ("command", "cmd", "exit_code", "aggregated_output"))
|
|
1381
|
+
|
|
1382
|
+
|
|
1383
|
+
def _record_codex_command_activity(
|
|
1384
|
+
parsed: Dict[str, Any],
|
|
1385
|
+
event_type: str,
|
|
1386
|
+
trace: Dict[str, Any],
|
|
1387
|
+
now: float,
|
|
1388
|
+
) -> None:
|
|
1389
|
+
item = parsed.get("item")
|
|
1390
|
+
command_source: Any = item if _looks_like_codex_command_item(item) else parsed
|
|
1391
|
+
if not _looks_like_codex_command_item(command_source):
|
|
1392
|
+
return
|
|
1393
|
+
|
|
1394
|
+
command_text = ""
|
|
1395
|
+
if isinstance(command_source, dict):
|
|
1396
|
+
for key in ("command", "cmd", "name"):
|
|
1397
|
+
raw = command_source.get(key)
|
|
1398
|
+
if isinstance(raw, str) and raw.strip():
|
|
1399
|
+
command_text = _truncate_inline(raw.strip(), 160)
|
|
1400
|
+
break
|
|
1401
|
+
command_id = ""
|
|
1402
|
+
if isinstance(command_source, dict):
|
|
1403
|
+
command_id = str(
|
|
1404
|
+
command_source.get("id")
|
|
1405
|
+
or command_source.get("call_id")
|
|
1406
|
+
or command_source.get("item_id")
|
|
1407
|
+
or command_text
|
|
1408
|
+
or "command"
|
|
1409
|
+
).strip()
|
|
1410
|
+
command_id = command_id or "command"
|
|
1411
|
+
|
|
1412
|
+
active = trace.setdefault("active_command_ids", [])
|
|
1413
|
+
if not isinstance(active, list):
|
|
1414
|
+
active = []
|
|
1415
|
+
trace["active_command_ids"] = active
|
|
1416
|
+
|
|
1417
|
+
status_text = ""
|
|
1418
|
+
if isinstance(command_source, dict):
|
|
1419
|
+
status_text = " ".join(
|
|
1420
|
+
str(command_source.get(key) or "")
|
|
1421
|
+
for key in ("status", "state", "outcome")
|
|
1422
|
+
).lower()
|
|
1423
|
+
event_lower = event_type.lower()
|
|
1424
|
+
completed = (
|
|
1425
|
+
"completed" in event_lower
|
|
1426
|
+
or "failed" in event_lower
|
|
1427
|
+
or "error" in event_lower
|
|
1428
|
+
or any(marker in status_text for marker in ("completed", "failed", "cancelled", "canceled", "exited"))
|
|
1429
|
+
)
|
|
1430
|
+
started = (
|
|
1431
|
+
"started" in event_lower
|
|
1432
|
+
or "updated" in event_lower
|
|
1433
|
+
or any(marker in status_text for marker in ("running", "in_progress", "started"))
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
if completed:
|
|
1437
|
+
trace["active_command_ids"] = [item for item in active if str(item) != command_id]
|
|
1438
|
+
elif started and command_id not in active:
|
|
1439
|
+
active.append(command_id)
|
|
1440
|
+
|
|
1441
|
+
trace["command_event_count"] = to_int(trace.get("command_event_count"), 0) + 1
|
|
1442
|
+
trace["last_command_activity_at"] = float(now)
|
|
1443
|
+
trace["last_command_summary"] = command_text or event_type
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
def _record_live_codex_stdout_line(
|
|
1447
|
+
line: str,
|
|
1448
|
+
use_json: bool,
|
|
1449
|
+
trace: Dict[str, Any],
|
|
1450
|
+
now: Optional[float] = None,
|
|
1451
|
+
) -> None:
|
|
1306
1452
|
stripped = line.strip()
|
|
1307
1453
|
if not stripped:
|
|
1308
1454
|
return
|
|
@@ -1329,6 +1475,7 @@ def _record_live_codex_stdout_line(line: str, use_json: bool, trace: Dict[str, A
|
|
|
1329
1475
|
return
|
|
1330
1476
|
|
|
1331
1477
|
if isinstance(parsed, dict):
|
|
1478
|
+
observed_at = float(now if now is not None else time.monotonic())
|
|
1332
1479
|
usage = _extract_usage_counts(parsed)
|
|
1333
1480
|
if usage is not None:
|
|
1334
1481
|
trace["prompt_tokens"] = max(
|
|
@@ -1345,6 +1492,7 @@ def _record_live_codex_stdout_line(line: str, use_json: bool, trace: Dict[str, A
|
|
|
1345
1492
|
.strip()
|
|
1346
1493
|
or "event"
|
|
1347
1494
|
)
|
|
1495
|
+
_record_codex_command_activity(parsed, event_type, trace, observed_at)
|
|
1348
1496
|
event_type_counts[event_type] = to_int(event_type_counts.get(event_type), 0) + 1
|
|
1349
1497
|
summary = _summarize_json_event(parsed)
|
|
1350
1498
|
# Reasoning can arrive under generic event types (for example item.updated).
|
|
@@ -1409,10 +1557,13 @@ def _finalize_codex_stdout_trace(trace: Dict[str, Any], use_json: bool) -> Dict[
|
|
|
1409
1557
|
prompt_tokens = to_int(trace.get("prompt_tokens"), 0)
|
|
1410
1558
|
completion_tokens = to_int(trace.get("completion_tokens"), 0)
|
|
1411
1559
|
total_tokens = to_int(trace.get("total_tokens"), 0)
|
|
1560
|
+
command_event_count = to_int(trace.get("command_event_count"), 0)
|
|
1412
1561
|
if reasoning_events > 0:
|
|
1413
1562
|
log.info(f"[codex] Reasoning-like event(s): {reasoning_events}")
|
|
1414
1563
|
elif use_json and valid_json > 0:
|
|
1415
1564
|
log.info("[codex] No reasoning-like events observed in this run.")
|
|
1565
|
+
if command_event_count > 0:
|
|
1566
|
+
log.info(f"[codex] Command execution event(s): {command_event_count}")
|
|
1416
1567
|
if total_tokens > 0:
|
|
1417
1568
|
log.info(
|
|
1418
1569
|
f"[codex] Usage observed: prompt={prompt_tokens} completion={completion_tokens} total={total_tokens}"
|
|
@@ -1433,6 +1584,7 @@ def _finalize_codex_stdout_trace(trace: Dict[str, Any], use_json: bool) -> Dict[
|
|
|
1433
1584
|
"prompt_tokens": prompt_tokens,
|
|
1434
1585
|
"completion_tokens": completion_tokens,
|
|
1435
1586
|
"total_tokens": total_tokens,
|
|
1587
|
+
"command_event_count": command_event_count,
|
|
1436
1588
|
}
|
|
1437
1589
|
|
|
1438
1590
|
|
|
@@ -2259,8 +2411,14 @@ def _run_codex_task(
|
|
|
2259
2411
|
if not line:
|
|
2260
2412
|
continue
|
|
2261
2413
|
with trace_lock:
|
|
2262
|
-
|
|
2263
|
-
|
|
2414
|
+
observed_at = time.monotonic()
|
|
2415
|
+
last_activity_at["ts"] = observed_at
|
|
2416
|
+
_record_live_codex_stdout_line(
|
|
2417
|
+
line,
|
|
2418
|
+
use_json,
|
|
2419
|
+
stdout_trace_state,
|
|
2420
|
+
observed_at,
|
|
2421
|
+
)
|
|
2264
2422
|
except Exception:
|
|
2265
2423
|
pass
|
|
2266
2424
|
finally:
|
|
@@ -2337,6 +2495,16 @@ def _run_codex_task(
|
|
|
2337
2495
|
else None
|
|
2338
2496
|
)
|
|
2339
2497
|
no_edit_recheck_s = _resolve_no_edit_recheck_seconds(communicate_timeout_s)
|
|
2498
|
+
no_edit_command_grace_s = _resolve_no_edit_command_grace_seconds(communicate_timeout_s)
|
|
2499
|
+
startup_stall_watchdog_s = _resolve_startup_stall_watchdog_seconds(
|
|
2500
|
+
communicate_timeout_s,
|
|
2501
|
+
recovery_attempt=startup_stall_recovery_attempt,
|
|
2502
|
+
)
|
|
2503
|
+
startup_stall_deadline = (
|
|
2504
|
+
started_at + float(startup_stall_watchdog_s)
|
|
2505
|
+
if startup_stall_watchdog_s is not None
|
|
2506
|
+
else None
|
|
2507
|
+
)
|
|
2340
2508
|
rollout_watchdog_s = (
|
|
2341
2509
|
_resolve_rollout_watchdog_seconds(
|
|
2342
2510
|
prompt,
|
|
@@ -2351,6 +2519,11 @@ def _run_codex_task(
|
|
|
2351
2519
|
if no_edit_watchdog_s is not None
|
|
2352
2520
|
else None
|
|
2353
2521
|
)
|
|
2522
|
+
no_edit_command_grace_cap_deadline = (
|
|
2523
|
+
started_at + float(no_edit_watchdog_s + no_edit_command_grace_s)
|
|
2524
|
+
if no_edit_watchdog_s is not None and no_edit_command_grace_s is not None
|
|
2525
|
+
else None
|
|
2526
|
+
)
|
|
2354
2527
|
rollout_deadline = (
|
|
2355
2528
|
started_at + float(rollout_watchdog_s)
|
|
2356
2529
|
if rollout_watchdog_s is not None
|
|
@@ -2364,9 +2537,93 @@ def _run_codex_task(
|
|
|
2364
2537
|
_terminate_active_child()
|
|
2365
2538
|
break
|
|
2366
2539
|
|
|
2540
|
+
if startup_stall_deadline is not None and now >= startup_stall_deadline:
|
|
2541
|
+
with trace_lock:
|
|
2542
|
+
live_trace = dict(stdout_trace_state)
|
|
2543
|
+
summaries = stdout_trace_state.get("summaries")
|
|
2544
|
+
if isinstance(summaries, list):
|
|
2545
|
+
live_trace["summaries"] = list(summaries)
|
|
2546
|
+
if _codex_trace_is_startup_stall(live_trace):
|
|
2547
|
+
changed_paths, _, effective_paths = _codex_changed_paths(repo, baseline_snapshot)
|
|
2548
|
+
if not effective_paths:
|
|
2549
|
+
no_edit_artifact_only_paths = _describe_non_publishable_paths(
|
|
2550
|
+
changed_paths,
|
|
2551
|
+
baseline_snapshot,
|
|
2552
|
+
)
|
|
2553
|
+
no_edit_watchdog_fired = True
|
|
2554
|
+
elapsed_s = int(max(0.0, now - started_at))
|
|
2555
|
+
log.info(
|
|
2556
|
+
f"Startup-stall watchdog fired after {elapsed_s}s with no assistant/tool progress."
|
|
2557
|
+
)
|
|
2558
|
+
_terminate_active_child()
|
|
2559
|
+
break
|
|
2560
|
+
startup_stall_deadline = None
|
|
2561
|
+
|
|
2367
2562
|
if no_edit_deadline is not None and now >= no_edit_deadline:
|
|
2368
2563
|
changed_paths, _, effective_paths = _codex_changed_paths(repo, baseline_snapshot)
|
|
2369
2564
|
if not effective_paths:
|
|
2565
|
+
with trace_lock:
|
|
2566
|
+
live_trace = dict(stdout_trace_state)
|
|
2567
|
+
summaries = stdout_trace_state.get("summaries")
|
|
2568
|
+
if isinstance(summaries, list):
|
|
2569
|
+
live_trace["summaries"] = list(summaries)
|
|
2570
|
+
startup_only = _codex_trace_is_startup_stall(live_trace)
|
|
2571
|
+
if (
|
|
2572
|
+
startup_only
|
|
2573
|
+
and startup_stall_deadline is not None
|
|
2574
|
+
and now < startup_stall_deadline
|
|
2575
|
+
):
|
|
2576
|
+
no_edit_deadline = startup_stall_deadline
|
|
2577
|
+
remaining_s = int(max(1.0, startup_stall_deadline - now))
|
|
2578
|
+
log.info(
|
|
2579
|
+
"No-edit watchdog observed only Codex startup events; "
|
|
2580
|
+
f"allowing {remaining_s}s for first assistant/tool progress "
|
|
2581
|
+
"before startup-stall recovery."
|
|
2582
|
+
)
|
|
2583
|
+
continue
|
|
2584
|
+
command_event_count = to_int(live_trace.get("command_event_count"), 0)
|
|
2585
|
+
active_commands_raw = live_trace.get("active_command_ids")
|
|
2586
|
+
active_command_count = (
|
|
2587
|
+
len(active_commands_raw)
|
|
2588
|
+
if isinstance(active_commands_raw, list)
|
|
2589
|
+
else 0
|
|
2590
|
+
)
|
|
2591
|
+
last_command_activity_at = 0.0
|
|
2592
|
+
try:
|
|
2593
|
+
last_command_activity_at = float(
|
|
2594
|
+
live_trace.get("last_command_activity_at") or 0.0
|
|
2595
|
+
)
|
|
2596
|
+
except Exception:
|
|
2597
|
+
last_command_activity_at = 0.0
|
|
2598
|
+
if command_event_count > 0 and no_edit_command_grace_s is not None:
|
|
2599
|
+
command_grace_deadline = 0.0
|
|
2600
|
+
if active_command_count > 0:
|
|
2601
|
+
# Do not kill while Codex is actively running a tool command; poll
|
|
2602
|
+
# again soon, but keep the total grace bounded by the hard cap below.
|
|
2603
|
+
command_grace_deadline = now + min(60.0, float(no_edit_command_grace_s))
|
|
2604
|
+
elif last_command_activity_at > 0:
|
|
2605
|
+
command_grace_deadline = last_command_activity_at + float(
|
|
2606
|
+
no_edit_command_grace_s
|
|
2607
|
+
)
|
|
2608
|
+
if no_edit_command_grace_cap_deadline is not None:
|
|
2609
|
+
command_grace_deadline = min(
|
|
2610
|
+
command_grace_deadline,
|
|
2611
|
+
no_edit_command_grace_cap_deadline,
|
|
2612
|
+
)
|
|
2613
|
+
if command_grace_deadline > now:
|
|
2614
|
+
no_edit_deadline = command_grace_deadline
|
|
2615
|
+
remaining_s = int(max(1.0, command_grace_deadline - now))
|
|
2616
|
+
command_detail = (
|
|
2617
|
+
f"{active_command_count} active command(s)"
|
|
2618
|
+
if active_command_count > 0
|
|
2619
|
+
else "recent command completion"
|
|
2620
|
+
)
|
|
2621
|
+
log.info(
|
|
2622
|
+
"No-edit watchdog observed Codex tool progress "
|
|
2623
|
+
f"({command_detail}); allowing {remaining_s}s for a "
|
|
2624
|
+
"publishable patch before recovery."
|
|
2625
|
+
)
|
|
2626
|
+
continue
|
|
2370
2627
|
no_edit_artifact_only_paths = _describe_non_publishable_paths(
|
|
2371
2628
|
changed_paths,
|
|
2372
2629
|
baseline_snapshot,
|
|
@@ -2377,9 +2634,15 @@ def _run_codex_task(
|
|
|
2377
2634
|
if no_edit_artifact_only_paths
|
|
2378
2635
|
else ""
|
|
2379
2636
|
)
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2637
|
+
if startup_only:
|
|
2638
|
+
elapsed_s = int(max(0.0, now - started_at))
|
|
2639
|
+
log.info(
|
|
2640
|
+
f"Startup-stall watchdog fired after {elapsed_s}s with no assistant/tool progress."
|
|
2641
|
+
)
|
|
2642
|
+
else:
|
|
2643
|
+
log.info(
|
|
2644
|
+
f"No-edit watchdog fired after {int(no_edit_watchdog_s or 0)}s with no publishable file changes.{artifact_detail} Retrying with patch-first guidance."
|
|
2645
|
+
)
|
|
2383
2646
|
_terminate_active_child()
|
|
2384
2647
|
break
|
|
2385
2648
|
no_edit_deadline = now + float(no_edit_recheck_s)
|
|
@@ -2550,9 +2813,15 @@ def _run_codex_task(
|
|
|
2550
2813
|
*supplemental_guidance,
|
|
2551
2814
|
_build_startup_stall_recovery_guidance(trace_excerpt),
|
|
2552
2815
|
]
|
|
2816
|
+
recovery_model = _startup_stall_recovery_model(model)
|
|
2817
|
+
recovery_detail = (
|
|
2818
|
+
f" using fallback model {recovery_model!r}"
|
|
2819
|
+
if recovery_model and recovery_model != model
|
|
2820
|
+
else ""
|
|
2821
|
+
)
|
|
2553
2822
|
log.warning(
|
|
2554
2823
|
"Codex emitted only startup events before the no-edit watchdog; "
|
|
2555
|
-
"restarting Codex once before classifying the job terminally."
|
|
2824
|
+
f"restarting Codex once{recovery_detail} before classifying the job terminally."
|
|
2556
2825
|
)
|
|
2557
2826
|
retry_result = _run_codex_task(
|
|
2558
2827
|
repo,
|
|
@@ -2563,7 +2832,7 @@ def _run_codex_task(
|
|
|
2563
2832
|
startup_stall_recovery_attempt=startup_stall_recovery_attempt + 1,
|
|
2564
2833
|
no_edit_recovery_attempt=no_edit_recovery_attempt,
|
|
2565
2834
|
rollout_recovery_attempt=rollout_recovery_attempt,
|
|
2566
|
-
model_override=model_override,
|
|
2835
|
+
model_override=recovery_model or model_override,
|
|
2567
2836
|
baseline_changes=baseline_snapshot,
|
|
2568
2837
|
)
|
|
2569
2838
|
retry_result["usage"] = _merge_usage_records(usage, retry_result.get("usage"))
|
|
@@ -49,6 +49,7 @@ from openai_codex_executor import (
|
|
|
49
49
|
_resolve_codex_command_prefix,
|
|
50
50
|
_resolve_no_edit_watchdog_seconds,
|
|
51
51
|
_resolve_rollout_watchdog_seconds,
|
|
52
|
+
_resolve_startup_stall_watchdog_seconds,
|
|
52
53
|
_unwrap_shell_wrapper_command,
|
|
53
54
|
_usage_from_trace_or_estimate,
|
|
54
55
|
)
|
|
@@ -372,6 +373,63 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
372
373
|
self.assertEqual(task.repo, str(repo.resolve()))
|
|
373
374
|
self.assertEqual(task.instruction, "Make one small publishable change")
|
|
374
375
|
|
|
376
|
+
def test_parse_payload_accepts_positional_payload_file_path(self) -> None:
|
|
377
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-payload-file-positional-") as temp_dir:
|
|
378
|
+
repo = Path(temp_dir) / "repo"
|
|
379
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
380
|
+
payload = {
|
|
381
|
+
"kind": "task.execute",
|
|
382
|
+
"repo": str(repo),
|
|
383
|
+
"params": {"instruction": "Recover from a direct-worker payload handoff"},
|
|
384
|
+
}
|
|
385
|
+
encoded = base64.b64encode(json.dumps(payload).encode("utf-8")).decode("ascii")
|
|
386
|
+
payload_file = Path(temp_dir) / "payload.b64"
|
|
387
|
+
payload_file.write_text(encoded, encoding="utf-8")
|
|
388
|
+
|
|
389
|
+
task = parse_task_execute_payload(
|
|
390
|
+
["executor", str(payload_file)],
|
|
391
|
+
logger=Logger("[test]"),
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
self.assertEqual(task.kind, "task.execute")
|
|
395
|
+
self.assertEqual(task.repo, str(repo.resolve()))
|
|
396
|
+
self.assertEqual(task.instruction, "Recover from a direct-worker payload handoff")
|
|
397
|
+
|
|
398
|
+
def test_parse_payload_accepts_unpadded_base64_payload(self) -> None:
|
|
399
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-payload-unpadded-") as temp_dir:
|
|
400
|
+
repo = Path(temp_dir) / "repo"
|
|
401
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
402
|
+
payload = {
|
|
403
|
+
"kind": "task.execute",
|
|
404
|
+
"repo": str(repo),
|
|
405
|
+
"params": {"instruction": "Accept wrapper-normalized payload padding"},
|
|
406
|
+
}
|
|
407
|
+
encoded = base64.b64encode(json.dumps(payload).encode("utf-8")).decode("ascii")
|
|
408
|
+
unpadded = encoded.rstrip("=")
|
|
409
|
+
|
|
410
|
+
task = parse_task_execute_payload(["executor", unpadded], logger=Logger("[test]"))
|
|
411
|
+
|
|
412
|
+
self.assertEqual(task.kind, "task.execute")
|
|
413
|
+
self.assertEqual(task.repo, str(repo.resolve()))
|
|
414
|
+
self.assertEqual(task.instruction, "Accept wrapper-normalized payload padding")
|
|
415
|
+
|
|
416
|
+
def test_parse_payload_accepts_raw_json_payload(self) -> None:
|
|
417
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-payload-raw-json-") as temp_dir:
|
|
418
|
+
repo = Path(temp_dir) / "repo"
|
|
419
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
420
|
+
payload = {
|
|
421
|
+
"kind": "task.execute",
|
|
422
|
+
"repo": str(repo),
|
|
423
|
+
"params": {"instruction": "Accept raw JSON from a recovery wrapper"},
|
|
424
|
+
}
|
|
425
|
+
raw_json = json.dumps(payload)
|
|
426
|
+
|
|
427
|
+
task = parse_task_execute_payload(["executor", raw_json], logger=Logger("[test]"))
|
|
428
|
+
|
|
429
|
+
self.assertEqual(task.kind, "task.execute")
|
|
430
|
+
self.assertEqual(task.repo, str(repo.resolve()))
|
|
431
|
+
self.assertEqual(task.instruction, "Accept raw JSON from a recovery wrapper")
|
|
432
|
+
|
|
375
433
|
def test_parse_payload_prefers_helper_tests_for_visual_derivation_tasks(self) -> None:
|
|
376
434
|
with tempfile.TemporaryDirectory(prefix="pushpals-visual-guidance-") as temp_dir:
|
|
377
435
|
repo = Path(temp_dir) / "repo"
|
|
@@ -1091,13 +1149,16 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1091
1149
|
"",
|
|
1092
1150
|
"argv = sys.argv[1:]",
|
|
1093
1151
|
"last_message_path = None",
|
|
1152
|
+
"model = ''",
|
|
1094
1153
|
"for index, arg in enumerate(argv):",
|
|
1095
1154
|
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1096
1155
|
" last_message_path = argv[index + 1]",
|
|
1156
|
+
" if arg == '-m' and index + 1 < len(argv):",
|
|
1157
|
+
" model = argv[index + 1]",
|
|
1097
1158
|
" break",
|
|
1098
1159
|
"",
|
|
1099
1160
|
"prompt = sys.stdin.read()",
|
|
1100
|
-
"if 'Codex startup-stall recovery' in prompt:",
|
|
1161
|
+
"if 'Codex startup-stall recovery' in prompt and model == 'gpt-5.4':",
|
|
1101
1162
|
" Path('src').mkdir(exist_ok=True)",
|
|
1102
1163
|
" Path('src/startup-stall-recovered.txt').write_text('patched after restart\\n', encoding='utf-8')",
|
|
1103
1164
|
" if last_message_path:",
|
|
@@ -1119,7 +1180,8 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1119
1180
|
"OPENAI_API_KEY": "pushpals-startup-stall-test-key",
|
|
1120
1181
|
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1121
1182
|
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1122
|
-
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "
|
|
1183
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "0",
|
|
1184
|
+
"WORKERPALS_OPENAI_CODEX_STARTUP_STALL_WATCHDOG_S": "1",
|
|
1123
1185
|
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1124
1186
|
}
|
|
1125
1187
|
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
@@ -1189,6 +1251,7 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1189
1251
|
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1190
1252
|
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1191
1253
|
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1254
|
+
"WORKERPALS_OPENAI_CODEX_STARTUP_STALL_WATCHDOG_S": "1",
|
|
1192
1255
|
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1193
1256
|
}
|
|
1194
1257
|
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
@@ -1284,6 +1347,90 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1284
1347
|
self.assertIn("Patched immediately after no-edit recovery", str(result.get("stdout") or ""))
|
|
1285
1348
|
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1286
1349
|
|
|
1350
|
+
def test_run_codex_task_no_edit_watchdog_allows_command_backed_discovery(self) -> None:
|
|
1351
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-command-grace-") as temp_dir:
|
|
1352
|
+
repo = Path(temp_dir) / "repo"
|
|
1353
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1354
|
+
(repo / "README.md").write_text("# command grace repo\n", encoding="utf-8")
|
|
1355
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1356
|
+
subprocess.run(
|
|
1357
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1358
|
+
cwd=repo,
|
|
1359
|
+
check=True,
|
|
1360
|
+
capture_output=True,
|
|
1361
|
+
text=True,
|
|
1362
|
+
)
|
|
1363
|
+
subprocess.run(
|
|
1364
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1365
|
+
cwd=repo,
|
|
1366
|
+
check=True,
|
|
1367
|
+
capture_output=True,
|
|
1368
|
+
text=True,
|
|
1369
|
+
)
|
|
1370
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1371
|
+
subprocess.run(
|
|
1372
|
+
["git", "commit", "-m", "chore: seed command grace repo"],
|
|
1373
|
+
cwd=repo,
|
|
1374
|
+
check=True,
|
|
1375
|
+
capture_output=True,
|
|
1376
|
+
text=True,
|
|
1377
|
+
)
|
|
1378
|
+
|
|
1379
|
+
stub_path = Path(temp_dir) / "fake_codex_no_edit_command_grace.py"
|
|
1380
|
+
stub_path.write_text(
|
|
1381
|
+
"\n".join(
|
|
1382
|
+
[
|
|
1383
|
+
"from pathlib import Path",
|
|
1384
|
+
"import json",
|
|
1385
|
+
"import sys",
|
|
1386
|
+
"import time",
|
|
1387
|
+
"",
|
|
1388
|
+
"argv = sys.argv[1:]",
|
|
1389
|
+
"last_message_path = None",
|
|
1390
|
+
"for index, arg in enumerate(argv):",
|
|
1391
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1392
|
+
" last_message_path = argv[index + 1]",
|
|
1393
|
+
" break",
|
|
1394
|
+
"",
|
|
1395
|
+
"sys.stdin.read()",
|
|
1396
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1397
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1398
|
+
"print(json.dumps({'type': 'item.started', 'item': {'id': 'cmd-read-target', 'type': 'command_execution', 'command': 'sed -n 1,120p README.md', 'status': 'in_progress'}}), flush=True)",
|
|
1399
|
+
"time.sleep(1.4)",
|
|
1400
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'id': 'cmd-read-target', 'type': 'command_execution', 'command': 'sed -n 1,120p README.md', 'status': 'completed', 'exit_code': 0, 'aggregated_output': '# command grace repo'}}), flush=True)",
|
|
1401
|
+
"time.sleep(1.6)",
|
|
1402
|
+
"Path('src').mkdir(exist_ok=True)",
|
|
1403
|
+
"Path('src/command-grace.txt').write_text('patched after command-backed discovery\\n', encoding='utf-8')",
|
|
1404
|
+
"if last_message_path:",
|
|
1405
|
+
" Path(last_message_path).write_text('Patched after command-backed discovery.', encoding='utf-8')",
|
|
1406
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Patched after command-backed discovery.'}}), flush=True)",
|
|
1407
|
+
]
|
|
1408
|
+
),
|
|
1409
|
+
encoding="utf-8",
|
|
1410
|
+
)
|
|
1411
|
+
|
|
1412
|
+
env_overrides = {
|
|
1413
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1414
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1415
|
+
"OPENAI_API_KEY": "pushpals-no-edit-command-grace-test-key",
|
|
1416
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1417
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1418
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1419
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S": "5",
|
|
1420
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1421
|
+
}
|
|
1422
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1423
|
+
result = _run_codex_task(
|
|
1424
|
+
str(repo),
|
|
1425
|
+
"Add one focused contract assertion after inspecting the hinted test.",
|
|
1426
|
+
[],
|
|
1427
|
+
)
|
|
1428
|
+
|
|
1429
|
+
self.assertTrue(result.get("ok"), result)
|
|
1430
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1431
|
+
self.assertIn("Patched after command-backed discovery", str(result.get("stdout") or ""))
|
|
1432
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1433
|
+
|
|
1287
1434
|
def test_run_codex_task_recovery_attempt_is_still_guarded_by_no_edit_watchdog(self) -> None:
|
|
1288
1435
|
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-watchdog-fail-") as temp_dir:
|
|
1289
1436
|
repo = Path(temp_dir) / "repo"
|
|
@@ -1587,6 +1734,31 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1587
1734
|
|
|
1588
1735
|
self.assertEqual(watchdog_s, 180)
|
|
1589
1736
|
|
|
1737
|
+
def test_startup_stall_watchdog_allows_slower_first_response_than_no_edit_watchdog(self) -> None:
|
|
1738
|
+
with mock.patch.dict(
|
|
1739
|
+
os.environ,
|
|
1740
|
+
{"WORKERPALS_OPENAI_CODEX_STARTUP_STALL_WATCHDOG_S": ""},
|
|
1741
|
+
clear=False,
|
|
1742
|
+
):
|
|
1743
|
+
watchdog_s = _resolve_startup_stall_watchdog_seconds(1200)
|
|
1744
|
+
recovery_watchdog_s = _resolve_startup_stall_watchdog_seconds(
|
|
1745
|
+
1200,
|
|
1746
|
+
recovery_attempt=1,
|
|
1747
|
+
)
|
|
1748
|
+
|
|
1749
|
+
self.assertEqual(watchdog_s, 210)
|
|
1750
|
+
self.assertEqual(recovery_watchdog_s, 150)
|
|
1751
|
+
|
|
1752
|
+
def test_explicit_startup_stall_watchdog_override_is_bounded(self) -> None:
|
|
1753
|
+
with mock.patch.dict(
|
|
1754
|
+
os.environ,
|
|
1755
|
+
{"WORKERPALS_OPENAI_CODEX_STARTUP_STALL_WATCHDOG_S": "500"},
|
|
1756
|
+
clear=False,
|
|
1757
|
+
):
|
|
1758
|
+
watchdog_s = _resolve_startup_stall_watchdog_seconds(120)
|
|
1759
|
+
|
|
1760
|
+
self.assertEqual(watchdog_s, 119)
|
|
1761
|
+
|
|
1590
1762
|
def test_narrow_contract_regression_with_required_e2e_uses_fast_no_edit_watchdog(self) -> None:
|
|
1591
1763
|
prompt = (
|
|
1592
1764
|
"Harden the opportunity graph contract around autonomous delivery-loop failure signals. "
|
|
@@ -155,14 +155,39 @@ def fail(summary: str, stderr: Optional[str] = None, exit_code: int = 1) -> int:
|
|
|
155
155
|
return exit_code
|
|
156
156
|
|
|
157
157
|
|
|
158
|
-
def
|
|
159
|
-
|
|
160
|
-
payload = json.loads(decoded)
|
|
158
|
+
def _parse_payload_json(raw: str) -> Dict[str, Any]:
|
|
159
|
+
payload = json.loads(raw)
|
|
161
160
|
if not isinstance(payload, dict):
|
|
162
161
|
raise ValueError("payload must be a JSON object")
|
|
163
162
|
return payload
|
|
164
163
|
|
|
165
164
|
|
|
165
|
+
def decode_payload(raw: str) -> Dict[str, Any]:
|
|
166
|
+
stripped = str(raw or "").strip()
|
|
167
|
+
if not stripped:
|
|
168
|
+
raise ValueError("empty job payload")
|
|
169
|
+
|
|
170
|
+
# Direct workers normally receive a file-backed base64 payload, but this
|
|
171
|
+
# parser intentionally accepts the safe adjacent encodings too. That keeps
|
|
172
|
+
# executor startup resilient if an outer wrapper normalizes padding, uses
|
|
173
|
+
# url-safe base64, or hands through raw JSON during recovery.
|
|
174
|
+
if stripped.startswith("{"):
|
|
175
|
+
return _parse_payload_json(stripped)
|
|
176
|
+
|
|
177
|
+
compact = "".join(stripped.split())
|
|
178
|
+
padded = compact + ("=" * ((4 - len(compact) % 4) % 4))
|
|
179
|
+
decode_errors: List[str] = []
|
|
180
|
+
for decoder in (base64.b64decode, base64.urlsafe_b64decode):
|
|
181
|
+
try:
|
|
182
|
+
decoded = decoder(padded).decode("utf-8")
|
|
183
|
+
return _parse_payload_json(decoded)
|
|
184
|
+
except Exception as exc:
|
|
185
|
+
decode_errors.append(str(exc))
|
|
186
|
+
|
|
187
|
+
detail = "; ".join(error for error in decode_errors if error) or "unknown decode error"
|
|
188
|
+
raise ValueError(f"invalid base64/JSON job payload: {detail}")
|
|
189
|
+
|
|
190
|
+
|
|
166
191
|
def read_encoded_payload_arg(argv: List[str]) -> str:
|
|
167
192
|
if len(argv) < 2:
|
|
168
193
|
raise ValueError("missing base64 job payload")
|
|
@@ -174,6 +199,13 @@ def read_encoded_payload_arg(argv: List[str]) -> str:
|
|
|
174
199
|
return path.read_text(encoding="utf-8").strip()
|
|
175
200
|
if mode == "--payload-stdin":
|
|
176
201
|
return sys.stdin.read().strip()
|
|
202
|
+
if len(mode) < 4096:
|
|
203
|
+
try:
|
|
204
|
+
path = Path(mode).expanduser()
|
|
205
|
+
if path.is_file():
|
|
206
|
+
return path.read_text(encoding="utf-8").strip()
|
|
207
|
+
except OSError:
|
|
208
|
+
pass
|
|
177
209
|
return mode
|
|
178
210
|
|
|
179
211
|
|
|
@@ -1918,8 +1918,6 @@ export class DockerExecutor {
|
|
|
1918
1918
|
|
|
1919
1919
|
private matchesRetryablePattern(text: string): boolean {
|
|
1920
1920
|
const transientPatterns: RegExp[] = [
|
|
1921
|
-
/\bstalled before first response\b/i,
|
|
1922
|
-
/\bstartup stall\b/i,
|
|
1923
1921
|
/warm .*runtime/i,
|
|
1924
1922
|
/failed to start warm container/i,
|
|
1925
1923
|
/docker execution error/i,
|