@pushpalsdev/cli 1.1.36 → 1.1.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/openai_codex_executor.py +48 -10
- package/runtime/sandbox/apps/workerpals/src/backends/openai_codex/test_openai_codex_runtime_config.py +169 -0
- package/runtime/sandbox/apps/workerpals/src/execute_job.ts +34 -1
package/package.json
CHANGED
|
@@ -2519,16 +2519,14 @@ def _run_codex_task(
|
|
|
2519
2519
|
if no_edit_watchdog_s is not None
|
|
2520
2520
|
else None
|
|
2521
2521
|
)
|
|
2522
|
-
no_edit_command_grace_cap_deadline = (
|
|
2523
|
-
started_at + float(no_edit_watchdog_s + no_edit_command_grace_s)
|
|
2524
|
-
if no_edit_watchdog_s is not None and no_edit_command_grace_s is not None
|
|
2525
|
-
else None
|
|
2526
|
-
)
|
|
2527
2522
|
rollout_deadline = (
|
|
2528
2523
|
started_at + float(rollout_watchdog_s)
|
|
2529
2524
|
if rollout_watchdog_s is not None
|
|
2530
2525
|
else None
|
|
2531
2526
|
)
|
|
2527
|
+
publishable_progress_seen_at: Optional[float] = None
|
|
2528
|
+
publishable_progress_finalized = False
|
|
2529
|
+
publishable_progress_paths: List[str] = []
|
|
2532
2530
|
|
|
2533
2531
|
while proc.poll() is None:
|
|
2534
2532
|
now = time.monotonic()
|
|
@@ -2605,11 +2603,6 @@ def _run_codex_task(
|
|
|
2605
2603
|
command_grace_deadline = last_command_activity_at + float(
|
|
2606
2604
|
no_edit_command_grace_s
|
|
2607
2605
|
)
|
|
2608
|
-
if no_edit_command_grace_cap_deadline is not None:
|
|
2609
|
-
command_grace_deadline = min(
|
|
2610
|
-
command_grace_deadline,
|
|
2611
|
-
no_edit_command_grace_cap_deadline,
|
|
2612
|
-
)
|
|
2613
2606
|
if command_grace_deadline > now:
|
|
2614
2607
|
no_edit_deadline = command_grace_deadline
|
|
2615
2608
|
remaining_s = int(max(1.0, command_grace_deadline - now))
|
|
@@ -2645,6 +2638,22 @@ def _run_codex_task(
|
|
|
2645
2638
|
)
|
|
2646
2639
|
_terminate_active_child()
|
|
2647
2640
|
break
|
|
2641
|
+
if publishable_progress_seen_at is None:
|
|
2642
|
+
publishable_progress_seen_at = now
|
|
2643
|
+
publishable_progress_paths = list(effective_paths)
|
|
2644
|
+
elif _has_credible_shell_wrapper_progress(effective_paths):
|
|
2645
|
+
publishable_progress_paths = list(effective_paths)
|
|
2646
|
+
publishable_age_s = now - publishable_progress_seen_at
|
|
2647
|
+
if publishable_age_s >= float(no_edit_recheck_s):
|
|
2648
|
+
publishable_progress_finalized = True
|
|
2649
|
+
log.info(
|
|
2650
|
+
"No-edit watchdog observed durable publishable file changes "
|
|
2651
|
+
f"({_describe_publishable_paths(effective_paths)}) for "
|
|
2652
|
+
f"{int(publishable_age_s)}s; stopping Codex early so "
|
|
2653
|
+
"QualityGate/ValidationGate can use the remaining budget."
|
|
2654
|
+
)
|
|
2655
|
+
_terminate_active_child()
|
|
2656
|
+
break
|
|
2648
2657
|
no_edit_deadline = now + float(no_edit_recheck_s)
|
|
2649
2658
|
log.info(
|
|
2650
2659
|
"No-edit watchdog observed publishable-looking file changes "
|
|
@@ -2806,6 +2815,35 @@ def _run_codex_task(
|
|
|
2806
2815
|
"cooldownMs": _NO_PUBLISHABLE_FAILURE_COOLDOWN_MS,
|
|
2807
2816
|
}
|
|
2808
2817
|
|
|
2818
|
+
if publishable_progress_finalized:
|
|
2819
|
+
changed_paths, _, effective_paths = _codex_changed_paths(repo, baseline_snapshot)
|
|
2820
|
+
effective_paths = effective_paths or publishable_progress_paths
|
|
2821
|
+
last_message = _read_text_if_exists(last_message_path)
|
|
2822
|
+
log_git_status(repo, log)
|
|
2823
|
+
prefix = (
|
|
2824
|
+
"Codex produced durable publishable file changes. PushPals stopped the "
|
|
2825
|
+
"Codex child early to preserve validation and revision budget; the normal "
|
|
2826
|
+
"QualityGate/ValidationGate will catch any incomplete edit."
|
|
2827
|
+
)
|
|
2828
|
+
return {
|
|
2829
|
+
"ok": True,
|
|
2830
|
+
"summary": (
|
|
2831
|
+
"openai_codex stopped after durable publishable progress "
|
|
2832
|
+
f"({len(effective_paths)} file(s))"
|
|
2833
|
+
),
|
|
2834
|
+
"stdout": _truncate(
|
|
2835
|
+
_build_success_stdout(
|
|
2836
|
+
effective_paths=effective_paths,
|
|
2837
|
+
last_message=last_message,
|
|
2838
|
+
trace_excerpt=trace_excerpt,
|
|
2839
|
+
prefix=prefix,
|
|
2840
|
+
)
|
|
2841
|
+
),
|
|
2842
|
+
"stderr": _truncate(stderr),
|
|
2843
|
+
"exitCode": 0,
|
|
2844
|
+
"usage": usage,
|
|
2845
|
+
}
|
|
2846
|
+
|
|
2809
2847
|
if no_edit_watchdog_fired:
|
|
2810
2848
|
startup_stall = _codex_trace_is_startup_stall(stdout_trace)
|
|
2811
2849
|
if startup_stall and startup_stall_recovery_attempt < _MAX_STARTUP_STALL_RECOVERY_ATTEMPTS:
|
|
@@ -1431,6 +1431,175 @@ class OpenAICodexRuntimeConfigTests(unittest.TestCase):
|
|
|
1431
1431
|
self.assertIn("Patched after command-backed discovery", str(result.get("stdout") or ""))
|
|
1432
1432
|
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1433
1433
|
|
|
1434
|
+
def test_run_codex_task_no_edit_watchdog_extends_after_later_command_progress(self) -> None:
|
|
1435
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-late-command-") as temp_dir:
|
|
1436
|
+
repo = Path(temp_dir) / "repo"
|
|
1437
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1438
|
+
(repo / "README.md").write_text("# late command grace repo\n", encoding="utf-8")
|
|
1439
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1440
|
+
subprocess.run(
|
|
1441
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1442
|
+
cwd=repo,
|
|
1443
|
+
check=True,
|
|
1444
|
+
capture_output=True,
|
|
1445
|
+
text=True,
|
|
1446
|
+
)
|
|
1447
|
+
subprocess.run(
|
|
1448
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1449
|
+
cwd=repo,
|
|
1450
|
+
check=True,
|
|
1451
|
+
capture_output=True,
|
|
1452
|
+
text=True,
|
|
1453
|
+
)
|
|
1454
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1455
|
+
subprocess.run(
|
|
1456
|
+
["git", "commit", "-m", "chore: seed late command repo"],
|
|
1457
|
+
cwd=repo,
|
|
1458
|
+
check=True,
|
|
1459
|
+
capture_output=True,
|
|
1460
|
+
text=True,
|
|
1461
|
+
)
|
|
1462
|
+
|
|
1463
|
+
stub_path = Path(temp_dir) / "fake_codex_late_command_grace.py"
|
|
1464
|
+
stub_path.write_text(
|
|
1465
|
+
"\n".join(
|
|
1466
|
+
[
|
|
1467
|
+
"from pathlib import Path",
|
|
1468
|
+
"import json",
|
|
1469
|
+
"import sys",
|
|
1470
|
+
"import time",
|
|
1471
|
+
"",
|
|
1472
|
+
"argv = sys.argv[1:]",
|
|
1473
|
+
"last_message_path = None",
|
|
1474
|
+
"for index, arg in enumerate(argv):",
|
|
1475
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1476
|
+
" last_message_path = argv[index + 1]",
|
|
1477
|
+
" break",
|
|
1478
|
+
"",
|
|
1479
|
+
"sys.stdin.read()",
|
|
1480
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1481
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1482
|
+
"print(json.dumps({'type': 'item.started', 'item': {'id': 'cmd-one', 'type': 'command_execution', 'command': 'cat README.md', 'status': 'in_progress'}}), flush=True)",
|
|
1483
|
+
"time.sleep(0.2)",
|
|
1484
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'id': 'cmd-one', 'type': 'command_execution', 'command': 'cat README.md', 'status': 'completed', 'exit_code': 0}}), flush=True)",
|
|
1485
|
+
"time.sleep(2.2)",
|
|
1486
|
+
"print(json.dumps({'type': 'item.started', 'item': {'id': 'cmd-two', 'type': 'command_execution', 'command': 'ls', 'status': 'in_progress'}}), flush=True)",
|
|
1487
|
+
"time.sleep(0.2)",
|
|
1488
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'id': 'cmd-two', 'type': 'command_execution', 'command': 'ls', 'status': 'completed', 'exit_code': 0}}), flush=True)",
|
|
1489
|
+
"time.sleep(2.0)",
|
|
1490
|
+
"Path('src').mkdir(exist_ok=True)",
|
|
1491
|
+
"Path('src/late-command-grace.txt').write_text('patched after later command progress\\n', encoding='utf-8')",
|
|
1492
|
+
"if last_message_path:",
|
|
1493
|
+
" Path(last_message_path).write_text('Patched after later command progress.', encoding='utf-8')",
|
|
1494
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Patched after later command progress.'}}), flush=True)",
|
|
1495
|
+
]
|
|
1496
|
+
),
|
|
1497
|
+
encoding="utf-8",
|
|
1498
|
+
)
|
|
1499
|
+
|
|
1500
|
+
env_overrides = {
|
|
1501
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1502
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1503
|
+
"OPENAI_API_KEY": "pushpals-no-edit-late-command-test-key",
|
|
1504
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1505
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1506
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1507
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_COMMAND_GRACE_S": "3",
|
|
1508
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1509
|
+
}
|
|
1510
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1511
|
+
result = _run_codex_task(
|
|
1512
|
+
str(repo),
|
|
1513
|
+
"Add one focused contract assertion after a later targeted read.",
|
|
1514
|
+
[],
|
|
1515
|
+
)
|
|
1516
|
+
|
|
1517
|
+
self.assertTrue(result.get("ok"), result)
|
|
1518
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1519
|
+
self.assertIn("Patched after later command progress", str(result.get("stdout") or ""))
|
|
1520
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1521
|
+
|
|
1522
|
+
def test_run_codex_task_finalizes_after_durable_publishable_progress(self) -> None:
|
|
1523
|
+
with tempfile.TemporaryDirectory(prefix="pushpals-codex-durable-progress-") as temp_dir:
|
|
1524
|
+
repo = Path(temp_dir) / "repo"
|
|
1525
|
+
repo.mkdir(parents=True, exist_ok=True)
|
|
1526
|
+
(repo / "README.md").write_text("# durable progress repo\n", encoding="utf-8")
|
|
1527
|
+
subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1528
|
+
subprocess.run(
|
|
1529
|
+
["git", "config", "user.name", "PushPals Test"],
|
|
1530
|
+
cwd=repo,
|
|
1531
|
+
check=True,
|
|
1532
|
+
capture_output=True,
|
|
1533
|
+
text=True,
|
|
1534
|
+
)
|
|
1535
|
+
subprocess.run(
|
|
1536
|
+
["git", "config", "user.email", "pushpals-tests@example.com"],
|
|
1537
|
+
cwd=repo,
|
|
1538
|
+
check=True,
|
|
1539
|
+
capture_output=True,
|
|
1540
|
+
text=True,
|
|
1541
|
+
)
|
|
1542
|
+
subprocess.run(["git", "add", "README.md"], cwd=repo, check=True, capture_output=True, text=True)
|
|
1543
|
+
subprocess.run(
|
|
1544
|
+
["git", "commit", "-m", "chore: seed durable progress repo"],
|
|
1545
|
+
cwd=repo,
|
|
1546
|
+
check=True,
|
|
1547
|
+
capture_output=True,
|
|
1548
|
+
text=True,
|
|
1549
|
+
)
|
|
1550
|
+
|
|
1551
|
+
stub_path = Path(temp_dir) / "fake_codex_durable_progress.py"
|
|
1552
|
+
stub_path.write_text(
|
|
1553
|
+
"\n".join(
|
|
1554
|
+
[
|
|
1555
|
+
"from pathlib import Path",
|
|
1556
|
+
"import json",
|
|
1557
|
+
"import sys",
|
|
1558
|
+
"import time",
|
|
1559
|
+
"",
|
|
1560
|
+
"argv = sys.argv[1:]",
|
|
1561
|
+
"last_message_path = None",
|
|
1562
|
+
"for index, arg in enumerate(argv):",
|
|
1563
|
+
" if arg == '--output-last-message' and index + 1 < len(argv):",
|
|
1564
|
+
" last_message_path = argv[index + 1]",
|
|
1565
|
+
" break",
|
|
1566
|
+
"",
|
|
1567
|
+
"sys.stdin.read()",
|
|
1568
|
+
"print(json.dumps({'type': 'thread.started'}), flush=True)",
|
|
1569
|
+
"print(json.dumps({'type': 'turn.started'}), flush=True)",
|
|
1570
|
+
"Path('src').mkdir(exist_ok=True)",
|
|
1571
|
+
"Path('src/durable-progress.txt').write_text('durable patch\\n', encoding='utf-8')",
|
|
1572
|
+
"if last_message_path:",
|
|
1573
|
+
" Path(last_message_path).write_text('Created durable patch and kept thinking.', encoding='utf-8')",
|
|
1574
|
+
"print(json.dumps({'type': 'item.completed', 'item': {'type': 'message', 'text': 'Created durable patch and kept thinking.'}}), flush=True)",
|
|
1575
|
+
"time.sleep(10)",
|
|
1576
|
+
]
|
|
1577
|
+
),
|
|
1578
|
+
encoding="utf-8",
|
|
1579
|
+
)
|
|
1580
|
+
|
|
1581
|
+
env_overrides = {
|
|
1582
|
+
"PUSHPALS_OPENAI_CODEX_BIN_JSON": json.dumps([sys.executable, str(stub_path)]),
|
|
1583
|
+
"PUSHPALS_OPENAI_CODEX_AUTH_MODE": "api_key",
|
|
1584
|
+
"OPENAI_API_KEY": "pushpals-durable-progress-test-key",
|
|
1585
|
+
"WORKERPALS_OPENAI_CODEX_JSON": "true",
|
|
1586
|
+
"WORKERPALS_OPENAI_CODEX_TIMEOUT_S": "20",
|
|
1587
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_WATCHDOG_S": "1",
|
|
1588
|
+
"WORKERPALS_OPENAI_CODEX_NO_EDIT_RECHECK_S": "1",
|
|
1589
|
+
"WORKERPALS_OPENAI_CODEX_PROGRESS_LOG_INTERVAL_S": "1",
|
|
1590
|
+
}
|
|
1591
|
+
with mock.patch.dict(os.environ, env_overrides, clear=False):
|
|
1592
|
+
result = _run_codex_task(
|
|
1593
|
+
str(repo),
|
|
1594
|
+
"Make a focused patch and stop once it is durable.",
|
|
1595
|
+
[],
|
|
1596
|
+
)
|
|
1597
|
+
|
|
1598
|
+
self.assertTrue(result.get("ok"), result)
|
|
1599
|
+
self.assertEqual(result.get("exitCode"), 0)
|
|
1600
|
+
self.assertIn("stopped after durable publishable progress", str(result.get("summary") or ""))
|
|
1601
|
+
self.assertIn("src/", str(result.get("stdout") or ""))
|
|
1602
|
+
|
|
1434
1603
|
def test_run_codex_task_recovery_attempt_is_still_guarded_by_no_edit_watchdog(self) -> None:
|
|
1435
1604
|
with tempfile.TemporaryDirectory(prefix="pushpals-codex-no-edit-watchdog-fail-") as temp_dir:
|
|
1436
1605
|
repo = Path(temp_dir) / "repo"
|
|
@@ -335,6 +335,22 @@ export function shouldSkipCriticForDeterministicValidationRevision(opts: {
|
|
|
335
335
|
return opts.validationRuns.some(isDeterministicFastValidationFailure);
|
|
336
336
|
}
|
|
337
337
|
|
|
338
|
+
export function shouldSkipCriticToPreserveRevisionBudget(opts: {
|
|
339
|
+
deterministicRequiresRevision: boolean;
|
|
340
|
+
remainingBudgetMs: number;
|
|
341
|
+
minimumRevisionBudgetMs: number;
|
|
342
|
+
criticTimeoutMs: number;
|
|
343
|
+
criticTimeoutBehavior: "skip" | "retry_once" | "block" | string;
|
|
344
|
+
}): boolean {
|
|
345
|
+
if (!opts.deterministicRequiresRevision) return false;
|
|
346
|
+
const remainingBudgetMs = Math.max(0, Math.floor(opts.remainingBudgetMs));
|
|
347
|
+
const minimumRevisionBudgetMs = Math.max(0, Math.floor(opts.minimumRevisionBudgetMs));
|
|
348
|
+
const criticTimeoutMs = Math.max(0, Math.floor(opts.criticTimeoutMs));
|
|
349
|
+
const criticAttempts = opts.criticTimeoutBehavior === "retry_once" ? 2 : 1;
|
|
350
|
+
const criticWorstCaseMs = criticTimeoutMs * criticAttempts;
|
|
351
|
+
return remainingBudgetMs < minimumRevisionBudgetMs + criticWorstCaseMs;
|
|
352
|
+
}
|
|
353
|
+
|
|
338
354
|
export function workerAttemptRolloutScore(params: {
|
|
339
355
|
executorElapsedMs: number;
|
|
340
356
|
qualityElapsedMs: number;
|
|
@@ -7975,11 +7991,23 @@ export async function executeJob(
|
|
|
7975
7991
|
validationOutsideTaskScope,
|
|
7976
7992
|
validationRuns: quality.validationRuns,
|
|
7977
7993
|
});
|
|
7994
|
+
const preCriticRevisionBudget = qualityRevisionBudgetDecision({
|
|
7995
|
+
jobElapsedMs: Date.now() - jobStartedAt,
|
|
7996
|
+
executionBudgetMs,
|
|
7997
|
+
});
|
|
7998
|
+
const skipCriticForRevisionBudget = shouldSkipCriticToPreserveRevisionBudget({
|
|
7999
|
+
deterministicRequiresRevision: preCriticDeterministicRequiresRevision,
|
|
8000
|
+
remainingBudgetMs: preCriticRevisionBudget.remainingBudgetMs,
|
|
8001
|
+
minimumRevisionBudgetMs: preCriticRevisionBudget.minimumRevisionBudgetMs,
|
|
8002
|
+
criticTimeoutMs: resolveQualityCriticTimeoutMs(runtimeConfig),
|
|
8003
|
+
criticTimeoutBehavior: resolveQualityCriticTimeoutBehavior(runtimeConfig),
|
|
8004
|
+
});
|
|
7978
8005
|
const critic =
|
|
7979
8006
|
quality.skipped ||
|
|
7980
8007
|
!qualityGatePolicy.criticGateEnabled ||
|
|
7981
8008
|
skipCriticAfterExecutorTimeout ||
|
|
7982
|
-
skipCriticForDeterministicValidationRevision
|
|
8009
|
+
skipCriticForDeterministicValidationRevision ||
|
|
8010
|
+
skipCriticForRevisionBudget
|
|
7983
8011
|
? null
|
|
7984
8012
|
: executor === "openai_codex"
|
|
7985
8013
|
? await runCodexCriticReview(repo, attemptParams, qualityForCritic, runtimeConfig, onLog)
|
|
@@ -8020,6 +8048,11 @@ export async function executeJob(
|
|
|
8020
8048
|
"stdout",
|
|
8021
8049
|
"[CriticGate] Skipping critic because deterministic fast validation already requires a quality revision.",
|
|
8022
8050
|
);
|
|
8051
|
+
} else if (skipCriticForRevisionBudget) {
|
|
8052
|
+
onLog?.(
|
|
8053
|
+
"stdout",
|
|
8054
|
+
`[CriticGate] Skipping critic because deterministic quality already requires revision and remaining budget (${preCriticRevisionBudget.remainingBudgetMs}ms) must be reserved for the next worker turn.`,
|
|
8055
|
+
);
|
|
8023
8056
|
}
|
|
8024
8057
|
const rolloutScore = workerAttemptRolloutScore({
|
|
8025
8058
|
executorElapsedMs,
|