forgexa-cli 1.6.1__tar.gz → 1.7.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/PKG-INFO +1 -1
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli/__init__.py +1 -1
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli/daemon.py +570 -34
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli.egg-info/PKG-INFO +1 -1
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/pyproject.toml +1 -1
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/README.md +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli/_build_config.py +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli/main.py +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli/py.typed +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli.egg-info/SOURCES.txt +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli.egg-info/dependency_links.txt +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli.egg-info/entry_points.txt +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli.egg-info/requires.txt +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/forgexa_cli.egg-info/top_level.txt +0 -0
- {forgexa_cli-1.6.1 → forgexa_cli-1.7.5}/setup.cfg +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""forgexa-cli — Forgexa command-line client."""
|
|
2
|
-
__version__ = "1.
|
|
2
|
+
__version__ = "1.7.5"
|
|
@@ -10,11 +10,26 @@ Usage:
|
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
+
import sys
|
|
14
|
+
|
|
15
|
+
# ── Python version gate — must run before any other imports ──────────────────
|
|
16
|
+
# Emit a machine-readable DAEMON_ERROR so the desktop app shows a clear
|
|
17
|
+
# message instead of a cryptic traceback.
|
|
18
|
+
if sys.version_info < (3, 9):
|
|
19
|
+
_ver = f"{sys.version_info.major}.{sys.version_info.minor}"
|
|
20
|
+
print(
|
|
21
|
+
f"DAEMON_ERROR: Python {_ver} is too old. Forgexa Daemon requires Python 3.9 or "
|
|
22
|
+
f"newer. Please upgrade Python from https://www.python.org/downloads/",
|
|
23
|
+
file=sys.stderr,
|
|
24
|
+
)
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
|
|
13
27
|
import asyncio
|
|
14
28
|
import base64
|
|
15
29
|
import hashlib
|
|
16
30
|
import json
|
|
17
31
|
import logging
|
|
32
|
+
from logging.handlers import RotatingFileHandler
|
|
18
33
|
import os
|
|
19
34
|
import platform
|
|
20
35
|
import re
|
|
@@ -287,6 +302,16 @@ except (ImportError, ModuleNotFoundError):
|
|
|
287
302
|
def AGENT_MAX_OUTPUT_SIZE(self) -> int:
|
|
288
303
|
return int(os.environ.get("AGENT_MAX_OUTPUT_SIZE", "100000"))
|
|
289
304
|
|
|
305
|
+
@property
|
|
306
|
+
def FACTORY_CODEX_SANDBOX(self) -> str:
|
|
307
|
+
"""Codex sandbox mode: 'bypass' (default, safe) or 'bwrap' (Linux only).
|
|
308
|
+
|
|
309
|
+
'bypass' uses --dangerously-bypass-approvals-and-sandbox which works
|
|
310
|
+
in all environments including Docker without CAP_NET_ADMIN.
|
|
311
|
+
'bwrap' uses --full-auto (bubblewrap) which requires CAP_NET_ADMIN.
|
|
312
|
+
"""
|
|
313
|
+
return os.environ.get("FACTORY_CODEX_SANDBOX", "bypass").strip().lower()
|
|
314
|
+
|
|
290
315
|
def get_daemon_workspaces_root(self) -> str:
|
|
291
316
|
root = self.DAEMON_WORKSPACES_ROOT
|
|
292
317
|
if not root:
|
|
@@ -307,7 +332,7 @@ except (ImportError, ModuleNotFoundError):
|
|
|
307
332
|
# DAEMON_VERSION is the protocol/logic version of the daemon code.
|
|
308
333
|
# Kept in sync with pyproject.toml version via bump-version.sh.
|
|
309
334
|
# CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
|
|
310
|
-
DAEMON_VERSION = "1.
|
|
335
|
+
DAEMON_VERSION = "1.7.5"
|
|
311
336
|
|
|
312
337
|
|
|
313
338
|
def _detect_client_type() -> str:
|
|
@@ -344,7 +369,11 @@ _log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
344
369
|
DAEMON_LOG_PATH = _log_dir / "daemon.log"
|
|
345
370
|
|
|
346
371
|
_log_handlers: list[logging.Handler] = [
|
|
347
|
-
|
|
372
|
+
RotatingFileHandler(
|
|
373
|
+
DAEMON_LOG_PATH, mode="a", encoding="utf-8",
|
|
374
|
+
maxBytes=50 * 1024 * 1024, # 50 MB per file
|
|
375
|
+
backupCount=5,
|
|
376
|
+
),
|
|
348
377
|
]
|
|
349
378
|
if sys.stderr.isatty():
|
|
350
379
|
_log_handlers.append(logging.StreamHandler(sys.stderr))
|
|
@@ -699,6 +728,9 @@ class AgentDiscovery:
|
|
|
699
728
|
|
|
700
729
|
async def discover(self) -> list[DiscoveredAgent]:
|
|
701
730
|
self._expand_path()
|
|
731
|
+
# Probe bwrap support once at discovery time and log a clear warning
|
|
732
|
+
# if it is broken. This surfaces the error early rather than mid-task.
|
|
733
|
+
await self._probe_bwrap_support()
|
|
702
734
|
available = []
|
|
703
735
|
for agent_id, spec in self.AGENT_REGISTRY.items():
|
|
704
736
|
custom_path = os.environ.get(spec.get("env_path_override", ""))
|
|
@@ -718,8 +750,9 @@ class AgentDiscovery:
|
|
|
718
750
|
|
|
719
751
|
async def _get_version(self, detect_cmd: str) -> str:
|
|
720
752
|
try:
|
|
721
|
-
|
|
722
|
-
|
|
753
|
+
parts = detect_cmd.split()
|
|
754
|
+
proc = await asyncio.create_subprocess_exec(
|
|
755
|
+
*parts,
|
|
723
756
|
stdout=asyncio.subprocess.PIPE,
|
|
724
757
|
stderr=asyncio.subprocess.PIPE,
|
|
725
758
|
)
|
|
@@ -728,8 +761,60 @@ class AgentDiscovery:
|
|
|
728
761
|
except Exception:
|
|
729
762
|
return "unknown"
|
|
730
763
|
|
|
764
|
+
@staticmethod
|
|
765
|
+
async def _probe_bwrap_support() -> None:
|
|
766
|
+
"""Probe whether bubblewrap (bwrap) works in this environment.
|
|
731
767
|
|
|
732
|
-
|
|
768
|
+
codex exec --full-auto internally creates a bubblewrap sandbox that
|
|
769
|
+
requires a network namespace (CAP_NET_ADMIN). Inside Docker containers
|
|
770
|
+
or other restricted Linux environments this fails immediately with:
|
|
771
|
+
bwrap: loopback: Failed RTM_NEWADDR: Operation not permitted
|
|
772
|
+
|
|
773
|
+
We probe at startup so the operator gets an actionable warning rather
|
|
774
|
+
than a cryptic mid-task failure. The probe is skipped on macOS/Windows
|
|
775
|
+
because Codex uses a different sandbox mechanism on those platforms.
|
|
776
|
+
"""
|
|
777
|
+
if sys.platform != "linux":
|
|
778
|
+
return
|
|
779
|
+
sandbox_mode = os.environ.get("FACTORY_CODEX_SANDBOX", "bypass").strip().lower()
|
|
780
|
+
if sandbox_mode != "bwrap":
|
|
781
|
+
# Default mode bypasses sandbox — no bwrap needed, skip probe.
|
|
782
|
+
return
|
|
783
|
+
bwrap_bin = shutil.which("bwrap")
|
|
784
|
+
if not bwrap_bin:
|
|
785
|
+
logger.warning(
|
|
786
|
+
"FACTORY_CODEX_SANDBOX=bwrap but bwrap binary not found. "
|
|
787
|
+
"Codex sandbox will fail. Either install bwrap or unset "
|
|
788
|
+
"FACTORY_CODEX_SANDBOX to use bypass mode (default)."
|
|
789
|
+
)
|
|
790
|
+
return
|
|
791
|
+
try:
|
|
792
|
+
proc = await asyncio.create_subprocess_exec(
|
|
793
|
+
bwrap_bin,
|
|
794
|
+
"--dev", "/dev",
|
|
795
|
+
"--proc", "/proc",
|
|
796
|
+
"--ro-bind", "/usr", "/usr",
|
|
797
|
+
"--unshare-net",
|
|
798
|
+
"true",
|
|
799
|
+
stdout=asyncio.subprocess.DEVNULL,
|
|
800
|
+
stderr=asyncio.subprocess.PIPE,
|
|
801
|
+
)
|
|
802
|
+
_, stderr = await asyncio.wait_for(proc.communicate(), timeout=5)
|
|
803
|
+
if proc.returncode != 0:
|
|
804
|
+
err = (stderr or b"").decode(errors="replace").strip()
|
|
805
|
+
logger.warning(
|
|
806
|
+
"bwrap probe failed (exit=%d): %s. "
|
|
807
|
+
"codex exec --full-auto will fail in this environment. "
|
|
808
|
+
"Unset FACTORY_CODEX_SANDBOX to use bypass mode (default), "
|
|
809
|
+
"or grant CAP_NET_ADMIN / run privileged.",
|
|
810
|
+
proc.returncode, err,
|
|
811
|
+
)
|
|
812
|
+
else:
|
|
813
|
+
logger.info("bwrap probe: network namespaces work in this environment")
|
|
814
|
+
except asyncio.TimeoutError:
|
|
815
|
+
logger.warning("bwrap probe timed out — treating as unsupported")
|
|
816
|
+
except Exception as exc:
|
|
817
|
+
logger.warning("bwrap probe error: %s", exc)
|
|
733
818
|
|
|
734
819
|
|
|
735
820
|
class WorkspaceManager:
|
|
@@ -972,6 +1057,71 @@ class WorkspaceManager:
|
|
|
972
1057
|
# Remove the broken worktree directory
|
|
973
1058
|
shutil.rmtree(ws_path, ignore_errors=True)
|
|
974
1059
|
|
|
1060
|
+
async def _detect_unrelated_histories(self, repo_path: Path, project_key: str) -> bool:
|
|
1061
|
+
"""Detect whether local clone has diverged from remote due to history rewrite.
|
|
1062
|
+
|
|
1063
|
+
When a remote repo is rewritten (e.g. via BFG or git filter-repo to
|
|
1064
|
+
remove large files), all commit SHAs change. The local clone retains
|
|
1065
|
+
the old SHAs in its object store, making fetch/reset/merge fail in
|
|
1066
|
+
cryptic ways.
|
|
1067
|
+
|
|
1068
|
+
Strategy: ask git whether the local HEAD commit object is reachable in
|
|
1069
|
+
the remote graph. We use `git ls-remote` to get the remote HEAD SHA,
|
|
1070
|
+
then check if that SHA exists locally. If the remote HEAD does NOT
|
|
1071
|
+
exist locally, histories are definitely unrelated.
|
|
1072
|
+
|
|
1073
|
+
Additionally, if the repo has a shallow marker but the remote default
|
|
1074
|
+
branch has diverged past the shallow grafts, `git fetch` itself will
|
|
1075
|
+
indicate problems.
|
|
1076
|
+
"""
|
|
1077
|
+
try:
|
|
1078
|
+
# Get the local HEAD SHA
|
|
1079
|
+
local_proc = await asyncio.create_subprocess_exec(
|
|
1080
|
+
"git", "rev-parse", "HEAD",
|
|
1081
|
+
cwd=str(repo_path),
|
|
1082
|
+
stdout=asyncio.subprocess.PIPE,
|
|
1083
|
+
stderr=asyncio.subprocess.PIPE,
|
|
1084
|
+
)
|
|
1085
|
+
local_out, _ = await asyncio.wait_for(local_proc.communicate(), timeout=10)
|
|
1086
|
+
if local_proc.returncode != 0:
|
|
1087
|
+
return False
|
|
1088
|
+
local_head = local_out.decode().strip()
|
|
1089
|
+
if not local_head:
|
|
1090
|
+
return False
|
|
1091
|
+
|
|
1092
|
+
# Get the remote HEAD SHA via ls-remote (no network for local check)
|
|
1093
|
+
# Try to see if the remote HEAD is in local object store
|
|
1094
|
+
# If git cat-file -e <remote_sha> succeeds, remote HEAD is known locally
|
|
1095
|
+
# (histories still share commits). Otherwise, fully diverged.
|
|
1096
|
+
#
|
|
1097
|
+
# However, after a history rewrite the remote HEAD is a brand-new SHA,
|
|
1098
|
+
# and the local object store only has old SHAs. So we check the other
|
|
1099
|
+
# direction: does the local HEAD exist on the remote at all?
|
|
1100
|
+
# We use `git branch -r --contains <local_head>` which lists remote
|
|
1101
|
+
# tracking branches that contain that commit. If none, it's unrelated.
|
|
1102
|
+
check_proc = await asyncio.create_subprocess_exec(
|
|
1103
|
+
"git", "branch", "-r", "--contains", local_head,
|
|
1104
|
+
cwd=str(repo_path),
|
|
1105
|
+
stdout=asyncio.subprocess.PIPE,
|
|
1106
|
+
stderr=asyncio.subprocess.PIPE,
|
|
1107
|
+
)
|
|
1108
|
+
out, _ = await asyncio.wait_for(check_proc.communicate(), timeout=10)
|
|
1109
|
+
if check_proc.returncode != 0:
|
|
1110
|
+
# Command failed (e.g. invalid object) — history is broken
|
|
1111
|
+
return True
|
|
1112
|
+
remote_branches = out.decode().strip()
|
|
1113
|
+
if not remote_branches:
|
|
1114
|
+
# Local HEAD is not reachable from any remote branch — unrelated
|
|
1115
|
+
logger.info(
|
|
1116
|
+
"Local HEAD %s not found in any remote branch at %s — "
|
|
1117
|
+
"histories appear unrelated (remote may have been rewritten).",
|
|
1118
|
+
local_head[:12], repo_path,
|
|
1119
|
+
)
|
|
1120
|
+
return True
|
|
1121
|
+
except Exception:
|
|
1122
|
+
pass
|
|
1123
|
+
return False
|
|
1124
|
+
|
|
975
1125
|
async def _create_worktree(
|
|
976
1126
|
self, project_dir: Path, repo_url: str, default_branch: str,
|
|
977
1127
|
workspace_key: str, branch_name: str, *, fresh_start: bool = False,
|
|
@@ -1129,6 +1279,25 @@ class WorkspaceManager:
|
|
|
1129
1279
|
)
|
|
1130
1280
|
if not sync_success:
|
|
1131
1281
|
if expect_branch:
|
|
1282
|
+
# Before giving up, check for history-rewrite: if the remote
|
|
1283
|
+
# history was rewritten (all SHAs changed), local objects are
|
|
1284
|
+
# stale and no amount of retries will fix sync. Detect this
|
|
1285
|
+
# and destroy the workspace + _main so they get recloned.
|
|
1286
|
+
is_unrelated = await self._detect_unrelated_histories(ws_path, project_key)
|
|
1287
|
+
if is_unrelated:
|
|
1288
|
+
logger.warning(
|
|
1289
|
+
"Detected repository history mismatch for worktree %s "
|
|
1290
|
+
"(remote history likely rewritten). Discarding stale "
|
|
1291
|
+
"worktree and _main clone for a full re-clone on retry.",
|
|
1292
|
+
ws_path,
|
|
1293
|
+
)
|
|
1294
|
+
await self._remove_broken_worktree(main_repo, ws_path, workspace_key)
|
|
1295
|
+
shutil.rmtree(main_repo, ignore_errors=True)
|
|
1296
|
+
raise RuntimeError(
|
|
1297
|
+
f"Repository history was rewritten (e.g. large-file cleanup). "
|
|
1298
|
+
f"Stale local clone discarded. "
|
|
1299
|
+
f"The task will be retried with a fresh clone."
|
|
1300
|
+
)
|
|
1132
1301
|
raise RuntimeError(
|
|
1133
1302
|
f"Failed to sync branch '{branch_name}' from remote after 3 attempts. "
|
|
1134
1303
|
f"The branch should exist (pushed by prior analysis/design phase). "
|
|
@@ -1149,7 +1318,36 @@ class WorkspaceManager:
|
|
|
1149
1318
|
repo_url, str(main_repo), timeout=settings.GIT_CLONE_TIMEOUT, project_key=project_key,
|
|
1150
1319
|
)
|
|
1151
1320
|
else:
|
|
1152
|
-
|
|
1321
|
+
# Use targeted fetch instead of --all to avoid pulling every branch/tag
|
|
1322
|
+
# from potentially large repos (avoids 300s timeout on big repos).
|
|
1323
|
+
# Fetch default branch only; the feature branch is explicitly fetched below.
|
|
1324
|
+
try:
|
|
1325
|
+
await self._git(
|
|
1326
|
+
"fetch", "origin", default_branch,
|
|
1327
|
+
cwd=main_repo, timeout=settings.GIT_CLONE_TIMEOUT, project_key=project_key,
|
|
1328
|
+
)
|
|
1329
|
+
except RuntimeError as _fetch_err:
|
|
1330
|
+
err_str = str(_fetch_err)
|
|
1331
|
+
# Detect "unrelated histories" / history-rewrite scenarios:
|
|
1332
|
+
# If the remote history was rewritten (e.g. BFG large-file removal),
|
|
1333
|
+
# all commit SHAs change. The local clone becomes incompatible —
|
|
1334
|
+
# fetch may succeed but the local refs are orphaned and unusable.
|
|
1335
|
+
# Detection: check whether local HEAD exists in the remote graph.
|
|
1336
|
+
is_unrelated = await self._detect_unrelated_histories(main_repo, project_key)
|
|
1337
|
+
if is_unrelated or "not our ref" in err_str or "shallow" in err_str:
|
|
1338
|
+
logger.warning(
|
|
1339
|
+
"Detected repository history mismatch for %s (remote history likely "
|
|
1340
|
+
"rewritten). Discarding stale local clone and re-cloning from scratch.",
|
|
1341
|
+
main_repo,
|
|
1342
|
+
)
|
|
1343
|
+
shutil.rmtree(main_repo, ignore_errors=True)
|
|
1344
|
+
await self._git(
|
|
1345
|
+
"clone", "--single-branch", "--no-tags",
|
|
1346
|
+
repo_url, str(main_repo), timeout=settings.GIT_CLONE_TIMEOUT,
|
|
1347
|
+
project_key=project_key,
|
|
1348
|
+
)
|
|
1349
|
+
else:
|
|
1350
|
+
raise
|
|
1153
1351
|
|
|
1154
1352
|
# --single-branch clone only fetches the default branch.
|
|
1155
1353
|
# Explicitly fetch the feature branch so origin/{branch_name}
|
|
@@ -1463,7 +1661,12 @@ class ProcessManager:
|
|
|
1463
1661
|
"name or service not known",
|
|
1464
1662
|
"no such host",
|
|
1465
1663
|
"network is unreachable",
|
|
1466
|
-
"api error"
|
|
1664
|
+
# "api error" removed: too broad — matches agent-generated code/output
|
|
1665
|
+
# discussing API errors. Real API transport errors are covered by the
|
|
1666
|
+
# connection patterns above (refused, reset, timed out, etc.).
|
|
1667
|
+
"apiexception:",
|
|
1668
|
+
"api error: 5", # 5xx errors like "API error: 503", "API error: 502"
|
|
1669
|
+
"api error: connection",
|
|
1467
1670
|
]
|
|
1468
1671
|
|
|
1469
1672
|
def __init__(self):
|
|
@@ -1918,7 +2121,30 @@ class ProcessManager:
|
|
|
1918
2121
|
timeout=timeout,
|
|
1919
2122
|
)
|
|
1920
2123
|
except asyncio.TimeoutError:
|
|
1921
|
-
|
|
2124
|
+
# Kill the entire process group so that child processes (npm, yarn,
|
|
2125
|
+
# ssh, git, etc.) spawned by the agent are also terminated. A plain
|
|
2126
|
+
# proc.kill() only kills the direct subprocess; any grandchildren
|
|
2127
|
+
# become orphaned, keep pipes open, and can exhaust system resources.
|
|
2128
|
+
try:
|
|
2129
|
+
if sys.platform != "win32":
|
|
2130
|
+
import signal as _signal
|
|
2131
|
+
try:
|
|
2132
|
+
os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
|
|
2133
|
+
except (ProcessLookupError, PermissionError, OSError):
|
|
2134
|
+
pass
|
|
2135
|
+
else:
|
|
2136
|
+
import subprocess as _subprocess
|
|
2137
|
+
_subprocess.run(
|
|
2138
|
+
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
|
|
2139
|
+
capture_output=True,
|
|
2140
|
+
)
|
|
2141
|
+
except Exception:
|
|
2142
|
+
pass
|
|
2143
|
+
finally:
|
|
2144
|
+
try:
|
|
2145
|
+
proc.kill()
|
|
2146
|
+
except Exception:
|
|
2147
|
+
pass
|
|
1922
2148
|
# Drain any remaining output after kill
|
|
1923
2149
|
try:
|
|
1924
2150
|
remaining, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
|
|
@@ -1985,6 +2211,7 @@ class ProcessManager:
|
|
|
1985
2211
|
cwd=str(cwd),
|
|
1986
2212
|
env=env,
|
|
1987
2213
|
limit=100 * 1024 * 1024, # 100MB line buffer for large JSON output from long sessions
|
|
2214
|
+
start_new_session=True, # own process group → killpg on timeout kills all children
|
|
1988
2215
|
)
|
|
1989
2216
|
self.active_processes[task_id] = proc
|
|
1990
2217
|
stdout, stderr, returncode = await self._stream_process(
|
|
@@ -2054,9 +2281,57 @@ class ProcessManager:
|
|
|
2054
2281
|
self, agent: DiscoveredAgent, prompt: str, cwd: Path, timeout: int, task_id: str,
|
|
2055
2282
|
on_chunk: Any = None,
|
|
2056
2283
|
) -> TaskResult:
|
|
2057
|
-
"""Run Codex CLI in exec mode (non-interactive).
|
|
2058
|
-
|
|
2284
|
+
"""Run Codex CLI in exec mode (non-interactive).
|
|
2285
|
+
|
|
2286
|
+
Sandbox mode selection (FACTORY_CODEX_SANDBOX env var):
|
|
2287
|
+
- "bypass" (default): --dangerously-bypass-approvals-and-sandbox
|
|
2288
|
+
Safe for daemon context: the daemon already runs on a controlled
|
|
2289
|
+
machine and the workspace path is pre-scoped to the project.
|
|
2290
|
+
Required when running inside Docker or any environment that lacks
|
|
2291
|
+
CAP_NET_ADMIN, because codex --full-auto internally uses bubblewrap
|
|
2292
|
+
(bwrap) which tries to set up a loopback network interface and fails
|
|
2293
|
+
with "bwrap: loopback: Failed RTM_NEWADDR: Operation not permitted".
|
|
2294
|
+
- "bwrap": --full-auto (uses bubblewrap Linux sandbox). Only works
|
|
2295
|
+
when bwrap can create user+network namespaces (bare-metal Linux,
|
|
2296
|
+
not inside most Docker containers).
|
|
2297
|
+
"""
|
|
2298
|
+
sandbox_mode = os.environ.get("FACTORY_CODEX_SANDBOX", "bypass").strip().lower()
|
|
2299
|
+
if sandbox_mode == "bwrap":
|
|
2300
|
+
sandbox_flag = "--full-auto"
|
|
2301
|
+
else:
|
|
2302
|
+
# Default: bypass sandbox entirely — no bwrap, no approval prompts.
|
|
2303
|
+
# Equivalent to Kimi's --yolo and OpenCode's --dangerously-skip-permissions.
|
|
2304
|
+
sandbox_flag = "--dangerously-bypass-approvals-and-sandbox"
|
|
2305
|
+
|
|
2306
|
+
cmd = [agent.command, "exec", sandbox_flag, "--json", "-"]
|
|
2059
2307
|
result = await self._run_cli(cmd, cwd, timeout, task_id, stdin_input=prompt, on_chunk=on_chunk)
|
|
2308
|
+
|
|
2309
|
+
# Detect the bwrap loopback error and surface a clear, actionable message.
|
|
2310
|
+
# This happens when FACTORY_CODEX_SANDBOX=bwrap (or any future codex version
|
|
2311
|
+
# that enables bwrap by default) is used inside Docker/container environments
|
|
2312
|
+
# that lack CAP_NET_ADMIN.
|
|
2313
|
+
if result.status == "failed" and "RTM_NEWADDR" in (result.stderr or ""):
|
|
2314
|
+
logger.error(
|
|
2315
|
+
"Codex sandbox (bwrap) failed for task %s with network namespace error. "
|
|
2316
|
+
"Set FACTORY_CODEX_SANDBOX=bypass (default) to disable bwrap sandboxing. "
|
|
2317
|
+
"Original error: %s",
|
|
2318
|
+
task_id, (result.stderr or "").strip()[:500],
|
|
2319
|
+
)
|
|
2320
|
+
result = TaskResult(
|
|
2321
|
+
status="failed",
|
|
2322
|
+
exit_code=result.exit_code,
|
|
2323
|
+
stdout=result.stdout,
|
|
2324
|
+
stderr=result.stderr,
|
|
2325
|
+
error=(
|
|
2326
|
+
"codex_sandbox_error: bubblewrap (bwrap) failed to create a network "
|
|
2327
|
+
"namespace (RTM_NEWADDR: Operation not permitted). This environment "
|
|
2328
|
+
"does not support bwrap sandboxing (e.g. Docker without CAP_NET_ADMIN). "
|
|
2329
|
+
"Fix: set FACTORY_CODEX_SANDBOX=bypass in the daemon environment "
|
|
2330
|
+
"(this is already the default — check that no override is set)."
|
|
2331
|
+
),
|
|
2332
|
+
metrics=result.metrics,
|
|
2333
|
+
)
|
|
2334
|
+
|
|
2060
2335
|
parsed_metrics = self._parse_agent_jsonl_output(result.stdout)
|
|
2061
2336
|
result.metrics.update(parsed_metrics)
|
|
2062
2337
|
return result
|
|
@@ -2065,14 +2340,23 @@ class ProcessManager:
|
|
|
2065
2340
|
self, agent: DiscoveredAgent, prompt: str, cwd: Path, timeout: int, task_id: str,
|
|
2066
2341
|
on_chunk: Any = None,
|
|
2067
2342
|
) -> TaskResult:
|
|
2068
|
-
"""Run OpenCode CLI in non-interactive mode.
|
|
2343
|
+
"""Run OpenCode CLI in non-interactive mode.
|
|
2344
|
+
|
|
2345
|
+
Uses `opencode run --format json --dir <cwd>` for headless execution.
|
|
2346
|
+
The message is passed as a positional argument.
|
|
2347
|
+
NOTE: `--dir` is the correct flag (not `--cwd` which is invalid).
|
|
2348
|
+
"""
|
|
2069
2349
|
cmd = [
|
|
2070
2350
|
agent.command, "run",
|
|
2071
2351
|
"--format", "json",
|
|
2072
2352
|
"--dangerously-skip-permissions",
|
|
2073
|
-
"--
|
|
2074
|
-
prompt,
|
|
2353
|
+
"--dir", str(cwd),
|
|
2075
2354
|
]
|
|
2355
|
+
# Apply model override if configured (e.g. FACTORY_OPENCODE_MODEL=copilot/gpt-4.1)
|
|
2356
|
+
model_override = os.environ.get("FACTORY_OPENCODE_MODEL")
|
|
2357
|
+
if model_override:
|
|
2358
|
+
cmd += ["--model", model_override]
|
|
2359
|
+
cmd.append(prompt)
|
|
2076
2360
|
result = await self._run_cli(cmd, cwd, timeout, task_id, on_chunk=on_chunk)
|
|
2077
2361
|
parsed_metrics = self._parse_agent_jsonl_output(result.stdout)
|
|
2078
2362
|
result.metrics.update(parsed_metrics)
|
|
@@ -2121,6 +2405,7 @@ class ProcessManager:
|
|
|
2121
2405
|
stdin=asyncio.subprocess.PIPE if stdin_input else None,
|
|
2122
2406
|
cwd=str(cwd),
|
|
2123
2407
|
limit=100 * 1024 * 1024, # 100MB line buffer for large agent output
|
|
2408
|
+
start_new_session=True, # own process group → killpg on timeout kills all children
|
|
2124
2409
|
)
|
|
2125
2410
|
self.active_processes[task_id] = proc
|
|
2126
2411
|
stdin_bytes = stdin_input.encode() if stdin_input else None
|
|
@@ -2136,8 +2421,28 @@ class ProcessManager:
|
|
|
2136
2421
|
error="" if status == "success" else f"Exited with code {returncode}",
|
|
2137
2422
|
)
|
|
2138
2423
|
except asyncio.TimeoutError:
|
|
2139
|
-
|
|
2140
|
-
|
|
2424
|
+
proc = self.active_processes.pop(task_id, None)
|
|
2425
|
+
if proc:
|
|
2426
|
+
try:
|
|
2427
|
+
if sys.platform != "win32":
|
|
2428
|
+
import signal as _signal
|
|
2429
|
+
try:
|
|
2430
|
+
os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
|
|
2431
|
+
except (ProcessLookupError, PermissionError, OSError):
|
|
2432
|
+
pass
|
|
2433
|
+
else:
|
|
2434
|
+
import subprocess as _subprocess
|
|
2435
|
+
_subprocess.run(
|
|
2436
|
+
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
|
|
2437
|
+
capture_output=True,
|
|
2438
|
+
)
|
|
2439
|
+
except Exception:
|
|
2440
|
+
pass
|
|
2441
|
+
finally:
|
|
2442
|
+
try:
|
|
2443
|
+
proc.kill()
|
|
2444
|
+
except Exception:
|
|
2445
|
+
pass
|
|
2141
2446
|
return TaskResult(
|
|
2142
2447
|
status="failed", exit_code=-1, stdout="", stderr="",
|
|
2143
2448
|
error=f"Timed out after {timeout}s",
|
|
@@ -2542,10 +2847,28 @@ class ProcessManager:
|
|
|
2542
2847
|
return info
|
|
2543
2848
|
|
|
2544
2849
|
async def cancel(self, task_id: str):
|
|
2545
|
-
proc = self.active_processes.
|
|
2850
|
+
proc = self.active_processes.pop(task_id, None)
|
|
2546
2851
|
if proc:
|
|
2547
|
-
|
|
2548
|
-
|
|
2852
|
+
try:
|
|
2853
|
+
if sys.platform != "win32":
|
|
2854
|
+
import signal as _signal
|
|
2855
|
+
try:
|
|
2856
|
+
os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
|
|
2857
|
+
except (ProcessLookupError, PermissionError, OSError):
|
|
2858
|
+
pass
|
|
2859
|
+
else:
|
|
2860
|
+
import subprocess as _subprocess
|
|
2861
|
+
_subprocess.run(
|
|
2862
|
+
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
|
|
2863
|
+
capture_output=True,
|
|
2864
|
+
)
|
|
2865
|
+
except Exception:
|
|
2866
|
+
pass
|
|
2867
|
+
finally:
|
|
2868
|
+
try:
|
|
2869
|
+
proc.kill()
|
|
2870
|
+
except Exception:
|
|
2871
|
+
pass
|
|
2549
2872
|
|
|
2550
2873
|
|
|
2551
2874
|
# ── Progress Reporter ──
|
|
@@ -2834,6 +3157,23 @@ class TaskPoller:
|
|
|
2834
3157
|
logger.warning("Task poll error: %s", e)
|
|
2835
3158
|
return []
|
|
2836
3159
|
|
|
3160
|
+
async def poll_ai_jobs(self) -> list[dict]:
|
|
3161
|
+
"""Poll for AIJobs dispatched to this daemon (workspace-mode)."""
|
|
3162
|
+
try:
|
|
3163
|
+
resp = await self.client.get(
|
|
3164
|
+
f"{self.server_url}/api/v1/runtimes/{self.runtime_id}/ai-jobs/poll",
|
|
3165
|
+
timeout=10,
|
|
3166
|
+
)
|
|
3167
|
+
if resp.status_code == 200:
|
|
3168
|
+
self._on_success()
|
|
3169
|
+
return resp.json().get("ai_jobs", [])
|
|
3170
|
+
elif resp.status_code in (401, 403):
|
|
3171
|
+
self._on_auth_failure()
|
|
3172
|
+
return []
|
|
3173
|
+
except Exception as e:
|
|
3174
|
+
logger.debug("AIJob poll error: %s", e)
|
|
3175
|
+
return []
|
|
3176
|
+
|
|
2837
3177
|
|
|
2838
3178
|
# ── Server Connection ──
|
|
2839
3179
|
|
|
@@ -3214,6 +3554,11 @@ class RuntimeDaemon:
|
|
|
3214
3554
|
|
|
3215
3555
|
if not acquired:
|
|
3216
3556
|
logger.error("Cannot acquire daemon lock — another instance may still be running")
|
|
3557
|
+
print(
|
|
3558
|
+
"DAEMON_ERROR: Cannot acquire daemon lock — another daemon instance may "
|
|
3559
|
+
"still be running. Stop the existing daemon first or restart the machine.",
|
|
3560
|
+
file=sys.stderr,
|
|
3561
|
+
)
|
|
3217
3562
|
raise SystemExit(1)
|
|
3218
3563
|
|
|
3219
3564
|
# Write PID to lock file (for reference, though unreadable while locked)
|
|
@@ -3269,6 +3614,11 @@ class RuntimeDaemon:
|
|
|
3269
3614
|
fcntl.flock(self._lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
3270
3615
|
except (IOError, OSError):
|
|
3271
3616
|
logger.error("Cannot acquire daemon lock — another instance may still be running")
|
|
3617
|
+
print(
|
|
3618
|
+
"DAEMON_ERROR: Cannot acquire daemon lock — another daemon instance may "
|
|
3619
|
+
"still be running. Stop the existing daemon first or restart the machine.",
|
|
3620
|
+
file=sys.stderr,
|
|
3621
|
+
)
|
|
3272
3622
|
raise SystemExit(1)
|
|
3273
3623
|
|
|
3274
3624
|
# Write our PID to the lock file for reference
|
|
@@ -3411,6 +3761,23 @@ class RuntimeDaemon:
|
|
|
3411
3761
|
self._execute_task(task, conn)
|
|
3412
3762
|
)
|
|
3413
3763
|
|
|
3764
|
+
# Poll for AIJobs (workspace-mode tasks)
|
|
3765
|
+
if len(self.active_tasks) < self.max_concurrent:
|
|
3766
|
+
ai_jobs = await conn.poller.poll_ai_jobs()
|
|
3767
|
+
for aj in ai_jobs:
|
|
3768
|
+
job_id = aj.get("job_id", "")
|
|
3769
|
+
ai_task_key = f"aijob_{job_id}"
|
|
3770
|
+
if ai_task_key in self.active_tasks:
|
|
3771
|
+
continue
|
|
3772
|
+
if len(self.active_tasks) >= self.max_concurrent:
|
|
3773
|
+
break
|
|
3774
|
+
logger.info("[%s] Starting AIJob %s (type=%s)",
|
|
3775
|
+
conn.label, job_id, aj.get("task_type"))
|
|
3776
|
+
self._task_connections[ai_task_key] = conn
|
|
3777
|
+
self.active_tasks[ai_task_key] = asyncio.create_task(
|
|
3778
|
+
self._execute_ai_job(aj, conn)
|
|
3779
|
+
)
|
|
3780
|
+
|
|
3414
3781
|
async def _execute_task(self, task: TaskInfo, conn: ServerConnection):
|
|
3415
3782
|
"""Execute a single task, reporting to the originating server connection."""
|
|
3416
3783
|
reporter = conn.reporter
|
|
@@ -3908,12 +4275,27 @@ class RuntimeDaemon:
|
|
|
3908
4275
|
|
|
3909
4276
|
# Testing-specific: validate structured test assets
|
|
3910
4277
|
if node_type == "testing":
|
|
3911
|
-
#
|
|
4278
|
+
# Determine which checks to run for this requirement type.
|
|
4279
|
+
#
|
|
4280
|
+
# _skip_test_artifacts = True → skip ALL artifact checks
|
|
4281
|
+
# (set for types that explicitly list "test_coverage" in
|
|
4282
|
+
# skip_dimensions, e.g. "task", "documentation", "spike")
|
|
4283
|
+
#
|
|
4284
|
+
# _requires_structured_artifacts = True → test-cases.json and
|
|
4285
|
+
# coverage-matrix.json are *required* deliverables.
|
|
4286
|
+
# Set only for "feature" and "improvement" — types whose
|
|
4287
|
+
# testing phase is a full QA suite rather than regression
|
|
4288
|
+
# verification. For "bugfix", "refactor", etc. these files
|
|
4289
|
+
# are *optional*: if they exist they are validated, but their
|
|
4290
|
+
# absence is not an error (the agent only writes regression
|
|
4291
|
+
# tests + test-report.md).
|
|
3912
4292
|
_skip_test_artifacts = False
|
|
4293
|
+
_requires_structured_artifacts = False
|
|
3913
4294
|
try:
|
|
3914
4295
|
from app.services.type_workflow_profiles import get_profile
|
|
3915
4296
|
_profile = get_profile(req_type)
|
|
3916
4297
|
_skip_test_artifacts = "test_coverage" in _profile.skip_dimensions
|
|
4298
|
+
_requires_structured_artifacts = req_type in ("feature", "improvement")
|
|
3917
4299
|
except Exception:
|
|
3918
4300
|
pass
|
|
3919
4301
|
|
|
@@ -3930,6 +4312,8 @@ class RuntimeDaemon:
|
|
|
3930
4312
|
base = workspace_path
|
|
3931
4313
|
|
|
3932
4314
|
# --- test-cases.json validation ---
|
|
4315
|
+
# Required for feature/improvement; optional (but validated
|
|
4316
|
+
# if present) for all other testing node types.
|
|
3933
4317
|
tc_path = base / "test-cases.json"
|
|
3934
4318
|
if tc_path.exists():
|
|
3935
4319
|
try:
|
|
@@ -3938,19 +4322,24 @@ class RuntimeDaemon:
|
|
|
3938
4322
|
if not cases:
|
|
3939
4323
|
issues.append("test-cases.json exists but contains no test cases")
|
|
3940
4324
|
else:
|
|
4325
|
+
# Collect ALL malformed test cases in one pass so
|
|
4326
|
+
# the retry prompt can fix everything at once.
|
|
4327
|
+
# (Previously a `break` was used here which caused
|
|
4328
|
+
# a one-issue-per-retry cascade, burning through
|
|
4329
|
+
# max_retries before the file was fully corrected.)
|
|
3941
4330
|
for tc in cases[:20]:
|
|
3942
4331
|
if not tc.get("id") or not tc.get("title"):
|
|
3943
|
-
issues.append(
|
|
3944
|
-
|
|
3945
|
-
|
|
4332
|
+
issues.append(
|
|
4333
|
+
f"Test case missing 'id' or 'title': {tc.get('id', '?')}"
|
|
4334
|
+
)
|
|
4335
|
+
elif not tc.get("steps"):
|
|
3946
4336
|
issues.append(f"Test case {tc['id']} has no 'steps'")
|
|
3947
|
-
break
|
|
3948
4337
|
p0_cases = [c for c in cases if c.get("priority") == "P0"]
|
|
3949
4338
|
if not p0_cases:
|
|
3950
4339
|
issues.append("No P0 priority test cases found in test-cases.json")
|
|
3951
4340
|
except (_json.JSONDecodeError, UnicodeDecodeError) as e:
|
|
3952
4341
|
issues.append(f"test-cases.json is not valid JSON: {e}")
|
|
3953
|
-
|
|
4342
|
+
elif _requires_structured_artifacts:
|
|
3954
4343
|
issues.append(f"test-cases.json not found in {doc_dir or 'workspace root'}")
|
|
3955
4344
|
|
|
3956
4345
|
# --- coverage-matrix.json validation ---
|
|
@@ -3965,7 +4354,7 @@ class RuntimeDaemon:
|
|
|
3965
4354
|
issues.append(f"Uncovered acceptance criteria in coverage-matrix.json: {ids}")
|
|
3966
4355
|
except (_json.JSONDecodeError, UnicodeDecodeError) as e:
|
|
3967
4356
|
issues.append(f"coverage-matrix.json is not valid JSON: {e}")
|
|
3968
|
-
|
|
4357
|
+
elif _requires_structured_artifacts:
|
|
3969
4358
|
issues.append(f"coverage-matrix.json not found in {doc_dir or 'workspace root'}")
|
|
3970
4359
|
|
|
3971
4360
|
# --- test-report.md validation ---
|
|
@@ -3975,6 +4364,139 @@ class RuntimeDaemon:
|
|
|
3975
4364
|
|
|
3976
4365
|
return issues
|
|
3977
4366
|
|
|
4367
|
+
async def _execute_ai_job(self, aj: dict, conn: "ServerConnection"):
|
|
4368
|
+
"""Execute an AIJob in daemon workspace and report results back.
|
|
4369
|
+
|
|
4370
|
+
Uses WorkspaceManager for branch-based isolation, runs the agent CLI
|
|
4371
|
+
with the job's prompt, auto-commits results, and reports back.
|
|
4372
|
+
"""
|
|
4373
|
+
job_id = aj.get("job_id", "")
|
|
4374
|
+
task_type = aj.get("task_type", "unknown")
|
|
4375
|
+
project_info = aj.get("project", {})
|
|
4376
|
+
requirement_key = aj.get("requirement_key")
|
|
4377
|
+
agent_override = aj.get("agent_override")
|
|
4378
|
+
system_prompt = aj.get("system_prompt", "")
|
|
4379
|
+
user_prompt = aj.get("user_prompt", "")
|
|
4380
|
+
|
|
4381
|
+
reporter_url = f"{conn.server_url.rstrip('/')}/api/v1/runtimes/{conn.runtime_id}/ai-jobs/{job_id}"
|
|
4382
|
+
|
|
4383
|
+
try:
|
|
4384
|
+
# Report progress: starting
|
|
4385
|
+
await conn.client.post(
|
|
4386
|
+
f"{reporter_url}/progress",
|
|
4387
|
+
json={"current_phase": "preparing", "current_step": "Preparing workspace...", "progress_pct": 5},
|
|
4388
|
+
timeout=10,
|
|
4389
|
+
)
|
|
4390
|
+
|
|
4391
|
+
# 1. Select agent
|
|
4392
|
+
agent_type = agent_override or "claude-code"
|
|
4393
|
+
agent = self._select_agent(agent_type, [])
|
|
4394
|
+
if not agent:
|
|
4395
|
+
await conn.client.post(
|
|
4396
|
+
f"{reporter_url}/complete",
|
|
4397
|
+
json={"status": "failed", "error": f"No agent CLI for '{agent_type}'", "failure_code": "no_agent"},
|
|
4398
|
+
timeout=10,
|
|
4399
|
+
)
|
|
4400
|
+
return
|
|
4401
|
+
|
|
4402
|
+
# 2. Prepare workspace (using project info + requirement branch)
|
|
4403
|
+
full_prompt = f"{system_prompt}\n\n{user_prompt}" if system_prompt else user_prompt
|
|
4404
|
+
fake_task = TaskInfo(
|
|
4405
|
+
task_id=job_id,
|
|
4406
|
+
graph_id="",
|
|
4407
|
+
node_type="ai_job",
|
|
4408
|
+
agent_type=agent_type,
|
|
4409
|
+
input_prompt=full_prompt,
|
|
4410
|
+
input_data={},
|
|
4411
|
+
timeout_seconds=settings.AGENT_TIMEOUT,
|
|
4412
|
+
max_retries=0,
|
|
4413
|
+
retry_count=0,
|
|
4414
|
+
project=project_info,
|
|
4415
|
+
work_item={},
|
|
4416
|
+
fallback_chain=[],
|
|
4417
|
+
requirement_workflow_id=None,
|
|
4418
|
+
requirement_key=requirement_key,
|
|
4419
|
+
graph_type="ai_job",
|
|
4420
|
+
)
|
|
4421
|
+
workspace_path = await self.workspace_manager.prepare_workspace(
|
|
4422
|
+
project_info, fake_task,
|
|
4423
|
+
)
|
|
4424
|
+
|
|
4425
|
+
await conn.client.post(
|
|
4426
|
+
f"{reporter_url}/progress",
|
|
4427
|
+
json={"current_phase": "running", "current_step": "Running agent...", "progress_pct": 15},
|
|
4428
|
+
timeout=10,
|
|
4429
|
+
)
|
|
4430
|
+
|
|
4431
|
+
# 3. Run agent with prompt
|
|
4432
|
+
_line_buffer: list[str] = []
|
|
4433
|
+
|
|
4434
|
+
async def on_chunk(lines: list[str]):
|
|
4435
|
+
_line_buffer.extend(lines)
|
|
4436
|
+
|
|
4437
|
+
result = await self.process_manager.run_agent(
|
|
4438
|
+
agent, fake_task, workspace_path, on_chunk=on_chunk,
|
|
4439
|
+
)
|
|
4440
|
+
|
|
4441
|
+
# 4. Auto-commit if successful
|
|
4442
|
+
git_info = {}
|
|
4443
|
+
if result.status == "success" and result.files_changed:
|
|
4444
|
+
git_info = await self._auto_commit(workspace_path, fake_task)
|
|
4445
|
+
|
|
4446
|
+
# 5. Report completion
|
|
4447
|
+
output_content = result.stdout[-20000:] if result.stdout else ""
|
|
4448
|
+
scripts: dict = {}
|
|
4449
|
+
|
|
4450
|
+
# Try to extract per-scenario scripts from output
|
|
4451
|
+
scenario_ids = aj.get("input_context", {}).get("scenario_ids", [])
|
|
4452
|
+
if scenario_ids and output_content:
|
|
4453
|
+
# Simple heuristic: if output is a single script, map it to first scenario
|
|
4454
|
+
# Daemon-generated scripts may be multiple files in workspace
|
|
4455
|
+
for sid in scenario_ids:
|
|
4456
|
+
# Check if daemon wrote test files to workspace
|
|
4457
|
+
import glob
|
|
4458
|
+
test_files = glob.glob(str(workspace_path / "tests" / "**" / f"*{sid[:8]}*"), recursive=True)
|
|
4459
|
+
if test_files:
|
|
4460
|
+
try:
|
|
4461
|
+
with open(test_files[0], "r") as f:
|
|
4462
|
+
scripts[sid] = f.read()
|
|
4463
|
+
except Exception:
|
|
4464
|
+
pass
|
|
4465
|
+
|
|
4466
|
+
complete_payload = {
|
|
4467
|
+
"status": "success" if result.status == "success" else "failed",
|
|
4468
|
+
"output_content": output_content,
|
|
4469
|
+
"output_result": {
|
|
4470
|
+
"scripts": scripts,
|
|
4471
|
+
"files_changed": result.files_changed,
|
|
4472
|
+
"lines_added": result.lines_added,
|
|
4473
|
+
"lines_removed": result.lines_removed,
|
|
4474
|
+
},
|
|
4475
|
+
"tier_used": "agent_cli",
|
|
4476
|
+
"resolved_agent": agent.agent_id,
|
|
4477
|
+
"git_info": git_info,
|
|
4478
|
+
"error": result.error if result.status != "success" else "",
|
|
4479
|
+
"failure_code": "agent_error" if result.status != "success" else "",
|
|
4480
|
+
}
|
|
4481
|
+
|
|
4482
|
+
await conn.client.post(
|
|
4483
|
+
f"{reporter_url}/complete",
|
|
4484
|
+
json=complete_payload,
|
|
4485
|
+
timeout=30,
|
|
4486
|
+
)
|
|
4487
|
+
logger.info("AIJob %s completed: %s", job_id, result.status)
|
|
4488
|
+
|
|
4489
|
+
except Exception as e:
|
|
4490
|
+
logger.exception("AIJob %s execution error", job_id)
|
|
4491
|
+
try:
|
|
4492
|
+
await conn.client.post(
|
|
4493
|
+
f"{reporter_url}/complete",
|
|
4494
|
+
json={"status": "failed", "error": str(e)[:2000], "failure_code": "daemon_exception"},
|
|
4495
|
+
timeout=10,
|
|
4496
|
+
)
|
|
4497
|
+
except Exception:
|
|
4498
|
+
pass
|
|
4499
|
+
|
|
3978
4500
|
async def _validate_and_retry(
|
|
3979
4501
|
self,
|
|
3980
4502
|
agent: "DiscoveredAgent",
|
|
@@ -4845,15 +5367,29 @@ class RuntimeDaemon:
|
|
|
4845
5367
|
)
|
|
4846
5368
|
|
|
4847
5369
|
logger.info("Found unpushed commits on %s, pushing...", branch)
|
|
4848
|
-
|
|
4849
|
-
|
|
4850
|
-
|
|
4851
|
-
|
|
4852
|
-
|
|
4853
|
-
|
|
4854
|
-
|
|
4855
|
-
|
|
4856
|
-
|
|
5370
|
+
last_push_exc: Exception | None = None
|
|
5371
|
+
for attempt in range(1, 4): # retry up to 3 times
|
|
5372
|
+
try:
|
|
5373
|
+
await git(
|
|
5374
|
+
"push", "-u", "origin", branch,
|
|
5375
|
+
cwd=workspace_path, project_key=project_key,
|
|
5376
|
+
)
|
|
5377
|
+
logger.info("Pushed branch %s to origin (attempt %d)", branch, attempt)
|
|
5378
|
+
last_push_exc = None
|
|
5379
|
+
break
|
|
5380
|
+
except RuntimeError as exc:
|
|
5381
|
+
last_push_exc = exc
|
|
5382
|
+
if attempt < 3:
|
|
5383
|
+
wait = attempt * 10 # 10s, 20s
|
|
5384
|
+
logger.warning(
|
|
5385
|
+
"Push attempt %d failed for branch %s: %s — retrying in %ds",
|
|
5386
|
+
attempt, branch, exc, wait,
|
|
5387
|
+
)
|
|
5388
|
+
await asyncio.sleep(wait)
|
|
5389
|
+
else:
|
|
5390
|
+
logger.error("Push failed for branch %s after 3 attempts: %s", branch, exc)
|
|
5391
|
+
if last_push_exc is not None:
|
|
5392
|
+
return f"Push failed: {last_push_exc}"
|
|
4857
5393
|
else:
|
|
4858
5394
|
logger.info("No unpushed commits on %s", branch)
|
|
4859
5395
|
return None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|