forgexa-cli 1.6.1__tar.gz → 1.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.6.1
3
+ Version: 1.7.5
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,2 +1,2 @@
1
1
  """forgexa-cli — Forgexa command-line client."""
2
- __version__ = "1.6.1"
2
+ __version__ = "1.7.5"
@@ -10,11 +10,26 @@ Usage:
10
10
 
11
11
  from __future__ import annotations
12
12
 
13
+ import sys
14
+
15
+ # ── Python version gate — must run before any other imports ──────────────────
16
+ # Emit a machine-readable DAEMON_ERROR so the desktop app shows a clear
17
+ # message instead of a cryptic traceback.
18
+ if sys.version_info < (3, 9):
19
+ _ver = f"{sys.version_info.major}.{sys.version_info.minor}"
20
+ print(
21
+ f"DAEMON_ERROR: Python {_ver} is too old. Forgexa Daemon requires Python 3.9 or "
22
+ f"newer. Please upgrade Python from https://www.python.org/downloads/",
23
+ file=sys.stderr,
24
+ )
25
+ sys.exit(1)
26
+
13
27
  import asyncio
14
28
  import base64
15
29
  import hashlib
16
30
  import json
17
31
  import logging
32
+ from logging.handlers import RotatingFileHandler
18
33
  import os
19
34
  import platform
20
35
  import re
@@ -287,6 +302,16 @@ except (ImportError, ModuleNotFoundError):
287
302
  def AGENT_MAX_OUTPUT_SIZE(self) -> int:
288
303
  return int(os.environ.get("AGENT_MAX_OUTPUT_SIZE", "100000"))
289
304
 
305
+ @property
306
+ def FACTORY_CODEX_SANDBOX(self) -> str:
307
+ """Codex sandbox mode: 'bypass' (default, safe) or 'bwrap' (Linux only).
308
+
309
+ 'bypass' uses --dangerously-bypass-approvals-and-sandbox which works
310
+ in all environments including Docker without CAP_NET_ADMIN.
311
+ 'bwrap' uses --full-auto (bubblewrap) which requires CAP_NET_ADMIN.
312
+ """
313
+ return os.environ.get("FACTORY_CODEX_SANDBOX", "bypass").strip().lower()
314
+
290
315
  def get_daemon_workspaces_root(self) -> str:
291
316
  root = self.DAEMON_WORKSPACES_ROOT
292
317
  if not root:
@@ -307,7 +332,7 @@ except (ImportError, ModuleNotFoundError):
307
332
  # DAEMON_VERSION is the protocol/logic version of the daemon code.
308
333
  # Kept in sync with pyproject.toml version via bump-version.sh.
309
334
  # CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
310
- DAEMON_VERSION = "1.6.1"
335
+ DAEMON_VERSION = "1.7.5"
311
336
 
312
337
 
313
338
  def _detect_client_type() -> str:
@@ -344,7 +369,11 @@ _log_dir.mkdir(parents=True, exist_ok=True)
344
369
  DAEMON_LOG_PATH = _log_dir / "daemon.log"
345
370
 
346
371
  _log_handlers: list[logging.Handler] = [
347
- logging.FileHandler(DAEMON_LOG_PATH, mode="a", encoding="utf-8"),
372
+ RotatingFileHandler(
373
+ DAEMON_LOG_PATH, mode="a", encoding="utf-8",
374
+ maxBytes=50 * 1024 * 1024, # 50 MB per file
375
+ backupCount=5,
376
+ ),
348
377
  ]
349
378
  if sys.stderr.isatty():
350
379
  _log_handlers.append(logging.StreamHandler(sys.stderr))
@@ -699,6 +728,9 @@ class AgentDiscovery:
699
728
 
700
729
  async def discover(self) -> list[DiscoveredAgent]:
701
730
  self._expand_path()
731
+ # Probe bwrap support once at discovery time and log a clear warning
732
+ # if it is broken. This surfaces the error early rather than mid-task.
733
+ await self._probe_bwrap_support()
702
734
  available = []
703
735
  for agent_id, spec in self.AGENT_REGISTRY.items():
704
736
  custom_path = os.environ.get(spec.get("env_path_override", ""))
@@ -718,8 +750,9 @@ class AgentDiscovery:
718
750
 
719
751
  async def _get_version(self, detect_cmd: str) -> str:
720
752
  try:
721
- proc = await asyncio.create_subprocess_shell(
722
- detect_cmd,
753
+ parts = detect_cmd.split()
754
+ proc = await asyncio.create_subprocess_exec(
755
+ *parts,
723
756
  stdout=asyncio.subprocess.PIPE,
724
757
  stderr=asyncio.subprocess.PIPE,
725
758
  )
@@ -728,8 +761,60 @@ class AgentDiscovery:
728
761
  except Exception:
729
762
  return "unknown"
730
763
 
764
+ @staticmethod
765
+ async def _probe_bwrap_support() -> None:
766
+ """Probe whether bubblewrap (bwrap) works in this environment.
731
767
 
732
- # ── Workspace Manager ──
768
+ codex exec --full-auto internally creates a bubblewrap sandbox that
769
+ requires a network namespace (CAP_NET_ADMIN). Inside Docker containers
770
+ or other restricted Linux environments this fails immediately with:
771
+ bwrap: loopback: Failed RTM_NEWADDR: Operation not permitted
772
+
773
+ We probe at startup so the operator gets an actionable warning rather
774
+ than a cryptic mid-task failure. The probe is skipped on macOS/Windows
775
+ because Codex uses a different sandbox mechanism on those platforms.
776
+ """
777
+ if sys.platform != "linux":
778
+ return
779
+ sandbox_mode = os.environ.get("FACTORY_CODEX_SANDBOX", "bypass").strip().lower()
780
+ if sandbox_mode != "bwrap":
781
+ # Default mode bypasses sandbox — no bwrap needed, skip probe.
782
+ return
783
+ bwrap_bin = shutil.which("bwrap")
784
+ if not bwrap_bin:
785
+ logger.warning(
786
+ "FACTORY_CODEX_SANDBOX=bwrap but bwrap binary not found. "
787
+ "Codex sandbox will fail. Either install bwrap or unset "
788
+ "FACTORY_CODEX_SANDBOX to use bypass mode (default)."
789
+ )
790
+ return
791
+ try:
792
+ proc = await asyncio.create_subprocess_exec(
793
+ bwrap_bin,
794
+ "--dev", "/dev",
795
+ "--proc", "/proc",
796
+ "--ro-bind", "/usr", "/usr",
797
+ "--unshare-net",
798
+ "true",
799
+ stdout=asyncio.subprocess.DEVNULL,
800
+ stderr=asyncio.subprocess.PIPE,
801
+ )
802
+ _, stderr = await asyncio.wait_for(proc.communicate(), timeout=5)
803
+ if proc.returncode != 0:
804
+ err = (stderr or b"").decode(errors="replace").strip()
805
+ logger.warning(
806
+ "bwrap probe failed (exit=%d): %s. "
807
+ "codex exec --full-auto will fail in this environment. "
808
+ "Unset FACTORY_CODEX_SANDBOX to use bypass mode (default), "
809
+ "or grant CAP_NET_ADMIN / run privileged.",
810
+ proc.returncode, err,
811
+ )
812
+ else:
813
+ logger.info("bwrap probe: network namespaces work in this environment")
814
+ except asyncio.TimeoutError:
815
+ logger.warning("bwrap probe timed out — treating as unsupported")
816
+ except Exception as exc:
817
+ logger.warning("bwrap probe error: %s", exc)
733
818
 
734
819
 
735
820
  class WorkspaceManager:
@@ -972,6 +1057,71 @@ class WorkspaceManager:
972
1057
  # Remove the broken worktree directory
973
1058
  shutil.rmtree(ws_path, ignore_errors=True)
974
1059
 
1060
+ async def _detect_unrelated_histories(self, repo_path: Path, project_key: str) -> bool:
1061
+ """Detect whether local clone has diverged from remote due to history rewrite.
1062
+
1063
+ When a remote repo is rewritten (e.g. via BFG or git filter-repo to
1064
+ remove large files), all commit SHAs change. The local clone retains
1065
+ the old SHAs in its object store, making fetch/reset/merge fail in
1066
+ cryptic ways.
1067
+
1068
+ Strategy: ask git whether the local HEAD commit object is reachable in
1069
+ the remote graph. We use `git ls-remote` to get the remote HEAD SHA,
1070
+ then check if that SHA exists locally. If the remote HEAD does NOT
1071
+ exist locally, histories are definitely unrelated.
1072
+
1073
+ Additionally, if the repo has a shallow marker but the remote default
1074
+ branch has diverged past the shallow grafts, `git fetch` itself will
1075
+ indicate problems.
1076
+ """
1077
+ try:
1078
+ # Get the local HEAD SHA
1079
+ local_proc = await asyncio.create_subprocess_exec(
1080
+ "git", "rev-parse", "HEAD",
1081
+ cwd=str(repo_path),
1082
+ stdout=asyncio.subprocess.PIPE,
1083
+ stderr=asyncio.subprocess.PIPE,
1084
+ )
1085
+ local_out, _ = await asyncio.wait_for(local_proc.communicate(), timeout=10)
1086
+ if local_proc.returncode != 0:
1087
+ return False
1088
+ local_head = local_out.decode().strip()
1089
+ if not local_head:
1090
+ return False
1091
+
1092
+ # Get the remote HEAD SHA via ls-remote (no network for local check)
1093
+ # Try to see if the remote HEAD is in local object store
1094
+ # If git cat-file -e <remote_sha> succeeds, remote HEAD is known locally
1095
+ # (histories still share commits). Otherwise, fully diverged.
1096
+ #
1097
+ # However, after a history rewrite the remote HEAD is a brand-new SHA,
1098
+ # and the local object store only has old SHAs. So we check the other
1099
+ # direction: does the local HEAD exist on the remote at all?
1100
+ # We use `git branch -r --contains <local_head>` which lists remote
1101
+ # tracking branches that contain that commit. If none, it's unrelated.
1102
+ check_proc = await asyncio.create_subprocess_exec(
1103
+ "git", "branch", "-r", "--contains", local_head,
1104
+ cwd=str(repo_path),
1105
+ stdout=asyncio.subprocess.PIPE,
1106
+ stderr=asyncio.subprocess.PIPE,
1107
+ )
1108
+ out, _ = await asyncio.wait_for(check_proc.communicate(), timeout=10)
1109
+ if check_proc.returncode != 0:
1110
+ # Command failed (e.g. invalid object) — history is broken
1111
+ return True
1112
+ remote_branches = out.decode().strip()
1113
+ if not remote_branches:
1114
+ # Local HEAD is not reachable from any remote branch — unrelated
1115
+ logger.info(
1116
+ "Local HEAD %s not found in any remote branch at %s — "
1117
+ "histories appear unrelated (remote may have been rewritten).",
1118
+ local_head[:12], repo_path,
1119
+ )
1120
+ return True
1121
+ except Exception:
1122
+ pass
1123
+ return False
1124
+
975
1125
  async def _create_worktree(
976
1126
  self, project_dir: Path, repo_url: str, default_branch: str,
977
1127
  workspace_key: str, branch_name: str, *, fresh_start: bool = False,
@@ -1129,6 +1279,25 @@ class WorkspaceManager:
1129
1279
  )
1130
1280
  if not sync_success:
1131
1281
  if expect_branch:
1282
+ # Before giving up, check for history-rewrite: if the remote
1283
+ # history was rewritten (all SHAs changed), local objects are
1284
+ # stale and no amount of retries will fix sync. Detect this
1285
+ # and destroy the workspace + _main so they get recloned.
1286
+ is_unrelated = await self._detect_unrelated_histories(ws_path, project_key)
1287
+ if is_unrelated:
1288
+ logger.warning(
1289
+ "Detected repository history mismatch for worktree %s "
1290
+ "(remote history likely rewritten). Discarding stale "
1291
+ "worktree and _main clone for a full re-clone on retry.",
1292
+ ws_path,
1293
+ )
1294
+ await self._remove_broken_worktree(main_repo, ws_path, workspace_key)
1295
+ shutil.rmtree(main_repo, ignore_errors=True)
1296
+ raise RuntimeError(
1297
+ f"Repository history was rewritten (e.g. large-file cleanup). "
1298
+ f"Stale local clone discarded. "
1299
+ f"The task will be retried with a fresh clone."
1300
+ )
1132
1301
  raise RuntimeError(
1133
1302
  f"Failed to sync branch '{branch_name}' from remote after 3 attempts. "
1134
1303
  f"The branch should exist (pushed by prior analysis/design phase). "
@@ -1149,7 +1318,36 @@ class WorkspaceManager:
1149
1318
  repo_url, str(main_repo), timeout=settings.GIT_CLONE_TIMEOUT, project_key=project_key,
1150
1319
  )
1151
1320
  else:
1152
- await self._git("fetch", "--all", cwd=main_repo, timeout=300, project_key=project_key)
1321
+ # Use targeted fetch instead of --all to avoid pulling every branch/tag
1322
+ # from potentially large repos (avoids 300s timeout on big repos).
1323
+ # Fetch default branch only; the feature branch is explicitly fetched below.
1324
+ try:
1325
+ await self._git(
1326
+ "fetch", "origin", default_branch,
1327
+ cwd=main_repo, timeout=settings.GIT_CLONE_TIMEOUT, project_key=project_key,
1328
+ )
1329
+ except RuntimeError as _fetch_err:
1330
+ err_str = str(_fetch_err)
1331
+ # Detect "unrelated histories" / history-rewrite scenarios:
1332
+ # If the remote history was rewritten (e.g. BFG large-file removal),
1333
+ # all commit SHAs change. The local clone becomes incompatible —
1334
+ # fetch may succeed but the local refs are orphaned and unusable.
1335
+ # Detection: check whether local HEAD exists in the remote graph.
1336
+ is_unrelated = await self._detect_unrelated_histories(main_repo, project_key)
1337
+ if is_unrelated or "not our ref" in err_str or "shallow" in err_str:
1338
+ logger.warning(
1339
+ "Detected repository history mismatch for %s (remote history likely "
1340
+ "rewritten). Discarding stale local clone and re-cloning from scratch.",
1341
+ main_repo,
1342
+ )
1343
+ shutil.rmtree(main_repo, ignore_errors=True)
1344
+ await self._git(
1345
+ "clone", "--single-branch", "--no-tags",
1346
+ repo_url, str(main_repo), timeout=settings.GIT_CLONE_TIMEOUT,
1347
+ project_key=project_key,
1348
+ )
1349
+ else:
1350
+ raise
1153
1351
 
1154
1352
  # --single-branch clone only fetches the default branch.
1155
1353
  # Explicitly fetch the feature branch so origin/{branch_name}
@@ -1463,7 +1661,12 @@ class ProcessManager:
1463
1661
  "name or service not known",
1464
1662
  "no such host",
1465
1663
  "network is unreachable",
1466
- "api error",
1664
+ # "api error" removed: too broad — matches agent-generated code/output
1665
+ # discussing API errors. Real API transport errors are covered by the
1666
+ # connection patterns above (refused, reset, timed out, etc.).
1667
+ "apiexception:",
1668
+ "api error: 5", # 5xx errors like "API error: 503", "API error: 502"
1669
+ "api error: connection",
1467
1670
  ]
1468
1671
 
1469
1672
  def __init__(self):
@@ -1918,7 +2121,30 @@ class ProcessManager:
1918
2121
  timeout=timeout,
1919
2122
  )
1920
2123
  except asyncio.TimeoutError:
1921
- proc.kill()
2124
+ # Kill the entire process group so that child processes (npm, yarn,
2125
+ # ssh, git, etc.) spawned by the agent are also terminated. A plain
2126
+ # proc.kill() only kills the direct subprocess; any grandchildren
2127
+ # become orphaned, keep pipes open, and can exhaust system resources.
2128
+ try:
2129
+ if sys.platform != "win32":
2130
+ import signal as _signal
2131
+ try:
2132
+ os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2133
+ except (ProcessLookupError, PermissionError, OSError):
2134
+ pass
2135
+ else:
2136
+ import subprocess as _subprocess
2137
+ _subprocess.run(
2138
+ ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2139
+ capture_output=True,
2140
+ )
2141
+ except Exception:
2142
+ pass
2143
+ finally:
2144
+ try:
2145
+ proc.kill()
2146
+ except Exception:
2147
+ pass
1922
2148
  # Drain any remaining output after kill
1923
2149
  try:
1924
2150
  remaining, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
@@ -1985,6 +2211,7 @@ class ProcessManager:
1985
2211
  cwd=str(cwd),
1986
2212
  env=env,
1987
2213
  limit=100 * 1024 * 1024, # 100MB line buffer for large JSON output from long sessions
2214
+ start_new_session=True, # own process group → killpg on timeout kills all children
1988
2215
  )
1989
2216
  self.active_processes[task_id] = proc
1990
2217
  stdout, stderr, returncode = await self._stream_process(
@@ -2054,9 +2281,57 @@ class ProcessManager:
2054
2281
  self, agent: DiscoveredAgent, prompt: str, cwd: Path, timeout: int, task_id: str,
2055
2282
  on_chunk: Any = None,
2056
2283
  ) -> TaskResult:
2057
- """Run Codex CLI in exec mode (non-interactive)."""
2058
- cmd = [agent.command, "exec", "--full-auto", "--json", "-"]
2284
+ """Run Codex CLI in exec mode (non-interactive).
2285
+
2286
+ Sandbox mode selection (FACTORY_CODEX_SANDBOX env var):
2287
+ - "bypass" (default): --dangerously-bypass-approvals-and-sandbox
2288
+ Safe for daemon context: the daemon already runs on a controlled
2289
+ machine and the workspace path is pre-scoped to the project.
2290
+ Required when running inside Docker or any environment that lacks
2291
+ CAP_NET_ADMIN, because codex --full-auto internally uses bubblewrap
2292
+ (bwrap) which tries to set up a loopback network interface and fails
2293
+ with "bwrap: loopback: Failed RTM_NEWADDR: Operation not permitted".
2294
+ - "bwrap": --full-auto (uses bubblewrap Linux sandbox). Only works
2295
+ when bwrap can create user+network namespaces (bare-metal Linux,
2296
+ not inside most Docker containers).
2297
+ """
2298
+ sandbox_mode = os.environ.get("FACTORY_CODEX_SANDBOX", "bypass").strip().lower()
2299
+ if sandbox_mode == "bwrap":
2300
+ sandbox_flag = "--full-auto"
2301
+ else:
2302
+ # Default: bypass sandbox entirely — no bwrap, no approval prompts.
2303
+ # Equivalent to Kimi's --yolo and OpenCode's --dangerously-skip-permissions.
2304
+ sandbox_flag = "--dangerously-bypass-approvals-and-sandbox"
2305
+
2306
+ cmd = [agent.command, "exec", sandbox_flag, "--json", "-"]
2059
2307
  result = await self._run_cli(cmd, cwd, timeout, task_id, stdin_input=prompt, on_chunk=on_chunk)
2308
+
2309
+ # Detect the bwrap loopback error and surface a clear, actionable message.
2310
+ # This happens when FACTORY_CODEX_SANDBOX=bwrap (or any future codex version
2311
+ # that enables bwrap by default) is used inside Docker/container environments
2312
+ # that lack CAP_NET_ADMIN.
2313
+ if result.status == "failed" and "RTM_NEWADDR" in (result.stderr or ""):
2314
+ logger.error(
2315
+ "Codex sandbox (bwrap) failed for task %s with network namespace error. "
2316
+ "Set FACTORY_CODEX_SANDBOX=bypass (default) to disable bwrap sandboxing. "
2317
+ "Original error: %s",
2318
+ task_id, (result.stderr or "").strip()[:500],
2319
+ )
2320
+ result = TaskResult(
2321
+ status="failed",
2322
+ exit_code=result.exit_code,
2323
+ stdout=result.stdout,
2324
+ stderr=result.stderr,
2325
+ error=(
2326
+ "codex_sandbox_error: bubblewrap (bwrap) failed to create a network "
2327
+ "namespace (RTM_NEWADDR: Operation not permitted). This environment "
2328
+ "does not support bwrap sandboxing (e.g. Docker without CAP_NET_ADMIN). "
2329
+ "Fix: set FACTORY_CODEX_SANDBOX=bypass in the daemon environment "
2330
+ "(this is already the default — check that no override is set)."
2331
+ ),
2332
+ metrics=result.metrics,
2333
+ )
2334
+
2060
2335
  parsed_metrics = self._parse_agent_jsonl_output(result.stdout)
2061
2336
  result.metrics.update(parsed_metrics)
2062
2337
  return result
@@ -2065,14 +2340,23 @@ class ProcessManager:
2065
2340
  self, agent: DiscoveredAgent, prompt: str, cwd: Path, timeout: int, task_id: str,
2066
2341
  on_chunk: Any = None,
2067
2342
  ) -> TaskResult:
2068
- """Run OpenCode CLI in non-interactive mode."""
2343
+ """Run OpenCode CLI in non-interactive mode.
2344
+
2345
+ Uses `opencode run --format json --dir <cwd>` for headless execution.
2346
+ The message is passed as a positional argument.
2347
+ NOTE: `--dir` is the correct flag (not `--cwd` which is invalid).
2348
+ """
2069
2349
  cmd = [
2070
2350
  agent.command, "run",
2071
2351
  "--format", "json",
2072
2352
  "--dangerously-skip-permissions",
2073
- "--cwd", str(cwd),
2074
- prompt,
2353
+ "--dir", str(cwd),
2075
2354
  ]
2355
+ # Apply model override if configured (e.g. FACTORY_OPENCODE_MODEL=copilot/gpt-4.1)
2356
+ model_override = os.environ.get("FACTORY_OPENCODE_MODEL")
2357
+ if model_override:
2358
+ cmd += ["--model", model_override]
2359
+ cmd.append(prompt)
2076
2360
  result = await self._run_cli(cmd, cwd, timeout, task_id, on_chunk=on_chunk)
2077
2361
  parsed_metrics = self._parse_agent_jsonl_output(result.stdout)
2078
2362
  result.metrics.update(parsed_metrics)
@@ -2121,6 +2405,7 @@ class ProcessManager:
2121
2405
  stdin=asyncio.subprocess.PIPE if stdin_input else None,
2122
2406
  cwd=str(cwd),
2123
2407
  limit=100 * 1024 * 1024, # 100MB line buffer for large agent output
2408
+ start_new_session=True, # own process group → killpg on timeout kills all children
2124
2409
  )
2125
2410
  self.active_processes[task_id] = proc
2126
2411
  stdin_bytes = stdin_input.encode() if stdin_input else None
@@ -2136,8 +2421,28 @@ class ProcessManager:
2136
2421
  error="" if status == "success" else f"Exited with code {returncode}",
2137
2422
  )
2138
2423
  except asyncio.TimeoutError:
2139
- if task_id in self.active_processes:
2140
- self.active_processes[task_id].kill()
2424
+ proc = self.active_processes.pop(task_id, None)
2425
+ if proc:
2426
+ try:
2427
+ if sys.platform != "win32":
2428
+ import signal as _signal
2429
+ try:
2430
+ os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2431
+ except (ProcessLookupError, PermissionError, OSError):
2432
+ pass
2433
+ else:
2434
+ import subprocess as _subprocess
2435
+ _subprocess.run(
2436
+ ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2437
+ capture_output=True,
2438
+ )
2439
+ except Exception:
2440
+ pass
2441
+ finally:
2442
+ try:
2443
+ proc.kill()
2444
+ except Exception:
2445
+ pass
2141
2446
  return TaskResult(
2142
2447
  status="failed", exit_code=-1, stdout="", stderr="",
2143
2448
  error=f"Timed out after {timeout}s",
@@ -2542,10 +2847,28 @@ class ProcessManager:
2542
2847
  return info
2543
2848
 
2544
2849
  async def cancel(self, task_id: str):
2545
- proc = self.active_processes.get(task_id)
2850
+ proc = self.active_processes.pop(task_id, None)
2546
2851
  if proc:
2547
- proc.kill()
2548
- self.active_processes.pop(task_id, None)
2852
+ try:
2853
+ if sys.platform != "win32":
2854
+ import signal as _signal
2855
+ try:
2856
+ os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2857
+ except (ProcessLookupError, PermissionError, OSError):
2858
+ pass
2859
+ else:
2860
+ import subprocess as _subprocess
2861
+ _subprocess.run(
2862
+ ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2863
+ capture_output=True,
2864
+ )
2865
+ except Exception:
2866
+ pass
2867
+ finally:
2868
+ try:
2869
+ proc.kill()
2870
+ except Exception:
2871
+ pass
2549
2872
 
2550
2873
 
2551
2874
  # ── Progress Reporter ──
@@ -2834,6 +3157,23 @@ class TaskPoller:
2834
3157
  logger.warning("Task poll error: %s", e)
2835
3158
  return []
2836
3159
 
3160
+ async def poll_ai_jobs(self) -> list[dict]:
3161
+ """Poll for AIJobs dispatched to this daemon (workspace-mode)."""
3162
+ try:
3163
+ resp = await self.client.get(
3164
+ f"{self.server_url}/api/v1/runtimes/{self.runtime_id}/ai-jobs/poll",
3165
+ timeout=10,
3166
+ )
3167
+ if resp.status_code == 200:
3168
+ self._on_success()
3169
+ return resp.json().get("ai_jobs", [])
3170
+ elif resp.status_code in (401, 403):
3171
+ self._on_auth_failure()
3172
+ return []
3173
+ except Exception as e:
3174
+ logger.debug("AIJob poll error: %s", e)
3175
+ return []
3176
+
2837
3177
 
2838
3178
  # ── Server Connection ──
2839
3179
 
@@ -3214,6 +3554,11 @@ class RuntimeDaemon:
3214
3554
 
3215
3555
  if not acquired:
3216
3556
  logger.error("Cannot acquire daemon lock — another instance may still be running")
3557
+ print(
3558
+ "DAEMON_ERROR: Cannot acquire daemon lock — another daemon instance may "
3559
+ "still be running. Stop the existing daemon first or restart the machine.",
3560
+ file=sys.stderr,
3561
+ )
3217
3562
  raise SystemExit(1)
3218
3563
 
3219
3564
  # Write PID to lock file (for reference, though unreadable while locked)
@@ -3269,6 +3614,11 @@ class RuntimeDaemon:
3269
3614
  fcntl.flock(self._lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
3270
3615
  except (IOError, OSError):
3271
3616
  logger.error("Cannot acquire daemon lock — another instance may still be running")
3617
+ print(
3618
+ "DAEMON_ERROR: Cannot acquire daemon lock — another daemon instance may "
3619
+ "still be running. Stop the existing daemon first or restart the machine.",
3620
+ file=sys.stderr,
3621
+ )
3272
3622
  raise SystemExit(1)
3273
3623
 
3274
3624
  # Write our PID to the lock file for reference
@@ -3411,6 +3761,23 @@ class RuntimeDaemon:
3411
3761
  self._execute_task(task, conn)
3412
3762
  )
3413
3763
 
3764
+ # Poll for AIJobs (workspace-mode tasks)
3765
+ if len(self.active_tasks) < self.max_concurrent:
3766
+ ai_jobs = await conn.poller.poll_ai_jobs()
3767
+ for aj in ai_jobs:
3768
+ job_id = aj.get("job_id", "")
3769
+ ai_task_key = f"aijob_{job_id}"
3770
+ if ai_task_key in self.active_tasks:
3771
+ continue
3772
+ if len(self.active_tasks) >= self.max_concurrent:
3773
+ break
3774
+ logger.info("[%s] Starting AIJob %s (type=%s)",
3775
+ conn.label, job_id, aj.get("task_type"))
3776
+ self._task_connections[ai_task_key] = conn
3777
+ self.active_tasks[ai_task_key] = asyncio.create_task(
3778
+ self._execute_ai_job(aj, conn)
3779
+ )
3780
+
3414
3781
  async def _execute_task(self, task: TaskInfo, conn: ServerConnection):
3415
3782
  """Execute a single task, reporting to the originating server connection."""
3416
3783
  reporter = conn.reporter
@@ -3908,12 +4275,27 @@ class RuntimeDaemon:
3908
4275
 
3909
4276
  # Testing-specific: validate structured test assets
3910
4277
  if node_type == "testing":
3911
- # Check if this type requires full test artifacts
4278
+ # Determine which checks to run for this requirement type.
4279
+ #
4280
+ # _skip_test_artifacts = True → skip ALL artifact checks
4281
+ # (set for types that explicitly list "test_coverage" in
4282
+ # skip_dimensions, e.g. "task", "documentation", "spike")
4283
+ #
4284
+ # _requires_structured_artifacts = True → test-cases.json and
4285
+ # coverage-matrix.json are *required* deliverables.
4286
+ # Set only for "feature" and "improvement" — types whose
4287
+ # testing phase is a full QA suite rather than regression
4288
+ # verification. For "bugfix", "refactor", etc. these files
4289
+ # are *optional*: if they exist they are validated, but their
4290
+ # absence is not an error (the agent only writes regression
4291
+ # tests + test-report.md).
3912
4292
  _skip_test_artifacts = False
4293
+ _requires_structured_artifacts = False
3913
4294
  try:
3914
4295
  from app.services.type_workflow_profiles import get_profile
3915
4296
  _profile = get_profile(req_type)
3916
4297
  _skip_test_artifacts = "test_coverage" in _profile.skip_dimensions
4298
+ _requires_structured_artifacts = req_type in ("feature", "improvement")
3917
4299
  except Exception:
3918
4300
  pass
3919
4301
 
@@ -3930,6 +4312,8 @@ class RuntimeDaemon:
3930
4312
  base = workspace_path
3931
4313
 
3932
4314
  # --- test-cases.json validation ---
4315
+ # Required for feature/improvement; optional (but validated
4316
+ # if present) for all other testing node types.
3933
4317
  tc_path = base / "test-cases.json"
3934
4318
  if tc_path.exists():
3935
4319
  try:
@@ -3938,19 +4322,24 @@ class RuntimeDaemon:
3938
4322
  if not cases:
3939
4323
  issues.append("test-cases.json exists but contains no test cases")
3940
4324
  else:
4325
+ # Collect ALL malformed test cases in one pass so
4326
+ # the retry prompt can fix everything at once.
4327
+ # (Previously a `break` was used here which caused
4328
+ # a one-issue-per-retry cascade, burning through
4329
+ # max_retries before the file was fully corrected.)
3941
4330
  for tc in cases[:20]:
3942
4331
  if not tc.get("id") or not tc.get("title"):
3943
- issues.append(f"Test case missing 'id' or 'title': {tc.get('id', '?')}")
3944
- break
3945
- if not tc.get("steps"):
4332
+ issues.append(
4333
+ f"Test case missing 'id' or 'title': {tc.get('id', '?')}"
4334
+ )
4335
+ elif not tc.get("steps"):
3946
4336
  issues.append(f"Test case {tc['id']} has no 'steps'")
3947
- break
3948
4337
  p0_cases = [c for c in cases if c.get("priority") == "P0"]
3949
4338
  if not p0_cases:
3950
4339
  issues.append("No P0 priority test cases found in test-cases.json")
3951
4340
  except (_json.JSONDecodeError, UnicodeDecodeError) as e:
3952
4341
  issues.append(f"test-cases.json is not valid JSON: {e}")
3953
- else:
4342
+ elif _requires_structured_artifacts:
3954
4343
  issues.append(f"test-cases.json not found in {doc_dir or 'workspace root'}")
3955
4344
 
3956
4345
  # --- coverage-matrix.json validation ---
@@ -3965,7 +4354,7 @@ class RuntimeDaemon:
3965
4354
  issues.append(f"Uncovered acceptance criteria in coverage-matrix.json: {ids}")
3966
4355
  except (_json.JSONDecodeError, UnicodeDecodeError) as e:
3967
4356
  issues.append(f"coverage-matrix.json is not valid JSON: {e}")
3968
- else:
4357
+ elif _requires_structured_artifacts:
3969
4358
  issues.append(f"coverage-matrix.json not found in {doc_dir or 'workspace root'}")
3970
4359
 
3971
4360
  # --- test-report.md validation ---
@@ -3975,6 +4364,139 @@ class RuntimeDaemon:
3975
4364
 
3976
4365
  return issues
3977
4366
 
4367
+ async def _execute_ai_job(self, aj: dict, conn: "ServerConnection"):
4368
+ """Execute an AIJob in daemon workspace and report results back.
4369
+
4370
+ Uses WorkspaceManager for branch-based isolation, runs the agent CLI
4371
+ with the job's prompt, auto-commits results, and reports back.
4372
+ """
4373
+ job_id = aj.get("job_id", "")
4374
+ task_type = aj.get("task_type", "unknown")
4375
+ project_info = aj.get("project", {})
4376
+ requirement_key = aj.get("requirement_key")
4377
+ agent_override = aj.get("agent_override")
4378
+ system_prompt = aj.get("system_prompt", "")
4379
+ user_prompt = aj.get("user_prompt", "")
4380
+
4381
+ reporter_url = f"{conn.server_url.rstrip('/')}/api/v1/runtimes/{conn.runtime_id}/ai-jobs/{job_id}"
4382
+
4383
+ try:
4384
+ # Report progress: starting
4385
+ await conn.client.post(
4386
+ f"{reporter_url}/progress",
4387
+ json={"current_phase": "preparing", "current_step": "Preparing workspace...", "progress_pct": 5},
4388
+ timeout=10,
4389
+ )
4390
+
4391
+ # 1. Select agent
4392
+ agent_type = agent_override or "claude-code"
4393
+ agent = self._select_agent(agent_type, [])
4394
+ if not agent:
4395
+ await conn.client.post(
4396
+ f"{reporter_url}/complete",
4397
+ json={"status": "failed", "error": f"No agent CLI for '{agent_type}'", "failure_code": "no_agent"},
4398
+ timeout=10,
4399
+ )
4400
+ return
4401
+
4402
+ # 2. Prepare workspace (using project info + requirement branch)
4403
+ full_prompt = f"{system_prompt}\n\n{user_prompt}" if system_prompt else user_prompt
4404
+ fake_task = TaskInfo(
4405
+ task_id=job_id,
4406
+ graph_id="",
4407
+ node_type="ai_job",
4408
+ agent_type=agent_type,
4409
+ input_prompt=full_prompt,
4410
+ input_data={},
4411
+ timeout_seconds=settings.AGENT_TIMEOUT,
4412
+ max_retries=0,
4413
+ retry_count=0,
4414
+ project=project_info,
4415
+ work_item={},
4416
+ fallback_chain=[],
4417
+ requirement_workflow_id=None,
4418
+ requirement_key=requirement_key,
4419
+ graph_type="ai_job",
4420
+ )
4421
+ workspace_path = await self.workspace_manager.prepare_workspace(
4422
+ project_info, fake_task,
4423
+ )
4424
+
4425
+ await conn.client.post(
4426
+ f"{reporter_url}/progress",
4427
+ json={"current_phase": "running", "current_step": "Running agent...", "progress_pct": 15},
4428
+ timeout=10,
4429
+ )
4430
+
4431
+ # 3. Run agent with prompt
4432
+ _line_buffer: list[str] = []
4433
+
4434
+ async def on_chunk(lines: list[str]):
4435
+ _line_buffer.extend(lines)
4436
+
4437
+ result = await self.process_manager.run_agent(
4438
+ agent, fake_task, workspace_path, on_chunk=on_chunk,
4439
+ )
4440
+
4441
+ # 4. Auto-commit if successful
4442
+ git_info = {}
4443
+ if result.status == "success" and result.files_changed:
4444
+ git_info = await self._auto_commit(workspace_path, fake_task)
4445
+
4446
+ # 5. Report completion
4447
+ output_content = result.stdout[-20000:] if result.stdout else ""
4448
+ scripts: dict = {}
4449
+
4450
+ # Try to extract per-scenario scripts from output
4451
+ scenario_ids = aj.get("input_context", {}).get("scenario_ids", [])
4452
+ if scenario_ids and output_content:
4453
+ # Simple heuristic: if output is a single script, map it to first scenario
4454
+ # Daemon-generated scripts may be multiple files in workspace
4455
+ for sid in scenario_ids:
4456
+ # Check if daemon wrote test files to workspace
4457
+ import glob
4458
+ test_files = glob.glob(str(workspace_path / "tests" / "**" / f"*{sid[:8]}*"), recursive=True)
4459
+ if test_files:
4460
+ try:
4461
+ with open(test_files[0], "r") as f:
4462
+ scripts[sid] = f.read()
4463
+ except Exception:
4464
+ pass
4465
+
4466
+ complete_payload = {
4467
+ "status": "success" if result.status == "success" else "failed",
4468
+ "output_content": output_content,
4469
+ "output_result": {
4470
+ "scripts": scripts,
4471
+ "files_changed": result.files_changed,
4472
+ "lines_added": result.lines_added,
4473
+ "lines_removed": result.lines_removed,
4474
+ },
4475
+ "tier_used": "agent_cli",
4476
+ "resolved_agent": agent.agent_id,
4477
+ "git_info": git_info,
4478
+ "error": result.error if result.status != "success" else "",
4479
+ "failure_code": "agent_error" if result.status != "success" else "",
4480
+ }
4481
+
4482
+ await conn.client.post(
4483
+ f"{reporter_url}/complete",
4484
+ json=complete_payload,
4485
+ timeout=30,
4486
+ )
4487
+ logger.info("AIJob %s completed: %s", job_id, result.status)
4488
+
4489
+ except Exception as e:
4490
+ logger.exception("AIJob %s execution error", job_id)
4491
+ try:
4492
+ await conn.client.post(
4493
+ f"{reporter_url}/complete",
4494
+ json={"status": "failed", "error": str(e)[:2000], "failure_code": "daemon_exception"},
4495
+ timeout=10,
4496
+ )
4497
+ except Exception:
4498
+ pass
4499
+
3978
4500
  async def _validate_and_retry(
3979
4501
  self,
3980
4502
  agent: "DiscoveredAgent",
@@ -4845,15 +5367,29 @@ class RuntimeDaemon:
4845
5367
  )
4846
5368
 
4847
5369
  logger.info("Found unpushed commits on %s, pushing...", branch)
4848
- try:
4849
- await git(
4850
- "push", "-u", "origin", branch,
4851
- cwd=workspace_path, project_key=project_key,
4852
- )
4853
- logger.info("Pushed branch %s to origin", branch)
4854
- except RuntimeError as exc:
4855
- logger.error("Push failed for branch %s: %s", branch, exc)
4856
- return f"Push failed: {exc}"
5370
+ last_push_exc: Exception | None = None
5371
+ for attempt in range(1, 4): # retry up to 3 times
5372
+ try:
5373
+ await git(
5374
+ "push", "-u", "origin", branch,
5375
+ cwd=workspace_path, project_key=project_key,
5376
+ )
5377
+ logger.info("Pushed branch %s to origin (attempt %d)", branch, attempt)
5378
+ last_push_exc = None
5379
+ break
5380
+ except RuntimeError as exc:
5381
+ last_push_exc = exc
5382
+ if attempt < 3:
5383
+ wait = attempt * 10 # 10s, 20s
5384
+ logger.warning(
5385
+ "Push attempt %d failed for branch %s: %s — retrying in %ds",
5386
+ attempt, branch, exc, wait,
5387
+ )
5388
+ await asyncio.sleep(wait)
5389
+ else:
5390
+ logger.error("Push failed for branch %s after 3 attempts: %s", branch, exc)
5391
+ if last_push_exc is not None:
5392
+ return f"Push failed: {last_push_exc}"
4857
5393
  else:
4858
5394
  logger.info("No unpushed commits on %s", branch)
4859
5395
  return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.6.1
3
+ Version: 1.7.5
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "forgexa-cli"
3
- version = "1.6.1"
3
+ version = "1.7.5"
4
4
  description = "Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform"
5
5
  requires-python = ">=3.9"
6
6
  license = { text = "MIT" }
File without changes
File without changes