PyPI - forgexa-cli - Versions diffs - 1.8.6__tar.gz → 1.8.8__tar.gz - Mend

forgexa-cli 1.8.6tar.gz → 1.8.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{forgexa_cli-1.8.6 → forgexa_cli-1.8.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: forgexa-cli
-Version: 1.8.6
+Version: 1.8.8
 Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
 Author-email: Jason Sun <dev.winds@gmail.com>
 License: MIT

{forgexa_cli-1.8.6 → forgexa_cli-1.8.8}/forgexa_cli/__init__.py RENAMED Viewed

@@ -1,2 +1,2 @@
 """forgexa-cli — Forgexa command-line client."""
-__version__ = "1.8.6"
+__version__ = "1.8.8"

{forgexa_cli-1.8.6 → forgexa_cli-1.8.8}/forgexa_cli/daemon.py RENAMED Viewed

@@ -392,7 +392,7 @@ except (ImportError, ModuleNotFoundError):
 # DAEMON_VERSION is the protocol/logic version of the daemon code.
 # Kept in sync with pyproject.toml version via bump-version.sh.
 # CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
-DAEMON_VERSION = "1.8.6"
+DAEMON_VERSION = "1.8.8"
 def _detect_client_type() -> str:
@@ -648,6 +648,10 @@ _ANALYSIS_OUTPUTS_BY_TYPE: dict[str, list[str]] = {
     "documentation": ["outline.md", "analysis.json"],
     "improvement": ["improvement-spec.md", "TASKS.md", "analysis.json", "test-intent.json"],
     "task": ["task-plan.md", "analysis.json"],
+    # Research / feasibility study — no PRD/SDD/TASKS, only a research plan and metadata
+    "spike": ["research.md", "analysis.json"],
+    # Customer support Q&A — lightweight answer doc + metadata only
+    "faq": ["faq-answer.md", "analysis.json"],
 }
@@ -1123,17 +1127,34 @@ class WorkspaceManager:
         )
         if repo_url:
+            # For non-fresh (refine/continuation) nodes, expand expect_branch to
+            # cover any node that is part of a real requirement workflow AND is not
+            # the initial analysis.  This ensures a hard error (and workspace
+            # re-clone) when the branch sync fails, rather than silently proceeding
+            # with a stale workspace that will cause a non-fast-forward push later.
+            expect_branch = bool(task.analysis_branch) or (
+                bool(task.requirement_key) and not is_fresh_start and task.node_type != "analysis"
+            )
             ws_path = await self._create_worktree(
                 project_dir, repo_url, default_branch, workspace_key, branch_name,
                 fresh_start=is_fresh_start,
                 project_key=project_key,
-                expect_branch=bool(task.analysis_branch),
+                expect_branch=expect_branch,
             )
-            # Refine mode: ensure we're on the analysis branch with its history
-            # (not reset to default_branch)
-            if analysis_mode == "refine" and task.node_type == "analysis":
+            # After workspace creation, perform a final branch-specific fetch + reset
+            # to ensure the working tree is at the absolute latest remote state.
+            # This is critical in two scenarios:
+            #   1. Analysis refine mode: must be on the analysis branch history.
+            #   2. All continuation nodes (design/coding/testing): another runtime
+            #      may have pushed commits while this runtime's agent was executing.
+            #      A final sync here keeps the workspace current so the agent works
+            #      on the latest codebase and avoids non-fast-forward push failures.
+            if not is_fresh_start:
                 try:
-                    await self._git("fetch", "origin", cwd=ws_path, project_key=project_key)
+                    await self._git(
+                        "fetch", "origin", branch_name,
+                        cwd=ws_path, project_key=project_key,
+                    )
                 except RuntimeError:
                     pass
                 try:
@@ -1141,15 +1162,28 @@ class WorkspaceManager:
                     await self._git("checkout", branch_name, cwd=ws_path)
                 except RuntimeError:
                     pass
-                # Pull latest from remote branch if it exists (preserves prior commits)
+                # Use --ff-only to keep only fast-forward changes; if the branch has
+                # diverged (force-pushed by prior phase), reset --hard is used below.
+                pulled = False
                 try:
                     await self._git(
                         "pull", "--ff-only", "origin", branch_name,
                         cwd=ws_path, project_key=project_key,
                     )
+                    pulled = True
                 except RuntimeError:
-                    # Remote branch might not exist yet or has diverged; that's OK
                     pass
+                if not pulled:
+                    # ff-only failed (diverged or remote not yet created) — try
+                    # reset --hard to force-sync to whatever the remote has.
+                    try:
+                        await self._git(
+                            "reset", "--hard", f"origin/{branch_name}",
+                            cwd=ws_path,
+                        )
+                    except RuntimeError:
+                        # Remote branch may not exist yet (first analysis on fresh repo)
+                        pass
             return ws_path
         else:
             # No repo — create a directory with git init for change tracking
@@ -1292,18 +1326,29 @@ class WorkspaceManager:
                 # only fetches the default branch.  Explicitly fetch the
                 # feature branch with a full refspec so that
                 # origin/{branch_name} is available for checkout/reset.
+                _last_sync_err: str = ""
                 try:
                     await self._git("fetch", "origin", cwd=ws_path, project_key=project_key)
-                except RuntimeError:
-                    pass
+                except RuntimeError as _pre_fe:
+                    logger.warning(
+                        "fetch origin failed for worktree %s: %s "
+                        "(likely auth/SSH issue — will retry in sync loop)",
+                        ws_path, _pre_fe,
+                    )
+                    _last_sync_err = str(_pre_fe)[:300]
                 try:
                     await self._git(
                         "fetch", "origin",
                         f"{branch_name}:refs/remotes/origin/{branch_name}",
                         cwd=ws_path, project_key=project_key,
                     )
-                except RuntimeError:
-                    pass  # Branch may not exist on remote yet
+                except RuntimeError as _pre_fe2:
+                    logger.warning(
+                        "fetch branch %s failed for worktree %s: %s "
+                        "(likely auth/SSH issue — will retry in sync loop)",
+                        branch_name, ws_path, _pre_fe2,
+                    )
+                    _last_sync_err = str(_pre_fe2)[:300]
                 if fresh_start:
                     # Safety check: if the branch already exists on remote with
@@ -1392,8 +1437,12 @@ class WorkspaceManager:
                                             cwd=ws_path,
                                             project_key=project_key,
                                         )
-                                    except RuntimeError:
-                                        pass
+                                    except RuntimeError as _sf:
+                                        logger.warning(
+                                            "Re-fetch %s failed (attempt %d): %s",
+                                            branch_name, _sync_attempt + 1, _sf,
+                                        )
+                                        _last_sync_err = str(_sf)[:300]
                                     continue
                                 else:
                                     logger.warning("Failed to checkout %s after retries: %s", branch_name, exc)
@@ -1419,8 +1468,12 @@ class WorkspaceManager:
                                         cwd=ws_path,
                                         project_key=project_key,
                                     )
-                                except RuntimeError:
-                                    pass
+                                except RuntimeError as _sf2:
+                                    logger.warning(
+                                        "Re-fetch %s failed (attempt %d): %s",
+                                        branch_name, _sync_attempt + 1, _sf2,
+                                    )
+                                    _last_sync_err = str(_sf2)[:300]
                             else:
                                 logger.warning(
                                     "Could not reset to origin/%s after retries: %s — "
@@ -1448,10 +1501,28 @@ class WorkspaceManager:
                                     f"Stale local clone discarded. "
                                     f"The task will be retried with a fresh clone."
                                 )
+                            # Destroy the stale worktree before raising so the
+                            # next retry can re-create it fresh from origin.
+                            # Without this, every retry hits the same broken state.
+                            try:
+                                await self._remove_broken_worktree(
+                                    main_repo, ws_path, workspace_key
+                                )
+                                logger.info(
+                                    "Removed stale worktree %s — retry will re-clone from origin",
+                                    ws_path,
+                                )
+                            except Exception as _rm_exc:
+                                logger.warning("Could not remove stale worktree %s: %s", ws_path, _rm_exc)
+                            _err_detail = (
+                                f"Git error: {_last_sync_err}" if _last_sync_err
+                                else "fetch timed out or credentials missing/invalid"
+                            )
                             raise RuntimeError(
-                                f"Failed to sync branch '{branch_name}' from remote after 3 attempts. "
+                                f"Failed to sync branch '{branch_name}' from remote after 3 attempts "
+                                f"({_err_detail}). "
                                 f"The branch should exist (pushed by prior analysis/design phase). "
-                                f"This task will be retried by the orchestrator."
+                                f"Stale local workspace discarded — this task will be retried by the orchestrator."
                             )
                         else:
                             logger.warning(
@@ -1680,12 +1751,42 @@ class WorkspaceManager:
                     # interprets backslashes as escape sequences, corrupting the
                     # path (e.g. C:\Users → C:Users).
                     key_path_safe = key_path.replace("\\", "/") if sys.platform == "win32" else key_path
+                    # RC1 (Windows): os.chmod(S_IRUSR) does not set proper NTFS ACLs.
+                    # Windows OpenSSH rejects keys that aren't exclusively owner-readable
+                    # ("UNPROTECTED PRIVATE KEY FILE").  Fix with icacls to set the ACLs
+                    # correctly.  NOTE: StrictModes is an sshd_config option (server side);
+                    # passing -o StrictModes=no to the SSH client is invalid and causes:
+                    #   "command-line: line 0: Bad configuration option: strictmodes"
+                    # RC2 (Windows): /dev/null doesn't exist on Windows native OpenSSH
+                    # (C:\Windows\System32\OpenSSH\ssh.exe). Use NUL instead.
+                    if sys.platform == "win32":
+                        _known_hosts_null = "NUL"
+                        try:
+                            import subprocess as _subp
+                            _username = (
+                                os.environ.get("USERNAME")
+                                or os.environ.get("USER")
+                                or ""
+                            )
+                            if _username:
+                                _subp.run(
+                                    [
+                                        "icacls", key_path,
+                                        "/inheritance:r",
+                                        "/grant:r", f"{_username}:(R)",
+                                    ],
+                                    capture_output=True, check=False, timeout=10,
+                                )
+                        except Exception:
+                            pass
+                    else:
+                        _known_hosts_null = "/dev/null"
                     env = {
                         **os.environ,
                         "GIT_SSH_COMMAND": (
                             f'ssh -i "{key_path_safe}"'
                             f" -o StrictHostKeyChecking=accept-new"
-                            f" -o UserKnownHostsFile=/dev/null"
+                            f" -o UserKnownHostsFile={_known_hosts_null}"
                             f" -o IdentitiesOnly=yes"
                             # Detect a stalled TCP connection (server accepts but
                             # never sends the git protocol banner).  After 30 s of
@@ -1782,6 +1883,92 @@ class WorkspaceManager:
 # ── Process Manager ──
+def _kill_proc(proc: asyncio.subprocess.Process) -> None:
+    """Kill a subprocess and its entire process group.
+    A plain ``proc.kill()`` only terminates the direct child; grandchildren
+    (npm, yarn, ssh, git, etc.) spawned by the agent stay alive, keep pipes
+    open, and exhaust system resources.  ``os.killpg`` sends SIGKILL to the
+    whole process group, reliably cleaning up all descendants.
+    """
+    try:
+        if sys.platform != "win32":
+            import signal as _signal
+            try:
+                os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
+            except (ProcessLookupError, PermissionError, OSError):
+                pass
+        else:
+            import subprocess as _subprocess
+            _subprocess.run(
+                ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
+                capture_output=True,
+            )
+    except Exception:
+        pass
+    finally:
+        try:
+            proc.kill()
+        except Exception:
+            pass
+class _IdleTimeoutError(asyncio.TimeoutError):
+    """Raised when an agent process produces no stdout for longer than AGENT_IDLE_TIMEOUT.
+    Subclasses asyncio.TimeoutError so existing ``except asyncio.TimeoutError``
+    handlers catch it, but callers can distinguish it from an absolute wall-clock
+    timeout via ``isinstance(exc, _IdleTimeoutError)`` or ``exc.idle_seconds``.
+    """
+    def __init__(self, idle_seconds: float) -> None:
+        super().__init__(f"idle:{idle_seconds:.0f}s")
+        self.idle_seconds = idle_seconds
+def _workspace_has_recent_activity(
+    workspace_path: "Path", since_monotonic: float, max_depth: int = 4
+) -> bool:
+    """Return True if any file under workspace_path was modified after since_monotonic.
+    Converts the monotonic timestamp to a wall-clock value for mtime comparison.
+    Scans the directory tree (up to max_depth levels) with os.scandir, skips
+    .git, and returns on the first matching file for speed.
+    This is the key secondary signal used by _stream_process to distinguish
+    "agent is silent but legitimately working" (e.g. running npm install,
+    compiling TypeScript, executing test suites) from "agent is truly hung".
+    All silent-but-busy operations (package installs, compilation, test runs,
+    git operations) write files to disk, so a positive result here means we
+    must NOT kill the process even if stdout has been idle for a long time.
+    """
+    since_wall = time.time() - (time.monotonic() - since_monotonic)
+    def _scan(path: "Path", depth: int) -> bool:
+        if depth > max_depth:
+            return False
+        try:
+            for entry in os.scandir(str(path)):
+                if entry.name == ".git":
+                    continue  # skip version-control metadata
+                try:
+                    if entry.stat(follow_symlinks=False).st_mtime > since_wall:
+                        return True
+                    if entry.is_dir(follow_symlinks=False) and depth < max_depth:
+                        if _scan(Path(entry.path), depth + 1):
+                            return True
+                except (OSError, PermissionError):
+                    pass
+        except (OSError, PermissionError):
+            pass
+        return False
+    try:
+        return _scan(workspace_path, 0)
+    except Exception:
+        return False  # never let a filesystem check crash the agent run
 class ProcessManager:
     """Manages Agent CLI subprocess lifecycle."""
@@ -2229,6 +2416,7 @@ class ProcessManager:
         timeout: int,
         task_id: str,
         on_chunk: Any,
+        workspace_path: "Path | None" = None,
     ) -> tuple[str, str, int]:
         """Stream stdout line-by-line from a subprocess, flushing to on_chunk.
@@ -2240,7 +2428,22 @@ class ProcessManager:
           deadlock when the process fills the stderr buffer.
         - on_chunk(lines) is called with each decoded line so the caller can
           forward to the progress reporter without waiting for completion.
+        - Idle timeout: if the agent produces no stdout for AGENT_IDLE_TIMEOUT
+          seconds the code checks for filesystem activity in workspace_path
+          before deciding to kill.  If files were recently modified the agent
+          is doing silent work (npm install, compilation, test runs, etc.) and
+          the idle timer is reset.  Only when BOTH stdout AND the filesystem
+          are idle does the process get killed.  This eliminates false-positive
+          kills at the idle boundary.
+        - Absolute timeout (``timeout`` param): hard ceiling for zombie-process
+          prevention.  Always kills at this boundary (no extension), but logs
+          filesystem activity status for post-mortem observability.
         """
+        idle_timeout: int = getattr(settings, "AGENT_IDLE_TIMEOUT", 600)
+        _start_time = time.monotonic()
+        # Mutable state shared between nested coroutines (list avoids nonlocal)
+        _last_activity_at: list[float] = [time.monotonic()]
         # Write prompt and close stdin so the agent knows input is done.
         if stdin_input and proc.stdin:
             try:
@@ -2262,13 +2465,76 @@ class ProcessManager:
             if not proc.stdout:
                 return
             while True:
+                # ── Timeout checks ────────────────────────────────────────────
+                now = time.monotonic()
+                elapsed_abs = now - _start_time
+                if elapsed_abs >= timeout:
+                    # Absolute ceiling reached — hard zombie-process prevention.
+                    # Log filesystem status for observability but always kill;
+                    # never extend the absolute boundary.
+                    _fs_active = workspace_path and _workspace_has_recent_activity(
+                        workspace_path, _last_activity_at[0]
+                    )
+                    logger.warning(
+                        "Task %s absolute timeout %.0fs reached — killing "
+                        "(workspace filesystem %s)",
+                        task_id, elapsed_abs,
+                        "was active" if _fs_active else "was idle",
+                    )
+                    _kill_proc(proc)
+                    raise asyncio.TimeoutError(
+                        f"Timed out after {timeout}s (absolute limit)"
+                    )
+                idle_elapsed = now - _last_activity_at[0]
+                if idle_elapsed >= idle_timeout:
+                    # Before killing, check if the agent is doing silent
+                    # filesystem work (npm install, compilation, test runs, git
+                    # operations, etc.).  These produce no stdout but DO modify
+                    # files — killing at the idle boundary would be a false
+                    # positive.  Only kill when BOTH signals agree: no stdout
+                    # AND no filesystem activity.
+                    if workspace_path and _workspace_has_recent_activity(
+                        workspace_path, _last_activity_at[0]
+                    ):
+                        # Files modified since last stdout → agent is working
+                        # silently.  Reset idle timer and continue.
+                        _last_activity_at[0] = time.monotonic()
+                        logger.info(
+                            "Task %s: stdout idle %.0fs but workspace files "
+                            "modified — resetting idle timer (agent working "
+                            "silently)",
+                            task_id, idle_elapsed,
+                        )
+                    else:
+                        # No stdout AND no filesystem activity → truly hung.
+                        logger.warning(
+                            "Task %s agent idle %.0fs — no stdout, no "
+                            "filesystem activity; killing hung process",
+                            task_id, idle_elapsed,
+                        )
+                        _kill_proc(proc)
+                        raise _IdleTimeoutError(idle_elapsed)
+                # Check interval capped at 30s so the loop stays responsive
+                # even when both timeouts are far away.
+                check_interval = min(
+                    idle_timeout - idle_elapsed + 0.5,   # until idle fires
+                    timeout - elapsed_abs + 0.5,          # until absolute fires
+                    30.0,
+                )
+                # ── Read one line with a bounded wait ─────────────────────────
                 try:
-                    line_bytes = await proc.stdout.readline()
+                    line_bytes = await asyncio.wait_for(
+                        proc.stdout.readline(), timeout=check_interval
+                    )
+                except asyncio.TimeoutError:
+                    # readline timed out within check_interval — no new output
+                    # yet.  Loop back to re-evaluate idle/absolute conditions.
+                    continue
                 except (ValueError, asyncio.LimitOverrunError, Exception) as exc:
-                    # Line exceeded stream buffer limit (LimitOverrunError
-                    # converted to ValueError by readline(), but catch broadly
-                    # to handle edge cases in different Python versions).
-                    # Fall back to reading remaining data in bulk.
+                    # Line exceeded stream buffer limit — drain remaining bulk.
                     logger.warning(
                         "Stream read error for task %s (%s: %s), draining remaining output",
                         task_id, type(exc).__name__, exc,
@@ -2287,8 +2553,12 @@ class ProcessManager:
                                     except Exception:
                                         pass
                     break
                 if not line_bytes:
-                    break
+                    break  # EOF — process exited normally
+                # ── New output received — reset idle timer ────────────────────
+                _last_activity_at[0] = time.monotonic()
                 line = line_bytes.decode(errors="replace").rstrip("\n")
                 stdout_lines.append(line)
                 if on_chunk:
@@ -2298,35 +2568,17 @@ class ProcessManager:
                         pass  # never let on_chunk crash the agent run
         try:
+            # Outer wait_for uses timeout+idle_timeout as generous safety net.
+            # In practice _read_stdout handles both idle and absolute killing
+            # before this fires.
             await asyncio.wait_for(
                 asyncio.gather(_read_stdout(), _read_stderr()),
-                timeout=timeout,
+                timeout=timeout + idle_timeout + 60,
             )
-        except asyncio.TimeoutError:
-            # Kill the entire process group so that child processes (npm, yarn,
-            # ssh, git, etc.) spawned by the agent are also terminated.  A plain
-            # proc.kill() only kills the direct subprocess; any grandchildren
-            # become orphaned, keep pipes open, and can exhaust system resources.
-            try:
-                if sys.platform != "win32":
-                    import signal as _signal
-                    try:
-                        os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
-                    except (ProcessLookupError, PermissionError, OSError):
-                        pass
-                else:
-                    import subprocess as _subprocess
-                    _subprocess.run(
-                        ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
-                        capture_output=True,
-                    )
-            except Exception:
-                pass
-            finally:
-                try:
-                    proc.kill()
-                except Exception:
-                    pass
+        except asyncio.TimeoutError as _exc:
+            # This branch fires if the outer safety net triggers (extremely
+            # rare) or if _IdleTimeoutError propagates from _read_stdout.
+            _kill_proc(proc)
             # Drain any remaining output after kill
             try:
                 remaining, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
@@ -2336,7 +2588,7 @@ class ProcessManager:
                             stdout_lines.append(line)
             except Exception:
                 pass
-            raise  # re-raise so callers can set result.error
+            raise  # re-raise (_IdleTimeoutError preserves subclass type)
         await proc.wait()
         stdout = "\n".join(stdout_lines)
@@ -2397,7 +2649,8 @@ class ProcessManager:
             )
             self.active_processes[task_id] = proc
             stdout, stderr, returncode = await self._stream_process(
-                proc, prompt.encode(), timeout, task_id, on_chunk
+                proc, prompt.encode(), timeout, task_id, on_chunk,
+                workspace_path=cwd,
             )
             # Parse Claude JSON output for metrics
@@ -2438,13 +2691,13 @@ class ProcessManager:
                     error=f"Claude exited with code {returncode}: {stderr[-500:]}",
                     metrics=metrics,
                 )
-        except asyncio.TimeoutError:
-            if task_id in self.active_processes:
-                self.active_processes[task_id].kill()
-            return TaskResult(
-                status="failed", exit_code=-1, stdout="", stderr="",
-                error=f"Timed out after {timeout}s",
-            )
+        except asyncio.TimeoutError as exc:
+            _kill_proc(self.active_processes.pop(task_id, None) or proc)
+            _err = (
+                f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
+                "Task may require more context decomposition or a different agent."
+            ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
+            return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
         except Exception as exc:
             logger.exception("Claude stream error for task %s", task_id)
             if task_id in self.active_processes:
@@ -2614,7 +2867,8 @@ class ProcessManager:
             )
             self.active_processes[task_id] = proc
             stdout, stderr, returncode = await self._stream_process(
-                proc, None, timeout, task_id, on_chunk
+                proc, None, timeout, task_id, on_chunk,
+                workspace_path=cwd,
             )
             # Parse copilot JSONL output for metrics
@@ -2642,13 +2896,13 @@ class ProcessManager:
                     error=f"Copilot exited with code {effective_rc}: {stderr[-500:]}",
                     metrics=metrics,
                 )
-        except asyncio.TimeoutError:
-            if task_id in self.active_processes:
-                self.active_processes[task_id].kill()
-            return TaskResult(
-                status="failed", exit_code=-1, stdout="", stderr="",
-                error=f"Timed out after {timeout}s",
-            )
+        except asyncio.TimeoutError as exc:
+            _kill_proc(self.active_processes.pop(task_id, None) or proc)
+            _err = (
+                f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
+                "Task may require more context decomposition or a different agent."
+            ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
+            return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
         except Exception as exc:
             logger.exception("Copilot stream error for task %s", task_id)
             if task_id in self.active_processes:
@@ -2689,7 +2943,8 @@ class ProcessManager:
             self.active_processes[task_id] = proc
             stdin_bytes = stdin_input.encode() if stdin_input else None
             stdout, stderr, returncode = await self._stream_process(
-                proc, stdin_bytes, timeout, task_id, on_chunk
+                proc, stdin_bytes, timeout, task_id, on_chunk,
+                workspace_path=cwd,
             )
             status = "success" if returncode == 0 else "failed"
             return TaskResult(
@@ -2699,33 +2954,13 @@ class ProcessManager:
                 stderr=stderr[-10000:],
                 error="" if status == "success" else f"Exited with code {returncode}",
             )
-        except asyncio.TimeoutError:
-            proc = self.active_processes.pop(task_id, None)
-            if proc:
-                try:
-                    if sys.platform != "win32":
-                        import signal as _signal
-                        try:
-                            os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
-                        except (ProcessLookupError, PermissionError, OSError):
-                            pass
-                    else:
-                        import subprocess as _subprocess
-                        _subprocess.run(
-                            ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
-                            capture_output=True,
-                        )
-                except Exception:
-                    pass
-                finally:
-                    try:
-                        proc.kill()
-                    except Exception:
-                        pass
-            return TaskResult(
-                status="failed", exit_code=-1, stdout="", stderr="",
-                error=f"Timed out after {timeout}s",
-            )
+        except asyncio.TimeoutError as exc:
+            _kill_proc(self.active_processes.pop(task_id, None) or proc)
+            _err = (
+                f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
+                "Task may require more context decomposition or a different agent."
+            ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
+            return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
         except Exception as exc:
             logger.exception("CLI stream error for task %s", task_id)
             if task_id in self.active_processes:
@@ -4167,6 +4402,65 @@ class RuntimeDaemon:
             )
             logger.info("Workspace ready: %s", workspace_path)
+            # 2.5 Wipe the analysis output directory on fresh analysis so the new
+            # agent run starts from a completely clean slate.  This covers:
+            #   • Type change: removes old-type files (e.g. PRD.md/SDD.md) so they
+            #     don't coexist with the new type's files (e.g. diagnosis.md).
+            #   • Same-type fresh re-analysis: removes extra files the agent may
+            #     have written that fall outside the expected type profile.
+            # Using a whole-directory wipe is more reliable than the old
+            # cleanup_stale_docs approach (which only deleted known-profile files).
+            if task.node_type == "analysis" and (
+                task.input_data.get("wipe_analysis_dir")
+                # Backwards-compat: older server versions send cleanup_stale_docs
+                or task.input_data.get("cleanup_stale_docs")
+                # Also wipe if analysis_mode is explicitly "fresh" (belt+suspenders)
+                or task.input_data.get("analysis_mode") == "fresh"
+            ):
+                output_dir_raw = (
+                    task.input_data.get("analysis_output_dir")
+                    or task.input_data.get("output_dir")
+                    or ""
+                )
+                output_dir_norm = str(output_dir_raw).replace("\\", "/").lstrip("./").rstrip("/")
+                if output_dir_norm:
+                    dir_to_wipe = workspace_path / output_dir_norm
+                    if dir_to_wipe.is_dir():
+                        existing_files = [f for f in dir_to_wipe.iterdir() if f.is_file()]
+                        if existing_files:
+                            try:
+                                # Stage all deletions with git rm
+                                await self._git(
+                                    "rm", "-r", "--cached", "--ignore-unmatch",
+                                    output_dir_norm,
+                                    cwd=workspace_path,
+                                )
+                                # Remove physical files
+                                shutil.rmtree(str(dir_to_wipe), ignore_errors=True)
+                                # Commit the wipe so the branch diff is clean
+                                await self._git(
+                                    "-c", "user.name=Forgexa Agent",
+                                    "-c", "user.email=agent@forgexa.net",
+                                    "commit", "-m",
+                                    f"cleanup: wipe analysis docs in {output_dir_norm} before fresh re-analysis",
+                                    cwd=workspace_path,
+                                )
+                                logger.info(
+                                    "Wiped %d analysis doc(s) from %s for task %s (fresh analysis)",
+                                    len(existing_files), output_dir_norm, task.task_id,
+                                )
+                            except Exception:
+                                logger.warning(
+                                    "Could not wipe analysis dir %s for task %s "
+                                    "(proceeding anyway — agent will overwrite)",
+                                    output_dir_norm, task.task_id, exc_info=True,
+                                )
+                        else:
+                            logger.debug(
+                                "Analysis dir %s is already empty for task %s",
+                                output_dir_norm, task.task_id,
+                            )
             # 3. Run agent with real-time output streaming + periodic progress heartbeat
             await reporter.report_progress(task.task_id, 10, "running_agent")
@@ -4364,26 +4658,49 @@ class RuntimeDaemon:
                         f"(node_type={task.node_type}, agent={agent.agent_id})"
                     )
-            # 4.1 Recovery: agent exited non-zero but already committed code
-            # (e.g. OpenCode EBADF crash on exit after successful work)
-            if result.status == "failed" and result.exit_code not in (None, -1):
-                committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
-                has_committed_changes = bool(committed_git.get("files_changed"))
-                has_no_uncommitted = not pre_commit_git.get("files_changed")
-                has_tokens = (
-                    int(result.metrics.get("token_input", 0) or 0)
-                    + int(result.metrics.get("token_output", 0) or 0)
-                ) > 0
-                has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
-                if has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output):
-                    logger.warning(
-                        "Task %s agent exited with code %s but has committed changes — "
-                        "recovering as success (agent likely crashed during cleanup)",
-                        task.task_id, result.exit_code,
+            # 4.1 Recovery: agent exited non-zero but already committed code.
+            # Covers two scenarios:
+            #   A. Process crash (e.g. OpenCode EBADF on exit after successful work):
+            #      exit_code is a real non-negative/non-(-1) value.
+            #   B. Timeout with committed work: agent finished its task and committed
+            #      before the idle/absolute timeout fired.  exit_code=-1 but the commits
+            #      are real — don't discard them.
+            if result.status == "failed":
+                _error_lower = (result.error or "").lower()
+                is_timeout_failure = (
+                    "idle for" in _error_lower
+                    or "timed out" in _error_lower
+                    or "absolute limit" in _error_lower
+                )
+                can_attempt_recovery = (
+                    is_timeout_failure               # timeout: also allow exit_code=-1
+                    or result.exit_code not in (None, -1)  # crash: original guard
+                )
+                if can_attempt_recovery:
+                    committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
+                    has_committed_changes = bool(committed_git.get("files_changed"))
+                    has_no_uncommitted = not pre_commit_git.get("files_changed")
+                    has_tokens = (
+                        int(result.metrics.get("token_input", 0) or 0)
+                        + int(result.metrics.get("token_output", 0) or 0)
+                    ) > 0
+                    has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
+                    # Timeout recovery requires stronger evidence: committed work + tokens.
+                    # Crash recovery (original): committed + (tokens OR meaningful output).
+                    sufficient_evidence = (
+                        (has_committed_changes and has_no_uncommitted and has_tokens and has_meaningful_output)
+                        if is_timeout_failure
+                        else (has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output))
                     )
-                    result.status = "success"
-                    result.error = ""
-                    result.metrics["recovered_from_exit_code"] = result.exit_code
+                    if sufficient_evidence:
+                        _reason = "timed out but already committed changes" if is_timeout_failure else f"exited with code {result.exit_code}"
+                        logger.warning(
+                            "Task %s agent %s — recovering as success",
+                            task.task_id, _reason,
+                        )
+                        result.status = "success"
+                        result.error = ""
+                        result.metrics["recovered_from_exit_code"] = result.exit_code
             # 4.5 Layer 2: Validation gate — check outputs before committing
             if result.status == "success":
@@ -5784,6 +6101,24 @@ class RuntimeDaemon:
             branch = (await git("rev-parse", "--abbrev-ref", "HEAD", cwd=workspace_path)).strip()
             if branch and branch != "HEAD":
+                # Always refresh the remote tracking ref before any divergence
+                # checks.  Without this, origin/{branch} may be stale if another
+                # runtime pushed commits while our agent was executing, causing
+                # the remote_ahead check to return empty and the naive push to
+                # fail with "non-fast-forward".  This is the single most reliable
+                # guard for cross-runtime / cross-machine collaboration scenarios.
+                try:
+                    await git(
+                        "fetch", "origin", branch,
+                        cwd=workspace_path, project_key=project_key,
+                    )
+                except RuntimeError as _pre_push_fetch_exc:
+                    logger.warning(
+                        "Pre-push fetch of branch '%s' failed: %s — "
+                        "divergence check will use possibly stale tracking ref",
+                        branch, _pre_push_fetch_exc,
+                    )
                 # Check if there are unpushed commits
                 try:
                     unpushed = (await git(

{forgexa_cli-1.8.6 → forgexa_cli-1.8.8}/forgexa_cli.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: forgexa-cli
-Version: 1.8.6
+Version: 1.8.8
 Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
 Author-email: Jason Sun <dev.winds@gmail.com>
 License: MIT

{forgexa_cli-1.8.6 → forgexa_cli-1.8.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "forgexa-cli"
-version = "1.8.6"
+version = "1.8.8"
 description = "Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform"
 requires-python = ">=3.9"
 license = { text = "MIT" }