forgexa-cli 1.8.7__tar.gz → 1.8.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.8.7
3
+ Version: 1.8.10
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,2 +1,2 @@
1
1
  """forgexa-cli — Forgexa command-line client."""
2
- __version__ = "1.8.7"
2
+ __version__ = "1.8.10"
@@ -352,7 +352,11 @@ except (ImportError, ModuleNotFoundError):
352
352
 
353
353
  @property
354
354
  def AGENT_TIMEOUT(self) -> int:
355
- return int(os.environ.get("AGENT_TIMEOUT", "3600"))
355
+ return int(os.environ.get("AGENT_TIMEOUT", "14400")) # 4-hour absolute ceiling
356
+
357
+ @property
358
+ def AGENT_IDLE_TIMEOUT(self) -> int:
359
+ return int(os.environ.get("AGENT_IDLE_TIMEOUT", "600")) # 10-min idle (stdout+fs) = hung agent
356
360
 
357
361
  @property
358
362
  def GIT_CLONE_TIMEOUT(self) -> int:
@@ -392,7 +396,7 @@ except (ImportError, ModuleNotFoundError):
392
396
  # DAEMON_VERSION is the protocol/logic version of the daemon code.
393
397
  # Kept in sync with pyproject.toml version via bump-version.sh.
394
398
  # CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
395
- DAEMON_VERSION = "1.8.7"
399
+ DAEMON_VERSION = "1.8.10"
396
400
 
397
401
 
398
402
  def _detect_client_type() -> str:
@@ -633,6 +637,11 @@ class TaskResult:
633
637
  lines_added: int = 0
634
638
  lines_removed: int = 0
635
639
  error: str = ""
640
+ # failure_code is forwarded to the server to drive retry policy.
641
+ # Key values:
642
+ # "all_agents_rate_limited" — daemon tried every installed agent, all
643
+ # hit rate/quota limits. Server must NOT retry on the same runtime.
644
+ failure_code: str = ""
636
645
  artifacts: list[dict] = field(default_factory=list)
637
646
  observations: list[dict] = field(default_factory=list)
638
647
  metrics: dict = field(default_factory=dict)
@@ -1753,15 +1762,34 @@ class WorkspaceManager:
1753
1762
  key_path_safe = key_path.replace("\\", "/") if sys.platform == "win32" else key_path
1754
1763
  # RC1 (Windows): os.chmod(S_IRUSR) does not set proper NTFS ACLs.
1755
1764
  # Windows OpenSSH rejects keys that aren't exclusively owner-readable
1756
- # ("UNPROTECTED PRIVATE KEY FILE"). StrictModes=no bypasses this.
1765
+ # ("UNPROTECTED PRIVATE KEY FILE"). Fix with icacls to set the ACLs
1766
+ # correctly. NOTE: StrictModes is an sshd_config option (server side);
1767
+ # passing -o StrictModes=no to the SSH client is invalid and causes:
1768
+ # "command-line: line 0: Bad configuration option: strictmodes"
1757
1769
  # RC2 (Windows): /dev/null doesn't exist on Windows native OpenSSH
1758
1770
  # (C:\Windows\System32\OpenSSH\ssh.exe). Use NUL instead.
1759
1771
  if sys.platform == "win32":
1760
1772
  _known_hosts_null = "NUL"
1761
- _strict_modes_opt = " -o StrictModes=no"
1773
+ try:
1774
+ import subprocess as _subp
1775
+ _username = (
1776
+ os.environ.get("USERNAME")
1777
+ or os.environ.get("USER")
1778
+ or ""
1779
+ )
1780
+ if _username:
1781
+ _subp.run(
1782
+ [
1783
+ "icacls", key_path,
1784
+ "/inheritance:r",
1785
+ "/grant:r", f"{_username}:(R)",
1786
+ ],
1787
+ capture_output=True, check=False, timeout=10,
1788
+ )
1789
+ except Exception:
1790
+ pass
1762
1791
  else:
1763
1792
  _known_hosts_null = "/dev/null"
1764
- _strict_modes_opt = ""
1765
1793
  env = {
1766
1794
  **os.environ,
1767
1795
  "GIT_SSH_COMMAND": (
@@ -1777,7 +1805,6 @@ class WorkspaceManager:
1777
1805
  f" -o ConnectTimeout=30"
1778
1806
  f" -o ServerAliveInterval=30"
1779
1807
  f" -o ServerAliveCountMax=3"
1780
- f"{_strict_modes_opt}"
1781
1808
  ),
1782
1809
  }
1783
1810
  except Exception:
@@ -1797,6 +1824,13 @@ class WorkspaceManager:
1797
1824
  if git_prefix_args:
1798
1825
  env = {**(env or os.environ), "GIT_TERMINAL_PROMPT": "0"}
1799
1826
 
1827
+ # Always enable long-path support. On Windows this removes git's own
1828
+ # 260-char path limit (Windows also needs HKLM LongPathsEnabled=1 or
1829
+ # the Win10 1607+ Group Policy, but at a minimum we ensure git won't
1830
+ # reject long paths on platforms where it is already enabled).
1831
+ # On Linux/macOS this is a no-op.
1832
+ longpath_args = ["-c", "core.longpaths=true"]
1833
+
1800
1834
  # start_new_session=True puts git in its own process group.
1801
1835
  # On timeout we send SIGKILL to the entire group, which includes
1802
1836
  # any ssh/gpg/credential-helper children that git forked — preventing
@@ -1804,7 +1838,7 @@ class WorkspaceManager:
1804
1838
  # Windows note: start_new_session creates a new console process group;
1805
1839
  # we use taskkill /T there instead of killpg.
1806
1840
  proc = await asyncio.create_subprocess_exec(
1807
- "git", *git_prefix_args, *args,
1841
+ "git", *longpath_args, *git_prefix_args, *args,
1808
1842
  stdout=asyncio.subprocess.PIPE,
1809
1843
  stderr=asyncio.subprocess.PIPE,
1810
1844
  cwd=str(cwd) if cwd else None,
@@ -1865,6 +1899,92 @@ class WorkspaceManager:
1865
1899
  # ── Process Manager ──
1866
1900
 
1867
1901
 
1902
+ def _kill_proc(proc: asyncio.subprocess.Process) -> None:
1903
+ """Kill a subprocess and its entire process group.
1904
+
1905
+ A plain ``proc.kill()`` only terminates the direct child; grandchildren
1906
+ (npm, yarn, ssh, git, etc.) spawned by the agent stay alive, keep pipes
1907
+ open, and exhaust system resources. ``os.killpg`` sends SIGKILL to the
1908
+ whole process group, reliably cleaning up all descendants.
1909
+ """
1910
+ try:
1911
+ if sys.platform != "win32":
1912
+ import signal as _signal
1913
+ try:
1914
+ os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
1915
+ except (ProcessLookupError, PermissionError, OSError):
1916
+ pass
1917
+ else:
1918
+ import subprocess as _subprocess
1919
+ _subprocess.run(
1920
+ ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
1921
+ capture_output=True,
1922
+ )
1923
+ except Exception:
1924
+ pass
1925
+ finally:
1926
+ try:
1927
+ proc.kill()
1928
+ except Exception:
1929
+ pass
1930
+
1931
+
1932
+ class _IdleTimeoutError(asyncio.TimeoutError):
1933
+ """Raised when an agent process produces no stdout for longer than AGENT_IDLE_TIMEOUT.
1934
+
1935
+ Subclasses asyncio.TimeoutError so existing ``except asyncio.TimeoutError``
1936
+ handlers catch it, but callers can distinguish it from an absolute wall-clock
1937
+ timeout via ``isinstance(exc, _IdleTimeoutError)`` or ``exc.idle_seconds``.
1938
+ """
1939
+
1940
+ def __init__(self, idle_seconds: float) -> None:
1941
+ super().__init__(f"idle:{idle_seconds:.0f}s")
1942
+ self.idle_seconds = idle_seconds
1943
+
1944
+
1945
+ def _workspace_has_recent_activity(
1946
+ workspace_path: "Path", since_monotonic: float, max_depth: int = 4
1947
+ ) -> bool:
1948
+ """Return True if any file under workspace_path was modified after since_monotonic.
1949
+
1950
+ Converts the monotonic timestamp to a wall-clock value for mtime comparison.
1951
+ Scans the directory tree (up to max_depth levels) with os.scandir, skips
1952
+ .git, and returns on the first matching file for speed.
1953
+
1954
+ This is the key secondary signal used by _stream_process to distinguish
1955
+ "agent is silent but legitimately working" (e.g. running npm install,
1956
+ compiling TypeScript, executing test suites) from "agent is truly hung".
1957
+ All silent-but-busy operations (package installs, compilation, test runs,
1958
+ git operations) write files to disk, so a positive result here means we
1959
+ must NOT kill the process even if stdout has been idle for a long time.
1960
+ """
1961
+ since_wall = time.time() - (time.monotonic() - since_monotonic)
1962
+
1963
+ def _scan(path: "Path", depth: int) -> bool:
1964
+ if depth > max_depth:
1965
+ return False
1966
+ try:
1967
+ for entry in os.scandir(str(path)):
1968
+ if entry.name == ".git":
1969
+ continue # skip version-control metadata
1970
+ try:
1971
+ if entry.stat(follow_symlinks=False).st_mtime > since_wall:
1972
+ return True
1973
+ if entry.is_dir(follow_symlinks=False) and depth < max_depth:
1974
+ if _scan(Path(entry.path), depth + 1):
1975
+ return True
1976
+ except (OSError, PermissionError):
1977
+ pass
1978
+ except (OSError, PermissionError):
1979
+ pass
1980
+ return False
1981
+
1982
+ try:
1983
+ return _scan(workspace_path, 0)
1984
+ except Exception:
1985
+ return False # never let a filesystem check crash the agent run
1986
+
1987
+
1868
1988
  class ProcessManager:
1869
1989
  """Manages Agent CLI subprocess lifecycle."""
1870
1990
 
@@ -2312,6 +2432,7 @@ class ProcessManager:
2312
2432
  timeout: int,
2313
2433
  task_id: str,
2314
2434
  on_chunk: Any,
2435
+ workspace_path: "Path | None" = None,
2315
2436
  ) -> tuple[str, str, int]:
2316
2437
  """Stream stdout line-by-line from a subprocess, flushing to on_chunk.
2317
2438
 
@@ -2323,7 +2444,22 @@ class ProcessManager:
2323
2444
  deadlock when the process fills the stderr buffer.
2324
2445
  - on_chunk(lines) is called with each decoded line so the caller can
2325
2446
  forward to the progress reporter without waiting for completion.
2447
+ - Idle timeout: if the agent produces no stdout for AGENT_IDLE_TIMEOUT
2448
+ seconds the code checks for filesystem activity in workspace_path
2449
+ before deciding to kill. If files were recently modified the agent
2450
+ is doing silent work (npm install, compilation, test runs, etc.) and
2451
+ the idle timer is reset. Only when BOTH stdout AND the filesystem
2452
+ are idle does the process get killed. This eliminates false-positive
2453
+ kills at the idle boundary.
2454
+ - Absolute timeout (``timeout`` param): hard ceiling for zombie-process
2455
+ prevention. Always kills at this boundary (no extension), but logs
2456
+ filesystem activity status for post-mortem observability.
2326
2457
  """
2458
+ idle_timeout: int = getattr(settings, "AGENT_IDLE_TIMEOUT", 600)
2459
+ _start_time = time.monotonic()
2460
+ # Mutable state shared between nested coroutines (list avoids nonlocal)
2461
+ _last_activity_at: list[float] = [time.monotonic()]
2462
+
2327
2463
  # Write prompt and close stdin so the agent knows input is done.
2328
2464
  if stdin_input and proc.stdin:
2329
2465
  try:
@@ -2345,13 +2481,76 @@ class ProcessManager:
2345
2481
  if not proc.stdout:
2346
2482
  return
2347
2483
  while True:
2484
+ # ── Timeout checks ────────────────────────────────────────────
2485
+ now = time.monotonic()
2486
+ elapsed_abs = now - _start_time
2487
+ if elapsed_abs >= timeout:
2488
+ # Absolute ceiling reached — hard zombie-process prevention.
2489
+ # Log filesystem status for observability but always kill;
2490
+ # never extend the absolute boundary.
2491
+ _fs_active = workspace_path and _workspace_has_recent_activity(
2492
+ workspace_path, _last_activity_at[0]
2493
+ )
2494
+ logger.warning(
2495
+ "Task %s absolute timeout %.0fs reached — killing "
2496
+ "(workspace filesystem %s)",
2497
+ task_id, elapsed_abs,
2498
+ "was active" if _fs_active else "was idle",
2499
+ )
2500
+ _kill_proc(proc)
2501
+ raise asyncio.TimeoutError(
2502
+ f"Timed out after {timeout}s (absolute limit)"
2503
+ )
2504
+
2505
+ idle_elapsed = now - _last_activity_at[0]
2506
+ if idle_elapsed >= idle_timeout:
2507
+ # Before killing, check if the agent is doing silent
2508
+ # filesystem work (npm install, compilation, test runs, git
2509
+ # operations, etc.). These produce no stdout but DO modify
2510
+ # files — killing at the idle boundary would be a false
2511
+ # positive. Only kill when BOTH signals agree: no stdout
2512
+ # AND no filesystem activity.
2513
+ if workspace_path and _workspace_has_recent_activity(
2514
+ workspace_path, _last_activity_at[0]
2515
+ ):
2516
+ # Files modified since last stdout → agent is working
2517
+ # silently. Reset idle timer and continue.
2518
+ _last_activity_at[0] = time.monotonic()
2519
+ logger.info(
2520
+ "Task %s: stdout idle %.0fs but workspace files "
2521
+ "modified — resetting idle timer (agent working "
2522
+ "silently)",
2523
+ task_id, idle_elapsed,
2524
+ )
2525
+ else:
2526
+ # No stdout AND no filesystem activity → truly hung.
2527
+ logger.warning(
2528
+ "Task %s agent idle %.0fs — no stdout, no "
2529
+ "filesystem activity; killing hung process",
2530
+ task_id, idle_elapsed,
2531
+ )
2532
+ _kill_proc(proc)
2533
+ raise _IdleTimeoutError(idle_elapsed)
2534
+
2535
+ # Check interval capped at 30s so the loop stays responsive
2536
+ # even when both timeouts are far away.
2537
+ check_interval = min(
2538
+ idle_timeout - idle_elapsed + 0.5, # until idle fires
2539
+ timeout - elapsed_abs + 0.5, # until absolute fires
2540
+ 30.0,
2541
+ )
2542
+
2543
+ # ── Read one line with a bounded wait ─────────────────────────
2348
2544
  try:
2349
- line_bytes = await proc.stdout.readline()
2545
+ line_bytes = await asyncio.wait_for(
2546
+ proc.stdout.readline(), timeout=check_interval
2547
+ )
2548
+ except asyncio.TimeoutError:
2549
+ # readline timed out within check_interval — no new output
2550
+ # yet. Loop back to re-evaluate idle/absolute conditions.
2551
+ continue
2350
2552
  except (ValueError, asyncio.LimitOverrunError, Exception) as exc:
2351
- # Line exceeded stream buffer limit (LimitOverrunError
2352
- # converted to ValueError by readline(), but catch broadly
2353
- # to handle edge cases in different Python versions).
2354
- # Fall back to reading remaining data in bulk.
2553
+ # Line exceeded stream buffer limit — drain remaining bulk.
2355
2554
  logger.warning(
2356
2555
  "Stream read error for task %s (%s: %s), draining remaining output",
2357
2556
  task_id, type(exc).__name__, exc,
@@ -2370,8 +2569,12 @@ class ProcessManager:
2370
2569
  except Exception:
2371
2570
  pass
2372
2571
  break
2572
+
2373
2573
  if not line_bytes:
2374
- break
2574
+ break # EOF — process exited normally
2575
+
2576
+ # ── New output received — reset idle timer ────────────────────
2577
+ _last_activity_at[0] = time.monotonic()
2375
2578
  line = line_bytes.decode(errors="replace").rstrip("\n")
2376
2579
  stdout_lines.append(line)
2377
2580
  if on_chunk:
@@ -2381,35 +2584,17 @@ class ProcessManager:
2381
2584
  pass # never let on_chunk crash the agent run
2382
2585
 
2383
2586
  try:
2587
+ # Outer wait_for uses timeout+idle_timeout as generous safety net.
2588
+ # In practice _read_stdout handles both idle and absolute killing
2589
+ # before this fires.
2384
2590
  await asyncio.wait_for(
2385
2591
  asyncio.gather(_read_stdout(), _read_stderr()),
2386
- timeout=timeout,
2592
+ timeout=timeout + idle_timeout + 60,
2387
2593
  )
2388
- except asyncio.TimeoutError:
2389
- # Kill the entire process group so that child processes (npm, yarn,
2390
- # ssh, git, etc.) spawned by the agent are also terminated. A plain
2391
- # proc.kill() only kills the direct subprocess; any grandchildren
2392
- # become orphaned, keep pipes open, and can exhaust system resources.
2393
- try:
2394
- if sys.platform != "win32":
2395
- import signal as _signal
2396
- try:
2397
- os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2398
- except (ProcessLookupError, PermissionError, OSError):
2399
- pass
2400
- else:
2401
- import subprocess as _subprocess
2402
- _subprocess.run(
2403
- ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2404
- capture_output=True,
2405
- )
2406
- except Exception:
2407
- pass
2408
- finally:
2409
- try:
2410
- proc.kill()
2411
- except Exception:
2412
- pass
2594
+ except asyncio.TimeoutError as _exc:
2595
+ # This branch fires if the outer safety net triggers (extremely
2596
+ # rare) or if _IdleTimeoutError propagates from _read_stdout.
2597
+ _kill_proc(proc)
2413
2598
  # Drain any remaining output after kill
2414
2599
  try:
2415
2600
  remaining, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
@@ -2419,7 +2604,7 @@ class ProcessManager:
2419
2604
  stdout_lines.append(line)
2420
2605
  except Exception:
2421
2606
  pass
2422
- raise # re-raise so callers can set result.error
2607
+ raise # re-raise (_IdleTimeoutError preserves subclass type)
2423
2608
 
2424
2609
  await proc.wait()
2425
2610
  stdout = "\n".join(stdout_lines)
@@ -2480,7 +2665,8 @@ class ProcessManager:
2480
2665
  )
2481
2666
  self.active_processes[task_id] = proc
2482
2667
  stdout, stderr, returncode = await self._stream_process(
2483
- proc, prompt.encode(), timeout, task_id, on_chunk
2668
+ proc, prompt.encode(), timeout, task_id, on_chunk,
2669
+ workspace_path=cwd,
2484
2670
  )
2485
2671
 
2486
2672
  # Parse Claude JSON output for metrics
@@ -2521,13 +2707,13 @@ class ProcessManager:
2521
2707
  error=f"Claude exited with code {returncode}: {stderr[-500:]}",
2522
2708
  metrics=metrics,
2523
2709
  )
2524
- except asyncio.TimeoutError:
2525
- if task_id in self.active_processes:
2526
- self.active_processes[task_id].kill()
2527
- return TaskResult(
2528
- status="failed", exit_code=-1, stdout="", stderr="",
2529
- error=f"Timed out after {timeout}s",
2530
- )
2710
+ except asyncio.TimeoutError as exc:
2711
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2712
+ _err = (
2713
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2714
+ "Task may require more context decomposition or a different agent."
2715
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2716
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2531
2717
  except Exception as exc:
2532
2718
  logger.exception("Claude stream error for task %s", task_id)
2533
2719
  if task_id in self.active_processes:
@@ -2697,7 +2883,8 @@ class ProcessManager:
2697
2883
  )
2698
2884
  self.active_processes[task_id] = proc
2699
2885
  stdout, stderr, returncode = await self._stream_process(
2700
- proc, None, timeout, task_id, on_chunk
2886
+ proc, None, timeout, task_id, on_chunk,
2887
+ workspace_path=cwd,
2701
2888
  )
2702
2889
 
2703
2890
  # Parse copilot JSONL output for metrics
@@ -2725,13 +2912,13 @@ class ProcessManager:
2725
2912
  error=f"Copilot exited with code {effective_rc}: {stderr[-500:]}",
2726
2913
  metrics=metrics,
2727
2914
  )
2728
- except asyncio.TimeoutError:
2729
- if task_id in self.active_processes:
2730
- self.active_processes[task_id].kill()
2731
- return TaskResult(
2732
- status="failed", exit_code=-1, stdout="", stderr="",
2733
- error=f"Timed out after {timeout}s",
2734
- )
2915
+ except asyncio.TimeoutError as exc:
2916
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2917
+ _err = (
2918
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2919
+ "Task may require more context decomposition or a different agent."
2920
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2921
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2735
2922
  except Exception as exc:
2736
2923
  logger.exception("Copilot stream error for task %s", task_id)
2737
2924
  if task_id in self.active_processes:
@@ -2772,7 +2959,8 @@ class ProcessManager:
2772
2959
  self.active_processes[task_id] = proc
2773
2960
  stdin_bytes = stdin_input.encode() if stdin_input else None
2774
2961
  stdout, stderr, returncode = await self._stream_process(
2775
- proc, stdin_bytes, timeout, task_id, on_chunk
2962
+ proc, stdin_bytes, timeout, task_id, on_chunk,
2963
+ workspace_path=cwd,
2776
2964
  )
2777
2965
  status = "success" if returncode == 0 else "failed"
2778
2966
  return TaskResult(
@@ -2782,33 +2970,13 @@ class ProcessManager:
2782
2970
  stderr=stderr[-10000:],
2783
2971
  error="" if status == "success" else f"Exited with code {returncode}",
2784
2972
  )
2785
- except asyncio.TimeoutError:
2786
- proc = self.active_processes.pop(task_id, None)
2787
- if proc:
2788
- try:
2789
- if sys.platform != "win32":
2790
- import signal as _signal
2791
- try:
2792
- os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2793
- except (ProcessLookupError, PermissionError, OSError):
2794
- pass
2795
- else:
2796
- import subprocess as _subprocess
2797
- _subprocess.run(
2798
- ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2799
- capture_output=True,
2800
- )
2801
- except Exception:
2802
- pass
2803
- finally:
2804
- try:
2805
- proc.kill()
2806
- except Exception:
2807
- pass
2808
- return TaskResult(
2809
- status="failed", exit_code=-1, stdout="", stderr="",
2810
- error=f"Timed out after {timeout}s",
2811
- )
2973
+ except asyncio.TimeoutError as exc:
2974
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2975
+ _err = (
2976
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2977
+ "Task may require more context decomposition or a different agent."
2978
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2979
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2812
2980
  except Exception as exc:
2813
2981
  logger.exception("CLI stream error for task %s", task_id)
2814
2982
  if task_id in self.active_processes:
@@ -3351,6 +3519,7 @@ class ProgressReporter:
3351
3519
  "stdout_tail": result.stdout[-20000:] if result.stdout else "",
3352
3520
  "stderr_tail": result.stderr[-5000:] if result.stderr else "",
3353
3521
  "error": result.error,
3522
+ "failure_code": result.failure_code,
3354
3523
  "files_changed": result.files_changed,
3355
3524
  "lines_added": result.lines_added,
3356
3525
  "lines_removed": result.lines_removed,
@@ -4250,6 +4419,110 @@ class RuntimeDaemon:
4250
4419
  )
4251
4420
  logger.info("Workspace ready: %s", workspace_path)
4252
4421
 
4422
+ # 2.1 Workspace health check: detect broken checkout (Windows filename-
4423
+ # too-long or other git checkout failure that leaves the working tree
4424
+ # empty while the git index still tracks all source files).
4425
+ # If this is not caught the agent will run `git add -A` and commit a
4426
+ # catastrophic mass-deletion (e.g. SI-434: 47,566 files deleted).
4427
+ try:
4428
+ _index_count_out = await self._git(
4429
+ "ls-files", "--cached", "--", ".", cwd=workspace_path,
4430
+ timeout=30,
4431
+ )
4432
+ _index_count = len([l for l in _index_count_out.splitlines() if l.strip()])
4433
+ if _index_count > 500:
4434
+ # Count physical files (exclude .git/)
4435
+ _phys_count = sum(1 for _ in workspace_path.rglob("*")
4436
+ if _.is_file() and ".git" not in _.parts)
4437
+ _ratio = _phys_count / _index_count
4438
+ if _ratio < 0.20:
4439
+ # Less than 20 % of tracked files exist on disk — almost
4440
+ # certainly a failed git checkout (e.g. Windows path-length
4441
+ # limit). Abort rather than letting the agent commit a
4442
+ # mass-deletion.
4443
+ _longpath_hint = (
4444
+ " Enable Windows long-path support: run "
4445
+ "`git config --global core.longpaths true` and enable "
4446
+ "LongPathsEnabled in Windows Group Policy / Registry "
4447
+ "(HKLM\\SYSTEM\\CurrentControlSet\\Control\\FileSystem\\LongPathsEnabled=1)."
4448
+ if sys.platform == "win32" else ""
4449
+ )
4450
+ raise RuntimeError(
4451
+ f"Workspace health check failed: only {_phys_count}/{_index_count} "
4452
+ f"tracked files exist on disk ({_ratio:.0%}). "
4453
+ f"The git checkout likely failed due to filename-length limitations."
4454
+ f"{_longpath_hint}"
4455
+ )
4456
+ elif _ratio < 0.80:
4457
+ logger.warning(
4458
+ "Workspace health check warning: only %d/%d tracked files "
4459
+ "exist on disk (%.0f%%) for task %s — checkout may be incomplete.",
4460
+ _phys_count, _index_count, _ratio * 100, task.task_id,
4461
+ )
4462
+ except RuntimeError:
4463
+ raise
4464
+ except Exception as _health_exc:
4465
+ logger.warning("Workspace health check error (non-fatal): %s", _health_exc)
4466
+
4467
+
4468
+ # agent run starts from a completely clean slate. This covers:
4469
+ # • Type change: removes old-type files (e.g. PRD.md/SDD.md) so they
4470
+ # don't coexist with the new type's files (e.g. diagnosis.md).
4471
+ # • Same-type fresh re-analysis: removes extra files the agent may
4472
+ # have written that fall outside the expected type profile.
4473
+ # Using a whole-directory wipe is more reliable than the old
4474
+ # cleanup_stale_docs approach (which only deleted known-profile files).
4475
+ if task.node_type == "analysis" and (
4476
+ task.input_data.get("wipe_analysis_dir")
4477
+ # Backwards-compat: older server versions send cleanup_stale_docs
4478
+ or task.input_data.get("cleanup_stale_docs")
4479
+ # Also wipe if analysis_mode is explicitly "fresh" (belt+suspenders)
4480
+ or task.input_data.get("analysis_mode") == "fresh"
4481
+ ):
4482
+ output_dir_raw = (
4483
+ task.input_data.get("analysis_output_dir")
4484
+ or task.input_data.get("output_dir")
4485
+ or ""
4486
+ )
4487
+ output_dir_norm = str(output_dir_raw).replace("\\", "/").lstrip("./").rstrip("/")
4488
+ if output_dir_norm:
4489
+ dir_to_wipe = workspace_path / output_dir_norm
4490
+ if dir_to_wipe.is_dir():
4491
+ existing_files = [f for f in dir_to_wipe.iterdir() if f.is_file()]
4492
+ if existing_files:
4493
+ try:
4494
+ # Stage all deletions with git rm
4495
+ await self._git(
4496
+ "rm", "-r", "--cached", "--ignore-unmatch",
4497
+ output_dir_norm,
4498
+ cwd=workspace_path,
4499
+ )
4500
+ # Remove physical files
4501
+ shutil.rmtree(str(dir_to_wipe), ignore_errors=True)
4502
+ # Commit the wipe so the branch diff is clean
4503
+ await self._git(
4504
+ "-c", "user.name=Forgexa Agent",
4505
+ "-c", "user.email=agent@forgexa.net",
4506
+ "commit", "-m",
4507
+ f"cleanup: wipe analysis docs in {output_dir_norm} before fresh re-analysis",
4508
+ cwd=workspace_path,
4509
+ )
4510
+ logger.info(
4511
+ "Wiped %d analysis doc(s) from %s for task %s (fresh analysis)",
4512
+ len(existing_files), output_dir_norm, task.task_id,
4513
+ )
4514
+ except Exception:
4515
+ logger.warning(
4516
+ "Could not wipe analysis dir %s for task %s "
4517
+ "(proceeding anyway — agent will overwrite)",
4518
+ output_dir_norm, task.task_id, exc_info=True,
4519
+ )
4520
+ else:
4521
+ logger.debug(
4522
+ "Analysis dir %s is already empty for task %s",
4523
+ output_dir_norm, task.task_id,
4524
+ )
4525
+
4253
4526
  # 3. Run agent with real-time output streaming + periodic progress heartbeat
4254
4527
  await reporter.report_progress(task.task_id, 10, "running_agent")
4255
4528
 
@@ -4419,6 +4692,10 @@ class RuntimeDaemon:
4419
4692
  f"Original error: {result.error}"
4420
4693
  )
4421
4694
  result.status = "failed"
4695
+ # Signal to the server that ALL installed agents were tried and
4696
+ # all are rate/quota limited. The server must NOT re-enqueue on
4697
+ # the same runtime — that would hit the same quota wall.
4698
+ result.failure_code = "all_agents_rate_limited"
4422
4699
 
4423
4700
  # 4. Collect git info BEFORE commit (shows uncommitted changes)
4424
4701
  pre_commit_git = await self.process_manager._collect_git_info(workspace_path)
@@ -4447,26 +4724,49 @@ class RuntimeDaemon:
4447
4724
  f"(node_type={task.node_type}, agent={agent.agent_id})"
4448
4725
  )
4449
4726
 
4450
- # 4.1 Recovery: agent exited non-zero but already committed code
4451
- # (e.g. OpenCode EBADF crash on exit after successful work)
4452
- if result.status == "failed" and result.exit_code not in (None, -1):
4453
- committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
4454
- has_committed_changes = bool(committed_git.get("files_changed"))
4455
- has_no_uncommitted = not pre_commit_git.get("files_changed")
4456
- has_tokens = (
4457
- int(result.metrics.get("token_input", 0) or 0)
4458
- + int(result.metrics.get("token_output", 0) or 0)
4459
- ) > 0
4460
- has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
4461
- if has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output):
4462
- logger.warning(
4463
- "Task %s agent exited with code %s but has committed changes — "
4464
- "recovering as success (agent likely crashed during cleanup)",
4465
- task.task_id, result.exit_code,
4727
+ # 4.1 Recovery: agent exited non-zero but already committed code.
4728
+ # Covers two scenarios:
4729
+ # A. Process crash (e.g. OpenCode EBADF on exit after successful work):
4730
+ # exit_code is a real non-negative/non-(-1) value.
4731
+ # B. Timeout with committed work: agent finished its task and committed
4732
+ # before the idle/absolute timeout fired. exit_code=-1 but the commits
4733
+ # are real — don't discard them.
4734
+ if result.status == "failed":
4735
+ _error_lower = (result.error or "").lower()
4736
+ is_timeout_failure = (
4737
+ "idle for" in _error_lower
4738
+ or "timed out" in _error_lower
4739
+ or "absolute limit" in _error_lower
4740
+ )
4741
+ can_attempt_recovery = (
4742
+ is_timeout_failure # timeout: also allow exit_code=-1
4743
+ or result.exit_code not in (None, -1) # crash: original guard
4744
+ )
4745
+ if can_attempt_recovery:
4746
+ committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
4747
+ has_committed_changes = bool(committed_git.get("files_changed"))
4748
+ has_no_uncommitted = not pre_commit_git.get("files_changed")
4749
+ has_tokens = (
4750
+ int(result.metrics.get("token_input", 0) or 0)
4751
+ + int(result.metrics.get("token_output", 0) or 0)
4752
+ ) > 0
4753
+ has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
4754
+ # Timeout recovery requires stronger evidence: committed work + tokens.
4755
+ # Crash recovery (original): committed + (tokens OR meaningful output).
4756
+ sufficient_evidence = (
4757
+ (has_committed_changes and has_no_uncommitted and has_tokens and has_meaningful_output)
4758
+ if is_timeout_failure
4759
+ else (has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output))
4466
4760
  )
4467
- result.status = "success"
4468
- result.error = ""
4469
- result.metrics["recovered_from_exit_code"] = result.exit_code
4761
+ if sufficient_evidence:
4762
+ _reason = "timed out but already committed changes" if is_timeout_failure else f"exited with code {result.exit_code}"
4763
+ logger.warning(
4764
+ "Task %s agent %s — recovering as success",
4765
+ task.task_id, _reason,
4766
+ )
4767
+ result.status = "success"
4768
+ result.error = ""
4769
+ result.metrics["recovered_from_exit_code"] = result.exit_code
4470
4770
 
4471
4771
  # 4.5 Layer 2: Validation gate — check outputs before committing
4472
4772
  if result.status == "success":
@@ -5081,7 +5381,14 @@ class RuntimeDaemon:
5081
5381
  ],
5082
5382
  )
5083
5383
 
5084
- # Build a targeted fix prompt with output directory context
5384
+ # Save the original prompt BEFORE building the retry variant so we
5385
+ # can include it in fix_prompt. Without this the agent receives only
5386
+ # "fix validation errors" with zero task context and responds with
5387
+ # "I don't have a specific task to execute yet." (root cause confirmed
5388
+ # via Copilot JSONL output for SI-434/SI-446).
5389
+ original_prompt = task.input_prompt
5390
+
5391
+ # Build a targeted fix prompt: original task + validation issues.
5085
5392
  _input = task.input_data or {}
5086
5393
  _fix_doc_dir = (
5087
5394
  _input.get("output_dir")
@@ -5089,8 +5396,11 @@ class RuntimeDaemon:
5089
5396
  or ""
5090
5397
  )
5091
5398
  fix_prompt = (
5092
- "The previous execution produced output with validation errors.\n"
5093
- "Please fix ALL of the following issues:\n\n"
5399
+ f"{original_prompt}\n\n"
5400
+ "---\n\n"
5401
+ "**IMPORTANT – Validation Retry:** The previous execution attempt "
5402
+ "did not produce all required output. Please complete the task above "
5403
+ "and ensure ALL of the following issues are resolved:\n\n"
5094
5404
  f"{issues_text}\n\n"
5095
5405
  )
5096
5406
  if _fix_doc_dir:
@@ -5105,7 +5415,6 @@ class RuntimeDaemon:
5105
5415
  )
5106
5416
 
5107
5417
  # Override task prompt temporarily
5108
- original_prompt = task.input_prompt
5109
5418
  task.input_prompt = fix_prompt
5110
5419
 
5111
5420
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.8.7
3
+ Version: 1.8.10
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "forgexa-cli"
3
- version = "1.8.7"
3
+ version = "1.8.10"
4
4
  description = "Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform"
5
5
  requires-python = ">=3.9"
6
6
  license = { text = "MIT" }
File without changes
File without changes