forgexa-cli 1.8.7__tar.gz → 1.8.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/PKG-INFO +1 -1
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli/__init__.py +1 -1
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli/daemon.py +335 -101
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli.egg-info/PKG-INFO +1 -1
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/pyproject.toml +1 -1
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/README.md +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli/_build_config.py +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli/main.py +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli/py.typed +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli.egg-info/SOURCES.txt +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli.egg-info/dependency_links.txt +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli.egg-info/entry_points.txt +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli.egg-info/requires.txt +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/forgexa_cli.egg-info/top_level.txt +0 -0
- {forgexa_cli-1.8.7 → forgexa_cli-1.8.8}/setup.cfg +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""forgexa-cli — Forgexa command-line client."""
|
|
2
|
-
__version__ = "1.8.
|
|
2
|
+
__version__ = "1.8.8"
|
|
@@ -392,7 +392,7 @@ except (ImportError, ModuleNotFoundError):
|
|
|
392
392
|
# DAEMON_VERSION is the protocol/logic version of the daemon code.
|
|
393
393
|
# Kept in sync with pyproject.toml version via bump-version.sh.
|
|
394
394
|
# CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
|
|
395
|
-
DAEMON_VERSION = "1.8.
|
|
395
|
+
DAEMON_VERSION = "1.8.8"
|
|
396
396
|
|
|
397
397
|
|
|
398
398
|
def _detect_client_type() -> str:
|
|
@@ -1753,15 +1753,34 @@ class WorkspaceManager:
|
|
|
1753
1753
|
key_path_safe = key_path.replace("\\", "/") if sys.platform == "win32" else key_path
|
|
1754
1754
|
# RC1 (Windows): os.chmod(S_IRUSR) does not set proper NTFS ACLs.
|
|
1755
1755
|
# Windows OpenSSH rejects keys that aren't exclusively owner-readable
|
|
1756
|
-
# ("UNPROTECTED PRIVATE KEY FILE").
|
|
1756
|
+
# ("UNPROTECTED PRIVATE KEY FILE"). Fix with icacls to set the ACLs
|
|
1757
|
+
# correctly. NOTE: StrictModes is an sshd_config option (server side);
|
|
1758
|
+
# passing -o StrictModes=no to the SSH client is invalid and causes:
|
|
1759
|
+
# "command-line: line 0: Bad configuration option: strictmodes"
|
|
1757
1760
|
# RC2 (Windows): /dev/null doesn't exist on Windows native OpenSSH
|
|
1758
1761
|
# (C:\Windows\System32\OpenSSH\ssh.exe). Use NUL instead.
|
|
1759
1762
|
if sys.platform == "win32":
|
|
1760
1763
|
_known_hosts_null = "NUL"
|
|
1761
|
-
|
|
1764
|
+
try:
|
|
1765
|
+
import subprocess as _subp
|
|
1766
|
+
_username = (
|
|
1767
|
+
os.environ.get("USERNAME")
|
|
1768
|
+
or os.environ.get("USER")
|
|
1769
|
+
or ""
|
|
1770
|
+
)
|
|
1771
|
+
if _username:
|
|
1772
|
+
_subp.run(
|
|
1773
|
+
[
|
|
1774
|
+
"icacls", key_path,
|
|
1775
|
+
"/inheritance:r",
|
|
1776
|
+
"/grant:r", f"{_username}:(R)",
|
|
1777
|
+
],
|
|
1778
|
+
capture_output=True, check=False, timeout=10,
|
|
1779
|
+
)
|
|
1780
|
+
except Exception:
|
|
1781
|
+
pass
|
|
1762
1782
|
else:
|
|
1763
1783
|
_known_hosts_null = "/dev/null"
|
|
1764
|
-
_strict_modes_opt = ""
|
|
1765
1784
|
env = {
|
|
1766
1785
|
**os.environ,
|
|
1767
1786
|
"GIT_SSH_COMMAND": (
|
|
@@ -1777,7 +1796,6 @@ class WorkspaceManager:
|
|
|
1777
1796
|
f" -o ConnectTimeout=30"
|
|
1778
1797
|
f" -o ServerAliveInterval=30"
|
|
1779
1798
|
f" -o ServerAliveCountMax=3"
|
|
1780
|
-
f"{_strict_modes_opt}"
|
|
1781
1799
|
),
|
|
1782
1800
|
}
|
|
1783
1801
|
except Exception:
|
|
@@ -1865,6 +1883,92 @@ class WorkspaceManager:
|
|
|
1865
1883
|
# ── Process Manager ──
|
|
1866
1884
|
|
|
1867
1885
|
|
|
1886
|
+
def _kill_proc(proc: asyncio.subprocess.Process) -> None:
|
|
1887
|
+
"""Kill a subprocess and its entire process group.
|
|
1888
|
+
|
|
1889
|
+
A plain ``proc.kill()`` only terminates the direct child; grandchildren
|
|
1890
|
+
(npm, yarn, ssh, git, etc.) spawned by the agent stay alive, keep pipes
|
|
1891
|
+
open, and exhaust system resources. ``os.killpg`` sends SIGKILL to the
|
|
1892
|
+
whole process group, reliably cleaning up all descendants.
|
|
1893
|
+
"""
|
|
1894
|
+
try:
|
|
1895
|
+
if sys.platform != "win32":
|
|
1896
|
+
import signal as _signal
|
|
1897
|
+
try:
|
|
1898
|
+
os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
|
|
1899
|
+
except (ProcessLookupError, PermissionError, OSError):
|
|
1900
|
+
pass
|
|
1901
|
+
else:
|
|
1902
|
+
import subprocess as _subprocess
|
|
1903
|
+
_subprocess.run(
|
|
1904
|
+
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
|
|
1905
|
+
capture_output=True,
|
|
1906
|
+
)
|
|
1907
|
+
except Exception:
|
|
1908
|
+
pass
|
|
1909
|
+
finally:
|
|
1910
|
+
try:
|
|
1911
|
+
proc.kill()
|
|
1912
|
+
except Exception:
|
|
1913
|
+
pass
|
|
1914
|
+
|
|
1915
|
+
|
|
1916
|
+
class _IdleTimeoutError(asyncio.TimeoutError):
|
|
1917
|
+
"""Raised when an agent process produces no stdout for longer than AGENT_IDLE_TIMEOUT.
|
|
1918
|
+
|
|
1919
|
+
Subclasses asyncio.TimeoutError so existing ``except asyncio.TimeoutError``
|
|
1920
|
+
handlers catch it, but callers can distinguish it from an absolute wall-clock
|
|
1921
|
+
timeout via ``isinstance(exc, _IdleTimeoutError)`` or ``exc.idle_seconds``.
|
|
1922
|
+
"""
|
|
1923
|
+
|
|
1924
|
+
def __init__(self, idle_seconds: float) -> None:
|
|
1925
|
+
super().__init__(f"idle:{idle_seconds:.0f}s")
|
|
1926
|
+
self.idle_seconds = idle_seconds
|
|
1927
|
+
|
|
1928
|
+
|
|
1929
|
+
def _workspace_has_recent_activity(
|
|
1930
|
+
workspace_path: "Path", since_monotonic: float, max_depth: int = 4
|
|
1931
|
+
) -> bool:
|
|
1932
|
+
"""Return True if any file under workspace_path was modified after since_monotonic.
|
|
1933
|
+
|
|
1934
|
+
Converts the monotonic timestamp to a wall-clock value for mtime comparison.
|
|
1935
|
+
Scans the directory tree (up to max_depth levels) with os.scandir, skips
|
|
1936
|
+
.git, and returns on the first matching file for speed.
|
|
1937
|
+
|
|
1938
|
+
This is the key secondary signal used by _stream_process to distinguish
|
|
1939
|
+
"agent is silent but legitimately working" (e.g. running npm install,
|
|
1940
|
+
compiling TypeScript, executing test suites) from "agent is truly hung".
|
|
1941
|
+
All silent-but-busy operations (package installs, compilation, test runs,
|
|
1942
|
+
git operations) write files to disk, so a positive result here means we
|
|
1943
|
+
must NOT kill the process even if stdout has been idle for a long time.
|
|
1944
|
+
"""
|
|
1945
|
+
since_wall = time.time() - (time.monotonic() - since_monotonic)
|
|
1946
|
+
|
|
1947
|
+
def _scan(path: "Path", depth: int) -> bool:
|
|
1948
|
+
if depth > max_depth:
|
|
1949
|
+
return False
|
|
1950
|
+
try:
|
|
1951
|
+
for entry in os.scandir(str(path)):
|
|
1952
|
+
if entry.name == ".git":
|
|
1953
|
+
continue # skip version-control metadata
|
|
1954
|
+
try:
|
|
1955
|
+
if entry.stat(follow_symlinks=False).st_mtime > since_wall:
|
|
1956
|
+
return True
|
|
1957
|
+
if entry.is_dir(follow_symlinks=False) and depth < max_depth:
|
|
1958
|
+
if _scan(Path(entry.path), depth + 1):
|
|
1959
|
+
return True
|
|
1960
|
+
except (OSError, PermissionError):
|
|
1961
|
+
pass
|
|
1962
|
+
except (OSError, PermissionError):
|
|
1963
|
+
pass
|
|
1964
|
+
return False
|
|
1965
|
+
|
|
1966
|
+
try:
|
|
1967
|
+
return _scan(workspace_path, 0)
|
|
1968
|
+
except Exception:
|
|
1969
|
+
return False # never let a filesystem check crash the agent run
|
|
1970
|
+
|
|
1971
|
+
|
|
1868
1972
|
class ProcessManager:
|
|
1869
1973
|
"""Manages Agent CLI subprocess lifecycle."""
|
|
1870
1974
|
|
|
@@ -2312,6 +2416,7 @@ class ProcessManager:
|
|
|
2312
2416
|
timeout: int,
|
|
2313
2417
|
task_id: str,
|
|
2314
2418
|
on_chunk: Any,
|
|
2419
|
+
workspace_path: "Path | None" = None,
|
|
2315
2420
|
) -> tuple[str, str, int]:
|
|
2316
2421
|
"""Stream stdout line-by-line from a subprocess, flushing to on_chunk.
|
|
2317
2422
|
|
|
@@ -2323,7 +2428,22 @@ class ProcessManager:
|
|
|
2323
2428
|
deadlock when the process fills the stderr buffer.
|
|
2324
2429
|
- on_chunk(lines) is called with each decoded line so the caller can
|
|
2325
2430
|
forward to the progress reporter without waiting for completion.
|
|
2431
|
+
- Idle timeout: if the agent produces no stdout for AGENT_IDLE_TIMEOUT
|
|
2432
|
+
seconds the code checks for filesystem activity in workspace_path
|
|
2433
|
+
before deciding to kill. If files were recently modified the agent
|
|
2434
|
+
is doing silent work (npm install, compilation, test runs, etc.) and
|
|
2435
|
+
the idle timer is reset. Only when BOTH stdout AND the filesystem
|
|
2436
|
+
are idle does the process get killed. This eliminates false-positive
|
|
2437
|
+
kills at the idle boundary.
|
|
2438
|
+
- Absolute timeout (``timeout`` param): hard ceiling for zombie-process
|
|
2439
|
+
prevention. Always kills at this boundary (no extension), but logs
|
|
2440
|
+
filesystem activity status for post-mortem observability.
|
|
2326
2441
|
"""
|
|
2442
|
+
idle_timeout: int = getattr(settings, "AGENT_IDLE_TIMEOUT", 600)
|
|
2443
|
+
_start_time = time.monotonic()
|
|
2444
|
+
# Mutable state shared between nested coroutines (list avoids nonlocal)
|
|
2445
|
+
_last_activity_at: list[float] = [time.monotonic()]
|
|
2446
|
+
|
|
2327
2447
|
# Write prompt and close stdin so the agent knows input is done.
|
|
2328
2448
|
if stdin_input and proc.stdin:
|
|
2329
2449
|
try:
|
|
@@ -2345,13 +2465,76 @@ class ProcessManager:
|
|
|
2345
2465
|
if not proc.stdout:
|
|
2346
2466
|
return
|
|
2347
2467
|
while True:
|
|
2468
|
+
# ── Timeout checks ────────────────────────────────────────────
|
|
2469
|
+
now = time.monotonic()
|
|
2470
|
+
elapsed_abs = now - _start_time
|
|
2471
|
+
if elapsed_abs >= timeout:
|
|
2472
|
+
# Absolute ceiling reached — hard zombie-process prevention.
|
|
2473
|
+
# Log filesystem status for observability but always kill;
|
|
2474
|
+
# never extend the absolute boundary.
|
|
2475
|
+
_fs_active = workspace_path and _workspace_has_recent_activity(
|
|
2476
|
+
workspace_path, _last_activity_at[0]
|
|
2477
|
+
)
|
|
2478
|
+
logger.warning(
|
|
2479
|
+
"Task %s absolute timeout %.0fs reached — killing "
|
|
2480
|
+
"(workspace filesystem %s)",
|
|
2481
|
+
task_id, elapsed_abs,
|
|
2482
|
+
"was active" if _fs_active else "was idle",
|
|
2483
|
+
)
|
|
2484
|
+
_kill_proc(proc)
|
|
2485
|
+
raise asyncio.TimeoutError(
|
|
2486
|
+
f"Timed out after {timeout}s (absolute limit)"
|
|
2487
|
+
)
|
|
2488
|
+
|
|
2489
|
+
idle_elapsed = now - _last_activity_at[0]
|
|
2490
|
+
if idle_elapsed >= idle_timeout:
|
|
2491
|
+
# Before killing, check if the agent is doing silent
|
|
2492
|
+
# filesystem work (npm install, compilation, test runs, git
|
|
2493
|
+
# operations, etc.). These produce no stdout but DO modify
|
|
2494
|
+
# files — killing at the idle boundary would be a false
|
|
2495
|
+
# positive. Only kill when BOTH signals agree: no stdout
|
|
2496
|
+
# AND no filesystem activity.
|
|
2497
|
+
if workspace_path and _workspace_has_recent_activity(
|
|
2498
|
+
workspace_path, _last_activity_at[0]
|
|
2499
|
+
):
|
|
2500
|
+
# Files modified since last stdout → agent is working
|
|
2501
|
+
# silently. Reset idle timer and continue.
|
|
2502
|
+
_last_activity_at[0] = time.monotonic()
|
|
2503
|
+
logger.info(
|
|
2504
|
+
"Task %s: stdout idle %.0fs but workspace files "
|
|
2505
|
+
"modified — resetting idle timer (agent working "
|
|
2506
|
+
"silently)",
|
|
2507
|
+
task_id, idle_elapsed,
|
|
2508
|
+
)
|
|
2509
|
+
else:
|
|
2510
|
+
# No stdout AND no filesystem activity → truly hung.
|
|
2511
|
+
logger.warning(
|
|
2512
|
+
"Task %s agent idle %.0fs — no stdout, no "
|
|
2513
|
+
"filesystem activity; killing hung process",
|
|
2514
|
+
task_id, idle_elapsed,
|
|
2515
|
+
)
|
|
2516
|
+
_kill_proc(proc)
|
|
2517
|
+
raise _IdleTimeoutError(idle_elapsed)
|
|
2518
|
+
|
|
2519
|
+
# Check interval capped at 30s so the loop stays responsive
|
|
2520
|
+
# even when both timeouts are far away.
|
|
2521
|
+
check_interval = min(
|
|
2522
|
+
idle_timeout - idle_elapsed + 0.5, # until idle fires
|
|
2523
|
+
timeout - elapsed_abs + 0.5, # until absolute fires
|
|
2524
|
+
30.0,
|
|
2525
|
+
)
|
|
2526
|
+
|
|
2527
|
+
# ── Read one line with a bounded wait ─────────────────────────
|
|
2348
2528
|
try:
|
|
2349
|
-
line_bytes = await
|
|
2529
|
+
line_bytes = await asyncio.wait_for(
|
|
2530
|
+
proc.stdout.readline(), timeout=check_interval
|
|
2531
|
+
)
|
|
2532
|
+
except asyncio.TimeoutError:
|
|
2533
|
+
# readline timed out within check_interval — no new output
|
|
2534
|
+
# yet. Loop back to re-evaluate idle/absolute conditions.
|
|
2535
|
+
continue
|
|
2350
2536
|
except (ValueError, asyncio.LimitOverrunError, Exception) as exc:
|
|
2351
|
-
# Line exceeded stream buffer limit
|
|
2352
|
-
# converted to ValueError by readline(), but catch broadly
|
|
2353
|
-
# to handle edge cases in different Python versions).
|
|
2354
|
-
# Fall back to reading remaining data in bulk.
|
|
2537
|
+
# Line exceeded stream buffer limit — drain remaining bulk.
|
|
2355
2538
|
logger.warning(
|
|
2356
2539
|
"Stream read error for task %s (%s: %s), draining remaining output",
|
|
2357
2540
|
task_id, type(exc).__name__, exc,
|
|
@@ -2370,8 +2553,12 @@ class ProcessManager:
|
|
|
2370
2553
|
except Exception:
|
|
2371
2554
|
pass
|
|
2372
2555
|
break
|
|
2556
|
+
|
|
2373
2557
|
if not line_bytes:
|
|
2374
|
-
break
|
|
2558
|
+
break # EOF — process exited normally
|
|
2559
|
+
|
|
2560
|
+
# ── New output received — reset idle timer ────────────────────
|
|
2561
|
+
_last_activity_at[0] = time.monotonic()
|
|
2375
2562
|
line = line_bytes.decode(errors="replace").rstrip("\n")
|
|
2376
2563
|
stdout_lines.append(line)
|
|
2377
2564
|
if on_chunk:
|
|
@@ -2381,35 +2568,17 @@ class ProcessManager:
|
|
|
2381
2568
|
pass # never let on_chunk crash the agent run
|
|
2382
2569
|
|
|
2383
2570
|
try:
|
|
2571
|
+
# Outer wait_for uses timeout+idle_timeout as generous safety net.
|
|
2572
|
+
# In practice _read_stdout handles both idle and absolute killing
|
|
2573
|
+
# before this fires.
|
|
2384
2574
|
await asyncio.wait_for(
|
|
2385
2575
|
asyncio.gather(_read_stdout(), _read_stderr()),
|
|
2386
|
-
timeout=timeout,
|
|
2576
|
+
timeout=timeout + idle_timeout + 60,
|
|
2387
2577
|
)
|
|
2388
|
-
except asyncio.TimeoutError:
|
|
2389
|
-
#
|
|
2390
|
-
#
|
|
2391
|
-
|
|
2392
|
-
# become orphaned, keep pipes open, and can exhaust system resources.
|
|
2393
|
-
try:
|
|
2394
|
-
if sys.platform != "win32":
|
|
2395
|
-
import signal as _signal
|
|
2396
|
-
try:
|
|
2397
|
-
os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
|
|
2398
|
-
except (ProcessLookupError, PermissionError, OSError):
|
|
2399
|
-
pass
|
|
2400
|
-
else:
|
|
2401
|
-
import subprocess as _subprocess
|
|
2402
|
-
_subprocess.run(
|
|
2403
|
-
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
|
|
2404
|
-
capture_output=True,
|
|
2405
|
-
)
|
|
2406
|
-
except Exception:
|
|
2407
|
-
pass
|
|
2408
|
-
finally:
|
|
2409
|
-
try:
|
|
2410
|
-
proc.kill()
|
|
2411
|
-
except Exception:
|
|
2412
|
-
pass
|
|
2578
|
+
except asyncio.TimeoutError as _exc:
|
|
2579
|
+
# This branch fires if the outer safety net triggers (extremely
|
|
2580
|
+
# rare) or if _IdleTimeoutError propagates from _read_stdout.
|
|
2581
|
+
_kill_proc(proc)
|
|
2413
2582
|
# Drain any remaining output after kill
|
|
2414
2583
|
try:
|
|
2415
2584
|
remaining, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
|
|
@@ -2419,7 +2588,7 @@ class ProcessManager:
|
|
|
2419
2588
|
stdout_lines.append(line)
|
|
2420
2589
|
except Exception:
|
|
2421
2590
|
pass
|
|
2422
|
-
raise # re-raise
|
|
2591
|
+
raise # re-raise (_IdleTimeoutError preserves subclass type)
|
|
2423
2592
|
|
|
2424
2593
|
await proc.wait()
|
|
2425
2594
|
stdout = "\n".join(stdout_lines)
|
|
@@ -2480,7 +2649,8 @@ class ProcessManager:
|
|
|
2480
2649
|
)
|
|
2481
2650
|
self.active_processes[task_id] = proc
|
|
2482
2651
|
stdout, stderr, returncode = await self._stream_process(
|
|
2483
|
-
proc, prompt.encode(), timeout, task_id, on_chunk
|
|
2652
|
+
proc, prompt.encode(), timeout, task_id, on_chunk,
|
|
2653
|
+
workspace_path=cwd,
|
|
2484
2654
|
)
|
|
2485
2655
|
|
|
2486
2656
|
# Parse Claude JSON output for metrics
|
|
@@ -2521,13 +2691,13 @@ class ProcessManager:
|
|
|
2521
2691
|
error=f"Claude exited with code {returncode}: {stderr[-500:]}",
|
|
2522
2692
|
metrics=metrics,
|
|
2523
2693
|
)
|
|
2524
|
-
except asyncio.TimeoutError:
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
|
|
2528
|
-
|
|
2529
|
-
|
|
2530
|
-
)
|
|
2694
|
+
except asyncio.TimeoutError as exc:
|
|
2695
|
+
_kill_proc(self.active_processes.pop(task_id, None) or proc)
|
|
2696
|
+
_err = (
|
|
2697
|
+
f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
|
|
2698
|
+
"Task may require more context decomposition or a different agent."
|
|
2699
|
+
) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
|
|
2700
|
+
return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
|
|
2531
2701
|
except Exception as exc:
|
|
2532
2702
|
logger.exception("Claude stream error for task %s", task_id)
|
|
2533
2703
|
if task_id in self.active_processes:
|
|
@@ -2697,7 +2867,8 @@ class ProcessManager:
|
|
|
2697
2867
|
)
|
|
2698
2868
|
self.active_processes[task_id] = proc
|
|
2699
2869
|
stdout, stderr, returncode = await self._stream_process(
|
|
2700
|
-
proc, None, timeout, task_id, on_chunk
|
|
2870
|
+
proc, None, timeout, task_id, on_chunk,
|
|
2871
|
+
workspace_path=cwd,
|
|
2701
2872
|
)
|
|
2702
2873
|
|
|
2703
2874
|
# Parse copilot JSONL output for metrics
|
|
@@ -2725,13 +2896,13 @@ class ProcessManager:
|
|
|
2725
2896
|
error=f"Copilot exited with code {effective_rc}: {stderr[-500:]}",
|
|
2726
2897
|
metrics=metrics,
|
|
2727
2898
|
)
|
|
2728
|
-
except asyncio.TimeoutError:
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
)
|
|
2899
|
+
except asyncio.TimeoutError as exc:
|
|
2900
|
+
_kill_proc(self.active_processes.pop(task_id, None) or proc)
|
|
2901
|
+
_err = (
|
|
2902
|
+
f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
|
|
2903
|
+
"Task may require more context decomposition or a different agent."
|
|
2904
|
+
) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
|
|
2905
|
+
return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
|
|
2735
2906
|
except Exception as exc:
|
|
2736
2907
|
logger.exception("Copilot stream error for task %s", task_id)
|
|
2737
2908
|
if task_id in self.active_processes:
|
|
@@ -2772,7 +2943,8 @@ class ProcessManager:
|
|
|
2772
2943
|
self.active_processes[task_id] = proc
|
|
2773
2944
|
stdin_bytes = stdin_input.encode() if stdin_input else None
|
|
2774
2945
|
stdout, stderr, returncode = await self._stream_process(
|
|
2775
|
-
proc, stdin_bytes, timeout, task_id, on_chunk
|
|
2946
|
+
proc, stdin_bytes, timeout, task_id, on_chunk,
|
|
2947
|
+
workspace_path=cwd,
|
|
2776
2948
|
)
|
|
2777
2949
|
status = "success" if returncode == 0 else "failed"
|
|
2778
2950
|
return TaskResult(
|
|
@@ -2782,33 +2954,13 @@ class ProcessManager:
|
|
|
2782
2954
|
stderr=stderr[-10000:],
|
|
2783
2955
|
error="" if status == "success" else f"Exited with code {returncode}",
|
|
2784
2956
|
)
|
|
2785
|
-
except asyncio.TimeoutError:
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
|
|
2790
|
-
|
|
2791
|
-
|
|
2792
|
-
os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
|
|
2793
|
-
except (ProcessLookupError, PermissionError, OSError):
|
|
2794
|
-
pass
|
|
2795
|
-
else:
|
|
2796
|
-
import subprocess as _subprocess
|
|
2797
|
-
_subprocess.run(
|
|
2798
|
-
["taskkill", "/F", "/T", "/PID", str(proc.pid)],
|
|
2799
|
-
capture_output=True,
|
|
2800
|
-
)
|
|
2801
|
-
except Exception:
|
|
2802
|
-
pass
|
|
2803
|
-
finally:
|
|
2804
|
-
try:
|
|
2805
|
-
proc.kill()
|
|
2806
|
-
except Exception:
|
|
2807
|
-
pass
|
|
2808
|
-
return TaskResult(
|
|
2809
|
-
status="failed", exit_code=-1, stdout="", stderr="",
|
|
2810
|
-
error=f"Timed out after {timeout}s",
|
|
2811
|
-
)
|
|
2957
|
+
except asyncio.TimeoutError as exc:
|
|
2958
|
+
_kill_proc(self.active_processes.pop(task_id, None) or proc)
|
|
2959
|
+
_err = (
|
|
2960
|
+
f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
|
|
2961
|
+
"Task may require more context decomposition or a different agent."
|
|
2962
|
+
) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
|
|
2963
|
+
return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
|
|
2812
2964
|
except Exception as exc:
|
|
2813
2965
|
logger.exception("CLI stream error for task %s", task_id)
|
|
2814
2966
|
if task_id in self.active_processes:
|
|
@@ -4250,6 +4402,65 @@ class RuntimeDaemon:
|
|
|
4250
4402
|
)
|
|
4251
4403
|
logger.info("Workspace ready: %s", workspace_path)
|
|
4252
4404
|
|
|
4405
|
+
# 2.5 Wipe the analysis output directory on fresh analysis so the new
|
|
4406
|
+
# agent run starts from a completely clean slate. This covers:
|
|
4407
|
+
# • Type change: removes old-type files (e.g. PRD.md/SDD.md) so they
|
|
4408
|
+
# don't coexist with the new type's files (e.g. diagnosis.md).
|
|
4409
|
+
# • Same-type fresh re-analysis: removes extra files the agent may
|
|
4410
|
+
# have written that fall outside the expected type profile.
|
|
4411
|
+
# Using a whole-directory wipe is more reliable than the old
|
|
4412
|
+
# cleanup_stale_docs approach (which only deleted known-profile files).
|
|
4413
|
+
if task.node_type == "analysis" and (
|
|
4414
|
+
task.input_data.get("wipe_analysis_dir")
|
|
4415
|
+
# Backwards-compat: older server versions send cleanup_stale_docs
|
|
4416
|
+
or task.input_data.get("cleanup_stale_docs")
|
|
4417
|
+
# Also wipe if analysis_mode is explicitly "fresh" (belt+suspenders)
|
|
4418
|
+
or task.input_data.get("analysis_mode") == "fresh"
|
|
4419
|
+
):
|
|
4420
|
+
output_dir_raw = (
|
|
4421
|
+
task.input_data.get("analysis_output_dir")
|
|
4422
|
+
or task.input_data.get("output_dir")
|
|
4423
|
+
or ""
|
|
4424
|
+
)
|
|
4425
|
+
output_dir_norm = str(output_dir_raw).replace("\\", "/").lstrip("./").rstrip("/")
|
|
4426
|
+
if output_dir_norm:
|
|
4427
|
+
dir_to_wipe = workspace_path / output_dir_norm
|
|
4428
|
+
if dir_to_wipe.is_dir():
|
|
4429
|
+
existing_files = [f for f in dir_to_wipe.iterdir() if f.is_file()]
|
|
4430
|
+
if existing_files:
|
|
4431
|
+
try:
|
|
4432
|
+
# Stage all deletions with git rm
|
|
4433
|
+
await self._git(
|
|
4434
|
+
"rm", "-r", "--cached", "--ignore-unmatch",
|
|
4435
|
+
output_dir_norm,
|
|
4436
|
+
cwd=workspace_path,
|
|
4437
|
+
)
|
|
4438
|
+
# Remove physical files
|
|
4439
|
+
shutil.rmtree(str(dir_to_wipe), ignore_errors=True)
|
|
4440
|
+
# Commit the wipe so the branch diff is clean
|
|
4441
|
+
await self._git(
|
|
4442
|
+
"-c", "user.name=Forgexa Agent",
|
|
4443
|
+
"-c", "user.email=agent@forgexa.net",
|
|
4444
|
+
"commit", "-m",
|
|
4445
|
+
f"cleanup: wipe analysis docs in {output_dir_norm} before fresh re-analysis",
|
|
4446
|
+
cwd=workspace_path,
|
|
4447
|
+
)
|
|
4448
|
+
logger.info(
|
|
4449
|
+
"Wiped %d analysis doc(s) from %s for task %s (fresh analysis)",
|
|
4450
|
+
len(existing_files), output_dir_norm, task.task_id,
|
|
4451
|
+
)
|
|
4452
|
+
except Exception:
|
|
4453
|
+
logger.warning(
|
|
4454
|
+
"Could not wipe analysis dir %s for task %s "
|
|
4455
|
+
"(proceeding anyway — agent will overwrite)",
|
|
4456
|
+
output_dir_norm, task.task_id, exc_info=True,
|
|
4457
|
+
)
|
|
4458
|
+
else:
|
|
4459
|
+
logger.debug(
|
|
4460
|
+
"Analysis dir %s is already empty for task %s",
|
|
4461
|
+
output_dir_norm, task.task_id,
|
|
4462
|
+
)
|
|
4463
|
+
|
|
4253
4464
|
# 3. Run agent with real-time output streaming + periodic progress heartbeat
|
|
4254
4465
|
await reporter.report_progress(task.task_id, 10, "running_agent")
|
|
4255
4466
|
|
|
@@ -4447,26 +4658,49 @@ class RuntimeDaemon:
|
|
|
4447
4658
|
f"(node_type={task.node_type}, agent={agent.agent_id})"
|
|
4448
4659
|
)
|
|
4449
4660
|
|
|
4450
|
-
# 4.1 Recovery: agent exited non-zero but already committed code
|
|
4451
|
-
#
|
|
4452
|
-
|
|
4453
|
-
|
|
4454
|
-
|
|
4455
|
-
|
|
4456
|
-
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
4461
|
-
|
|
4462
|
-
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4661
|
+
# 4.1 Recovery: agent exited non-zero but already committed code.
|
|
4662
|
+
# Covers two scenarios:
|
|
4663
|
+
# A. Process crash (e.g. OpenCode EBADF on exit after successful work):
|
|
4664
|
+
# exit_code is a real non-negative/non-(-1) value.
|
|
4665
|
+
# B. Timeout with committed work: agent finished its task and committed
|
|
4666
|
+
# before the idle/absolute timeout fired. exit_code=-1 but the commits
|
|
4667
|
+
# are real — don't discard them.
|
|
4668
|
+
if result.status == "failed":
|
|
4669
|
+
_error_lower = (result.error or "").lower()
|
|
4670
|
+
is_timeout_failure = (
|
|
4671
|
+
"idle for" in _error_lower
|
|
4672
|
+
or "timed out" in _error_lower
|
|
4673
|
+
or "absolute limit" in _error_lower
|
|
4674
|
+
)
|
|
4675
|
+
can_attempt_recovery = (
|
|
4676
|
+
is_timeout_failure # timeout: also allow exit_code=-1
|
|
4677
|
+
or result.exit_code not in (None, -1) # crash: original guard
|
|
4678
|
+
)
|
|
4679
|
+
if can_attempt_recovery:
|
|
4680
|
+
committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
|
|
4681
|
+
has_committed_changes = bool(committed_git.get("files_changed"))
|
|
4682
|
+
has_no_uncommitted = not pre_commit_git.get("files_changed")
|
|
4683
|
+
has_tokens = (
|
|
4684
|
+
int(result.metrics.get("token_input", 0) or 0)
|
|
4685
|
+
+ int(result.metrics.get("token_output", 0) or 0)
|
|
4686
|
+
) > 0
|
|
4687
|
+
has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
|
|
4688
|
+
# Timeout recovery requires stronger evidence: committed work + tokens.
|
|
4689
|
+
# Crash recovery (original): committed + (tokens OR meaningful output).
|
|
4690
|
+
sufficient_evidence = (
|
|
4691
|
+
(has_committed_changes and has_no_uncommitted and has_tokens and has_meaningful_output)
|
|
4692
|
+
if is_timeout_failure
|
|
4693
|
+
else (has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output))
|
|
4466
4694
|
)
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4695
|
+
if sufficient_evidence:
|
|
4696
|
+
_reason = "timed out but already committed changes" if is_timeout_failure else f"exited with code {result.exit_code}"
|
|
4697
|
+
logger.warning(
|
|
4698
|
+
"Task %s agent %s — recovering as success",
|
|
4699
|
+
task.task_id, _reason,
|
|
4700
|
+
)
|
|
4701
|
+
result.status = "success"
|
|
4702
|
+
result.error = ""
|
|
4703
|
+
result.metrics["recovered_from_exit_code"] = result.exit_code
|
|
4470
4704
|
|
|
4471
4705
|
# 4.5 Layer 2: Validation gate — check outputs before committing
|
|
4472
4706
|
if result.status == "success":
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|