forgexa-cli 1.8.7__tar.gz → 1.8.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.8.7
3
+ Version: 1.8.8
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,2 +1,2 @@
1
1
  """forgexa-cli — Forgexa command-line client."""
2
- __version__ = "1.8.7"
2
+ __version__ = "1.8.8"
@@ -392,7 +392,7 @@ except (ImportError, ModuleNotFoundError):
392
392
  # DAEMON_VERSION is the protocol/logic version of the daemon code.
393
393
  # Kept in sync with pyproject.toml version via bump-version.sh.
394
394
  # CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
395
- DAEMON_VERSION = "1.8.7"
395
+ DAEMON_VERSION = "1.8.8"
396
396
 
397
397
 
398
398
  def _detect_client_type() -> str:
@@ -1753,15 +1753,34 @@ class WorkspaceManager:
1753
1753
  key_path_safe = key_path.replace("\\", "/") if sys.platform == "win32" else key_path
1754
1754
  # RC1 (Windows): os.chmod(S_IRUSR) does not set proper NTFS ACLs.
1755
1755
  # Windows OpenSSH rejects keys that aren't exclusively owner-readable
1756
- # ("UNPROTECTED PRIVATE KEY FILE"). StrictModes=no bypasses this.
1756
+ # ("UNPROTECTED PRIVATE KEY FILE"). Fix with icacls to set the ACLs
1757
+ # correctly. NOTE: StrictModes is an sshd_config option (server side);
1758
+ # passing -o StrictModes=no to the SSH client is invalid and causes:
1759
+ # "command-line: line 0: Bad configuration option: strictmodes"
1757
1760
  # RC2 (Windows): /dev/null doesn't exist on Windows native OpenSSH
1758
1761
  # (C:\Windows\System32\OpenSSH\ssh.exe). Use NUL instead.
1759
1762
  if sys.platform == "win32":
1760
1763
  _known_hosts_null = "NUL"
1761
- _strict_modes_opt = " -o StrictModes=no"
1764
+ try:
1765
+ import subprocess as _subp
1766
+ _username = (
1767
+ os.environ.get("USERNAME")
1768
+ or os.environ.get("USER")
1769
+ or ""
1770
+ )
1771
+ if _username:
1772
+ _subp.run(
1773
+ [
1774
+ "icacls", key_path,
1775
+ "/inheritance:r",
1776
+ "/grant:r", f"{_username}:(R)",
1777
+ ],
1778
+ capture_output=True, check=False, timeout=10,
1779
+ )
1780
+ except Exception:
1781
+ pass
1762
1782
  else:
1763
1783
  _known_hosts_null = "/dev/null"
1764
- _strict_modes_opt = ""
1765
1784
  env = {
1766
1785
  **os.environ,
1767
1786
  "GIT_SSH_COMMAND": (
@@ -1777,7 +1796,6 @@ class WorkspaceManager:
1777
1796
  f" -o ConnectTimeout=30"
1778
1797
  f" -o ServerAliveInterval=30"
1779
1798
  f" -o ServerAliveCountMax=3"
1780
- f"{_strict_modes_opt}"
1781
1799
  ),
1782
1800
  }
1783
1801
  except Exception:
@@ -1865,6 +1883,92 @@ class WorkspaceManager:
1865
1883
  # ── Process Manager ──
1866
1884
 
1867
1885
 
1886
+ def _kill_proc(proc: asyncio.subprocess.Process) -> None:
1887
+ """Kill a subprocess and its entire process group.
1888
+
1889
+ A plain ``proc.kill()`` only terminates the direct child; grandchildren
1890
+ (npm, yarn, ssh, git, etc.) spawned by the agent stay alive, keep pipes
1891
+ open, and exhaust system resources. ``os.killpg`` sends SIGKILL to the
1892
+ whole process group, reliably cleaning up all descendants.
1893
+ """
1894
+ try:
1895
+ if sys.platform != "win32":
1896
+ import signal as _signal
1897
+ try:
1898
+ os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
1899
+ except (ProcessLookupError, PermissionError, OSError):
1900
+ pass
1901
+ else:
1902
+ import subprocess as _subprocess
1903
+ _subprocess.run(
1904
+ ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
1905
+ capture_output=True,
1906
+ )
1907
+ except Exception:
1908
+ pass
1909
+ finally:
1910
+ try:
1911
+ proc.kill()
1912
+ except Exception:
1913
+ pass
1914
+
1915
+
1916
+ class _IdleTimeoutError(asyncio.TimeoutError):
1917
+ """Raised when an agent process produces no stdout for longer than AGENT_IDLE_TIMEOUT.
1918
+
1919
+ Subclasses asyncio.TimeoutError so existing ``except asyncio.TimeoutError``
1920
+ handlers catch it, but callers can distinguish it from an absolute wall-clock
1921
+ timeout via ``isinstance(exc, _IdleTimeoutError)`` or ``exc.idle_seconds``.
1922
+ """
1923
+
1924
+ def __init__(self, idle_seconds: float) -> None:
1925
+ super().__init__(f"idle:{idle_seconds:.0f}s")
1926
+ self.idle_seconds = idle_seconds
1927
+
1928
+
1929
+ def _workspace_has_recent_activity(
1930
+ workspace_path: "Path", since_monotonic: float, max_depth: int = 4
1931
+ ) -> bool:
1932
+ """Return True if any file under workspace_path was modified after since_monotonic.
1933
+
1934
+ Converts the monotonic timestamp to a wall-clock value for mtime comparison.
1935
+ Scans the directory tree (up to max_depth levels) with os.scandir, skips
1936
+ .git, and returns on the first matching file for speed.
1937
+
1938
+ This is the key secondary signal used by _stream_process to distinguish
1939
+ "agent is silent but legitimately working" (e.g. running npm install,
1940
+ compiling TypeScript, executing test suites) from "agent is truly hung".
1941
+ All silent-but-busy operations (package installs, compilation, test runs,
1942
+ git operations) write files to disk, so a positive result here means we
1943
+ must NOT kill the process even if stdout has been idle for a long time.
1944
+ """
1945
+ since_wall = time.time() - (time.monotonic() - since_monotonic)
1946
+
1947
+ def _scan(path: "Path", depth: int) -> bool:
1948
+ if depth > max_depth:
1949
+ return False
1950
+ try:
1951
+ for entry in os.scandir(str(path)):
1952
+ if entry.name == ".git":
1953
+ continue # skip version-control metadata
1954
+ try:
1955
+ if entry.stat(follow_symlinks=False).st_mtime > since_wall:
1956
+ return True
1957
+ if entry.is_dir(follow_symlinks=False) and depth < max_depth:
1958
+ if _scan(Path(entry.path), depth + 1):
1959
+ return True
1960
+ except (OSError, PermissionError):
1961
+ pass
1962
+ except (OSError, PermissionError):
1963
+ pass
1964
+ return False
1965
+
1966
+ try:
1967
+ return _scan(workspace_path, 0)
1968
+ except Exception:
1969
+ return False # never let a filesystem check crash the agent run
1970
+
1971
+
1868
1972
  class ProcessManager:
1869
1973
  """Manages Agent CLI subprocess lifecycle."""
1870
1974
 
@@ -2312,6 +2416,7 @@ class ProcessManager:
2312
2416
  timeout: int,
2313
2417
  task_id: str,
2314
2418
  on_chunk: Any,
2419
+ workspace_path: "Path | None" = None,
2315
2420
  ) -> tuple[str, str, int]:
2316
2421
  """Stream stdout line-by-line from a subprocess, flushing to on_chunk.
2317
2422
 
@@ -2323,7 +2428,22 @@ class ProcessManager:
2323
2428
  deadlock when the process fills the stderr buffer.
2324
2429
  - on_chunk(lines) is called with each decoded line so the caller can
2325
2430
  forward to the progress reporter without waiting for completion.
2431
+ - Idle timeout: if the agent produces no stdout for AGENT_IDLE_TIMEOUT
2432
+ seconds the code checks for filesystem activity in workspace_path
2433
+ before deciding to kill. If files were recently modified the agent
2434
+ is doing silent work (npm install, compilation, test runs, etc.) and
2435
+ the idle timer is reset. Only when BOTH stdout AND the filesystem
2436
+ are idle does the process get killed. This eliminates false-positive
2437
+ kills at the idle boundary.
2438
+ - Absolute timeout (``timeout`` param): hard ceiling for zombie-process
2439
+ prevention. Always kills at this boundary (no extension), but logs
2440
+ filesystem activity status for post-mortem observability.
2326
2441
  """
2442
+ idle_timeout: int = getattr(settings, "AGENT_IDLE_TIMEOUT", 600)
2443
+ _start_time = time.monotonic()
2444
+ # Mutable state shared between nested coroutines (list avoids nonlocal)
2445
+ _last_activity_at: list[float] = [time.monotonic()]
2446
+
2327
2447
  # Write prompt and close stdin so the agent knows input is done.
2328
2448
  if stdin_input and proc.stdin:
2329
2449
  try:
@@ -2345,13 +2465,76 @@ class ProcessManager:
2345
2465
  if not proc.stdout:
2346
2466
  return
2347
2467
  while True:
2468
+ # ── Timeout checks ────────────────────────────────────────────
2469
+ now = time.monotonic()
2470
+ elapsed_abs = now - _start_time
2471
+ if elapsed_abs >= timeout:
2472
+ # Absolute ceiling reached — hard zombie-process prevention.
2473
+ # Log filesystem status for observability but always kill;
2474
+ # never extend the absolute boundary.
2475
+ _fs_active = workspace_path and _workspace_has_recent_activity(
2476
+ workspace_path, _last_activity_at[0]
2477
+ )
2478
+ logger.warning(
2479
+ "Task %s absolute timeout %.0fs reached — killing "
2480
+ "(workspace filesystem %s)",
2481
+ task_id, elapsed_abs,
2482
+ "was active" if _fs_active else "was idle",
2483
+ )
2484
+ _kill_proc(proc)
2485
+ raise asyncio.TimeoutError(
2486
+ f"Timed out after {timeout}s (absolute limit)"
2487
+ )
2488
+
2489
+ idle_elapsed = now - _last_activity_at[0]
2490
+ if idle_elapsed >= idle_timeout:
2491
+ # Before killing, check if the agent is doing silent
2492
+ # filesystem work (npm install, compilation, test runs, git
2493
+ # operations, etc.). These produce no stdout but DO modify
2494
+ # files — killing at the idle boundary would be a false
2495
+ # positive. Only kill when BOTH signals agree: no stdout
2496
+ # AND no filesystem activity.
2497
+ if workspace_path and _workspace_has_recent_activity(
2498
+ workspace_path, _last_activity_at[0]
2499
+ ):
2500
+ # Files modified since last stdout → agent is working
2501
+ # silently. Reset idle timer and continue.
2502
+ _last_activity_at[0] = time.monotonic()
2503
+ logger.info(
2504
+ "Task %s: stdout idle %.0fs but workspace files "
2505
+ "modified — resetting idle timer (agent working "
2506
+ "silently)",
2507
+ task_id, idle_elapsed,
2508
+ )
2509
+ else:
2510
+ # No stdout AND no filesystem activity → truly hung.
2511
+ logger.warning(
2512
+ "Task %s agent idle %.0fs — no stdout, no "
2513
+ "filesystem activity; killing hung process",
2514
+ task_id, idle_elapsed,
2515
+ )
2516
+ _kill_proc(proc)
2517
+ raise _IdleTimeoutError(idle_elapsed)
2518
+
2519
+ # Check interval capped at 30s so the loop stays responsive
2520
+ # even when both timeouts are far away.
2521
+ check_interval = min(
2522
+ idle_timeout - idle_elapsed + 0.5, # until idle fires
2523
+ timeout - elapsed_abs + 0.5, # until absolute fires
2524
+ 30.0,
2525
+ )
2526
+
2527
+ # ── Read one line with a bounded wait ─────────────────────────
2348
2528
  try:
2349
- line_bytes = await proc.stdout.readline()
2529
+ line_bytes = await asyncio.wait_for(
2530
+ proc.stdout.readline(), timeout=check_interval
2531
+ )
2532
+ except asyncio.TimeoutError:
2533
+ # readline timed out within check_interval — no new output
2534
+ # yet. Loop back to re-evaluate idle/absolute conditions.
2535
+ continue
2350
2536
  except (ValueError, asyncio.LimitOverrunError, Exception) as exc:
2351
- # Line exceeded stream buffer limit (LimitOverrunError
2352
- # converted to ValueError by readline(), but catch broadly
2353
- # to handle edge cases in different Python versions).
2354
- # Fall back to reading remaining data in bulk.
2537
+ # Line exceeded stream buffer limit — drain remaining bulk.
2355
2538
  logger.warning(
2356
2539
  "Stream read error for task %s (%s: %s), draining remaining output",
2357
2540
  task_id, type(exc).__name__, exc,
@@ -2370,8 +2553,12 @@ class ProcessManager:
2370
2553
  except Exception:
2371
2554
  pass
2372
2555
  break
2556
+
2373
2557
  if not line_bytes:
2374
- break
2558
+ break # EOF — process exited normally
2559
+
2560
+ # ── New output received — reset idle timer ────────────────────
2561
+ _last_activity_at[0] = time.monotonic()
2375
2562
  line = line_bytes.decode(errors="replace").rstrip("\n")
2376
2563
  stdout_lines.append(line)
2377
2564
  if on_chunk:
@@ -2381,35 +2568,17 @@ class ProcessManager:
2381
2568
  pass # never let on_chunk crash the agent run
2382
2569
 
2383
2570
  try:
2571
+ # Outer wait_for uses timeout+idle_timeout as generous safety net.
2572
+ # In practice _read_stdout handles both idle and absolute killing
2573
+ # before this fires.
2384
2574
  await asyncio.wait_for(
2385
2575
  asyncio.gather(_read_stdout(), _read_stderr()),
2386
- timeout=timeout,
2576
+ timeout=timeout + idle_timeout + 60,
2387
2577
  )
2388
- except asyncio.TimeoutError:
2389
- # Kill the entire process group so that child processes (npm, yarn,
2390
- # ssh, git, etc.) spawned by the agent are also terminated. A plain
2391
- # proc.kill() only kills the direct subprocess; any grandchildren
2392
- # become orphaned, keep pipes open, and can exhaust system resources.
2393
- try:
2394
- if sys.platform != "win32":
2395
- import signal as _signal
2396
- try:
2397
- os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2398
- except (ProcessLookupError, PermissionError, OSError):
2399
- pass
2400
- else:
2401
- import subprocess as _subprocess
2402
- _subprocess.run(
2403
- ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2404
- capture_output=True,
2405
- )
2406
- except Exception:
2407
- pass
2408
- finally:
2409
- try:
2410
- proc.kill()
2411
- except Exception:
2412
- pass
2578
+ except asyncio.TimeoutError as _exc:
2579
+ # This branch fires if the outer safety net triggers (extremely
2580
+ # rare) or if _IdleTimeoutError propagates from _read_stdout.
2581
+ _kill_proc(proc)
2413
2582
  # Drain any remaining output after kill
2414
2583
  try:
2415
2584
  remaining, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
@@ -2419,7 +2588,7 @@ class ProcessManager:
2419
2588
  stdout_lines.append(line)
2420
2589
  except Exception:
2421
2590
  pass
2422
- raise # re-raise so callers can set result.error
2591
+ raise # re-raise (_IdleTimeoutError preserves subclass type)
2423
2592
 
2424
2593
  await proc.wait()
2425
2594
  stdout = "\n".join(stdout_lines)
@@ -2480,7 +2649,8 @@ class ProcessManager:
2480
2649
  )
2481
2650
  self.active_processes[task_id] = proc
2482
2651
  stdout, stderr, returncode = await self._stream_process(
2483
- proc, prompt.encode(), timeout, task_id, on_chunk
2652
+ proc, prompt.encode(), timeout, task_id, on_chunk,
2653
+ workspace_path=cwd,
2484
2654
  )
2485
2655
 
2486
2656
  # Parse Claude JSON output for metrics
@@ -2521,13 +2691,13 @@ class ProcessManager:
2521
2691
  error=f"Claude exited with code {returncode}: {stderr[-500:]}",
2522
2692
  metrics=metrics,
2523
2693
  )
2524
- except asyncio.TimeoutError:
2525
- if task_id in self.active_processes:
2526
- self.active_processes[task_id].kill()
2527
- return TaskResult(
2528
- status="failed", exit_code=-1, stdout="", stderr="",
2529
- error=f"Timed out after {timeout}s",
2530
- )
2694
+ except asyncio.TimeoutError as exc:
2695
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2696
+ _err = (
2697
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2698
+ "Task may require more context decomposition or a different agent."
2699
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2700
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2531
2701
  except Exception as exc:
2532
2702
  logger.exception("Claude stream error for task %s", task_id)
2533
2703
  if task_id in self.active_processes:
@@ -2697,7 +2867,8 @@ class ProcessManager:
2697
2867
  )
2698
2868
  self.active_processes[task_id] = proc
2699
2869
  stdout, stderr, returncode = await self._stream_process(
2700
- proc, None, timeout, task_id, on_chunk
2870
+ proc, None, timeout, task_id, on_chunk,
2871
+ workspace_path=cwd,
2701
2872
  )
2702
2873
 
2703
2874
  # Parse copilot JSONL output for metrics
@@ -2725,13 +2896,13 @@ class ProcessManager:
2725
2896
  error=f"Copilot exited with code {effective_rc}: {stderr[-500:]}",
2726
2897
  metrics=metrics,
2727
2898
  )
2728
- except asyncio.TimeoutError:
2729
- if task_id in self.active_processes:
2730
- self.active_processes[task_id].kill()
2731
- return TaskResult(
2732
- status="failed", exit_code=-1, stdout="", stderr="",
2733
- error=f"Timed out after {timeout}s",
2734
- )
2899
+ except asyncio.TimeoutError as exc:
2900
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2901
+ _err = (
2902
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2903
+ "Task may require more context decomposition or a different agent."
2904
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2905
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2735
2906
  except Exception as exc:
2736
2907
  logger.exception("Copilot stream error for task %s", task_id)
2737
2908
  if task_id in self.active_processes:
@@ -2772,7 +2943,8 @@ class ProcessManager:
2772
2943
  self.active_processes[task_id] = proc
2773
2944
  stdin_bytes = stdin_input.encode() if stdin_input else None
2774
2945
  stdout, stderr, returncode = await self._stream_process(
2775
- proc, stdin_bytes, timeout, task_id, on_chunk
2946
+ proc, stdin_bytes, timeout, task_id, on_chunk,
2947
+ workspace_path=cwd,
2776
2948
  )
2777
2949
  status = "success" if returncode == 0 else "failed"
2778
2950
  return TaskResult(
@@ -2782,33 +2954,13 @@ class ProcessManager:
2782
2954
  stderr=stderr[-10000:],
2783
2955
  error="" if status == "success" else f"Exited with code {returncode}",
2784
2956
  )
2785
- except asyncio.TimeoutError:
2786
- proc = self.active_processes.pop(task_id, None)
2787
- if proc:
2788
- try:
2789
- if sys.platform != "win32":
2790
- import signal as _signal
2791
- try:
2792
- os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2793
- except (ProcessLookupError, PermissionError, OSError):
2794
- pass
2795
- else:
2796
- import subprocess as _subprocess
2797
- _subprocess.run(
2798
- ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2799
- capture_output=True,
2800
- )
2801
- except Exception:
2802
- pass
2803
- finally:
2804
- try:
2805
- proc.kill()
2806
- except Exception:
2807
- pass
2808
- return TaskResult(
2809
- status="failed", exit_code=-1, stdout="", stderr="",
2810
- error=f"Timed out after {timeout}s",
2811
- )
2957
+ except asyncio.TimeoutError as exc:
2958
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2959
+ _err = (
2960
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2961
+ "Task may require more context decomposition or a different agent."
2962
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2963
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2812
2964
  except Exception as exc:
2813
2965
  logger.exception("CLI stream error for task %s", task_id)
2814
2966
  if task_id in self.active_processes:
@@ -4250,6 +4402,65 @@ class RuntimeDaemon:
4250
4402
  )
4251
4403
  logger.info("Workspace ready: %s", workspace_path)
4252
4404
 
4405
+ # 2.5 Wipe the analysis output directory on fresh analysis so the new
4406
+ # agent run starts from a completely clean slate. This covers:
4407
+ # • Type change: removes old-type files (e.g. PRD.md/SDD.md) so they
4408
+ # don't coexist with the new type's files (e.g. diagnosis.md).
4409
+ # • Same-type fresh re-analysis: removes extra files the agent may
4410
+ # have written that fall outside the expected type profile.
4411
+ # Using a whole-directory wipe is more reliable than the old
4412
+ # cleanup_stale_docs approach (which only deleted known-profile files).
4413
+ if task.node_type == "analysis" and (
4414
+ task.input_data.get("wipe_analysis_dir")
4415
+ # Backwards-compat: older server versions send cleanup_stale_docs
4416
+ or task.input_data.get("cleanup_stale_docs")
4417
+ # Also wipe if analysis_mode is explicitly "fresh" (belt+suspenders)
4418
+ or task.input_data.get("analysis_mode") == "fresh"
4419
+ ):
4420
+ output_dir_raw = (
4421
+ task.input_data.get("analysis_output_dir")
4422
+ or task.input_data.get("output_dir")
4423
+ or ""
4424
+ )
4425
+ output_dir_norm = str(output_dir_raw).replace("\\", "/").lstrip("./").rstrip("/")
4426
+ if output_dir_norm:
4427
+ dir_to_wipe = workspace_path / output_dir_norm
4428
+ if dir_to_wipe.is_dir():
4429
+ existing_files = [f for f in dir_to_wipe.iterdir() if f.is_file()]
4430
+ if existing_files:
4431
+ try:
4432
+ # Stage all deletions with git rm
4433
+ await self._git(
4434
+ "rm", "-r", "--cached", "--ignore-unmatch",
4435
+ output_dir_norm,
4436
+ cwd=workspace_path,
4437
+ )
4438
+ # Remove physical files
4439
+ shutil.rmtree(str(dir_to_wipe), ignore_errors=True)
4440
+ # Commit the wipe so the branch diff is clean
4441
+ await self._git(
4442
+ "-c", "user.name=Forgexa Agent",
4443
+ "-c", "user.email=agent@forgexa.net",
4444
+ "commit", "-m",
4445
+ f"cleanup: wipe analysis docs in {output_dir_norm} before fresh re-analysis",
4446
+ cwd=workspace_path,
4447
+ )
4448
+ logger.info(
4449
+ "Wiped %d analysis doc(s) from %s for task %s (fresh analysis)",
4450
+ len(existing_files), output_dir_norm, task.task_id,
4451
+ )
4452
+ except Exception:
4453
+ logger.warning(
4454
+ "Could not wipe analysis dir %s for task %s "
4455
+ "(proceeding anyway — agent will overwrite)",
4456
+ output_dir_norm, task.task_id, exc_info=True,
4457
+ )
4458
+ else:
4459
+ logger.debug(
4460
+ "Analysis dir %s is already empty for task %s",
4461
+ output_dir_norm, task.task_id,
4462
+ )
4463
+
4253
4464
  # 3. Run agent with real-time output streaming + periodic progress heartbeat
4254
4465
  await reporter.report_progress(task.task_id, 10, "running_agent")
4255
4466
 
@@ -4447,26 +4658,49 @@ class RuntimeDaemon:
4447
4658
  f"(node_type={task.node_type}, agent={agent.agent_id})"
4448
4659
  )
4449
4660
 
4450
- # 4.1 Recovery: agent exited non-zero but already committed code
4451
- # (e.g. OpenCode EBADF crash on exit after successful work)
4452
- if result.status == "failed" and result.exit_code not in (None, -1):
4453
- committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
4454
- has_committed_changes = bool(committed_git.get("files_changed"))
4455
- has_no_uncommitted = not pre_commit_git.get("files_changed")
4456
- has_tokens = (
4457
- int(result.metrics.get("token_input", 0) or 0)
4458
- + int(result.metrics.get("token_output", 0) or 0)
4459
- ) > 0
4460
- has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
4461
- if has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output):
4462
- logger.warning(
4463
- "Task %s agent exited with code %s but has committed changes — "
4464
- "recovering as success (agent likely crashed during cleanup)",
4465
- task.task_id, result.exit_code,
4661
+ # 4.1 Recovery: agent exited non-zero but already committed code.
4662
+ # Covers two scenarios:
4663
+ # A. Process crash (e.g. OpenCode EBADF on exit after successful work):
4664
+ # exit_code is a real non-negative/non-(-1) value.
4665
+ # B. Timeout with committed work: agent finished its task and committed
4666
+ # before the idle/absolute timeout fired. exit_code=-1 but the commits
4667
+ # are real — don't discard them.
4668
+ if result.status == "failed":
4669
+ _error_lower = (result.error or "").lower()
4670
+ is_timeout_failure = (
4671
+ "idle for" in _error_lower
4672
+ or "timed out" in _error_lower
4673
+ or "absolute limit" in _error_lower
4674
+ )
4675
+ can_attempt_recovery = (
4676
+ is_timeout_failure # timeout: also allow exit_code=-1
4677
+ or result.exit_code not in (None, -1) # crash: original guard
4678
+ )
4679
+ if can_attempt_recovery:
4680
+ committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
4681
+ has_committed_changes = bool(committed_git.get("files_changed"))
4682
+ has_no_uncommitted = not pre_commit_git.get("files_changed")
4683
+ has_tokens = (
4684
+ int(result.metrics.get("token_input", 0) or 0)
4685
+ + int(result.metrics.get("token_output", 0) or 0)
4686
+ ) > 0
4687
+ has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
4688
+ # Timeout recovery requires stronger evidence: committed work + tokens.
4689
+ # Crash recovery (original): committed + (tokens OR meaningful output).
4690
+ sufficient_evidence = (
4691
+ (has_committed_changes and has_no_uncommitted and has_tokens and has_meaningful_output)
4692
+ if is_timeout_failure
4693
+ else (has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output))
4466
4694
  )
4467
- result.status = "success"
4468
- result.error = ""
4469
- result.metrics["recovered_from_exit_code"] = result.exit_code
4695
+ if sufficient_evidence:
4696
+ _reason = "timed out but already committed changes" if is_timeout_failure else f"exited with code {result.exit_code}"
4697
+ logger.warning(
4698
+ "Task %s agent %s — recovering as success",
4699
+ task.task_id, _reason,
4700
+ )
4701
+ result.status = "success"
4702
+ result.error = ""
4703
+ result.metrics["recovered_from_exit_code"] = result.exit_code
4470
4704
 
4471
4705
  # 4.5 Layer 2: Validation gate — check outputs before committing
4472
4706
  if result.status == "success":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.8.7
3
+ Version: 1.8.8
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "forgexa-cli"
3
- version = "1.8.7"
3
+ version = "1.8.8"
4
4
  description = "Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform"
5
5
  requires-python = ">=3.9"
6
6
  license = { text = "MIT" }
File without changes
File without changes