forgexa-cli 1.8.6__tar.gz → 1.8.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.8.6
3
+ Version: 1.8.8
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,2 +1,2 @@
1
1
  """forgexa-cli — Forgexa command-line client."""
2
- __version__ = "1.8.6"
2
+ __version__ = "1.8.8"
@@ -392,7 +392,7 @@ except (ImportError, ModuleNotFoundError):
392
392
  # DAEMON_VERSION is the protocol/logic version of the daemon code.
393
393
  # Kept in sync with pyproject.toml version via bump-version.sh.
394
394
  # CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
395
- DAEMON_VERSION = "1.8.6"
395
+ DAEMON_VERSION = "1.8.8"
396
396
 
397
397
 
398
398
  def _detect_client_type() -> str:
@@ -648,6 +648,10 @@ _ANALYSIS_OUTPUTS_BY_TYPE: dict[str, list[str]] = {
648
648
  "documentation": ["outline.md", "analysis.json"],
649
649
  "improvement": ["improvement-spec.md", "TASKS.md", "analysis.json", "test-intent.json"],
650
650
  "task": ["task-plan.md", "analysis.json"],
651
+ # Research / feasibility study — no PRD/SDD/TASKS, only a research plan and metadata
652
+ "spike": ["research.md", "analysis.json"],
653
+ # Customer support Q&A — lightweight answer doc + metadata only
654
+ "faq": ["faq-answer.md", "analysis.json"],
651
655
  }
652
656
 
653
657
 
@@ -1123,17 +1127,34 @@ class WorkspaceManager:
1123
1127
  )
1124
1128
 
1125
1129
  if repo_url:
1130
+ # For non-fresh (refine/continuation) nodes, expand expect_branch to
1131
+ # cover any node that is part of a real requirement workflow AND is not
1132
+ # the initial analysis. This ensures a hard error (and workspace
1133
+ # re-clone) when the branch sync fails, rather than silently proceeding
1134
+ # with a stale workspace that will cause a non-fast-forward push later.
1135
+ expect_branch = bool(task.analysis_branch) or (
1136
+ bool(task.requirement_key) and not is_fresh_start and task.node_type != "analysis"
1137
+ )
1126
1138
  ws_path = await self._create_worktree(
1127
1139
  project_dir, repo_url, default_branch, workspace_key, branch_name,
1128
1140
  fresh_start=is_fresh_start,
1129
1141
  project_key=project_key,
1130
- expect_branch=bool(task.analysis_branch),
1142
+ expect_branch=expect_branch,
1131
1143
  )
1132
- # Refine mode: ensure we're on the analysis branch with its history
1133
- # (not reset to default_branch)
1134
- if analysis_mode == "refine" and task.node_type == "analysis":
1144
+ # After workspace creation, perform a final branch-specific fetch + reset
1145
+ # to ensure the working tree is at the absolute latest remote state.
1146
+ # This is critical in two scenarios:
1147
+ # 1. Analysis refine mode: must be on the analysis branch history.
1148
+ # 2. All continuation nodes (design/coding/testing): another runtime
1149
+ # may have pushed commits while this runtime's agent was executing.
1150
+ # A final sync here keeps the workspace current so the agent works
1151
+ # on the latest codebase and avoids non-fast-forward push failures.
1152
+ if not is_fresh_start:
1135
1153
  try:
1136
- await self._git("fetch", "origin", cwd=ws_path, project_key=project_key)
1154
+ await self._git(
1155
+ "fetch", "origin", branch_name,
1156
+ cwd=ws_path, project_key=project_key,
1157
+ )
1137
1158
  except RuntimeError:
1138
1159
  pass
1139
1160
  try:
@@ -1141,15 +1162,28 @@ class WorkspaceManager:
1141
1162
  await self._git("checkout", branch_name, cwd=ws_path)
1142
1163
  except RuntimeError:
1143
1164
  pass
1144
- # Pull latest from remote branch if it exists (preserves prior commits)
1165
+ # Use --ff-only to keep only fast-forward changes; if the branch has
1166
+ # diverged (force-pushed by prior phase), reset --hard is used below.
1167
+ pulled = False
1145
1168
  try:
1146
1169
  await self._git(
1147
1170
  "pull", "--ff-only", "origin", branch_name,
1148
1171
  cwd=ws_path, project_key=project_key,
1149
1172
  )
1173
+ pulled = True
1150
1174
  except RuntimeError:
1151
- # Remote branch might not exist yet or has diverged; that's OK
1152
1175
  pass
1176
+ if not pulled:
1177
+ # ff-only failed (diverged or remote not yet created) — try
1178
+ # reset --hard to force-sync to whatever the remote has.
1179
+ try:
1180
+ await self._git(
1181
+ "reset", "--hard", f"origin/{branch_name}",
1182
+ cwd=ws_path,
1183
+ )
1184
+ except RuntimeError:
1185
+ # Remote branch may not exist yet (first analysis on fresh repo)
1186
+ pass
1153
1187
  return ws_path
1154
1188
  else:
1155
1189
  # No repo — create a directory with git init for change tracking
@@ -1292,18 +1326,29 @@ class WorkspaceManager:
1292
1326
  # only fetches the default branch. Explicitly fetch the
1293
1327
  # feature branch with a full refspec so that
1294
1328
  # origin/{branch_name} is available for checkout/reset.
1329
+ _last_sync_err: str = ""
1295
1330
  try:
1296
1331
  await self._git("fetch", "origin", cwd=ws_path, project_key=project_key)
1297
- except RuntimeError:
1298
- pass
1332
+ except RuntimeError as _pre_fe:
1333
+ logger.warning(
1334
+ "fetch origin failed for worktree %s: %s "
1335
+ "(likely auth/SSH issue — will retry in sync loop)",
1336
+ ws_path, _pre_fe,
1337
+ )
1338
+ _last_sync_err = str(_pre_fe)[:300]
1299
1339
  try:
1300
1340
  await self._git(
1301
1341
  "fetch", "origin",
1302
1342
  f"{branch_name}:refs/remotes/origin/{branch_name}",
1303
1343
  cwd=ws_path, project_key=project_key,
1304
1344
  )
1305
- except RuntimeError:
1306
- pass # Branch may not exist on remote yet
1345
+ except RuntimeError as _pre_fe2:
1346
+ logger.warning(
1347
+ "fetch branch %s failed for worktree %s: %s "
1348
+ "(likely auth/SSH issue — will retry in sync loop)",
1349
+ branch_name, ws_path, _pre_fe2,
1350
+ )
1351
+ _last_sync_err = str(_pre_fe2)[:300]
1307
1352
 
1308
1353
  if fresh_start:
1309
1354
  # Safety check: if the branch already exists on remote with
@@ -1392,8 +1437,12 @@ class WorkspaceManager:
1392
1437
  cwd=ws_path,
1393
1438
  project_key=project_key,
1394
1439
  )
1395
- except RuntimeError:
1396
- pass
1440
+ except RuntimeError as _sf:
1441
+ logger.warning(
1442
+ "Re-fetch %s failed (attempt %d): %s",
1443
+ branch_name, _sync_attempt + 1, _sf,
1444
+ )
1445
+ _last_sync_err = str(_sf)[:300]
1397
1446
  continue
1398
1447
  else:
1399
1448
  logger.warning("Failed to checkout %s after retries: %s", branch_name, exc)
@@ -1419,8 +1468,12 @@ class WorkspaceManager:
1419
1468
  cwd=ws_path,
1420
1469
  project_key=project_key,
1421
1470
  )
1422
- except RuntimeError:
1423
- pass
1471
+ except RuntimeError as _sf2:
1472
+ logger.warning(
1473
+ "Re-fetch %s failed (attempt %d): %s",
1474
+ branch_name, _sync_attempt + 1, _sf2,
1475
+ )
1476
+ _last_sync_err = str(_sf2)[:300]
1424
1477
  else:
1425
1478
  logger.warning(
1426
1479
  "Could not reset to origin/%s after retries: %s — "
@@ -1448,10 +1501,28 @@ class WorkspaceManager:
1448
1501
  f"Stale local clone discarded. "
1449
1502
  f"The task will be retried with a fresh clone."
1450
1503
  )
1504
+ # Destroy the stale worktree before raising so the
1505
+ # next retry can re-create it fresh from origin.
1506
+ # Without this, every retry hits the same broken state.
1507
+ try:
1508
+ await self._remove_broken_worktree(
1509
+ main_repo, ws_path, workspace_key
1510
+ )
1511
+ logger.info(
1512
+ "Removed stale worktree %s — retry will re-clone from origin",
1513
+ ws_path,
1514
+ )
1515
+ except Exception as _rm_exc:
1516
+ logger.warning("Could not remove stale worktree %s: %s", ws_path, _rm_exc)
1517
+ _err_detail = (
1518
+ f"Git error: {_last_sync_err}" if _last_sync_err
1519
+ else "fetch timed out or credentials missing/invalid"
1520
+ )
1451
1521
  raise RuntimeError(
1452
- f"Failed to sync branch '{branch_name}' from remote after 3 attempts. "
1522
+ f"Failed to sync branch '{branch_name}' from remote after 3 attempts "
1523
+ f"({_err_detail}). "
1453
1524
  f"The branch should exist (pushed by prior analysis/design phase). "
1454
- f"This task will be retried by the orchestrator."
1525
+ f"Stale local workspace discarded — this task will be retried by the orchestrator."
1455
1526
  )
1456
1527
  else:
1457
1528
  logger.warning(
@@ -1680,12 +1751,42 @@ class WorkspaceManager:
1680
1751
  # interprets backslashes as escape sequences, corrupting the
1681
1752
  # path (e.g. C:\Users → C:Users).
1682
1753
  key_path_safe = key_path.replace("\\", "/") if sys.platform == "win32" else key_path
1754
+ # RC1 (Windows): os.chmod(S_IRUSR) does not set proper NTFS ACLs.
1755
+ # Windows OpenSSH rejects keys that aren't exclusively owner-readable
1756
+ # ("UNPROTECTED PRIVATE KEY FILE"). Fix with icacls to set the ACLs
1757
+ # correctly. NOTE: StrictModes is an sshd_config option (server side);
1758
+ # passing -o StrictModes=no to the SSH client is invalid and causes:
1759
+ # "command-line: line 0: Bad configuration option: strictmodes"
1760
+ # RC2 (Windows): /dev/null doesn't exist on Windows native OpenSSH
1761
+ # (C:\Windows\System32\OpenSSH\ssh.exe). Use NUL instead.
1762
+ if sys.platform == "win32":
1763
+ _known_hosts_null = "NUL"
1764
+ try:
1765
+ import subprocess as _subp
1766
+ _username = (
1767
+ os.environ.get("USERNAME")
1768
+ or os.environ.get("USER")
1769
+ or ""
1770
+ )
1771
+ if _username:
1772
+ _subp.run(
1773
+ [
1774
+ "icacls", key_path,
1775
+ "/inheritance:r",
1776
+ "/grant:r", f"{_username}:(R)",
1777
+ ],
1778
+ capture_output=True, check=False, timeout=10,
1779
+ )
1780
+ except Exception:
1781
+ pass
1782
+ else:
1783
+ _known_hosts_null = "/dev/null"
1683
1784
  env = {
1684
1785
  **os.environ,
1685
1786
  "GIT_SSH_COMMAND": (
1686
1787
  f'ssh -i "{key_path_safe}"'
1687
1788
  f" -o StrictHostKeyChecking=accept-new"
1688
- f" -o UserKnownHostsFile=/dev/null"
1789
+ f" -o UserKnownHostsFile={_known_hosts_null}"
1689
1790
  f" -o IdentitiesOnly=yes"
1690
1791
  # Detect a stalled TCP connection (server accepts but
1691
1792
  # never sends the git protocol banner). After 30 s of
@@ -1782,6 +1883,92 @@ class WorkspaceManager:
1782
1883
  # ── Process Manager ──
1783
1884
 
1784
1885
 
1886
+ def _kill_proc(proc: asyncio.subprocess.Process) -> None:
1887
+ """Kill a subprocess and its entire process group.
1888
+
1889
+ A plain ``proc.kill()`` only terminates the direct child; grandchildren
1890
+ (npm, yarn, ssh, git, etc.) spawned by the agent stay alive, keep pipes
1891
+ open, and exhaust system resources. ``os.killpg`` sends SIGKILL to the
1892
+ whole process group, reliably cleaning up all descendants.
1893
+ """
1894
+ try:
1895
+ if sys.platform != "win32":
1896
+ import signal as _signal
1897
+ try:
1898
+ os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
1899
+ except (ProcessLookupError, PermissionError, OSError):
1900
+ pass
1901
+ else:
1902
+ import subprocess as _subprocess
1903
+ _subprocess.run(
1904
+ ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
1905
+ capture_output=True,
1906
+ )
1907
+ except Exception:
1908
+ pass
1909
+ finally:
1910
+ try:
1911
+ proc.kill()
1912
+ except Exception:
1913
+ pass
1914
+
1915
+
1916
+ class _IdleTimeoutError(asyncio.TimeoutError):
1917
+ """Raised when an agent process produces no stdout for longer than AGENT_IDLE_TIMEOUT.
1918
+
1919
+ Subclasses asyncio.TimeoutError so existing ``except asyncio.TimeoutError``
1920
+ handlers catch it, but callers can distinguish it from an absolute wall-clock
1921
+ timeout via ``isinstance(exc, _IdleTimeoutError)`` or ``exc.idle_seconds``.
1922
+ """
1923
+
1924
+ def __init__(self, idle_seconds: float) -> None:
1925
+ super().__init__(f"idle:{idle_seconds:.0f}s")
1926
+ self.idle_seconds = idle_seconds
1927
+
1928
+
1929
+ def _workspace_has_recent_activity(
1930
+ workspace_path: "Path", since_monotonic: float, max_depth: int = 4
1931
+ ) -> bool:
1932
+ """Return True if any file under workspace_path was modified after since_monotonic.
1933
+
1934
+ Converts the monotonic timestamp to a wall-clock value for mtime comparison.
1935
+ Scans the directory tree (up to max_depth levels) with os.scandir, skips
1936
+ .git, and returns on the first matching file for speed.
1937
+
1938
+ This is the key secondary signal used by _stream_process to distinguish
1939
+ "agent is silent but legitimately working" (e.g. running npm install,
1940
+ compiling TypeScript, executing test suites) from "agent is truly hung".
1941
+ All silent-but-busy operations (package installs, compilation, test runs,
1942
+ git operations) write files to disk, so a positive result here means we
1943
+ must NOT kill the process even if stdout has been idle for a long time.
1944
+ """
1945
+ since_wall = time.time() - (time.monotonic() - since_monotonic)
1946
+
1947
+ def _scan(path: "Path", depth: int) -> bool:
1948
+ if depth > max_depth:
1949
+ return False
1950
+ try:
1951
+ for entry in os.scandir(str(path)):
1952
+ if entry.name == ".git":
1953
+ continue # skip version-control metadata
1954
+ try:
1955
+ if entry.stat(follow_symlinks=False).st_mtime > since_wall:
1956
+ return True
1957
+ if entry.is_dir(follow_symlinks=False) and depth < max_depth:
1958
+ if _scan(Path(entry.path), depth + 1):
1959
+ return True
1960
+ except (OSError, PermissionError):
1961
+ pass
1962
+ except (OSError, PermissionError):
1963
+ pass
1964
+ return False
1965
+
1966
+ try:
1967
+ return _scan(workspace_path, 0)
1968
+ except Exception:
1969
+ return False # never let a filesystem check crash the agent run
1970
+
1971
+
1785
1972
  class ProcessManager:
1786
1973
  """Manages Agent CLI subprocess lifecycle."""
1787
1974
 
@@ -2229,6 +2416,7 @@ class ProcessManager:
2229
2416
  timeout: int,
2230
2417
  task_id: str,
2231
2418
  on_chunk: Any,
2419
+ workspace_path: "Path | None" = None,
2232
2420
  ) -> tuple[str, str, int]:
2233
2421
  """Stream stdout line-by-line from a subprocess, flushing to on_chunk.
2234
2422
 
@@ -2240,7 +2428,22 @@ class ProcessManager:
2240
2428
  deadlock when the process fills the stderr buffer.
2241
2429
  - on_chunk(lines) is called with each decoded line so the caller can
2242
2430
  forward to the progress reporter without waiting for completion.
2431
+ - Idle timeout: if the agent produces no stdout for AGENT_IDLE_TIMEOUT
2432
+ seconds the code checks for filesystem activity in workspace_path
2433
+ before deciding to kill. If files were recently modified the agent
2434
+ is doing silent work (npm install, compilation, test runs, etc.) and
2435
+ the idle timer is reset. Only when BOTH stdout AND the filesystem
2436
+ are idle does the process get killed. This eliminates false-positive
2437
+ kills at the idle boundary.
2438
+ - Absolute timeout (``timeout`` param): hard ceiling for zombie-process
2439
+ prevention. Always kills at this boundary (no extension), but logs
2440
+ filesystem activity status for post-mortem observability.
2243
2441
  """
2442
+ idle_timeout: int = getattr(settings, "AGENT_IDLE_TIMEOUT", 600)
2443
+ _start_time = time.monotonic()
2444
+ # Mutable state shared between nested coroutines (list avoids nonlocal)
2445
+ _last_activity_at: list[float] = [time.monotonic()]
2446
+
2244
2447
  # Write prompt and close stdin so the agent knows input is done.
2245
2448
  if stdin_input and proc.stdin:
2246
2449
  try:
@@ -2262,13 +2465,76 @@ class ProcessManager:
2262
2465
  if not proc.stdout:
2263
2466
  return
2264
2467
  while True:
2468
+ # ── Timeout checks ────────────────────────────────────────────
2469
+ now = time.monotonic()
2470
+ elapsed_abs = now - _start_time
2471
+ if elapsed_abs >= timeout:
2472
+ # Absolute ceiling reached — hard zombie-process prevention.
2473
+ # Log filesystem status for observability but always kill;
2474
+ # never extend the absolute boundary.
2475
+ _fs_active = workspace_path and _workspace_has_recent_activity(
2476
+ workspace_path, _last_activity_at[0]
2477
+ )
2478
+ logger.warning(
2479
+ "Task %s absolute timeout %.0fs reached — killing "
2480
+ "(workspace filesystem %s)",
2481
+ task_id, elapsed_abs,
2482
+ "was active" if _fs_active else "was idle",
2483
+ )
2484
+ _kill_proc(proc)
2485
+ raise asyncio.TimeoutError(
2486
+ f"Timed out after {timeout}s (absolute limit)"
2487
+ )
2488
+
2489
+ idle_elapsed = now - _last_activity_at[0]
2490
+ if idle_elapsed >= idle_timeout:
2491
+ # Before killing, check if the agent is doing silent
2492
+ # filesystem work (npm install, compilation, test runs, git
2493
+ # operations, etc.). These produce no stdout but DO modify
2494
+ # files — killing at the idle boundary would be a false
2495
+ # positive. Only kill when BOTH signals agree: no stdout
2496
+ # AND no filesystem activity.
2497
+ if workspace_path and _workspace_has_recent_activity(
2498
+ workspace_path, _last_activity_at[0]
2499
+ ):
2500
+ # Files modified since last stdout → agent is working
2501
+ # silently. Reset idle timer and continue.
2502
+ _last_activity_at[0] = time.monotonic()
2503
+ logger.info(
2504
+ "Task %s: stdout idle %.0fs but workspace files "
2505
+ "modified — resetting idle timer (agent working "
2506
+ "silently)",
2507
+ task_id, idle_elapsed,
2508
+ )
2509
+ else:
2510
+ # No stdout AND no filesystem activity → truly hung.
2511
+ logger.warning(
2512
+ "Task %s agent idle %.0fs — no stdout, no "
2513
+ "filesystem activity; killing hung process",
2514
+ task_id, idle_elapsed,
2515
+ )
2516
+ _kill_proc(proc)
2517
+ raise _IdleTimeoutError(idle_elapsed)
2518
+
2519
+ # Check interval capped at 30s so the loop stays responsive
2520
+ # even when both timeouts are far away.
2521
+ check_interval = min(
2522
+ idle_timeout - idle_elapsed + 0.5, # until idle fires
2523
+ timeout - elapsed_abs + 0.5, # until absolute fires
2524
+ 30.0,
2525
+ )
2526
+
2527
+ # ── Read one line with a bounded wait ─────────────────────────
2265
2528
  try:
2266
- line_bytes = await proc.stdout.readline()
2529
+ line_bytes = await asyncio.wait_for(
2530
+ proc.stdout.readline(), timeout=check_interval
2531
+ )
2532
+ except asyncio.TimeoutError:
2533
+ # readline timed out within check_interval — no new output
2534
+ # yet. Loop back to re-evaluate idle/absolute conditions.
2535
+ continue
2267
2536
  except (ValueError, asyncio.LimitOverrunError, Exception) as exc:
2268
- # Line exceeded stream buffer limit (LimitOverrunError
2269
- # converted to ValueError by readline(), but catch broadly
2270
- # to handle edge cases in different Python versions).
2271
- # Fall back to reading remaining data in bulk.
2537
+ # Line exceeded stream buffer limit — drain remaining bulk.
2272
2538
  logger.warning(
2273
2539
  "Stream read error for task %s (%s: %s), draining remaining output",
2274
2540
  task_id, type(exc).__name__, exc,
@@ -2287,8 +2553,12 @@ class ProcessManager:
2287
2553
  except Exception:
2288
2554
  pass
2289
2555
  break
2556
+
2290
2557
  if not line_bytes:
2291
- break
2558
+ break # EOF — process exited normally
2559
+
2560
+ # ── New output received — reset idle timer ────────────────────
2561
+ _last_activity_at[0] = time.monotonic()
2292
2562
  line = line_bytes.decode(errors="replace").rstrip("\n")
2293
2563
  stdout_lines.append(line)
2294
2564
  if on_chunk:
@@ -2298,35 +2568,17 @@ class ProcessManager:
2298
2568
  pass # never let on_chunk crash the agent run
2299
2569
 
2300
2570
  try:
2571
+ # Outer wait_for uses timeout+idle_timeout as generous safety net.
2572
+ # In practice _read_stdout handles both idle and absolute killing
2573
+ # before this fires.
2301
2574
  await asyncio.wait_for(
2302
2575
  asyncio.gather(_read_stdout(), _read_stderr()),
2303
- timeout=timeout,
2576
+ timeout=timeout + idle_timeout + 60,
2304
2577
  )
2305
- except asyncio.TimeoutError:
2306
- # Kill the entire process group so that child processes (npm, yarn,
2307
- # ssh, git, etc.) spawned by the agent are also terminated. A plain
2308
- # proc.kill() only kills the direct subprocess; any grandchildren
2309
- # become orphaned, keep pipes open, and can exhaust system resources.
2310
- try:
2311
- if sys.platform != "win32":
2312
- import signal as _signal
2313
- try:
2314
- os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2315
- except (ProcessLookupError, PermissionError, OSError):
2316
- pass
2317
- else:
2318
- import subprocess as _subprocess
2319
- _subprocess.run(
2320
- ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2321
- capture_output=True,
2322
- )
2323
- except Exception:
2324
- pass
2325
- finally:
2326
- try:
2327
- proc.kill()
2328
- except Exception:
2329
- pass
2578
+ except asyncio.TimeoutError as _exc:
2579
+ # This branch fires if the outer safety net triggers (extremely
2580
+ # rare) or if _IdleTimeoutError propagates from _read_stdout.
2581
+ _kill_proc(proc)
2330
2582
  # Drain any remaining output after kill
2331
2583
  try:
2332
2584
  remaining, _ = await asyncio.wait_for(proc.communicate(), timeout=5)
@@ -2336,7 +2588,7 @@ class ProcessManager:
2336
2588
  stdout_lines.append(line)
2337
2589
  except Exception:
2338
2590
  pass
2339
- raise # re-raise so callers can set result.error
2591
+ raise # re-raise (_IdleTimeoutError preserves subclass type)
2340
2592
 
2341
2593
  await proc.wait()
2342
2594
  stdout = "\n".join(stdout_lines)
@@ -2397,7 +2649,8 @@ class ProcessManager:
2397
2649
  )
2398
2650
  self.active_processes[task_id] = proc
2399
2651
  stdout, stderr, returncode = await self._stream_process(
2400
- proc, prompt.encode(), timeout, task_id, on_chunk
2652
+ proc, prompt.encode(), timeout, task_id, on_chunk,
2653
+ workspace_path=cwd,
2401
2654
  )
2402
2655
 
2403
2656
  # Parse Claude JSON output for metrics
@@ -2438,13 +2691,13 @@ class ProcessManager:
2438
2691
  error=f"Claude exited with code {returncode}: {stderr[-500:]}",
2439
2692
  metrics=metrics,
2440
2693
  )
2441
- except asyncio.TimeoutError:
2442
- if task_id in self.active_processes:
2443
- self.active_processes[task_id].kill()
2444
- return TaskResult(
2445
- status="failed", exit_code=-1, stdout="", stderr="",
2446
- error=f"Timed out after {timeout}s",
2447
- )
2694
+ except asyncio.TimeoutError as exc:
2695
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2696
+ _err = (
2697
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2698
+ "Task may require more context decomposition or a different agent."
2699
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2700
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2448
2701
  except Exception as exc:
2449
2702
  logger.exception("Claude stream error for task %s", task_id)
2450
2703
  if task_id in self.active_processes:
@@ -2614,7 +2867,8 @@ class ProcessManager:
2614
2867
  )
2615
2868
  self.active_processes[task_id] = proc
2616
2869
  stdout, stderr, returncode = await self._stream_process(
2617
- proc, None, timeout, task_id, on_chunk
2870
+ proc, None, timeout, task_id, on_chunk,
2871
+ workspace_path=cwd,
2618
2872
  )
2619
2873
 
2620
2874
  # Parse copilot JSONL output for metrics
@@ -2642,13 +2896,13 @@ class ProcessManager:
2642
2896
  error=f"Copilot exited with code {effective_rc}: {stderr[-500:]}",
2643
2897
  metrics=metrics,
2644
2898
  )
2645
- except asyncio.TimeoutError:
2646
- if task_id in self.active_processes:
2647
- self.active_processes[task_id].kill()
2648
- return TaskResult(
2649
- status="failed", exit_code=-1, stdout="", stderr="",
2650
- error=f"Timed out after {timeout}s",
2651
- )
2899
+ except asyncio.TimeoutError as exc:
2900
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2901
+ _err = (
2902
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2903
+ "Task may require more context decomposition or a different agent."
2904
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2905
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2652
2906
  except Exception as exc:
2653
2907
  logger.exception("Copilot stream error for task %s", task_id)
2654
2908
  if task_id in self.active_processes:
@@ -2689,7 +2943,8 @@ class ProcessManager:
2689
2943
  self.active_processes[task_id] = proc
2690
2944
  stdin_bytes = stdin_input.encode() if stdin_input else None
2691
2945
  stdout, stderr, returncode = await self._stream_process(
2692
- proc, stdin_bytes, timeout, task_id, on_chunk
2946
+ proc, stdin_bytes, timeout, task_id, on_chunk,
2947
+ workspace_path=cwd,
2693
2948
  )
2694
2949
  status = "success" if returncode == 0 else "failed"
2695
2950
  return TaskResult(
@@ -2699,33 +2954,13 @@ class ProcessManager:
2699
2954
  stderr=stderr[-10000:],
2700
2955
  error="" if status == "success" else f"Exited with code {returncode}",
2701
2956
  )
2702
- except asyncio.TimeoutError:
2703
- proc = self.active_processes.pop(task_id, None)
2704
- if proc:
2705
- try:
2706
- if sys.platform != "win32":
2707
- import signal as _signal
2708
- try:
2709
- os.killpg(os.getpgid(proc.pid), _signal.SIGKILL)
2710
- except (ProcessLookupError, PermissionError, OSError):
2711
- pass
2712
- else:
2713
- import subprocess as _subprocess
2714
- _subprocess.run(
2715
- ["taskkill", "/F", "/T", "/PID", str(proc.pid)],
2716
- capture_output=True,
2717
- )
2718
- except Exception:
2719
- pass
2720
- finally:
2721
- try:
2722
- proc.kill()
2723
- except Exception:
2724
- pass
2725
- return TaskResult(
2726
- status="failed", exit_code=-1, stdout="", stderr="",
2727
- error=f"Timed out after {timeout}s",
2728
- )
2957
+ except asyncio.TimeoutError as exc:
2958
+ _kill_proc(self.active_processes.pop(task_id, None) or proc)
2959
+ _err = (
2960
+ f"Agent idle for {exc.idle_seconds:.0f}s without output — process terminated. "
2961
+ "Task may require more context decomposition or a different agent."
2962
+ ) if isinstance(exc, _IdleTimeoutError) else f"Timed out after {timeout}s (absolute limit)"
2963
+ return TaskResult(status="failed", exit_code=-1, stdout="", stderr="", error=_err)
2729
2964
  except Exception as exc:
2730
2965
  logger.exception("CLI stream error for task %s", task_id)
2731
2966
  if task_id in self.active_processes:
@@ -4167,6 +4402,65 @@ class RuntimeDaemon:
4167
4402
  )
4168
4403
  logger.info("Workspace ready: %s", workspace_path)
4169
4404
 
4405
+ # 2.5 Wipe the analysis output directory on fresh analysis so the new
4406
+ # agent run starts from a completely clean slate. This covers:
4407
+ # • Type change: removes old-type files (e.g. PRD.md/SDD.md) so they
4408
+ # don't coexist with the new type's files (e.g. diagnosis.md).
4409
+ # • Same-type fresh re-analysis: removes extra files the agent may
4410
+ # have written that fall outside the expected type profile.
4411
+ # Using a whole-directory wipe is more reliable than the old
4412
+ # cleanup_stale_docs approach (which only deleted known-profile files).
4413
+ if task.node_type == "analysis" and (
4414
+ task.input_data.get("wipe_analysis_dir")
4415
+ # Backwards-compat: older server versions send cleanup_stale_docs
4416
+ or task.input_data.get("cleanup_stale_docs")
4417
+ # Also wipe if analysis_mode is explicitly "fresh" (belt+suspenders)
4418
+ or task.input_data.get("analysis_mode") == "fresh"
4419
+ ):
4420
+ output_dir_raw = (
4421
+ task.input_data.get("analysis_output_dir")
4422
+ or task.input_data.get("output_dir")
4423
+ or ""
4424
+ )
4425
+ output_dir_norm = str(output_dir_raw).replace("\\", "/").lstrip("./").rstrip("/")
4426
+ if output_dir_norm:
4427
+ dir_to_wipe = workspace_path / output_dir_norm
4428
+ if dir_to_wipe.is_dir():
4429
+ existing_files = [f for f in dir_to_wipe.iterdir() if f.is_file()]
4430
+ if existing_files:
4431
+ try:
4432
+ # Stage all deletions with git rm
4433
+ await self._git(
4434
+ "rm", "-r", "--cached", "--ignore-unmatch",
4435
+ output_dir_norm,
4436
+ cwd=workspace_path,
4437
+ )
4438
+ # Remove physical files
4439
+ shutil.rmtree(str(dir_to_wipe), ignore_errors=True)
4440
+ # Commit the wipe so the branch diff is clean
4441
+ await self._git(
4442
+ "-c", "user.name=Forgexa Agent",
4443
+ "-c", "user.email=agent@forgexa.net",
4444
+ "commit", "-m",
4445
+ f"cleanup: wipe analysis docs in {output_dir_norm} before fresh re-analysis",
4446
+ cwd=workspace_path,
4447
+ )
4448
+ logger.info(
4449
+ "Wiped %d analysis doc(s) from %s for task %s (fresh analysis)",
4450
+ len(existing_files), output_dir_norm, task.task_id,
4451
+ )
4452
+ except Exception:
4453
+ logger.warning(
4454
+ "Could not wipe analysis dir %s for task %s "
4455
+ "(proceeding anyway — agent will overwrite)",
4456
+ output_dir_norm, task.task_id, exc_info=True,
4457
+ )
4458
+ else:
4459
+ logger.debug(
4460
+ "Analysis dir %s is already empty for task %s",
4461
+ output_dir_norm, task.task_id,
4462
+ )
4463
+
4170
4464
  # 3. Run agent with real-time output streaming + periodic progress heartbeat
4171
4465
  await reporter.report_progress(task.task_id, 10, "running_agent")
4172
4466
 
@@ -4364,26 +4658,49 @@ class RuntimeDaemon:
4364
4658
  f"(node_type={task.node_type}, agent={agent.agent_id})"
4365
4659
  )
4366
4660
 
4367
- # 4.1 Recovery: agent exited non-zero but already committed code
4368
- # (e.g. OpenCode EBADF crash on exit after successful work)
4369
- if result.status == "failed" and result.exit_code not in (None, -1):
4370
- committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
4371
- has_committed_changes = bool(committed_git.get("files_changed"))
4372
- has_no_uncommitted = not pre_commit_git.get("files_changed")
4373
- has_tokens = (
4374
- int(result.metrics.get("token_input", 0) or 0)
4375
- + int(result.metrics.get("token_output", 0) or 0)
4376
- ) > 0
4377
- has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
4378
- if has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output):
4379
- logger.warning(
4380
- "Task %s agent exited with code %s but has committed changes — "
4381
- "recovering as success (agent likely crashed during cleanup)",
4382
- task.task_id, result.exit_code,
4661
+ # 4.1 Recovery: agent exited non-zero but already committed code.
4662
+ # Covers two scenarios:
4663
+ # A. Process crash (e.g. OpenCode EBADF on exit after successful work):
4664
+ # exit_code is a real non-negative/non-(-1) value.
4665
+ # B. Timeout with committed work: agent finished its task and committed
4666
+ # before the idle/absolute timeout fired. exit_code=-1 but the commits
4667
+ # are real — don't discard them.
4668
+ if result.status == "failed":
4669
+ _error_lower = (result.error or "").lower()
4670
+ is_timeout_failure = (
4671
+ "idle for" in _error_lower
4672
+ or "timed out" in _error_lower
4673
+ or "absolute limit" in _error_lower
4674
+ )
4675
+ can_attempt_recovery = (
4676
+ is_timeout_failure # timeout: also allow exit_code=-1
4677
+ or result.exit_code not in (None, -1) # crash: original guard
4678
+ )
4679
+ if can_attempt_recovery:
4680
+ committed_git = await self.process_manager._collect_git_info_vs_parent(workspace_path)
4681
+ has_committed_changes = bool(committed_git.get("files_changed"))
4682
+ has_no_uncommitted = not pre_commit_git.get("files_changed")
4683
+ has_tokens = (
4684
+ int(result.metrics.get("token_input", 0) or 0)
4685
+ + int(result.metrics.get("token_output", 0) or 0)
4686
+ ) > 0
4687
+ has_meaningful_output = self.process_manager.has_meaningful_agent_output(result)
4688
+ # Timeout recovery requires stronger evidence: committed work + tokens.
4689
+ # Crash recovery (original): committed + (tokens OR meaningful output).
4690
+ sufficient_evidence = (
4691
+ (has_committed_changes and has_no_uncommitted and has_tokens and has_meaningful_output)
4692
+ if is_timeout_failure
4693
+ else (has_committed_changes and has_no_uncommitted and (has_tokens or has_meaningful_output))
4383
4694
  )
4384
- result.status = "success"
4385
- result.error = ""
4386
- result.metrics["recovered_from_exit_code"] = result.exit_code
4695
+ if sufficient_evidence:
4696
+ _reason = "timed out but already committed changes" if is_timeout_failure else f"exited with code {result.exit_code}"
4697
+ logger.warning(
4698
+ "Task %s agent %s — recovering as success",
4699
+ task.task_id, _reason,
4700
+ )
4701
+ result.status = "success"
4702
+ result.error = ""
4703
+ result.metrics["recovered_from_exit_code"] = result.exit_code
4387
4704
 
4388
4705
  # 4.5 Layer 2: Validation gate — check outputs before committing
4389
4706
  if result.status == "success":
@@ -5784,6 +6101,24 @@ class RuntimeDaemon:
5784
6101
  branch = (await git("rev-parse", "--abbrev-ref", "HEAD", cwd=workspace_path)).strip()
5785
6102
 
5786
6103
  if branch and branch != "HEAD":
6104
+ # Always refresh the remote tracking ref before any divergence
6105
+ # checks. Without this, origin/{branch} may be stale if another
6106
+ # runtime pushed commits while our agent was executing, causing
6107
+ # the remote_ahead check to return empty and the naive push to
6108
+ # fail with "non-fast-forward". This is the single most reliable
6109
+ # guard for cross-runtime / cross-machine collaboration scenarios.
6110
+ try:
6111
+ await git(
6112
+ "fetch", "origin", branch,
6113
+ cwd=workspace_path, project_key=project_key,
6114
+ )
6115
+ except RuntimeError as _pre_push_fetch_exc:
6116
+ logger.warning(
6117
+ "Pre-push fetch of branch '%s' failed: %s — "
6118
+ "divergence check will use possibly stale tracking ref",
6119
+ branch, _pre_push_fetch_exc,
6120
+ )
6121
+
5787
6122
  # Check if there are unpushed commits
5788
6123
  try:
5789
6124
  unpushed = (await git(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.8.6
3
+ Version: 1.8.8
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "forgexa-cli"
3
- version = "1.8.6"
3
+ version = "1.8.8"
4
4
  description = "Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform"
5
5
  requires-python = ">=3.9"
6
6
  license = { text = "MIT" }
File without changes
File without changes