forgexa-cli 1.8.8__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.8.8
3
+ Version: 1.9.0
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,2 +1,2 @@
1
1
  """forgexa-cli — Forgexa command-line client."""
2
- __version__ = "1.8.8"
2
+ __version__ = "1.9.0"
@@ -352,7 +352,11 @@ except (ImportError, ModuleNotFoundError):
352
352
 
353
353
  @property
354
354
  def AGENT_TIMEOUT(self) -> int:
355
- return int(os.environ.get("AGENT_TIMEOUT", "3600"))
355
+ return int(os.environ.get("AGENT_TIMEOUT", "14400")) # 4-hour absolute ceiling
356
+
357
+ @property
358
+ def AGENT_IDLE_TIMEOUT(self) -> int:
359
+ return int(os.environ.get("AGENT_IDLE_TIMEOUT", "600")) # 10-min idle (stdout+fs) = hung agent
356
360
 
357
361
  @property
358
362
  def GIT_CLONE_TIMEOUT(self) -> int:
@@ -392,7 +396,7 @@ except (ImportError, ModuleNotFoundError):
392
396
  # DAEMON_VERSION is the protocol/logic version of the daemon code.
393
397
  # Kept in sync with pyproject.toml version via bump-version.sh.
394
398
  # CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
395
- DAEMON_VERSION = "1.8.8"
399
+ DAEMON_VERSION = "1.9.0"
396
400
 
397
401
 
398
402
  def _detect_client_type() -> str:
@@ -633,6 +637,11 @@ class TaskResult:
633
637
  lines_added: int = 0
634
638
  lines_removed: int = 0
635
639
  error: str = ""
640
+ # failure_code is forwarded to the server to drive retry policy.
641
+ # Key values:
642
+ # "all_agents_rate_limited" — daemon tried every installed agent, all
643
+ # hit rate/quota limits. Server must NOT retry on the same runtime.
644
+ failure_code: str = ""
636
645
  artifacts: list[dict] = field(default_factory=list)
637
646
  observations: list[dict] = field(default_factory=list)
638
647
  metrics: dict = field(default_factory=dict)
@@ -1815,6 +1824,13 @@ class WorkspaceManager:
1815
1824
  if git_prefix_args:
1816
1825
  env = {**(env or os.environ), "GIT_TERMINAL_PROMPT": "0"}
1817
1826
 
1827
+ # Always enable long-path support. On Windows this removes git's own
1828
+ # 260-char path limit (Windows also needs HKLM LongPathsEnabled=1 or
1829
+ # the Win10 1607+ Group Policy, but at a minimum we ensure git won't
1830
+ # reject long paths on platforms where it is already enabled).
1831
+ # On Linux/macOS this is a no-op.
1832
+ longpath_args = ["-c", "core.longpaths=true"]
1833
+
1818
1834
  # start_new_session=True puts git in its own process group.
1819
1835
  # On timeout we send SIGKILL to the entire group, which includes
1820
1836
  # any ssh/gpg/credential-helper children that git forked — preventing
@@ -1822,7 +1838,7 @@ class WorkspaceManager:
1822
1838
  # Windows note: start_new_session creates a new console process group;
1823
1839
  # we use taskkill /T there instead of killpg.
1824
1840
  proc = await asyncio.create_subprocess_exec(
1825
- "git", *git_prefix_args, *args,
1841
+ "git", *longpath_args, *git_prefix_args, *args,
1826
1842
  stdout=asyncio.subprocess.PIPE,
1827
1843
  stderr=asyncio.subprocess.PIPE,
1828
1844
  cwd=str(cwd) if cwd else None,
@@ -2134,10 +2150,54 @@ class ProcessManager:
2134
2150
  "has_turn_failed": has_turn_failed,
2135
2151
  "has_result": has_result,
2136
2152
  "has_meaningful_content": has_meaningful_content,
2153
+ "has_assistant_events": has_assistant_events,
2137
2154
  "error_messages": error_messages,
2138
2155
  "json_line_count": json_line_count,
2139
2156
  }
2140
2157
 
2158
+ @staticmethod
2159
+ def _should_scan_short_success_stdout(stdout: str, signals: dict[str, Any]) -> bool:
2160
+ """Return True when success-shaped stdout is short enough to be an error blob.
2161
+
2162
+ Real agent work output can legitimately mention quota/rate-limit terms, so
2163
+ we never scan arbitrary stdout for success cases. The safe exception is a
2164
+ tiny stdout payload with no assistant/result/turn-complete signals; in that
2165
+ shape the CLI usually failed before making a real model call and printed a
2166
+ plain-text transport/quota error like "API Error: 429 ...".
2167
+ """
2168
+ stdout_stripped = (stdout or "").strip()
2169
+ if not stdout_stripped or len(stdout_stripped) >= 500:
2170
+ return False
2171
+ return not (
2172
+ signals.get("has_result")
2173
+ or signals.get("has_turn_completed")
2174
+ or signals.get("has_assistant_events")
2175
+ )
2176
+
2177
+ @staticmethod
2178
+ def _failure_pattern_channels(result: "TaskResult") -> str:
2179
+ """Build the text window safe to scan for quota/backend failure patterns."""
2180
+ stdout = result.stdout or ""
2181
+ stderr = result.stderr or ""
2182
+ error = result.error or ""
2183
+
2184
+ if result.status != "success":
2185
+ if result.exit_code == 0:
2186
+ return "\n".join(part for part in (stderr, error) if part)
2187
+ return "\n".join(part for part in (stderr, error, stdout[-3000:]) if part)
2188
+
2189
+ error_channels = "\n".join(part for part in (stderr, error) if part)
2190
+ has_token_usage = (
2191
+ int(result.metrics.get("token_input", 0) or 0)
2192
+ + int(result.metrics.get("token_output", 0) or 0)
2193
+ ) > 0
2194
+ signals = ProcessManager._extract_output_signals(
2195
+ "\n".join(part for part in (stdout, stderr) if part)
2196
+ )
2197
+ if not has_token_usage and ProcessManager._should_scan_short_success_stdout(stdout, signals):
2198
+ error_channels = "\n".join(filter(None, [error_channels, stdout.strip()]))
2199
+ return error_channels
2200
+
2141
2201
  @staticmethod
2142
2202
  def has_meaningful_agent_output(result: "TaskResult") -> bool:
2143
2203
  """Return True when the agent emitted real user-meaningful output."""
@@ -2156,29 +2216,12 @@ class ProcessManager:
2156
2216
  Returns True for rate/quota limits AND API unavailability errors,
2157
2217
  since a different agent (using a different API backend) may succeed.
2158
2218
 
2159
- IMPORTANT: Only checks stderr and error message. When exit code is
2160
- non-zero, also checks the tail of stdout (last 3000 chars) since the
2161
- error is likely at the end. When exit code is 0 (agent reported
2162
- success but _detect_agent_output_failure set status to failed), do
2163
- NOT scan stdout — it contains the agent's work output (configs, code)
2164
- which naturally has terms like "rate_limit", "API_RATE_LIMIT_PER_MINUTE"
2165
- that trigger false positives.
2219
+ For true success cases we still avoid scanning arbitrary stdout.
2220
+ The one safe exception is a tiny stdout payload with no assistant/result
2221
+ signals, which strongly indicates a pre-call CLI failure printed as
2222
+ plain text (for example "API Error: 429 ...").
2166
2223
  """
2167
- if result.status == "success":
2168
- return False
2169
- # When exit code is 0, _detect_agent_output_failure already checked
2170
- # stderr+error for rate-limit patterns. Don't re-scan stdout here.
2171
- if result.exit_code == 0:
2172
- error_text = (
2173
- (result.stderr or "")
2174
- + "\n" + (result.error or "")
2175
- ).lower()
2176
- else:
2177
- error_text = (
2178
- (result.stderr or "")
2179
- + "\n" + (result.error or "")
2180
- + "\n" + (result.stdout or "")[-3000:]
2181
- ).lower()
2224
+ error_text = ProcessManager._failure_pattern_channels(result).lower()
2182
2225
  return (
2183
2226
  any(p in error_text for p in ProcessManager.RATE_LIMIT_PATTERNS)
2184
2227
  or any(p in error_text for p in ProcessManager.AGENT_UNAVAILABLE_PATTERNS)
@@ -2198,16 +2241,13 @@ class ProcessManager:
2198
2241
  if result.status != "success":
2199
2242
  return None
2200
2243
 
2201
- # For exit-code-0 (success) cases, only scan stderr and the error field
2202
- # for rate-limit / unavailability patterns. Stdout contains the agent's
2203
- # actual task output (code, configs, analysis docs) which may legitimately
2204
- # contain substrings like "rate_limit", "429", "quota", etc. — e.g. writing
2205
- # a config file with API_RATE_LIMIT_PER_MINUTE=1000 would previously trigger
2206
- # a false "quota exhaustion" failure even though the agent succeeded.
2207
- # stdout[-N:] is only safe to scan when the agent already failed (exit != 0),
2208
- # which is handled by is_rate_limited() called at the orchestrator level.
2209
- error_only_channels = (result.stderr or "") + "\n" + (result.error or "")
2210
- pattern_failure = ProcessManager._has_failure_pattern(error_only_channels)
2244
+ # For exit-code-0 (success) cases, avoid scanning arbitrary stdout for
2245
+ # quota keywords. The only safe stdout exception is a tiny payload with
2246
+ # no success signals, which typically means the CLI failed before making
2247
+ # a real model call and printed a plain-text error like "API Error: 429".
2248
+ pattern_failure = ProcessManager._has_failure_pattern(
2249
+ ProcessManager._failure_pattern_channels(result)
2250
+ )
2211
2251
  if pattern_failure:
2212
2252
  return pattern_failure
2213
2253
 
@@ -2223,8 +2263,13 @@ class ProcessManager:
2223
2263
  has_turn_failed = signals["has_turn_failed"]
2224
2264
  has_result = signals["has_result"]
2225
2265
  has_meaningful_content = signals["has_meaningful_content"]
2266
+ has_assistant_events = signals["has_assistant_events"]
2226
2267
  error_messages = signals["error_messages"]
2227
2268
  json_line_count = signals["json_line_count"]
2269
+ has_token_usage = (
2270
+ int(result.metrics.get("token_input", 0) or 0)
2271
+ + int(result.metrics.get("token_output", 0) or 0)
2272
+ ) > 0
2228
2273
 
2229
2274
  stderr_lower = stderr.lower()
2230
2275
  if (
@@ -2256,14 +2301,27 @@ class ProcessManager:
2256
2301
  and not has_meaningful_content and json_line_count > 0):
2257
2302
  return f"Agent encountered errors without producing output: {error_messages[0]}"
2258
2303
 
2304
+ # Plain-text pre-call failures (not JSONL) can still exit 0 on some
2305
+ # agent CLIs. When stdout is tiny and lacks any structural success
2306
+ # signals, treat explicit error markers as agent failure so fallback can
2307
+ # run locally instead of relying on server-side re-enqueue.
2308
+ stdout_stripped = stdout.strip()
2309
+ stdout_lower = stdout_stripped.lower()
2310
+ if (
2311
+ not has_token_usage
2312
+ and ProcessManager._should_scan_short_success_stdout(stdout, signals)
2313
+ and any(marker in stdout_lower for marker in ("api error", "exception", "forbidden", "unauthorized"))
2314
+ ):
2315
+ return stdout_stripped.splitlines()[-1][:300]
2316
+
2259
2317
  # ── Claude: JSON output mode but no result object and no content ──
2260
2318
  if agent_id == "claude" and json_line_count > 0:
2261
- if not has_result and not has_meaningful_content:
2319
+ if not has_result and not has_meaningful_content and not has_assistant_events:
2262
2320
  return "Claude produced no result output"
2263
2321
 
2264
2322
  # ── Copilot: JSONL mode but no turn completion and no content ──
2265
2323
  if agent_id == "copilot" and json_line_count > 0:
2266
- if not has_result and not has_meaningful_content:
2324
+ if not has_result and not has_meaningful_content and not has_assistant_events:
2267
2325
  return "Copilot produced no result output (check GitHub authentication: run 'gh auth login')"
2268
2326
 
2269
2327
  return None
@@ -2331,8 +2389,9 @@ class ProcessManager:
2331
2389
  return normalized
2332
2390
 
2333
2391
  def _required_deliverable_paths(self, task: TaskInfo) -> set[str]:
2334
- # For analysis nodes, deliverables live in analysis_output_dir (docs/requirements/...)
2335
- # For other nodes, use output_dir (docs/implements/...)
2392
+ # For analysis nodes, deliverables live in analysis_output_dir (docs/requirements/<key>/analysis)
2393
+ # For delivery nodes, deliverables live in output_dir (docs/requirements/<key>/delivery)
2394
+ # For other nodes, use output_dir (docs/requirements/<key>/implement)
2336
2395
  if task.node_type == "analysis":
2337
2396
  output_dir = str(
2338
2397
  (task.input_data or {}).get("analysis_output_dir", "")
@@ -2350,6 +2409,9 @@ class ProcessManager:
2350
2409
  required_files = _get_analysis_outputs_for_type(req_type)
2351
2410
  elif task.node_type == "design":
2352
2411
  required_files = ["design.md"]
2412
+ elif task.node_type == "delivery":
2413
+ # Required docs come from node input_data (set by delivery_doc_service)
2414
+ required_files = (task.input_data or {}).get("required_docs") or ["release-note.md"]
2353
2415
  else:
2354
2416
  return set()
2355
2417
 
@@ -3503,6 +3565,7 @@ class ProgressReporter:
3503
3565
  "stdout_tail": result.stdout[-20000:] if result.stdout else "",
3504
3566
  "stderr_tail": result.stderr[-5000:] if result.stderr else "",
3505
3567
  "error": result.error,
3568
+ "failure_code": result.failure_code,
3506
3569
  "files_changed": result.files_changed,
3507
3570
  "lines_added": result.lines_added,
3508
3571
  "lines_removed": result.lines_removed,
@@ -4402,7 +4465,52 @@ class RuntimeDaemon:
4402
4465
  )
4403
4466
  logger.info("Workspace ready: %s", workspace_path)
4404
4467
 
4405
- # 2.5 Wipe the analysis output directory on fresh analysis so the new
4468
+ # 2.1 Workspace health check: detect broken checkout (Windows filename-
4469
+ # too-long or other git checkout failure that leaves the working tree
4470
+ # empty while the git index still tracks all source files).
4471
+ # If this is not caught the agent will run `git add -A` and commit a
4472
+ # catastrophic mass-deletion (e.g. SI-434: 47,566 files deleted).
4473
+ try:
4474
+ _index_count_out = await self._git(
4475
+ "ls-files", "--cached", "--", ".", cwd=workspace_path,
4476
+ timeout=30,
4477
+ )
4478
+ _index_count = len([l for l in _index_count_out.splitlines() if l.strip()])
4479
+ if _index_count > 500:
4480
+ # Count physical files (exclude .git/)
4481
+ _phys_count = sum(1 for _ in workspace_path.rglob("*")
4482
+ if _.is_file() and ".git" not in _.parts)
4483
+ _ratio = _phys_count / _index_count
4484
+ if _ratio < 0.20:
4485
+ # Less than 20 % of tracked files exist on disk — almost
4486
+ # certainly a failed git checkout (e.g. Windows path-length
4487
+ # limit). Abort rather than letting the agent commit a
4488
+ # mass-deletion.
4489
+ _longpath_hint = (
4490
+ " Enable Windows long-path support: run "
4491
+ "`git config --global core.longpaths true` and enable "
4492
+ "LongPathsEnabled in Windows Group Policy / Registry "
4493
+ "(HKLM\\SYSTEM\\CurrentControlSet\\Control\\FileSystem\\LongPathsEnabled=1)."
4494
+ if sys.platform == "win32" else ""
4495
+ )
4496
+ raise RuntimeError(
4497
+ f"Workspace health check failed: only {_phys_count}/{_index_count} "
4498
+ f"tracked files exist on disk ({_ratio:.0%}). "
4499
+ f"The git checkout likely failed due to filename-length limitations."
4500
+ f"{_longpath_hint}"
4501
+ )
4502
+ elif _ratio < 0.80:
4503
+ logger.warning(
4504
+ "Workspace health check warning: only %d/%d tracked files "
4505
+ "exist on disk (%.0f%%) for task %s — checkout may be incomplete.",
4506
+ _phys_count, _index_count, _ratio * 100, task.task_id,
4507
+ )
4508
+ except RuntimeError:
4509
+ raise
4510
+ except Exception as _health_exc:
4511
+ logger.warning("Workspace health check error (non-fatal): %s", _health_exc)
4512
+
4513
+
4406
4514
  # agent run starts from a completely clean slate. This covers:
4407
4515
  # • Type change: removes old-type files (e.g. PRD.md/SDD.md) so they
4408
4516
  # don't coexist with the new type's files (e.g. diagnosis.md).
@@ -4630,6 +4738,10 @@ class RuntimeDaemon:
4630
4738
  f"Original error: {result.error}"
4631
4739
  )
4632
4740
  result.status = "failed"
4741
+ # Signal to the server that ALL installed agents were tried and
4742
+ # all are rate/quota limited. The server must NOT re-enqueue on
4743
+ # the same runtime — that would hit the same quota wall.
4744
+ result.failure_code = "all_agents_rate_limited"
4633
4745
 
4634
4746
  # 4. Collect git info BEFORE commit (shows uncommitted changes)
4635
4747
  pre_commit_git = await self.process_manager._collect_git_info(workspace_path)
@@ -4715,6 +4827,72 @@ class RuntimeDaemon:
4715
4827
  except Exception:
4716
4828
  logger.exception("Validation gate error for task %s (proceeding anyway)", task.task_id)
4717
4829
 
4830
+ # 4.6 Post-validation rate-limit fallback.
4831
+ # _validate_and_retry returns early (preserving the rate-limit error)
4832
+ # when the agent hits a quota wall mid-retry. The initial-run fallback
4833
+ # block (step 3) only checked the *initial* run; if that succeeded but
4834
+ # the agent became rate-limited during a validation retry, we need a
4835
+ # second fallback pass here so the task is attempted on a fresh agent.
4836
+ if result.status == "failed" and self.process_manager.is_rate_limited(result) and not _skip_fallback:
4837
+ logger.warning(
4838
+ "Agent '%s' rate-limited during validation retry for task %s — "
4839
+ "attempting post-validation agent fallback",
4840
+ agent.agent_id, task.task_id,
4841
+ )
4842
+ _pv_fallback = self._select_fallback_agent(
4843
+ agent.agent_id, task.fallback_chain, tried_agents
4844
+ )
4845
+ while _pv_fallback:
4846
+ logger.info(
4847
+ "Post-validation fallback: '%s' → '%s' for task %s",
4848
+ agent.agent_id, _pv_fallback.agent_id, task.task_id,
4849
+ )
4850
+ agent = _pv_fallback
4851
+ tried_agents.add(agent.agent_id)
4852
+ await reporter.report_progress(
4853
+ task.task_id, 10,
4854
+ f"agent_fallback: retrying with {agent.agent_id}",
4855
+ output_lines=[
4856
+ f"[daemon] Agent rate-limited during validation, "
4857
+ f"switching to {agent.agent_id}",
4858
+ ],
4859
+ )
4860
+ result = await self.process_manager.run_agent(
4861
+ agent, task, workspace_path, on_chunk=on_output_chunk,
4862
+ )
4863
+ if not self.process_manager.is_rate_limited(result):
4864
+ # Fallback agent ran successfully (or hit a non-rate-limit
4865
+ # failure) — re-run the validation gate and update git state.
4866
+ if result.status == "success":
4867
+ try:
4868
+ result = await self._validate_and_retry(
4869
+ agent, task, workspace_path, result,
4870
+ reporter, on_output_chunk, max_retries=2,
4871
+ )
4872
+ pre_commit_git = await self.process_manager._collect_git_info(workspace_path)
4873
+ except Exception:
4874
+ logger.exception(
4875
+ "Post-validation gate error for task %s (proceeding anyway)",
4876
+ task.task_id,
4877
+ )
4878
+ break
4879
+ logger.warning(
4880
+ "Post-validation fallback agent '%s' also rate-limited for task %s",
4881
+ agent.agent_id, task.task_id,
4882
+ )
4883
+ _pv_fallback = self._select_fallback_agent(
4884
+ agent.agent_id, task.fallback_chain, tried_agents
4885
+ )
4886
+ # If every agent we tried is still rate-limited, signal the server
4887
+ # NOT to re-enqueue — it would hit the same quota wall immediately.
4888
+ if self.process_manager.is_rate_limited(result):
4889
+ result.error = (
4890
+ f"All agents unavailable/rate-limited (tried: {', '.join(tried_agents)}). "
4891
+ f"Original error: {result.error}"
4892
+ )
4893
+ result.status = "failed"
4894
+ result.failure_code = "all_agents_rate_limited"
4895
+
4718
4896
  # 4.55 Analysis/design nodes must update their deliverables in THIS run.
4719
4897
  # Existing files from a prior iteration are not sufficient evidence.
4720
4898
  if result.status == "success" and task.node_type in ("analysis", "design"):
@@ -5196,6 +5374,38 @@ class RuntimeDaemon:
5196
5374
  # Flush any remaining buffered lines after agent finishes
5197
5375
  await _flush_output_to_server()
5198
5376
 
5377
+ # 3.5 Agent fallback: if the chosen agent hit a rate/quota limit,
5378
+ # try the next available agent before giving up.
5379
+ _aj_tried: set[str] = {agent.agent_id}
5380
+ while self.process_manager.is_rate_limited(result):
5381
+ _aj_fallback = self._select_fallback_agent(agent.agent_id, [], _aj_tried)
5382
+ if not _aj_fallback:
5383
+ # All agents exhausted — signal server not to re-enqueue.
5384
+ result.failure_code = "all_agents_rate_limited"
5385
+ break
5386
+ logger.warning(
5387
+ "AIJob %s: agent '%s' rate-limited, falling back to '%s'",
5388
+ job_id, agent.agent_id, _aj_fallback.agent_id,
5389
+ )
5390
+ await conn.client.post(
5391
+ f"{reporter_url}/progress",
5392
+ json={
5393
+ "current_step": f"agent_fallback: retrying with {_aj_fallback.agent_id}",
5394
+ "output_lines": [
5395
+ f"[daemon] Agent rate-limited, switching to {_aj_fallback.agent_id}",
5396
+ ],
5397
+ "progress_pct": 15,
5398
+ },
5399
+ timeout=5,
5400
+ )
5401
+ agent = _aj_fallback
5402
+ _aj_tried.add(agent.agent_id)
5403
+ fake_task.agent_type = agent.agent_id
5404
+ result = await self.process_manager.run_agent(
5405
+ agent, fake_task, workspace_path, on_chunk=on_chunk,
5406
+ )
5407
+ await _flush_output_to_server()
5408
+
5199
5409
  # 4. Auto-commit if successful
5200
5410
  input_ctx = aj.get("input_context", {})
5201
5411
  git_info = {}
@@ -5242,6 +5452,10 @@ class RuntimeDaemon:
5242
5452
  except Exception:
5243
5453
  pass
5244
5454
 
5455
+ # Preserve all_agents_rate_limited so the server does NOT re-enqueue.
5456
+ _failure_code = result.failure_code if result.failure_code else (
5457
+ "agent_error" if result.status != "success" else ""
5458
+ )
5245
5459
  complete_payload = {
5246
5460
  "status": "success" if result.status == "success" else "failed",
5247
5461
  "output_content": output_content,
@@ -5255,7 +5469,7 @@ class RuntimeDaemon:
5255
5469
  "resolved_agent": agent.agent_id,
5256
5470
  "git_info": git_info,
5257
5471
  "error": result.error if result.status != "success" else "",
5258
- "failure_code": "agent_error" if result.status != "success" else "",
5472
+ "failure_code": _failure_code,
5259
5473
  }
5260
5474
 
5261
5475
  await conn.client.post(
@@ -5315,7 +5529,14 @@ class RuntimeDaemon:
5315
5529
  ],
5316
5530
  )
5317
5531
 
5318
- # Build a targeted fix prompt with output directory context
5532
+ # Save the original prompt BEFORE building the retry variant so we
5533
+ # can include it in fix_prompt. Without this the agent receives only
5534
+ # "fix validation errors" with zero task context and responds with
5535
+ # "I don't have a specific task to execute yet." (root cause confirmed
5536
+ # via Copilot JSONL output for SI-434/SI-446).
5537
+ original_prompt = task.input_prompt
5538
+
5539
+ # Build a targeted fix prompt: original task + validation issues.
5319
5540
  _input = task.input_data or {}
5320
5541
  _fix_doc_dir = (
5321
5542
  _input.get("output_dir")
@@ -5323,8 +5544,11 @@ class RuntimeDaemon:
5323
5544
  or ""
5324
5545
  )
5325
5546
  fix_prompt = (
5326
- "The previous execution produced output with validation errors.\n"
5327
- "Please fix ALL of the following issues:\n\n"
5547
+ f"{original_prompt}\n\n"
5548
+ "---\n\n"
5549
+ "**IMPORTANT – Validation Retry:** The previous execution attempt "
5550
+ "did not produce all required output. Please complete the task above "
5551
+ "and ensure ALL of the following issues are resolved:\n\n"
5328
5552
  f"{issues_text}\n\n"
5329
5553
  )
5330
5554
  if _fix_doc_dir:
@@ -5339,7 +5563,6 @@ class RuntimeDaemon:
5339
5563
  )
5340
5564
 
5341
5565
  # Override task prompt temporarily
5342
- original_prompt = task.input_prompt
5343
5566
  task.input_prompt = fix_prompt
5344
5567
 
5345
5568
  try:
@@ -5349,6 +5572,20 @@ class RuntimeDaemon:
5349
5572
  finally:
5350
5573
  task.input_prompt = original_prompt
5351
5574
 
5575
+ # If the agent hit a rate/quota limit during this validation retry,
5576
+ # bail out immediately so the outer execution loop can trigger agent
5577
+ # fallback. Continuing to retry with the same rate-limited agent is
5578
+ # pointless; it will hit the same wall every time.
5579
+ # Returning early also preserves the rate-limit error in result.error
5580
+ # so that is_rate_limited() can detect it in the caller.
5581
+ if ProcessManager.is_rate_limited(result):
5582
+ logger.warning(
5583
+ "Agent '%s' rate-limited during validation retry for task %s "
5584
+ "(attempt %d/%d) — aborting validation retries for agent fallback",
5585
+ agent.agent_id, task.task_id, attempt + 1, max_retries,
5586
+ )
5587
+ return result
5588
+
5352
5589
  # Final check after all retries
5353
5590
  remaining = self._validate_outputs(workspace_path, task, result)
5354
5591
  if remaining:
@@ -5822,7 +6059,7 @@ class RuntimeDaemon:
5822
6059
  for f in files[:30]:
5823
6060
  path = f["path"].lower()
5824
6061
  fname = path.rsplit("/", 1)[-1]
5825
- if "docs/requirements" in path:
6062
+ if "docs/requirements" in path or "docs/workitems" in path:
5826
6063
  buckets["Analysis deliverables"].append(f)
5827
6064
  elif (
5828
6065
  "_test" in fname or fname.startswith("test_")
@@ -6032,24 +6269,49 @@ class RuntimeDaemon:
6032
6269
  # 5. Run the agent with the conflict resolution prompt
6033
6270
  logger.info("Invoking %s to resolve %d conflict(s)...", agent.agent_id, len(conflicted_files))
6034
6271
  try:
6272
+ _cr_task = TaskInfo(
6273
+ task_id=f"{task.task_id}-conflict-resolve",
6274
+ graph_id=task.graph_id,
6275
+ node_type="conflict_resolution",
6276
+ agent_type=agent.agent_id,
6277
+ input_prompt=resolve_prompt,
6278
+ input_data={},
6279
+ timeout_seconds=min(task.timeout_seconds, 300), # cap at 5 min
6280
+ max_retries=0,
6281
+ retry_count=0,
6282
+ project=task.project,
6283
+ work_item=task.work_item,
6284
+ )
6035
6285
  resolve_result = await self.process_manager.run_agent(
6036
6286
  agent,
6037
- TaskInfo(
6038
- task_id=f"{task.task_id}-conflict-resolve",
6039
- graph_id=task.graph_id,
6040
- node_type="conflict_resolution",
6041
- agent_type=agent.agent_id,
6042
- input_prompt=resolve_prompt,
6043
- input_data={},
6044
- timeout_seconds=min(task.timeout_seconds, 300), # cap at 5 min
6045
- max_retries=0,
6046
- retry_count=0,
6047
- project=task.project,
6048
- work_item=task.work_item,
6049
- ),
6287
+ _cr_task,
6050
6288
  workspace_path,
6051
6289
  )
6052
6290
 
6291
+ # Agent fallback: if the primary agent is rate-limited, try others.
6292
+ _cr_tried: set[str] = {agent.agent_id}
6293
+ while self.process_manager.is_rate_limited(resolve_result):
6294
+ _cr_fallback = self._select_fallback_agent(agent.agent_id, task.fallback_chain, _cr_tried)
6295
+ if not _cr_fallback:
6296
+ logger.warning(
6297
+ "All agents rate-limited for conflict resolution of task %s — aborting merge",
6298
+ task.task_id,
6299
+ )
6300
+ try:
6301
+ await git("merge", "--abort", cwd=workspace_path)
6302
+ except RuntimeError:
6303
+ pass
6304
+ return
6305
+ logger.warning(
6306
+ "Conflict resolution: agent '%s' rate-limited for task %s, "
6307
+ "falling back to '%s'",
6308
+ agent.agent_id, task.task_id, _cr_fallback.agent_id,
6309
+ )
6310
+ agent = _cr_fallback
6311
+ _cr_tried.add(agent.agent_id)
6312
+ _cr_task.agent_type = agent.agent_id
6313
+ resolve_result = await self.process_manager.run_agent(agent, _cr_task, workspace_path)
6314
+
6053
6315
  # 6. Check if conflicts are resolved
6054
6316
  proc = await asyncio.create_subprocess_exec(
6055
6317
  "git", "diff", "--name-only", "--diff-filter=U",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: forgexa-cli
3
- Version: 1.8.8
3
+ Version: 1.9.0
4
4
  Summary: Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform
5
5
  Author-email: Jason Sun <dev.winds@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "forgexa-cli"
3
- version = "1.8.8"
3
+ version = "1.9.0"
4
4
  description = "Forgexa CLI — command-line client and AI agent runtime for the Forgexa platform"
5
5
  requires-python = ">=3.9"
6
6
  license = { text = "MIT" }
File without changes
File without changes