forgexa-cli 1.8.8__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/PKG-INFO +1 -1
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli/__init__.py +1 -1
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli/daemon.py +321 -59
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli.egg-info/PKG-INFO +1 -1
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/pyproject.toml +1 -1
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/README.md +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli/_build_config.py +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli/main.py +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli/py.typed +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli.egg-info/SOURCES.txt +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli.egg-info/dependency_links.txt +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli.egg-info/entry_points.txt +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli.egg-info/requires.txt +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/forgexa_cli.egg-info/top_level.txt +0 -0
- {forgexa_cli-1.8.8 → forgexa_cli-1.9.0}/setup.cfg +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""forgexa-cli — Forgexa command-line client."""
|
|
2
|
-
__version__ = "1.
|
|
2
|
+
__version__ = "1.9.0"
|
|
@@ -352,7 +352,11 @@ except (ImportError, ModuleNotFoundError):
|
|
|
352
352
|
|
|
353
353
|
@property
|
|
354
354
|
def AGENT_TIMEOUT(self) -> int:
|
|
355
|
-
return int(os.environ.get("AGENT_TIMEOUT", "
|
|
355
|
+
return int(os.environ.get("AGENT_TIMEOUT", "14400")) # 4-hour absolute ceiling
|
|
356
|
+
|
|
357
|
+
@property
|
|
358
|
+
def AGENT_IDLE_TIMEOUT(self) -> int:
|
|
359
|
+
return int(os.environ.get("AGENT_IDLE_TIMEOUT", "600")) # 10-min idle (stdout+fs) = hung agent
|
|
356
360
|
|
|
357
361
|
@property
|
|
358
362
|
def GIT_CLONE_TIMEOUT(self) -> int:
|
|
@@ -392,7 +396,7 @@ except (ImportError, ModuleNotFoundError):
|
|
|
392
396
|
# DAEMON_VERSION is the protocol/logic version of the daemon code.
|
|
393
397
|
# Kept in sync with pyproject.toml version via bump-version.sh.
|
|
394
398
|
# CLIENT_TYPE identifies which packaging/distribution this daemon runs in.
|
|
395
|
-
DAEMON_VERSION = "1.
|
|
399
|
+
DAEMON_VERSION = "1.9.0"
|
|
396
400
|
|
|
397
401
|
|
|
398
402
|
def _detect_client_type() -> str:
|
|
@@ -633,6 +637,11 @@ class TaskResult:
|
|
|
633
637
|
lines_added: int = 0
|
|
634
638
|
lines_removed: int = 0
|
|
635
639
|
error: str = ""
|
|
640
|
+
# failure_code is forwarded to the server to drive retry policy.
|
|
641
|
+
# Key values:
|
|
642
|
+
# "all_agents_rate_limited" — daemon tried every installed agent, all
|
|
643
|
+
# hit rate/quota limits. Server must NOT retry on the same runtime.
|
|
644
|
+
failure_code: str = ""
|
|
636
645
|
artifacts: list[dict] = field(default_factory=list)
|
|
637
646
|
observations: list[dict] = field(default_factory=list)
|
|
638
647
|
metrics: dict = field(default_factory=dict)
|
|
@@ -1815,6 +1824,13 @@ class WorkspaceManager:
|
|
|
1815
1824
|
if git_prefix_args:
|
|
1816
1825
|
env = {**(env or os.environ), "GIT_TERMINAL_PROMPT": "0"}
|
|
1817
1826
|
|
|
1827
|
+
# Always enable long-path support. On Windows this removes git's own
|
|
1828
|
+
# 260-char path limit (Windows also needs HKLM LongPathsEnabled=1 or
|
|
1829
|
+
# the Win10 1607+ Group Policy, but at a minimum we ensure git won't
|
|
1830
|
+
# reject long paths on platforms where it is already enabled).
|
|
1831
|
+
# On Linux/macOS this is a no-op.
|
|
1832
|
+
longpath_args = ["-c", "core.longpaths=true"]
|
|
1833
|
+
|
|
1818
1834
|
# start_new_session=True puts git in its own process group.
|
|
1819
1835
|
# On timeout we send SIGKILL to the entire group, which includes
|
|
1820
1836
|
# any ssh/gpg/credential-helper children that git forked — preventing
|
|
@@ -1822,7 +1838,7 @@ class WorkspaceManager:
|
|
|
1822
1838
|
# Windows note: start_new_session creates a new console process group;
|
|
1823
1839
|
# we use taskkill /T there instead of killpg.
|
|
1824
1840
|
proc = await asyncio.create_subprocess_exec(
|
|
1825
|
-
"git", *git_prefix_args, *args,
|
|
1841
|
+
"git", *longpath_args, *git_prefix_args, *args,
|
|
1826
1842
|
stdout=asyncio.subprocess.PIPE,
|
|
1827
1843
|
stderr=asyncio.subprocess.PIPE,
|
|
1828
1844
|
cwd=str(cwd) if cwd else None,
|
|
@@ -2134,10 +2150,54 @@ class ProcessManager:
|
|
|
2134
2150
|
"has_turn_failed": has_turn_failed,
|
|
2135
2151
|
"has_result": has_result,
|
|
2136
2152
|
"has_meaningful_content": has_meaningful_content,
|
|
2153
|
+
"has_assistant_events": has_assistant_events,
|
|
2137
2154
|
"error_messages": error_messages,
|
|
2138
2155
|
"json_line_count": json_line_count,
|
|
2139
2156
|
}
|
|
2140
2157
|
|
|
2158
|
+
@staticmethod
|
|
2159
|
+
def _should_scan_short_success_stdout(stdout: str, signals: dict[str, Any]) -> bool:
|
|
2160
|
+
"""Return True when success-shaped stdout is short enough to be an error blob.
|
|
2161
|
+
|
|
2162
|
+
Real agent work output can legitimately mention quota/rate-limit terms, so
|
|
2163
|
+
we never scan arbitrary stdout for success cases. The safe exception is a
|
|
2164
|
+
tiny stdout payload with no assistant/result/turn-complete signals; in that
|
|
2165
|
+
shape the CLI usually failed before making a real model call and printed a
|
|
2166
|
+
plain-text transport/quota error like "API Error: 429 ...".
|
|
2167
|
+
"""
|
|
2168
|
+
stdout_stripped = (stdout or "").strip()
|
|
2169
|
+
if not stdout_stripped or len(stdout_stripped) >= 500:
|
|
2170
|
+
return False
|
|
2171
|
+
return not (
|
|
2172
|
+
signals.get("has_result")
|
|
2173
|
+
or signals.get("has_turn_completed")
|
|
2174
|
+
or signals.get("has_assistant_events")
|
|
2175
|
+
)
|
|
2176
|
+
|
|
2177
|
+
@staticmethod
|
|
2178
|
+
def _failure_pattern_channels(result: "TaskResult") -> str:
|
|
2179
|
+
"""Build the text window safe to scan for quota/backend failure patterns."""
|
|
2180
|
+
stdout = result.stdout or ""
|
|
2181
|
+
stderr = result.stderr or ""
|
|
2182
|
+
error = result.error or ""
|
|
2183
|
+
|
|
2184
|
+
if result.status != "success":
|
|
2185
|
+
if result.exit_code == 0:
|
|
2186
|
+
return "\n".join(part for part in (stderr, error) if part)
|
|
2187
|
+
return "\n".join(part for part in (stderr, error, stdout[-3000:]) if part)
|
|
2188
|
+
|
|
2189
|
+
error_channels = "\n".join(part for part in (stderr, error) if part)
|
|
2190
|
+
has_token_usage = (
|
|
2191
|
+
int(result.metrics.get("token_input", 0) or 0)
|
|
2192
|
+
+ int(result.metrics.get("token_output", 0) or 0)
|
|
2193
|
+
) > 0
|
|
2194
|
+
signals = ProcessManager._extract_output_signals(
|
|
2195
|
+
"\n".join(part for part in (stdout, stderr) if part)
|
|
2196
|
+
)
|
|
2197
|
+
if not has_token_usage and ProcessManager._should_scan_short_success_stdout(stdout, signals):
|
|
2198
|
+
error_channels = "\n".join(filter(None, [error_channels, stdout.strip()]))
|
|
2199
|
+
return error_channels
|
|
2200
|
+
|
|
2141
2201
|
@staticmethod
|
|
2142
2202
|
def has_meaningful_agent_output(result: "TaskResult") -> bool:
|
|
2143
2203
|
"""Return True when the agent emitted real user-meaningful output."""
|
|
@@ -2156,29 +2216,12 @@ class ProcessManager:
|
|
|
2156
2216
|
Returns True for rate/quota limits AND API unavailability errors,
|
|
2157
2217
|
since a different agent (using a different API backend) may succeed.
|
|
2158
2218
|
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
NOT scan stdout — it contains the agent's work output (configs, code)
|
|
2164
|
-
which naturally has terms like "rate_limit", "API_RATE_LIMIT_PER_MINUTE"
|
|
2165
|
-
that trigger false positives.
|
|
2219
|
+
For true success cases we still avoid scanning arbitrary stdout.
|
|
2220
|
+
The one safe exception is a tiny stdout payload with no assistant/result
|
|
2221
|
+
signals, which strongly indicates a pre-call CLI failure printed as
|
|
2222
|
+
plain text (for example "API Error: 429 ...").
|
|
2166
2223
|
"""
|
|
2167
|
-
|
|
2168
|
-
return False
|
|
2169
|
-
# When exit code is 0, _detect_agent_output_failure already checked
|
|
2170
|
-
# stderr+error for rate-limit patterns. Don't re-scan stdout here.
|
|
2171
|
-
if result.exit_code == 0:
|
|
2172
|
-
error_text = (
|
|
2173
|
-
(result.stderr or "")
|
|
2174
|
-
+ "\n" + (result.error or "")
|
|
2175
|
-
).lower()
|
|
2176
|
-
else:
|
|
2177
|
-
error_text = (
|
|
2178
|
-
(result.stderr or "")
|
|
2179
|
-
+ "\n" + (result.error or "")
|
|
2180
|
-
+ "\n" + (result.stdout or "")[-3000:]
|
|
2181
|
-
).lower()
|
|
2224
|
+
error_text = ProcessManager._failure_pattern_channels(result).lower()
|
|
2182
2225
|
return (
|
|
2183
2226
|
any(p in error_text for p in ProcessManager.RATE_LIMIT_PATTERNS)
|
|
2184
2227
|
or any(p in error_text for p in ProcessManager.AGENT_UNAVAILABLE_PATTERNS)
|
|
@@ -2198,16 +2241,13 @@ class ProcessManager:
|
|
|
2198
2241
|
if result.status != "success":
|
|
2199
2242
|
return None
|
|
2200
2243
|
|
|
2201
|
-
# For exit-code-0 (success) cases,
|
|
2202
|
-
#
|
|
2203
|
-
#
|
|
2204
|
-
#
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
# which is handled by is_rate_limited() called at the orchestrator level.
|
|
2209
|
-
error_only_channels = (result.stderr or "") + "\n" + (result.error or "")
|
|
2210
|
-
pattern_failure = ProcessManager._has_failure_pattern(error_only_channels)
|
|
2244
|
+
# For exit-code-0 (success) cases, avoid scanning arbitrary stdout for
|
|
2245
|
+
# quota keywords. The only safe stdout exception is a tiny payload with
|
|
2246
|
+
# no success signals, which typically means the CLI failed before making
|
|
2247
|
+
# a real model call and printed a plain-text error like "API Error: 429".
|
|
2248
|
+
pattern_failure = ProcessManager._has_failure_pattern(
|
|
2249
|
+
ProcessManager._failure_pattern_channels(result)
|
|
2250
|
+
)
|
|
2211
2251
|
if pattern_failure:
|
|
2212
2252
|
return pattern_failure
|
|
2213
2253
|
|
|
@@ -2223,8 +2263,13 @@ class ProcessManager:
|
|
|
2223
2263
|
has_turn_failed = signals["has_turn_failed"]
|
|
2224
2264
|
has_result = signals["has_result"]
|
|
2225
2265
|
has_meaningful_content = signals["has_meaningful_content"]
|
|
2266
|
+
has_assistant_events = signals["has_assistant_events"]
|
|
2226
2267
|
error_messages = signals["error_messages"]
|
|
2227
2268
|
json_line_count = signals["json_line_count"]
|
|
2269
|
+
has_token_usage = (
|
|
2270
|
+
int(result.metrics.get("token_input", 0) or 0)
|
|
2271
|
+
+ int(result.metrics.get("token_output", 0) or 0)
|
|
2272
|
+
) > 0
|
|
2228
2273
|
|
|
2229
2274
|
stderr_lower = stderr.lower()
|
|
2230
2275
|
if (
|
|
@@ -2256,14 +2301,27 @@ class ProcessManager:
|
|
|
2256
2301
|
and not has_meaningful_content and json_line_count > 0):
|
|
2257
2302
|
return f"Agent encountered errors without producing output: {error_messages[0]}"
|
|
2258
2303
|
|
|
2304
|
+
# Plain-text pre-call failures (not JSONL) can still exit 0 on some
|
|
2305
|
+
# agent CLIs. When stdout is tiny and lacks any structural success
|
|
2306
|
+
# signals, treat explicit error markers as agent failure so fallback can
|
|
2307
|
+
# run locally instead of relying on server-side re-enqueue.
|
|
2308
|
+
stdout_stripped = stdout.strip()
|
|
2309
|
+
stdout_lower = stdout_stripped.lower()
|
|
2310
|
+
if (
|
|
2311
|
+
not has_token_usage
|
|
2312
|
+
and ProcessManager._should_scan_short_success_stdout(stdout, signals)
|
|
2313
|
+
and any(marker in stdout_lower for marker in ("api error", "exception", "forbidden", "unauthorized"))
|
|
2314
|
+
):
|
|
2315
|
+
return stdout_stripped.splitlines()[-1][:300]
|
|
2316
|
+
|
|
2259
2317
|
# ── Claude: JSON output mode but no result object and no content ──
|
|
2260
2318
|
if agent_id == "claude" and json_line_count > 0:
|
|
2261
|
-
if not has_result and not has_meaningful_content:
|
|
2319
|
+
if not has_result and not has_meaningful_content and not has_assistant_events:
|
|
2262
2320
|
return "Claude produced no result output"
|
|
2263
2321
|
|
|
2264
2322
|
# ── Copilot: JSONL mode but no turn completion and no content ──
|
|
2265
2323
|
if agent_id == "copilot" and json_line_count > 0:
|
|
2266
|
-
if not has_result and not has_meaningful_content:
|
|
2324
|
+
if not has_result and not has_meaningful_content and not has_assistant_events:
|
|
2267
2325
|
return "Copilot produced no result output (check GitHub authentication: run 'gh auth login')"
|
|
2268
2326
|
|
|
2269
2327
|
return None
|
|
@@ -2331,8 +2389,9 @@ class ProcessManager:
|
|
|
2331
2389
|
return normalized
|
|
2332
2390
|
|
|
2333
2391
|
def _required_deliverable_paths(self, task: TaskInfo) -> set[str]:
|
|
2334
|
-
# For analysis nodes, deliverables live in analysis_output_dir (docs/requirements
|
|
2335
|
-
# For
|
|
2392
|
+
# For analysis nodes, deliverables live in analysis_output_dir (docs/requirements/<key>/analysis)
|
|
2393
|
+
# For delivery nodes, deliverables live in output_dir (docs/requirements/<key>/delivery)
|
|
2394
|
+
# For other nodes, use output_dir (docs/requirements/<key>/implement)
|
|
2336
2395
|
if task.node_type == "analysis":
|
|
2337
2396
|
output_dir = str(
|
|
2338
2397
|
(task.input_data or {}).get("analysis_output_dir", "")
|
|
@@ -2350,6 +2409,9 @@ class ProcessManager:
|
|
|
2350
2409
|
required_files = _get_analysis_outputs_for_type(req_type)
|
|
2351
2410
|
elif task.node_type == "design":
|
|
2352
2411
|
required_files = ["design.md"]
|
|
2412
|
+
elif task.node_type == "delivery":
|
|
2413
|
+
# Required docs come from node input_data (set by delivery_doc_service)
|
|
2414
|
+
required_files = (task.input_data or {}).get("required_docs") or ["release-note.md"]
|
|
2353
2415
|
else:
|
|
2354
2416
|
return set()
|
|
2355
2417
|
|
|
@@ -3503,6 +3565,7 @@ class ProgressReporter:
|
|
|
3503
3565
|
"stdout_tail": result.stdout[-20000:] if result.stdout else "",
|
|
3504
3566
|
"stderr_tail": result.stderr[-5000:] if result.stderr else "",
|
|
3505
3567
|
"error": result.error,
|
|
3568
|
+
"failure_code": result.failure_code,
|
|
3506
3569
|
"files_changed": result.files_changed,
|
|
3507
3570
|
"lines_added": result.lines_added,
|
|
3508
3571
|
"lines_removed": result.lines_removed,
|
|
@@ -4402,7 +4465,52 @@ class RuntimeDaemon:
|
|
|
4402
4465
|
)
|
|
4403
4466
|
logger.info("Workspace ready: %s", workspace_path)
|
|
4404
4467
|
|
|
4405
|
-
# 2.
|
|
4468
|
+
# 2.1 Workspace health check: detect broken checkout (Windows filename-
|
|
4469
|
+
# too-long or other git checkout failure that leaves the working tree
|
|
4470
|
+
# empty while the git index still tracks all source files).
|
|
4471
|
+
# If this is not caught the agent will run `git add -A` and commit a
|
|
4472
|
+
# catastrophic mass-deletion (e.g. SI-434: 47,566 files deleted).
|
|
4473
|
+
try:
|
|
4474
|
+
_index_count_out = await self._git(
|
|
4475
|
+
"ls-files", "--cached", "--", ".", cwd=workspace_path,
|
|
4476
|
+
timeout=30,
|
|
4477
|
+
)
|
|
4478
|
+
_index_count = len([l for l in _index_count_out.splitlines() if l.strip()])
|
|
4479
|
+
if _index_count > 500:
|
|
4480
|
+
# Count physical files (exclude .git/)
|
|
4481
|
+
_phys_count = sum(1 for _ in workspace_path.rglob("*")
|
|
4482
|
+
if _.is_file() and ".git" not in _.parts)
|
|
4483
|
+
_ratio = _phys_count / _index_count
|
|
4484
|
+
if _ratio < 0.20:
|
|
4485
|
+
# Less than 20 % of tracked files exist on disk — almost
|
|
4486
|
+
# certainly a failed git checkout (e.g. Windows path-length
|
|
4487
|
+
# limit). Abort rather than letting the agent commit a
|
|
4488
|
+
# mass-deletion.
|
|
4489
|
+
_longpath_hint = (
|
|
4490
|
+
" Enable Windows long-path support: run "
|
|
4491
|
+
"`git config --global core.longpaths true` and enable "
|
|
4492
|
+
"LongPathsEnabled in Windows Group Policy / Registry "
|
|
4493
|
+
"(HKLM\\SYSTEM\\CurrentControlSet\\Control\\FileSystem\\LongPathsEnabled=1)."
|
|
4494
|
+
if sys.platform == "win32" else ""
|
|
4495
|
+
)
|
|
4496
|
+
raise RuntimeError(
|
|
4497
|
+
f"Workspace health check failed: only {_phys_count}/{_index_count} "
|
|
4498
|
+
f"tracked files exist on disk ({_ratio:.0%}). "
|
|
4499
|
+
f"The git checkout likely failed due to filename-length limitations."
|
|
4500
|
+
f"{_longpath_hint}"
|
|
4501
|
+
)
|
|
4502
|
+
elif _ratio < 0.80:
|
|
4503
|
+
logger.warning(
|
|
4504
|
+
"Workspace health check warning: only %d/%d tracked files "
|
|
4505
|
+
"exist on disk (%.0f%%) for task %s — checkout may be incomplete.",
|
|
4506
|
+
_phys_count, _index_count, _ratio * 100, task.task_id,
|
|
4507
|
+
)
|
|
4508
|
+
except RuntimeError:
|
|
4509
|
+
raise
|
|
4510
|
+
except Exception as _health_exc:
|
|
4511
|
+
logger.warning("Workspace health check error (non-fatal): %s", _health_exc)
|
|
4512
|
+
|
|
4513
|
+
|
|
4406
4514
|
# agent run starts from a completely clean slate. This covers:
|
|
4407
4515
|
# • Type change: removes old-type files (e.g. PRD.md/SDD.md) so they
|
|
4408
4516
|
# don't coexist with the new type's files (e.g. diagnosis.md).
|
|
@@ -4630,6 +4738,10 @@ class RuntimeDaemon:
|
|
|
4630
4738
|
f"Original error: {result.error}"
|
|
4631
4739
|
)
|
|
4632
4740
|
result.status = "failed"
|
|
4741
|
+
# Signal to the server that ALL installed agents were tried and
|
|
4742
|
+
# all are rate/quota limited. The server must NOT re-enqueue on
|
|
4743
|
+
# the same runtime — that would hit the same quota wall.
|
|
4744
|
+
result.failure_code = "all_agents_rate_limited"
|
|
4633
4745
|
|
|
4634
4746
|
# 4. Collect git info BEFORE commit (shows uncommitted changes)
|
|
4635
4747
|
pre_commit_git = await self.process_manager._collect_git_info(workspace_path)
|
|
@@ -4715,6 +4827,72 @@ class RuntimeDaemon:
|
|
|
4715
4827
|
except Exception:
|
|
4716
4828
|
logger.exception("Validation gate error for task %s (proceeding anyway)", task.task_id)
|
|
4717
4829
|
|
|
4830
|
+
# 4.6 Post-validation rate-limit fallback.
|
|
4831
|
+
# _validate_and_retry returns early (preserving the rate-limit error)
|
|
4832
|
+
# when the agent hits a quota wall mid-retry. The initial-run fallback
|
|
4833
|
+
# block (step 3) only checked the *initial* run; if that succeeded but
|
|
4834
|
+
# the agent became rate-limited during a validation retry, we need a
|
|
4835
|
+
# second fallback pass here so the task is attempted on a fresh agent.
|
|
4836
|
+
if result.status == "failed" and self.process_manager.is_rate_limited(result) and not _skip_fallback:
|
|
4837
|
+
logger.warning(
|
|
4838
|
+
"Agent '%s' rate-limited during validation retry for task %s — "
|
|
4839
|
+
"attempting post-validation agent fallback",
|
|
4840
|
+
agent.agent_id, task.task_id,
|
|
4841
|
+
)
|
|
4842
|
+
_pv_fallback = self._select_fallback_agent(
|
|
4843
|
+
agent.agent_id, task.fallback_chain, tried_agents
|
|
4844
|
+
)
|
|
4845
|
+
while _pv_fallback:
|
|
4846
|
+
logger.info(
|
|
4847
|
+
"Post-validation fallback: '%s' → '%s' for task %s",
|
|
4848
|
+
agent.agent_id, _pv_fallback.agent_id, task.task_id,
|
|
4849
|
+
)
|
|
4850
|
+
agent = _pv_fallback
|
|
4851
|
+
tried_agents.add(agent.agent_id)
|
|
4852
|
+
await reporter.report_progress(
|
|
4853
|
+
task.task_id, 10,
|
|
4854
|
+
f"agent_fallback: retrying with {agent.agent_id}",
|
|
4855
|
+
output_lines=[
|
|
4856
|
+
f"[daemon] Agent rate-limited during validation, "
|
|
4857
|
+
f"switching to {agent.agent_id}",
|
|
4858
|
+
],
|
|
4859
|
+
)
|
|
4860
|
+
result = await self.process_manager.run_agent(
|
|
4861
|
+
agent, task, workspace_path, on_chunk=on_output_chunk,
|
|
4862
|
+
)
|
|
4863
|
+
if not self.process_manager.is_rate_limited(result):
|
|
4864
|
+
# Fallback agent ran successfully (or hit a non-rate-limit
|
|
4865
|
+
# failure) — re-run the validation gate and update git state.
|
|
4866
|
+
if result.status == "success":
|
|
4867
|
+
try:
|
|
4868
|
+
result = await self._validate_and_retry(
|
|
4869
|
+
agent, task, workspace_path, result,
|
|
4870
|
+
reporter, on_output_chunk, max_retries=2,
|
|
4871
|
+
)
|
|
4872
|
+
pre_commit_git = await self.process_manager._collect_git_info(workspace_path)
|
|
4873
|
+
except Exception:
|
|
4874
|
+
logger.exception(
|
|
4875
|
+
"Post-validation gate error for task %s (proceeding anyway)",
|
|
4876
|
+
task.task_id,
|
|
4877
|
+
)
|
|
4878
|
+
break
|
|
4879
|
+
logger.warning(
|
|
4880
|
+
"Post-validation fallback agent '%s' also rate-limited for task %s",
|
|
4881
|
+
agent.agent_id, task.task_id,
|
|
4882
|
+
)
|
|
4883
|
+
_pv_fallback = self._select_fallback_agent(
|
|
4884
|
+
agent.agent_id, task.fallback_chain, tried_agents
|
|
4885
|
+
)
|
|
4886
|
+
# If every agent we tried is still rate-limited, signal the server
|
|
4887
|
+
# NOT to re-enqueue — it would hit the same quota wall immediately.
|
|
4888
|
+
if self.process_manager.is_rate_limited(result):
|
|
4889
|
+
result.error = (
|
|
4890
|
+
f"All agents unavailable/rate-limited (tried: {', '.join(tried_agents)}). "
|
|
4891
|
+
f"Original error: {result.error}"
|
|
4892
|
+
)
|
|
4893
|
+
result.status = "failed"
|
|
4894
|
+
result.failure_code = "all_agents_rate_limited"
|
|
4895
|
+
|
|
4718
4896
|
# 4.55 Analysis/design nodes must update their deliverables in THIS run.
|
|
4719
4897
|
# Existing files from a prior iteration are not sufficient evidence.
|
|
4720
4898
|
if result.status == "success" and task.node_type in ("analysis", "design"):
|
|
@@ -5196,6 +5374,38 @@ class RuntimeDaemon:
|
|
|
5196
5374
|
# Flush any remaining buffered lines after agent finishes
|
|
5197
5375
|
await _flush_output_to_server()
|
|
5198
5376
|
|
|
5377
|
+
# 3.5 Agent fallback: if the chosen agent hit a rate/quota limit,
|
|
5378
|
+
# try the next available agent before giving up.
|
|
5379
|
+
_aj_tried: set[str] = {agent.agent_id}
|
|
5380
|
+
while self.process_manager.is_rate_limited(result):
|
|
5381
|
+
_aj_fallback = self._select_fallback_agent(agent.agent_id, [], _aj_tried)
|
|
5382
|
+
if not _aj_fallback:
|
|
5383
|
+
# All agents exhausted — signal server not to re-enqueue.
|
|
5384
|
+
result.failure_code = "all_agents_rate_limited"
|
|
5385
|
+
break
|
|
5386
|
+
logger.warning(
|
|
5387
|
+
"AIJob %s: agent '%s' rate-limited, falling back to '%s'",
|
|
5388
|
+
job_id, agent.agent_id, _aj_fallback.agent_id,
|
|
5389
|
+
)
|
|
5390
|
+
await conn.client.post(
|
|
5391
|
+
f"{reporter_url}/progress",
|
|
5392
|
+
json={
|
|
5393
|
+
"current_step": f"agent_fallback: retrying with {_aj_fallback.agent_id}",
|
|
5394
|
+
"output_lines": [
|
|
5395
|
+
f"[daemon] Agent rate-limited, switching to {_aj_fallback.agent_id}",
|
|
5396
|
+
],
|
|
5397
|
+
"progress_pct": 15,
|
|
5398
|
+
},
|
|
5399
|
+
timeout=5,
|
|
5400
|
+
)
|
|
5401
|
+
agent = _aj_fallback
|
|
5402
|
+
_aj_tried.add(agent.agent_id)
|
|
5403
|
+
fake_task.agent_type = agent.agent_id
|
|
5404
|
+
result = await self.process_manager.run_agent(
|
|
5405
|
+
agent, fake_task, workspace_path, on_chunk=on_chunk,
|
|
5406
|
+
)
|
|
5407
|
+
await _flush_output_to_server()
|
|
5408
|
+
|
|
5199
5409
|
# 4. Auto-commit if successful
|
|
5200
5410
|
input_ctx = aj.get("input_context", {})
|
|
5201
5411
|
git_info = {}
|
|
@@ -5242,6 +5452,10 @@ class RuntimeDaemon:
|
|
|
5242
5452
|
except Exception:
|
|
5243
5453
|
pass
|
|
5244
5454
|
|
|
5455
|
+
# Preserve all_agents_rate_limited so the server does NOT re-enqueue.
|
|
5456
|
+
_failure_code = result.failure_code if result.failure_code else (
|
|
5457
|
+
"agent_error" if result.status != "success" else ""
|
|
5458
|
+
)
|
|
5245
5459
|
complete_payload = {
|
|
5246
5460
|
"status": "success" if result.status == "success" else "failed",
|
|
5247
5461
|
"output_content": output_content,
|
|
@@ -5255,7 +5469,7 @@ class RuntimeDaemon:
|
|
|
5255
5469
|
"resolved_agent": agent.agent_id,
|
|
5256
5470
|
"git_info": git_info,
|
|
5257
5471
|
"error": result.error if result.status != "success" else "",
|
|
5258
|
-
"failure_code":
|
|
5472
|
+
"failure_code": _failure_code,
|
|
5259
5473
|
}
|
|
5260
5474
|
|
|
5261
5475
|
await conn.client.post(
|
|
@@ -5315,7 +5529,14 @@ class RuntimeDaemon:
|
|
|
5315
5529
|
],
|
|
5316
5530
|
)
|
|
5317
5531
|
|
|
5318
|
-
#
|
|
5532
|
+
# Save the original prompt BEFORE building the retry variant so we
|
|
5533
|
+
# can include it in fix_prompt. Without this the agent receives only
|
|
5534
|
+
# "fix validation errors" with zero task context and responds with
|
|
5535
|
+
# "I don't have a specific task to execute yet." (root cause confirmed
|
|
5536
|
+
# via Copilot JSONL output for SI-434/SI-446).
|
|
5537
|
+
original_prompt = task.input_prompt
|
|
5538
|
+
|
|
5539
|
+
# Build a targeted fix prompt: original task + validation issues.
|
|
5319
5540
|
_input = task.input_data or {}
|
|
5320
5541
|
_fix_doc_dir = (
|
|
5321
5542
|
_input.get("output_dir")
|
|
@@ -5323,8 +5544,11 @@ class RuntimeDaemon:
|
|
|
5323
5544
|
or ""
|
|
5324
5545
|
)
|
|
5325
5546
|
fix_prompt = (
|
|
5326
|
-
"
|
|
5327
|
-
"
|
|
5547
|
+
f"{original_prompt}\n\n"
|
|
5548
|
+
"---\n\n"
|
|
5549
|
+
"**IMPORTANT – Validation Retry:** The previous execution attempt "
|
|
5550
|
+
"did not produce all required output. Please complete the task above "
|
|
5551
|
+
"and ensure ALL of the following issues are resolved:\n\n"
|
|
5328
5552
|
f"{issues_text}\n\n"
|
|
5329
5553
|
)
|
|
5330
5554
|
if _fix_doc_dir:
|
|
@@ -5339,7 +5563,6 @@ class RuntimeDaemon:
|
|
|
5339
5563
|
)
|
|
5340
5564
|
|
|
5341
5565
|
# Override task prompt temporarily
|
|
5342
|
-
original_prompt = task.input_prompt
|
|
5343
5566
|
task.input_prompt = fix_prompt
|
|
5344
5567
|
|
|
5345
5568
|
try:
|
|
@@ -5349,6 +5572,20 @@ class RuntimeDaemon:
|
|
|
5349
5572
|
finally:
|
|
5350
5573
|
task.input_prompt = original_prompt
|
|
5351
5574
|
|
|
5575
|
+
# If the agent hit a rate/quota limit during this validation retry,
|
|
5576
|
+
# bail out immediately so the outer execution loop can trigger agent
|
|
5577
|
+
# fallback. Continuing to retry with the same rate-limited agent is
|
|
5578
|
+
# pointless; it will hit the same wall every time.
|
|
5579
|
+
# Returning early also preserves the rate-limit error in result.error
|
|
5580
|
+
# so that is_rate_limited() can detect it in the caller.
|
|
5581
|
+
if ProcessManager.is_rate_limited(result):
|
|
5582
|
+
logger.warning(
|
|
5583
|
+
"Agent '%s' rate-limited during validation retry for task %s "
|
|
5584
|
+
"(attempt %d/%d) — aborting validation retries for agent fallback",
|
|
5585
|
+
agent.agent_id, task.task_id, attempt + 1, max_retries,
|
|
5586
|
+
)
|
|
5587
|
+
return result
|
|
5588
|
+
|
|
5352
5589
|
# Final check after all retries
|
|
5353
5590
|
remaining = self._validate_outputs(workspace_path, task, result)
|
|
5354
5591
|
if remaining:
|
|
@@ -5822,7 +6059,7 @@ class RuntimeDaemon:
|
|
|
5822
6059
|
for f in files[:30]:
|
|
5823
6060
|
path = f["path"].lower()
|
|
5824
6061
|
fname = path.rsplit("/", 1)[-1]
|
|
5825
|
-
if "docs/requirements" in path:
|
|
6062
|
+
if "docs/requirements" in path or "docs/workitems" in path:
|
|
5826
6063
|
buckets["Analysis deliverables"].append(f)
|
|
5827
6064
|
elif (
|
|
5828
6065
|
"_test" in fname or fname.startswith("test_")
|
|
@@ -6032,24 +6269,49 @@ class RuntimeDaemon:
|
|
|
6032
6269
|
# 5. Run the agent with the conflict resolution prompt
|
|
6033
6270
|
logger.info("Invoking %s to resolve %d conflict(s)...", agent.agent_id, len(conflicted_files))
|
|
6034
6271
|
try:
|
|
6272
|
+
_cr_task = TaskInfo(
|
|
6273
|
+
task_id=f"{task.task_id}-conflict-resolve",
|
|
6274
|
+
graph_id=task.graph_id,
|
|
6275
|
+
node_type="conflict_resolution",
|
|
6276
|
+
agent_type=agent.agent_id,
|
|
6277
|
+
input_prompt=resolve_prompt,
|
|
6278
|
+
input_data={},
|
|
6279
|
+
timeout_seconds=min(task.timeout_seconds, 300), # cap at 5 min
|
|
6280
|
+
max_retries=0,
|
|
6281
|
+
retry_count=0,
|
|
6282
|
+
project=task.project,
|
|
6283
|
+
work_item=task.work_item,
|
|
6284
|
+
)
|
|
6035
6285
|
resolve_result = await self.process_manager.run_agent(
|
|
6036
6286
|
agent,
|
|
6037
|
-
|
|
6038
|
-
task_id=f"{task.task_id}-conflict-resolve",
|
|
6039
|
-
graph_id=task.graph_id,
|
|
6040
|
-
node_type="conflict_resolution",
|
|
6041
|
-
agent_type=agent.agent_id,
|
|
6042
|
-
input_prompt=resolve_prompt,
|
|
6043
|
-
input_data={},
|
|
6044
|
-
timeout_seconds=min(task.timeout_seconds, 300), # cap at 5 min
|
|
6045
|
-
max_retries=0,
|
|
6046
|
-
retry_count=0,
|
|
6047
|
-
project=task.project,
|
|
6048
|
-
work_item=task.work_item,
|
|
6049
|
-
),
|
|
6287
|
+
_cr_task,
|
|
6050
6288
|
workspace_path,
|
|
6051
6289
|
)
|
|
6052
6290
|
|
|
6291
|
+
# Agent fallback: if the primary agent is rate-limited, try others.
|
|
6292
|
+
_cr_tried: set[str] = {agent.agent_id}
|
|
6293
|
+
while self.process_manager.is_rate_limited(resolve_result):
|
|
6294
|
+
_cr_fallback = self._select_fallback_agent(agent.agent_id, task.fallback_chain, _cr_tried)
|
|
6295
|
+
if not _cr_fallback:
|
|
6296
|
+
logger.warning(
|
|
6297
|
+
"All agents rate-limited for conflict resolution of task %s — aborting merge",
|
|
6298
|
+
task.task_id,
|
|
6299
|
+
)
|
|
6300
|
+
try:
|
|
6301
|
+
await git("merge", "--abort", cwd=workspace_path)
|
|
6302
|
+
except RuntimeError:
|
|
6303
|
+
pass
|
|
6304
|
+
return
|
|
6305
|
+
logger.warning(
|
|
6306
|
+
"Conflict resolution: agent '%s' rate-limited for task %s, "
|
|
6307
|
+
"falling back to '%s'",
|
|
6308
|
+
agent.agent_id, task.task_id, _cr_fallback.agent_id,
|
|
6309
|
+
)
|
|
6310
|
+
agent = _cr_fallback
|
|
6311
|
+
_cr_tried.add(agent.agent_id)
|
|
6312
|
+
_cr_task.agent_type = agent.agent_id
|
|
6313
|
+
resolve_result = await self.process_manager.run_agent(agent, _cr_task, workspace_path)
|
|
6314
|
+
|
|
6053
6315
|
# 6. Check if conflicts are resolved
|
|
6054
6316
|
proc = await asyncio.create_subprocess_exec(
|
|
6055
6317
|
"git", "diff", "--name-only", "--diff-filter=U",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|