@team-agent/installer 0.1.10 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crates/team-agent-core/src/lib.rs +50 -5
- package/package.json +1 -1
- package/schemas/team.schema.json +1 -0
- package/skills/team-agent/SKILL.md +1 -1
- package/src/team_agent/approvals/__init__.py +65 -0
- package/src/team_agent/approvals/constants.py +6 -0
- package/src/team_agent/approvals/parsing.py +176 -0
- package/src/team_agent/approvals/runtime_prompts.py +171 -0
- package/src/team_agent/approvals/status.py +165 -0
- package/src/team_agent/cli/__init__.py +135 -0
- package/src/team_agent/cli/commands.py +335 -0
- package/src/team_agent/cli/e2e.py +202 -0
- package/src/team_agent/cli/helpers.py +137 -0
- package/src/team_agent/cli/parser.py +470 -0
- package/src/team_agent/compiler.py +98 -33
- package/src/team_agent/coordinator/__init__.py +53 -0
- package/src/team_agent/{coordinator.py → coordinator/__main__.py} +3 -1
- package/src/team_agent/coordinator/lifecycle.py +319 -0
- package/src/team_agent/coordinator/metadata.py +61 -0
- package/src/team_agent/coordinator/paths.py +17 -0
- package/src/team_agent/diagnose/__init__.py +48 -0
- package/src/team_agent/diagnose/checks.py +101 -0
- package/src/team_agent/diagnose/health.py +241 -0
- package/src/team_agent/diagnose/preflight.py +194 -0
- package/src/team_agent/diagnose/quick_start.py +233 -0
- package/src/team_agent/display/__init__.py +61 -0
- package/src/team_agent/display/close.py +147 -0
- package/src/team_agent/display/ghostty.py +77 -0
- package/src/team_agent/display/worker_window.py +110 -0
- package/src/team_agent/display/workspace.py +473 -0
- package/src/team_agent/launch/__init__.py +41 -0
- package/src/team_agent/launch/bootstrap.py +85 -0
- package/src/team_agent/launch/config.py +106 -0
- package/src/team_agent/launch/core.py +291 -0
- package/src/team_agent/launch/requirements.py +57 -0
- package/src/team_agent/leader/__init__.py +320 -0
- package/src/team_agent/lifecycle/__init__.py +5 -0
- package/src/team_agent/lifecycle/agents.py +226 -0
- package/src/team_agent/lifecycle/operations.py +321 -0
- package/src/team_agent/lifecycle/start.py +360 -0
- package/src/team_agent/mcp_server/__init__.py +42 -0
- package/src/team_agent/mcp_server/__main__.py +7 -0
- package/src/team_agent/mcp_server/contracts.py +148 -0
- package/src/team_agent/mcp_server/normalize.py +257 -0
- package/src/team_agent/mcp_server/server.py +150 -0
- package/src/team_agent/mcp_server/tools.py +205 -0
- package/src/team_agent/message_store/__init__.py +23 -0
- package/src/team_agent/message_store/agent_health.py +109 -0
- package/src/team_agent/{message_store.py → message_store/core.py} +188 -245
- package/src/team_agent/message_store/result_watchers.py +102 -0
- package/src/team_agent/message_store/schema.py +266 -0
- package/src/team_agent/messaging/__init__.py +1 -0
- package/src/team_agent/messaging/activity_detector.py +190 -0
- package/src/team_agent/messaging/delivery.py +128 -0
- package/src/team_agent/messaging/deps.py +263 -0
- package/src/team_agent/messaging/idle_alerts.py +217 -0
- package/src/team_agent/messaging/internal_delivery.py +46 -0
- package/src/team_agent/messaging/leader.py +317 -0
- package/src/team_agent/messaging/leader_panes.py +343 -0
- package/src/team_agent/messaging/result_delivery.py +300 -0
- package/src/team_agent/messaging/results.py +456 -0
- package/src/team_agent/messaging/scheduler.py +418 -0
- package/src/team_agent/messaging/send.py +493 -0
- package/src/team_agent/messaging/tmux_io.py +337 -0
- package/src/team_agent/messaging/tmux_prompt.py +229 -0
- package/src/team_agent/orchestrator/__init__.py +376 -0
- package/src/team_agent/orchestrator/plan.py +122 -0
- package/src/team_agent/orchestrator/state.py +128 -0
- package/src/team_agent/profiles/__init__.py +82 -0
- package/src/team_agent/profiles/constants.py +19 -0
- package/src/team_agent/profiles/core.py +407 -0
- package/src/team_agent/profiles/helpers.py +69 -0
- package/src/team_agent/profiles/provider_env.py +188 -0
- package/src/team_agent/profiles/smoke.py +201 -0
- package/src/team_agent/provider_cli/__init__.py +43 -0
- package/src/team_agent/provider_cli/adapter.py +167 -0
- package/src/team_agent/provider_cli/base.py +48 -0
- package/src/team_agent/provider_cli/claude.py +457 -0
- package/src/team_agent/provider_cli/codex.py +319 -0
- package/src/team_agent/provider_cli/copilot.py +8 -0
- package/src/team_agent/provider_cli/fake.py +39 -0
- package/src/team_agent/provider_cli/gemini.py +95 -0
- package/src/team_agent/provider_cli/opencode.py +8 -0
- package/src/team_agent/provider_cli/prompt.py +62 -0
- package/src/team_agent/provider_cli/registry.py +18 -0
- package/src/team_agent/provider_cli/unsupported.py +32 -0
- package/src/team_agent/providers.py +67 -949
- package/src/team_agent/quality_gates.py +104 -0
- package/src/team_agent/restart/__init__.py +34 -0
- package/src/team_agent/restart/orchestration.py +328 -0
- package/src/team_agent/restart/selection.py +89 -0
- package/src/team_agent/restart/snapshot.py +70 -0
- package/src/team_agent/runtime.py +802 -5740
- package/src/team_agent/rust_core.py +22 -5
- package/src/team_agent/sessions/__init__.py +25 -0
- package/src/team_agent/sessions/capture.py +93 -0
- package/src/team_agent/sessions/inventory.py +44 -0
- package/src/team_agent/sessions/resume.py +135 -0
- package/src/team_agent/spec.py +3 -1
- package/src/team_agent/state.py +204 -4
- package/src/team_agent/status/__init__.py +63 -0
- package/src/team_agent/status/approvals.py +52 -0
- package/src/team_agent/status/compact.py +158 -0
- package/src/team_agent/status/constants.py +18 -0
- package/src/team_agent/status/inbox.py +28 -0
- package/src/team_agent/status/peek.py +117 -0
- package/src/team_agent/status/queries.py +168 -0
- package/src/team_agent/terminal.py +57 -0
- package/src/team_agent/cli.py +0 -857
- package/src/team_agent/mcp_server.py +0 -579
- package/src/team_agent/profiles.py +0 -882
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from team_agent.messaging.deps import (
|
|
4
|
+
EventLog,
|
|
5
|
+
MessageStore,
|
|
6
|
+
check_team_owner,
|
|
7
|
+
datetime,
|
|
8
|
+
json,
|
|
9
|
+
load_runtime_state,
|
|
10
|
+
load_spec,
|
|
11
|
+
save_runtime_state,
|
|
12
|
+
send_message,
|
|
13
|
+
team_state_key,
|
|
14
|
+
timedelta,
|
|
15
|
+
timezone,
|
|
16
|
+
)
|
|
17
|
+
from team_agent.messaging.activity_detector import classify_agent_activity, detect_compaction_degradation
|
|
18
|
+
from team_agent.messaging.internal_delivery import deliver_stored_message
|
|
19
|
+
from team_agent.messaging.result_delivery import delivered_result_message, result_id_from_text
|
|
20
|
+
from team_agent.state import team_state_candidates
|
|
21
|
+
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
_ACTIVE_TASK_STATUSES = {"pending", "assigned", "in_progress", "ready", "running", "needs_retry"}
|
|
26
|
+
_INBOUND_WORK_STATUSES = {"pending", "accepted", "target_resolved", "injected"}
|
|
27
|
+
_DELIVERED_MESSAGE_STATUSES = {"visible", "submitted", "delivered", "acknowledged"}
|
|
28
|
+
_PROGRESS_EVENTS = {
|
|
29
|
+
"mcp.report_result",
|
|
30
|
+
"report_result.accepted",
|
|
31
|
+
"send.deliver_attempt",
|
|
32
|
+
"send.submitted",
|
|
33
|
+
"leader_receiver.deliver_attempt",
|
|
34
|
+
"leader_receiver.submitted",
|
|
35
|
+
"communication.peer_mirrored",
|
|
36
|
+
}
|
|
37
|
+
_RESTART_RESET_EVENTS = {"restart.agent_start", "restart.complete", "reset_agent.complete", "start_agent.complete"}
|
|
38
|
+
_ALERT_TYPES = {"stuck", "idle_fallback", "cross_worker_deadlock"}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _fire_due_scheduled_events(workspace: Path, store: MessageStore, event_log: EventLog) -> list[int]:
|
|
42
|
+
fired: list[int] = []
|
|
43
|
+
for row in store.due_scheduled_events():
|
|
44
|
+
payload = json.loads(row["payload_json"] or "{}")
|
|
45
|
+
try:
|
|
46
|
+
if row["kind"] == "send":
|
|
47
|
+
content = str(payload.get("content") or "")
|
|
48
|
+
result_id = result_id_from_text(content)
|
|
49
|
+
existing = delivered_result_message(
|
|
50
|
+
store,
|
|
51
|
+
result_id or "",
|
|
52
|
+
task_id=payload.get("task_id"),
|
|
53
|
+
owner_team_id=row.get("owner_team_id"),
|
|
54
|
+
)
|
|
55
|
+
if existing:
|
|
56
|
+
result = {
|
|
57
|
+
"ok": True,
|
|
58
|
+
"status": "already_delivered",
|
|
59
|
+
"message_id": existing.get("message_id"),
|
|
60
|
+
"deduped": True,
|
|
61
|
+
}
|
|
62
|
+
event_log.write(
|
|
63
|
+
"coordinator.scheduled_result_deduped",
|
|
64
|
+
id=row["id"],
|
|
65
|
+
target=row["target"],
|
|
66
|
+
result_id=result_id,
|
|
67
|
+
message_id=existing.get("message_id"),
|
|
68
|
+
)
|
|
69
|
+
store.mark_scheduled_event(int(row["id"]), "done", result)
|
|
70
|
+
fired.append(int(row["id"]))
|
|
71
|
+
continue
|
|
72
|
+
deliver = deliver_stored_message if row.get("owner_team_id") else send_message
|
|
73
|
+
result = deliver(
|
|
74
|
+
workspace,
|
|
75
|
+
row["target"],
|
|
76
|
+
content,
|
|
77
|
+
task_id=payload.get("task_id"),
|
|
78
|
+
sender=payload.get("sender", "coordinator"),
|
|
79
|
+
requires_ack=bool(payload.get("requires_ack", True)),
|
|
80
|
+
wait_visible=bool(payload.get("wait_visible", True)),
|
|
81
|
+
timeout=float(payload.get("timeout", 30)),
|
|
82
|
+
team=row.get("owner_team_id"),
|
|
83
|
+
)
|
|
84
|
+
elif row["kind"] == "health_ping":
|
|
85
|
+
result = {"ok": True, "status": "logged"}
|
|
86
|
+
event_log.write("coordinator.health_ping", target=row["target"], payload=payload)
|
|
87
|
+
else:
|
|
88
|
+
result = {"ok": False, "error": f"unknown scheduled event kind: {row['kind']}"}
|
|
89
|
+
if not result.get("ok") and row["kind"] == "send":
|
|
90
|
+
retry = _schedule_send_retry(store, row, payload, result)
|
|
91
|
+
if retry:
|
|
92
|
+
result = {**result, **retry}
|
|
93
|
+
store.mark_scheduled_event(int(row["id"]), "retry_scheduled", result)
|
|
94
|
+
event_log.write(
|
|
95
|
+
"coordinator.scheduled_retry",
|
|
96
|
+
id=row["id"],
|
|
97
|
+
retry_event_id=retry["retry_event_id"],
|
|
98
|
+
target=row["target"],
|
|
99
|
+
attempt=retry["next_attempt"],
|
|
100
|
+
)
|
|
101
|
+
fired.append(int(row["id"]))
|
|
102
|
+
continue
|
|
103
|
+
store.mark_scheduled_event(int(row["id"]), "done" if result.get("ok") else "failed", result)
|
|
104
|
+
fired.append(int(row["id"]))
|
|
105
|
+
except Exception as exc:
|
|
106
|
+
result = {"ok": False, "error": str(exc)}
|
|
107
|
+
store.mark_scheduled_event(int(row["id"]), "failed", result)
|
|
108
|
+
event_log.write("coordinator.scheduled_failed", id=row["id"], error=str(exc))
|
|
109
|
+
return fired
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _schedule_send_retry(
|
|
113
|
+
store: MessageStore,
|
|
114
|
+
row: dict[str, Any],
|
|
115
|
+
payload: dict[str, Any],
|
|
116
|
+
result: dict[str, Any],
|
|
117
|
+
) -> dict[str, Any] | None:
|
|
118
|
+
attempt = int(payload.get("attempt") or 1)
|
|
119
|
+
max_attempts = int(payload.get("max_attempts") or 1)
|
|
120
|
+
if attempt >= max_attempts:
|
|
121
|
+
return None
|
|
122
|
+
retry_payload = dict(payload)
|
|
123
|
+
retry_payload["attempt"] = attempt + 1
|
|
124
|
+
due_at = datetime.now(timezone.utc) + timedelta(seconds=min(2 * attempt, 5))
|
|
125
|
+
retry_id = store.add_scheduled_event(due_at.isoformat(), row["target"], row["kind"], retry_payload, owner_team_id=row.get("owner_team_id"))
|
|
126
|
+
return {
|
|
127
|
+
"retry_event_id": retry_id,
|
|
128
|
+
"next_attempt": attempt + 1,
|
|
129
|
+
"max_attempts": max_attempts,
|
|
130
|
+
"retry_reason": result.get("reason") or result.get("error"),
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _detect_stuck_agents(
|
|
135
|
+
workspace: Path,
|
|
136
|
+
state: dict[str, Any],
|
|
137
|
+
store: MessageStore,
|
|
138
|
+
event_log: EventLog,
|
|
139
|
+
) -> list[str]:
|
|
140
|
+
spec_path = Path(state.get("spec_path", workspace / "team.spec.yaml"))
|
|
141
|
+
spec = load_spec(spec_path) if spec_path.exists() else {}
|
|
142
|
+
runtime_cfg = spec.get("runtime", {})
|
|
143
|
+
stuck_timeout = int(runtime_cfg.get("stuck_timeout_sec", 300))
|
|
144
|
+
push_min_interval = int(runtime_cfg.get("push_min_interval_sec", 60))
|
|
145
|
+
owner_team_id = team_state_key(state)
|
|
146
|
+
health = store.agent_health(owner_team_id=owner_team_id)
|
|
147
|
+
stuck: list[str] = []
|
|
148
|
+
now = datetime.now(timezone.utc)
|
|
149
|
+
for agent_id, row in health.items():
|
|
150
|
+
if row.get("status") not in {"RUNNING"} or not row.get("last_output_at"):
|
|
151
|
+
continue
|
|
152
|
+
try:
|
|
153
|
+
last = datetime.fromisoformat(row["last_output_at"])
|
|
154
|
+
except ValueError:
|
|
155
|
+
continue
|
|
156
|
+
if last.tzinfo is None:
|
|
157
|
+
last = last.replace(tzinfo=timezone.utc)
|
|
158
|
+
if (now - last).total_seconds() < stuck_timeout:
|
|
159
|
+
continue
|
|
160
|
+
suppression = _active_alert_suppression(state, store, event_log, agent_id, "stuck")
|
|
161
|
+
has_work, work_reason = _agent_has_stuck_relevant_work(state, store, agent_id)
|
|
162
|
+
if not has_work:
|
|
163
|
+
event_log.write("coordinator.agent_stuck_suppressed", agent_id=agent_id, reason="idle_no_work", last_output_at=row["last_output_at"])
|
|
164
|
+
continue
|
|
165
|
+
if suppression:
|
|
166
|
+
continue
|
|
167
|
+
progress_event = _recent_agent_progress_event(event_log, agent_id, last)
|
|
168
|
+
if progress_event:
|
|
169
|
+
event_log.write(
|
|
170
|
+
"coordinator.agent_stuck_suppressed",
|
|
171
|
+
agent_id=agent_id,
|
|
172
|
+
reason="recent_progress_event",
|
|
173
|
+
progress_event=progress_event.get("event"),
|
|
174
|
+
progress_ts=progress_event.get("ts"),
|
|
175
|
+
last_output_at=row["last_output_at"],
|
|
176
|
+
work_reason=work_reason,
|
|
177
|
+
)
|
|
178
|
+
continue
|
|
179
|
+
stuck.append(agent_id)
|
|
180
|
+
state.setdefault("coordinator", {})
|
|
181
|
+
push_key = f"last_stuck_push_at:{agent_id}"
|
|
182
|
+
last_push_raw = state["coordinator"].get(push_key)
|
|
183
|
+
should_push = True
|
|
184
|
+
if last_push_raw:
|
|
185
|
+
try:
|
|
186
|
+
last_push = datetime.fromisoformat(last_push_raw)
|
|
187
|
+
if last_push.tzinfo is None:
|
|
188
|
+
last_push = last_push.replace(tzinfo=timezone.utc)
|
|
189
|
+
should_push = (now - last_push).total_seconds() >= push_min_interval
|
|
190
|
+
except ValueError:
|
|
191
|
+
should_push = True
|
|
192
|
+
event_log.write("coordinator.agent_stuck", agent_id=agent_id, last_output_at=row["last_output_at"], work_reason=work_reason)
|
|
193
|
+
if should_push:
|
|
194
|
+
state["coordinator"][push_key] = now.isoformat()
|
|
195
|
+
try:
|
|
196
|
+
send_message(
|
|
197
|
+
workspace,
|
|
198
|
+
"leader",
|
|
199
|
+
f"agent {agent_id} appears stuck: no output for {stuck_timeout}s",
|
|
200
|
+
sender="coordinator",
|
|
201
|
+
requires_ack=False,
|
|
202
|
+
wait_visible=False,
|
|
203
|
+
team=owner_team_id,
|
|
204
|
+
)
|
|
205
|
+
except Exception as exc:
|
|
206
|
+
event_log.write("coordinator.stuck_push_failed", agent_id=agent_id, error=str(exc))
|
|
207
|
+
return stuck
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def stuck_list(workspace: Path) -> dict[str, Any]:
|
|
211
|
+
state = load_runtime_state(workspace)
|
|
212
|
+
suppressed = state.get("coordinator", {}).get("suppressed_idle_alerts", {})
|
|
213
|
+
if _use_team_scoped_suppressions(state):
|
|
214
|
+
from team_agent.state import _caller_identity_from_env
|
|
215
|
+
caller = _caller_identity_from_env()
|
|
216
|
+
candidates = team_state_candidates(state)
|
|
217
|
+
caller_team = None
|
|
218
|
+
if caller.get("pane_id"):
|
|
219
|
+
for key, candidate in candidates.items():
|
|
220
|
+
owner = candidate.get("team_owner") or {}
|
|
221
|
+
if (
|
|
222
|
+
caller["pane_id"] == (owner.get("pane_id") or "")
|
|
223
|
+
and caller["provider"] == (owner.get("provider") or "")
|
|
224
|
+
and caller["machine_fingerprint"] == (owner.get("machine_fingerprint") or "")
|
|
225
|
+
):
|
|
226
|
+
caller_team = key
|
|
227
|
+
break
|
|
228
|
+
if caller_team is None:
|
|
229
|
+
return {
|
|
230
|
+
"ok": False,
|
|
231
|
+
"status": "refused",
|
|
232
|
+
"reason": "team_owner_unresolved",
|
|
233
|
+
"action": "set TEAM_AGENT_LEADER_PANE_ID/PROVIDER/MACHINE_FINGERPRINT to your team's claimed identity, or use team-agent takeover --confirm",
|
|
234
|
+
"candidates": sorted(candidates),
|
|
235
|
+
}
|
|
236
|
+
return {"ok": True, "suppressed_idle_alerts": suppressed.get(caller_team, {}), "team": caller_team}
|
|
237
|
+
known_team_keys = set(team_state_candidates(state).keys())
|
|
238
|
+
has_team_keys = bool(known_team_keys & set(suppressed.keys()))
|
|
239
|
+
if not has_team_keys and (
|
|
240
|
+
len(suppressed) == 1
|
|
241
|
+
and all(isinstance(value, dict) for value in suppressed.values())
|
|
242
|
+
and not any(isinstance(value, dict) and set(value) & _ALERT_TYPES for value in suppressed.values())
|
|
243
|
+
):
|
|
244
|
+
only = next(iter(suppressed.values()))
|
|
245
|
+
if all(isinstance(value, dict) for value in only.values()):
|
|
246
|
+
suppressed = only
|
|
247
|
+
return {"ok": True, "suppressed_idle_alerts": suppressed}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def stuck_cancel(
|
|
251
|
+
workspace: Path,
|
|
252
|
+
agent_id: str,
|
|
253
|
+
alert_type: str = "stuck",
|
|
254
|
+
suppressed_by: str = "leader",
|
|
255
|
+
) -> dict[str, Any]:
|
|
256
|
+
if alert_type == "all":
|
|
257
|
+
alert_types = sorted(_ALERT_TYPES)
|
|
258
|
+
elif alert_type in _ALERT_TYPES:
|
|
259
|
+
alert_types = [alert_type]
|
|
260
|
+
else:
|
|
261
|
+
return {"ok": False, "status": "refused", "reason": "invalid_alert_type", "alert_type": alert_type}
|
|
262
|
+
state = load_runtime_state(workspace)
|
|
263
|
+
gate = check_team_owner(state)
|
|
264
|
+
if gate:
|
|
265
|
+
return gate
|
|
266
|
+
store = MessageStore(workspace)
|
|
267
|
+
owner_team_id = team_state_key(state)
|
|
268
|
+
coordinator = state.setdefault("coordinator", {})
|
|
269
|
+
suppressed = coordinator.setdefault("suppressed_idle_alerts", {})
|
|
270
|
+
team_suppressions = suppressed.setdefault(owner_team_id, {}) if _use_team_scoped_suppressions(state) else suppressed
|
|
271
|
+
agent_suppressions = team_suppressions.setdefault(agent_id, {})
|
|
272
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
273
|
+
snapshot = _agent_alert_snapshot(state, store, agent_id, owner_team_id)
|
|
274
|
+
for item in alert_types:
|
|
275
|
+
agent_suppressions[item] = {
|
|
276
|
+
"suppressed_at": now,
|
|
277
|
+
"suppressed_by": suppressed_by,
|
|
278
|
+
"snapshot": snapshot,
|
|
279
|
+
}
|
|
280
|
+
save_runtime_state(workspace, state)
|
|
281
|
+
EventLog(workspace).write("coordinator.idle_alert_suppressed", agent_id=agent_id, alert_types=alert_types, suppressed_by=suppressed_by)
|
|
282
|
+
return {"ok": True, "agent_id": agent_id, "alert_types": alert_types, "suppressed": agent_suppressions}
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _active_alert_suppression(
|
|
286
|
+
state: dict[str, Any],
|
|
287
|
+
store: MessageStore,
|
|
288
|
+
event_log: EventLog,
|
|
289
|
+
agent_id: str,
|
|
290
|
+
alert_type: str,
|
|
291
|
+
) -> dict[str, Any] | None:
|
|
292
|
+
owner_team_id = team_state_key(state)
|
|
293
|
+
suppressed = state.get("coordinator", {}).get("suppressed_idle_alerts", {})
|
|
294
|
+
entry = suppressed.get(owner_team_id, {}).get(agent_id, {}).get(alert_type)
|
|
295
|
+
if not isinstance(entry, dict):
|
|
296
|
+
entry = suppressed.get(agent_id, {}).get(alert_type)
|
|
297
|
+
if not isinstance(entry, dict):
|
|
298
|
+
return None
|
|
299
|
+
cleared = _suppression_clear_reason(state, store, event_log, agent_id, entry)
|
|
300
|
+
if cleared:
|
|
301
|
+
_clear_alert_suppression(state, agent_id, alert_type, owner_team_id)
|
|
302
|
+
event_log.write("coordinator.idle_alert_suppression_cleared", agent_id=agent_id, alert_type=alert_type, reason=cleared)
|
|
303
|
+
return None
|
|
304
|
+
return entry
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _suppression_clear_reason(
|
|
308
|
+
state: dict[str, Any],
|
|
309
|
+
store: MessageStore,
|
|
310
|
+
event_log: EventLog,
|
|
311
|
+
agent_id: str,
|
|
312
|
+
entry: dict[str, Any],
|
|
313
|
+
) -> str | None:
|
|
314
|
+
previous = entry.get("snapshot") if isinstance(entry.get("snapshot"), dict) else {}
|
|
315
|
+
current = _agent_alert_snapshot(state, store, agent_id)
|
|
316
|
+
if current.get("assigned_task_ids") != previous.get("assigned_task_ids"):
|
|
317
|
+
return "task_assignment_changed"
|
|
318
|
+
if current.get("delivered_message_ids") != previous.get("delivered_message_ids"):
|
|
319
|
+
return "inbound_delivery_changed"
|
|
320
|
+
try:
|
|
321
|
+
suppressed_at = datetime.fromisoformat(str(entry.get("suppressed_at")))
|
|
322
|
+
except ValueError:
|
|
323
|
+
return "invalid_suppression_timestamp"
|
|
324
|
+
if suppressed_at.tzinfo is None:
|
|
325
|
+
suppressed_at = suppressed_at.replace(tzinfo=timezone.utc)
|
|
326
|
+
if _recent_agent_progress_event(event_log, agent_id, suppressed_at):
|
|
327
|
+
return "progress_event"
|
|
328
|
+
if _recent_restart_or_reset_event(event_log, agent_id, suppressed_at):
|
|
329
|
+
return "restart_or_reset"
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _clear_alert_suppression(state: dict[str, Any], agent_id: str, alert_type: str, owner_team_id: str | None = None) -> None:
|
|
334
|
+
suppressed = state.get("coordinator", {}).get("suppressed_idle_alerts", {})
|
|
335
|
+
if agent_id in suppressed:
|
|
336
|
+
agent_suppressions = suppressed.get(agent_id, {})
|
|
337
|
+
agent_suppressions.pop(alert_type, None)
|
|
338
|
+
if not agent_suppressions:
|
|
339
|
+
suppressed.pop(agent_id, None)
|
|
340
|
+
return
|
|
341
|
+
team_suppressions = suppressed.get(owner_team_id or team_state_key(state), {})
|
|
342
|
+
agent_suppressions = team_suppressions.get(agent_id, {})
|
|
343
|
+
agent_suppressions.pop(alert_type, None)
|
|
344
|
+
if not agent_suppressions:
|
|
345
|
+
team_suppressions.pop(agent_id, None)
|
|
346
|
+
if not team_suppressions:
|
|
347
|
+
suppressed.pop(owner_team_id or team_state_key(state), None)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _use_team_scoped_suppressions(state: dict[str, Any]) -> bool:
|
|
351
|
+
return len(team_state_candidates(state)) > 1
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _agent_alert_snapshot(state: dict[str, Any], store: MessageStore, agent_id: str, owner_team_id: str | None = None) -> dict[str, Any]:
|
|
355
|
+
assigned_task_ids = sorted(str(task.get("id")) for task in state.get("tasks", []) if task.get("assignee") == agent_id)
|
|
356
|
+
delivered_message_ids = sorted(
|
|
357
|
+
str(message.get("message_id"))
|
|
358
|
+
for message in store.messages(owner_team_id=owner_team_id or team_state_key(state))
|
|
359
|
+
if message.get("recipient") == agent_id and message.get("status") in _DELIVERED_MESSAGE_STATUSES
|
|
360
|
+
)
|
|
361
|
+
return {"assigned_task_ids": assigned_task_ids, "delivered_message_ids": delivered_message_ids}
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _agent_has_stuck_relevant_work(state: dict[str, Any], store: MessageStore, agent_id: str) -> tuple[bool, str]:
|
|
365
|
+
for task in state.get("tasks", []):
|
|
366
|
+
if task.get("assignee") == agent_id and task.get("status", "pending") in _ACTIVE_TASK_STATUSES:
|
|
367
|
+
return True, "active_task"
|
|
368
|
+
for message in store.messages(owner_team_id=team_state_key(state)):
|
|
369
|
+
if message.get("recipient") == agent_id and message.get("status") in _INBOUND_WORK_STATUSES:
|
|
370
|
+
return True, "inbound_message"
|
|
371
|
+
return False, "idle_no_work"
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _recent_agent_progress_event(event_log: EventLog, agent_id: str, since: datetime) -> dict[str, Any] | None:
|
|
375
|
+
for event in reversed(event_log.tail(200)):
|
|
376
|
+
if event.get("event") not in _PROGRESS_EVENTS:
|
|
377
|
+
continue
|
|
378
|
+
if not _event_mentions_agent(event, agent_id):
|
|
379
|
+
continue
|
|
380
|
+
try:
|
|
381
|
+
ts = datetime.fromisoformat(str(event.get("ts")))
|
|
382
|
+
except ValueError:
|
|
383
|
+
continue
|
|
384
|
+
if ts.tzinfo is None:
|
|
385
|
+
ts = ts.replace(tzinfo=timezone.utc)
|
|
386
|
+
if ts >= since:
|
|
387
|
+
return event
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _event_mentions_agent(event: dict[str, Any], agent_id: str) -> bool:
|
|
392
|
+
if event.get("agent_id") == agent_id or event.get("sender") == agent_id or event.get("target") == agent_id:
|
|
393
|
+
return True
|
|
394
|
+
payload = event.get("payload")
|
|
395
|
+
return isinstance(payload, dict) and (payload.get("from") == agent_id or payload.get("to") == agent_id)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _recent_restart_or_reset_event(event_log: EventLog, agent_id: str, since: datetime) -> dict[str, Any] | None:
|
|
399
|
+
for event in reversed(event_log.tail(200)):
|
|
400
|
+
if event.get("event") not in _RESTART_RESET_EVENTS:
|
|
401
|
+
continue
|
|
402
|
+
if event.get("agent_id") != agent_id and agent_id not in set(event.get("agents") or []):
|
|
403
|
+
continue
|
|
404
|
+
try:
|
|
405
|
+
ts = datetime.fromisoformat(str(event.get("ts")))
|
|
406
|
+
except ValueError:
|
|
407
|
+
continue
|
|
408
|
+
if ts.tzinfo is None:
|
|
409
|
+
ts = ts.replace(tzinfo=timezone.utc)
|
|
410
|
+
if ts >= since:
|
|
411
|
+
return event
|
|
412
|
+
return None
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
from team_agent.messaging.idle_alerts import (
|
|
416
|
+
detect_cross_worker_deadlocks,
|
|
417
|
+
detect_idle_fallbacks,
|
|
418
|
+
)
|