@team-agent/installer 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/abnormal_track.py +253 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +104 -3
- package/src/team_agent/cli/parser.py +10 -1
- package/src/team_agent/compiler.py +1 -1
- package/src/team_agent/coordinator/lifecycle.py +23 -2
- package/src/team_agent/diagnose/orphan_cleanup.py +199 -28
- package/src/team_agent/display/__init__.py +31 -0
- package/src/team_agent/display/adaptive.py +425 -0
- package/src/team_agent/display/backend.py +46 -0
- package/src/team_agent/display/close.py +6 -0
- package/src/team_agent/display/rebuild.py +102 -0
- package/src/team_agent/display/tiling.py +156 -0
- package/src/team_agent/display/worker_window.py +4 -0
- package/src/team_agent/display/workspace.py +36 -127
- package/src/team_agent/idle_predicate.py +200 -0
- package/src/team_agent/idle_takeover.py +59 -0
- package/src/team_agent/idle_takeover_wiring.py +111 -0
- package/src/team_agent/launch/core.py +14 -4
- package/src/team_agent/leader/__init__.py +444 -61
- package/src/team_agent/lifecycle/operations.py +1 -0
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +38 -11
- package/src/team_agent/message_store/leader_notification_log.py +47 -26
- package/src/team_agent/message_store/schema.py +8 -2
- package/src/team_agent/messaging/delivery.py +336 -1
- package/src/team_agent/messaging/leader.py +13 -4
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +294 -0
- package/src/team_agent/messaging/scheduler.py +12 -0
- package/src/team_agent/messaging/send.py +54 -26
- package/src/team_agent/messaging/tmux_io.py +202 -33
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +52 -0
- package/src/team_agent/provider_state/README.md +78 -0
- package/src/team_agent/provider_state/__init__.py +86 -0
- package/src/team_agent/provider_state/claude.py +86 -0
- package/src/team_agent/provider_state/codex.py +84 -0
- package/src/team_agent/provider_state/common.py +207 -0
- package/src/team_agent/provider_state/registry.py +118 -0
- package/src/team_agent/restart/orchestration.py +215 -12
- package/src/team_agent/runtime.py +65 -15
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +63 -3
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/wake.py +58 -0
- package/src/team_agent/watch/__init__.py +145 -0
package/package.json
CHANGED
package/schemas/team.schema.json
CHANGED
|
@@ -72,6 +72,12 @@
|
|
|
72
72
|
"startup_order": {
|
|
73
73
|
"type": "array",
|
|
74
74
|
"items": { "type": "string" }
|
|
75
|
+
},
|
|
76
|
+
"auto_trust_own_workspace": {
|
|
77
|
+
"type": "boolean",
|
|
78
|
+
"default": false,
|
|
79
|
+
"deprecated": true,
|
|
80
|
+
"description": "DEPRECATED: use env TEAM_AGENT_AUTO_TRUST_OWN_WORKSPACE per session. Will be removed in 0.3.0."
|
|
75
81
|
}
|
|
76
82
|
}
|
|
77
83
|
},
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""Provider-neutral abnormal-state track (Gap 32 §4).
|
|
2
|
+
|
|
3
|
+
Reads structured fault records + process identity; never reads a screen and
|
|
4
|
+
never names a provider. Catch-bias for structured error/failed-class records
|
|
5
|
+
(C9), dedup by (signature, turn) (C8), and coordinator-independent whole-team
|
|
6
|
+
disappearance with clean-shutdown vs unexpected distinction (C10).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def process_abnormal_records(
|
|
15
|
+
records: list[dict[str, Any]],
|
|
16
|
+
*,
|
|
17
|
+
registry: Any,
|
|
18
|
+
notification_state: dict[str, Any] | None,
|
|
19
|
+
event_sink: Any = None,
|
|
20
|
+
) -> dict[str, Any]:
|
|
21
|
+
"""Classify raw provider session records that may carry faults.
|
|
22
|
+
|
|
23
|
+
``registry`` carries the provider whose records these are (``{"provider":
|
|
24
|
+
name}``) or a full registry mapping. Records are turned into structured
|
|
25
|
+
fault facts by the provider reader (so this module names no provider), then
|
|
26
|
+
catch-biased + deduped by (signature, turn).
|
|
27
|
+
"""
|
|
28
|
+
from team_agent.provider_state import read_fault_facts
|
|
29
|
+
from team_agent.provider_state.registry import get_provider_registry
|
|
30
|
+
|
|
31
|
+
state = dict(notification_state or {})
|
|
32
|
+
seen = set(state.get("seen") or [])
|
|
33
|
+
notifications: list[dict[str, Any]] = []
|
|
34
|
+
discovery_log: list[dict[str, Any]] = []
|
|
35
|
+
diagnostics: list[dict[str, Any]] = []
|
|
36
|
+
|
|
37
|
+
provider = _provider_of(registry)
|
|
38
|
+
white, black = _lists_for(provider, registry, get_provider_registry)
|
|
39
|
+
|
|
40
|
+
faults = read_fault_facts(provider, records or []) if provider else []
|
|
41
|
+
if not faults and records:
|
|
42
|
+
# Records that produced no structured fault fact are not default-notify
|
|
43
|
+
# candidates (C9): arbitrary unrecognized lines become diagnostics only.
|
|
44
|
+
diagnostics.append({"kind": "no_structured_fault", "count": len(records)})
|
|
45
|
+
|
|
46
|
+
for fact in faults:
|
|
47
|
+
signature = str(fact.get("signature") or fact.get("reason") or "fault")
|
|
48
|
+
turn_id = fact.get("turn_id")
|
|
49
|
+
text = " ".join(str(x) for x in (signature, fact.get("reason"), _raw_message(fact)) if x).lower()
|
|
50
|
+
decision = _classify(text, signature, white, black)
|
|
51
|
+
discovery_log.append({
|
|
52
|
+
"signature": signature,
|
|
53
|
+
"turn_id": turn_id,
|
|
54
|
+
"decision": decision,
|
|
55
|
+
"kind": fact.get("kind"),
|
|
56
|
+
"provider": provider,
|
|
57
|
+
})
|
|
58
|
+
if decision == "skip":
|
|
59
|
+
continue
|
|
60
|
+
# C8: dedup by (signature, turn_id) — a retry loop in the SAME turn folds
|
|
61
|
+
# to one notify. But a MISSING turn_id must not collapse distinct errors
|
|
62
|
+
# into one global bucket: discriminate by a per-record content fingerprint
|
|
63
|
+
# so genuinely different faults each notify (identical duplicates still fold).
|
|
64
|
+
bucket = turn_id if turn_id is not None else f"norow:{_record_fingerprint(fact)}"
|
|
65
|
+
dedupe_key = (signature, bucket)
|
|
66
|
+
key = f"{signature}\x00{bucket}"
|
|
67
|
+
if key in seen:
|
|
68
|
+
continue
|
|
69
|
+
seen.add(key)
|
|
70
|
+
notifications.append({
|
|
71
|
+
"signature": signature,
|
|
72
|
+
"turn_id": turn_id,
|
|
73
|
+
"dedupe_key": dedupe_key,
|
|
74
|
+
"state": "blocked_on_human" if fact.get("kind") == "approval" else "abnormal",
|
|
75
|
+
"decision": decision,
|
|
76
|
+
"provider": provider,
|
|
77
|
+
"raw": fact.get("raw", fact),
|
|
78
|
+
"raw_record": fact.get("raw", fact),
|
|
79
|
+
})
|
|
80
|
+
_emit(event_sink, "abnormal.notify", signature=signature, turn_id=turn_id, decision=decision)
|
|
81
|
+
|
|
82
|
+
state["seen"] = sorted(seen)
|
|
83
|
+
return {
|
|
84
|
+
"notifications": notifications,
|
|
85
|
+
"discovery_log": discovery_log,
|
|
86
|
+
"diagnostics": diagnostics,
|
|
87
|
+
"notification_state": state,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def detect_whole_team_gone(
|
|
92
|
+
snapshot: dict[str, Any],
|
|
93
|
+
*,
|
|
94
|
+
marker_store: Any,
|
|
95
|
+
event_sink: Any = None,
|
|
96
|
+
) -> dict[str, Any]:
|
|
97
|
+
"""Coordinator-independent whole-team-gone detection (C10/C13).
|
|
98
|
+
|
|
99
|
+
Does not require the coordinator to be alive. The whole team is gone when the
|
|
100
|
+
coordinator, the leader, every provider process, and every session are all
|
|
101
|
+
absent. Clean shutdown / restart-in-progress (flagged in the snapshot) are
|
|
102
|
+
silent; an unexpected disappearance records a durable marker and defers user
|
|
103
|
+
escalation to the next leader command.
|
|
104
|
+
"""
|
|
105
|
+
coordinator = snapshot.get("coordinator") or {}
|
|
106
|
+
leader = snapshot.get("leader") or {}
|
|
107
|
+
provider_processes = snapshot.get("provider_processes")
|
|
108
|
+
if provider_processes is None:
|
|
109
|
+
provider_processes = snapshot.get("nodes") or snapshot.get("agents") or []
|
|
110
|
+
tmux_sessions = snapshot.get("tmux_sessions") or []
|
|
111
|
+
|
|
112
|
+
coord_alive = _alive(coordinator)
|
|
113
|
+
leader_alive = _alive(leader)
|
|
114
|
+
any_worker_alive = any(_alive(p) for p in provider_processes)
|
|
115
|
+
sessions_present = bool(tmux_sessions)
|
|
116
|
+
|
|
117
|
+
whole_gone = not (coord_alive or leader_alive or any_worker_alive or sessions_present)
|
|
118
|
+
|
|
119
|
+
if not whole_gone:
|
|
120
|
+
return {
|
|
121
|
+
"state": "alive",
|
|
122
|
+
"whole_team_gone": False,
|
|
123
|
+
"classification": "alive",
|
|
124
|
+
"notify": False,
|
|
125
|
+
"escalate_user_on_next_leader_command": False,
|
|
126
|
+
"marker_written": False,
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if snapshot.get("clean_shutdown"):
|
|
130
|
+
return _silent_gone("clean_shutdown")
|
|
131
|
+
if snapshot.get("restart_in_progress"):
|
|
132
|
+
return _silent_gone("restart_in_progress")
|
|
133
|
+
|
|
134
|
+
# Unexpected disappearance (闪退): durable marker + deferred escalation.
|
|
135
|
+
marker_written = _marker_set(marker_store, "whole_team_gone", {
|
|
136
|
+
"classification": "unexpected_exit",
|
|
137
|
+
"provider_processes": len(provider_processes),
|
|
138
|
+
})
|
|
139
|
+
_emit(event_sink, "abnormal.whole_team_gone", classification="unexpected_exit")
|
|
140
|
+
return {
|
|
141
|
+
"state": "whole_team_gone",
|
|
142
|
+
"whole_team_gone": True,
|
|
143
|
+
"classification": "unexpected_exit",
|
|
144
|
+
"notify": True,
|
|
145
|
+
"escalate_user_on_next_leader_command": True,
|
|
146
|
+
"marker_written": bool(marker_written),
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _silent_gone(classification: str) -> dict[str, Any]:
|
|
151
|
+
return {
|
|
152
|
+
"state": classification,
|
|
153
|
+
"whole_team_gone": True,
|
|
154
|
+
"classification": classification,
|
|
155
|
+
"notify": False,
|
|
156
|
+
"escalate_user_on_next_leader_command": False,
|
|
157
|
+
"marker_written": False,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _alive(entry: Any) -> bool:
|
|
162
|
+
from team_agent.provider_state.common import process_is_live
|
|
163
|
+
|
|
164
|
+
if isinstance(entry, dict):
|
|
165
|
+
if "alive" in entry:
|
|
166
|
+
return entry.get("alive") is True
|
|
167
|
+
if "process" in entry:
|
|
168
|
+
ok, _r, _d = process_is_live(entry.get("process"))
|
|
169
|
+
return ok
|
|
170
|
+
ok, _r, _d = process_is_live(entry)
|
|
171
|
+
return ok
|
|
172
|
+
return bool(entry)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _provider_of(registry: Any) -> str | None:
|
|
176
|
+
if isinstance(registry, dict):
|
|
177
|
+
if isinstance(registry.get("provider"), str):
|
|
178
|
+
return registry.get("provider")
|
|
179
|
+
if isinstance(registry.get("kind"), str):
|
|
180
|
+
return registry.get("kind")
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _lists_for(provider: str | None, registry: Any, get_provider_registry: Any) -> tuple[list[str], list[str]]:
|
|
185
|
+
entry: Any = None
|
|
186
|
+
if isinstance(registry, dict) and ("error_whitelist" in registry or "error_blacklist" in registry):
|
|
187
|
+
entry = registry
|
|
188
|
+
elif provider is not None:
|
|
189
|
+
entry = get_provider_registry(provider)
|
|
190
|
+
if not isinstance(entry, dict):
|
|
191
|
+
return [], []
|
|
192
|
+
lists = entry.get("error_lists") if isinstance(entry.get("error_lists"), dict) else {}
|
|
193
|
+
white = [str(x).lower() for x in (lists.get("whitelist") or entry.get("error_whitelist") or [])]
|
|
194
|
+
black = [str(x).lower() for x in (lists.get("blacklist") or entry.get("error_blacklist") or [])]
|
|
195
|
+
return white, black
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _classify(text: str, signature: str, white: list[str], black: list[str]) -> str:
|
|
199
|
+
sig = signature.lower()
|
|
200
|
+
if any(w and (w in text or w in sig) for w in white):
|
|
201
|
+
return "skip" # whitelist > blacklist > default
|
|
202
|
+
if any(b and (b in text or b in sig) for b in black):
|
|
203
|
+
return "notify_blacklist"
|
|
204
|
+
return "notify_default" # C9 catch-bias for structured faults
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _record_fingerprint(fact: dict[str, Any]) -> str:
|
|
208
|
+
import hashlib
|
|
209
|
+
import json
|
|
210
|
+
|
|
211
|
+
raw = fact.get("raw", fact)
|
|
212
|
+
try:
|
|
213
|
+
blob = json.dumps(raw, sort_keys=True, default=str)
|
|
214
|
+
except (TypeError, ValueError):
|
|
215
|
+
blob = repr(raw)
|
|
216
|
+
return hashlib.sha256(blob.encode("utf-8", errors="ignore")).hexdigest()[:16]
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _raw_message(fact: dict[str, Any]) -> str:
|
|
220
|
+
raw = fact.get("raw")
|
|
221
|
+
if isinstance(raw, dict):
|
|
222
|
+
return str(raw.get("message") or "")
|
|
223
|
+
return ""
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _marker_set(marker_store: Any, name: str, value: Any) -> bool:
|
|
227
|
+
if marker_store is None:
|
|
228
|
+
return False
|
|
229
|
+
if isinstance(marker_store, dict):
|
|
230
|
+
marker_store[name] = value
|
|
231
|
+
return True
|
|
232
|
+
setter = getattr(marker_store, "set", None) or getattr(marker_store, "write", None)
|
|
233
|
+
if callable(setter):
|
|
234
|
+
try:
|
|
235
|
+
setter(name, value)
|
|
236
|
+
return True
|
|
237
|
+
except Exception:
|
|
238
|
+
return False
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _emit(event_sink: Any, name: str, **fields: Any) -> None:
|
|
243
|
+
if event_sink is None:
|
|
244
|
+
return
|
|
245
|
+
try:
|
|
246
|
+
event_sink(name, fields)
|
|
247
|
+
except TypeError:
|
|
248
|
+
try:
|
|
249
|
+
event_sink({"event": name, **fields})
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
252
|
+
except Exception:
|
|
253
|
+
pass
|
|
@@ -64,7 +64,7 @@ def handle_provider_startup_prompts(workspace: Path, state: dict[str, Any], even
|
|
|
64
64
|
continue
|
|
65
65
|
agent_state["startup_prompt_check_count"] = check_count + 1
|
|
66
66
|
adapter = get_adapter(agent_state["provider"])
|
|
67
|
-
for prompt_event in adapter.handle_startup_prompts(session_name, window, checks=
|
|
67
|
+
for prompt_event in adapter.handle_startup_prompts(session_name, window, checks=20, sleep_s=0.5):
|
|
68
68
|
event_log.write(
|
|
69
69
|
"runtime.startup_prompt_handled",
|
|
70
70
|
agent_id=agent_id,
|
|
@@ -88,9 +88,25 @@ def cmd_settle(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
def cmd_status(args: argparse.Namespace) -> dict[str, Any]:
|
|
91
|
-
if args
|
|
92
|
-
|
|
93
|
-
|
|
91
|
+
if getattr(args, "summary", False) is True:
|
|
92
|
+
if getattr(args, "json", False) is True:
|
|
93
|
+
raise TeamAgentError("--summary and --json are mutually exclusive")
|
|
94
|
+
if getattr(args, "agent", None):
|
|
95
|
+
raise TeamAgentError("status --summary does not accept an agent argument")
|
|
96
|
+
data = runtime.status(Path(args.workspace).resolve(), as_json=True, compact=False)
|
|
97
|
+
return _format_status_summary(data)
|
|
98
|
+
if getattr(args, "json", False) is True:
|
|
99
|
+
return runtime.status(Path(args.workspace).resolve(), as_json=True, compact=not (getattr(args, "detail", False) is True))
|
|
100
|
+
return runtime.format_status(Path(args.workspace).resolve(), getattr(args, "agent", None))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def cmd_watch(args: argparse.Namespace) -> None:
|
|
104
|
+
from team_agent.watch import run_watch
|
|
105
|
+
try:
|
|
106
|
+
run_watch(Path(args.workspace).resolve(), team=getattr(args, "team", None))
|
|
107
|
+
except KeyboardInterrupt:
|
|
108
|
+
raise SystemExit(0)
|
|
109
|
+
raise SystemExit(0)
|
|
94
110
|
|
|
95
111
|
|
|
96
112
|
def cmd_approvals(args: argparse.Namespace) -> dict[str, Any]:
|
|
@@ -200,6 +216,14 @@ def cmd_validate_result(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
200
216
|
|
|
201
217
|
|
|
202
218
|
def cmd_doctor(args: argparse.Namespace) -> dict[str, Any] | str:
|
|
219
|
+
gate = getattr(args, "gate", None)
|
|
220
|
+
if getattr(args, "fix", False) is True and not gate:
|
|
221
|
+
raise TeamAgentError("--fix requires --gate")
|
|
222
|
+
if isinstance(gate, str) and gate:
|
|
223
|
+
from team_agent.diagnose.orphan_cleanup import orphan_gate
|
|
224
|
+
if gate != "orphans":
|
|
225
|
+
raise TeamAgentError(f"unknown doctor gate: {gate}")
|
|
226
|
+
return orphan_gate(fix=bool(getattr(args, "fix", False)), confirm=bool(getattr(args, "confirm", False)))
|
|
203
227
|
if getattr(args, "cleanup_orphans", False):
|
|
204
228
|
from team_agent.diagnose.orphan_cleanup import cleanup_orphan_coordinators, format_cleanup_orphans
|
|
205
229
|
result = cleanup_orphan_coordinators(confirm=bool(getattr(args, "confirm", False)))
|
|
@@ -210,6 +234,83 @@ def cmd_doctor(args: argparse.Namespace) -> dict[str, Any] | str:
|
|
|
210
234
|
return runtime.doctor(spec)
|
|
211
235
|
|
|
212
236
|
|
|
237
|
+
def _format_status_summary(data: dict[str, Any]) -> str:
|
|
238
|
+
coordinator = data.get("coordinator") or {}
|
|
239
|
+
receiver = data.get("leader_receiver") or {}
|
|
240
|
+
agents = data.get("agents") or {}
|
|
241
|
+
health = data.get("agent_health") or {}
|
|
242
|
+
latest = (data.get("latest_results") or [{}])[0] if data.get("latest_results") else None
|
|
243
|
+
counts = _agent_summary_counts(agents, health)
|
|
244
|
+
agents_line = (
|
|
245
|
+
f"agents: {len(agents)} — running={counts['running']} busy={counts['busy']} "
|
|
246
|
+
f"idle={counts['idle']} stopped={counts['stopped']} failed={counts['failed']} "
|
|
247
|
+
f"unknown={counts['unknown']}"
|
|
248
|
+
)
|
|
249
|
+
# C3 (cr verdict, 2026-05-27): append a (N interacted, M never) marker
|
|
250
|
+
# only when at least one worker has a valid first_send_at stamp. When N
|
|
251
|
+
# is zero, the agents line stays byte-identical to the pre-Route-B
|
|
252
|
+
# output so the Gap 18a triage contract (strict five-line shape with
|
|
253
|
+
# exact line[2] string) remains unchanged.
|
|
254
|
+
interacted_count, never_count = _interaction_counts(agents)
|
|
255
|
+
if interacted_count > 0:
|
|
256
|
+
agents_line = f"{agents_line} ({interacted_count} interacted, {never_count} never)"
|
|
257
|
+
return "\n".join([
|
|
258
|
+
f"coordinator: {coordinator.get('status') or 'stopped'} schema_ok={bool(coordinator.get('schema_ok'))} tmux={bool(data.get('tmux_session_present'))}",
|
|
259
|
+
f"receiver: {receiver.get('pane_id') or '-'} cmd={receiver.get('pane_current_command') or receiver.get('current_command') or '-'}",
|
|
260
|
+
agents_line,
|
|
261
|
+
f"queued: {len(data.get('queued_messages') or [])} mailbox messages awaiting delivery",
|
|
262
|
+
_latest_result_line(latest),
|
|
263
|
+
])
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _interaction_counts(agents: dict[str, Any]) -> tuple[int, int]:
|
|
267
|
+
"""Return (interacted, never_interacted) over the agents dict. An agent is
|
|
268
|
+
interacted when its `interacted` field (added by status.queries.status) is
|
|
269
|
+
a non-empty string other than the literal "never". This intentionally
|
|
270
|
+
sources from the enriched per-status interacted field rather than re-
|
|
271
|
+
parsing first_send_at so the summary stays a derived view."""
|
|
272
|
+
interacted = 0
|
|
273
|
+
never = 0
|
|
274
|
+
for entry in agents.values():
|
|
275
|
+
marker = (entry or {}).get("interacted") if isinstance(entry, dict) else None
|
|
276
|
+
if isinstance(marker, str) and marker and marker != "never":
|
|
277
|
+
interacted += 1
|
|
278
|
+
else:
|
|
279
|
+
never += 1
|
|
280
|
+
return interacted, never
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _agent_summary_counts(agents: dict[str, Any], health: dict[str, Any]) -> dict[str, int]:
|
|
284
|
+
counts = dict.fromkeys(("running", "busy", "idle", "stopped", "failed", "unknown"), 0)
|
|
285
|
+
for agent_id, agent in agents.items():
|
|
286
|
+
raw = str((agent or {}).get("status") or "").lower()
|
|
287
|
+
hstatus = str((health.get(agent_id) or {}).get("status") or "").lower()
|
|
288
|
+
if raw in {"failed", "error"} or hstatus in {"failed", "error"}:
|
|
289
|
+
counts["failed"] += 1
|
|
290
|
+
elif raw in {"stopped", "done"} or hstatus == "done":
|
|
291
|
+
counts["stopped"] += 1
|
|
292
|
+
elif raw == "busy" or hstatus in {"running", "working"}:
|
|
293
|
+
counts["busy"] += 1
|
|
294
|
+
elif hstatus == "idle":
|
|
295
|
+
counts["idle"] += 1
|
|
296
|
+
elif raw in {"blocked", "awaiting_approval", "interrupted", "missing", "stuck", "uncertain"} or hstatus in {
|
|
297
|
+
"blocked", "awaiting_approval", "interrupted", "missing", "stuck", "uncertain"
|
|
298
|
+
}:
|
|
299
|
+
counts["unknown"] += 1
|
|
300
|
+
elif raw == "running":
|
|
301
|
+
counts["running"] += 1
|
|
302
|
+
else:
|
|
303
|
+
counts["unknown"] += 1
|
|
304
|
+
return counts
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _latest_result_line(result: dict[str, Any] | None) -> str:
|
|
308
|
+
if not result:
|
|
309
|
+
return "latest result: none"
|
|
310
|
+
summary = str(result.get("summary") or "").replace("\n", " ")[:80]
|
|
311
|
+
return f"latest result: {result.get('agent_id') or '-'} -> {summary or '-'} @ {runtime._age_text(result.get('created_at'))}"
|
|
312
|
+
|
|
313
|
+
|
|
213
314
|
def cmd_shutdown(args: argparse.Namespace) -> dict[str, Any]:
|
|
214
315
|
return runtime.shutdown(Path(args.workspace).resolve(), keep_logs=args.keep_logs, team=args.team)
|
|
215
316
|
|
|
@@ -24,6 +24,7 @@ from team_agent.cli.commands import (
|
|
|
24
24
|
cmd_wait_ready,
|
|
25
25
|
cmd_settle,
|
|
26
26
|
cmd_status,
|
|
27
|
+
cmd_watch,
|
|
27
28
|
cmd_approvals,
|
|
28
29
|
cmd_peek,
|
|
29
30
|
cmd_inbox,
|
|
@@ -182,9 +183,15 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
182
183
|
p.add_argument("agent", nargs="?")
|
|
183
184
|
p.add_argument("--workspace", default=".")
|
|
184
185
|
p.add_argument("--detail", action="store_true", help="Include full raw runtime state in --json output")
|
|
186
|
+
p.add_argument("--summary", action="store_true", help="Emit five-line human-readable triage summary")
|
|
185
187
|
add_json(p)
|
|
186
188
|
p.set_defaults(func=cmd_status)
|
|
187
189
|
|
|
190
|
+
p = sub.add_parser("watch", help="Watch leader-visible team events")
|
|
191
|
+
p.add_argument("--workspace", default=".")
|
|
192
|
+
p.add_argument("--team", help="Explicit team/session selector when a workspace has multiple teams")
|
|
193
|
+
p.set_defaults(func=cmd_watch)
|
|
194
|
+
|
|
188
195
|
p = sub.add_parser("approvals", help="Show structured pending worker approval prompts")
|
|
189
196
|
p.add_argument("agent", nargs="?")
|
|
190
197
|
p.add_argument("--workspace", default=".")
|
|
@@ -310,6 +317,8 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
310
317
|
|
|
311
318
|
p = sub.add_parser("doctor", help="Check local dependencies, providers, auth hints, tmux, and MCP")
|
|
312
319
|
p.add_argument("spec", nargs="?")
|
|
320
|
+
p.add_argument("--gate", choices=["orphans"], help="Run a CI-friendly doctor gate")
|
|
321
|
+
p.add_argument("--fix", action="store_true", help="With --gate orphans: apply the gate fix")
|
|
313
322
|
p.add_argument(
|
|
314
323
|
"--cleanup-orphans",
|
|
315
324
|
action="store_true",
|
|
@@ -461,7 +470,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
461
470
|
sub._choices_actions = [ # type: ignore[attr-defined]
|
|
462
471
|
action for action in sub._choices_actions if action.help != argparse.SUPPRESS # type: ignore[attr-defined]
|
|
463
472
|
]
|
|
464
|
-
sub.metavar = "{codex,claude,quick-start,send,status,approvals,inbox,takeover,claim-leader,identity,shutdown,restart,start-agent,stop-agent,reset-agent,add-agent,fork-agent,remove-agent,stuck-list,stuck-cancel,acknowledge-idle,doctor}"
|
|
473
|
+
sub.metavar = "{codex,claude,quick-start,send,status,watch,approvals,inbox,takeover,claim-leader,identity,shutdown,restart,start-agent,stop-agent,reset-agent,add-agent,fork-agent,remove-agent,stuck-list,stuck-cancel,acknowledge-idle,doctor}"
|
|
465
474
|
|
|
466
475
|
args = parser.parse_args(raw_argv)
|
|
467
476
|
try:
|
|
@@ -93,7 +93,7 @@ def compile_team(team_dir: Path, out_path: Path | None = None) -> dict[str, Any]
|
|
|
93
93
|
},
|
|
94
94
|
"runtime": {
|
|
95
95
|
"backend": "tmux",
|
|
96
|
-
"display_backend": str(team_meta.get("display_backend") or "
|
|
96
|
+
"display_backend": str(team_meta.get("display_backend") or "adaptive"),
|
|
97
97
|
"session_name": str(team_meta.get("session_name") or f"team-{_slug(team_name)}"),
|
|
98
98
|
"auto_launch": True,
|
|
99
99
|
"require_user_approval_before_launch": True,
|
|
@@ -262,9 +262,12 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
262
262
|
)
|
|
263
263
|
from team_agent.messaging.idle_alerts import (
|
|
264
264
|
detect_cross_worker_deadlocks,
|
|
265
|
-
detect_idle_fallbacks,
|
|
266
265
|
)
|
|
266
|
+
from team_agent.idle_predicate import evaluate_takeover_reminder
|
|
267
|
+
from team_agent.idle_takeover_wiring import build_idle_nodes, push_idle_reminder, IDLE_DEBOUNCE_SECONDS
|
|
268
|
+
import time as _time
|
|
267
269
|
from team_agent.messaging.activity_detector import detect_compaction_degradation
|
|
270
|
+
from team_agent.messaging.leader_api_errors import detect_leader_api_errors
|
|
268
271
|
from team_agent.messaging.session_drift import detect_session_drift
|
|
269
272
|
from team_agent.state import load_runtime_state, save_runtime_state
|
|
270
273
|
state = load_runtime_state(workspace)
|
|
@@ -282,7 +285,23 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
282
285
|
delivered = _deliver_pending_messages(workspace, state, event_log)
|
|
283
286
|
fired = _fire_due_scheduled_events(workspace, store, event_log)
|
|
284
287
|
stuck = _detect_stuck_agents(workspace, state, store, event_log)
|
|
285
|
-
|
|
288
|
+
# Gap 32: the take-over reminder is driven by file-fact turn-state via the
|
|
289
|
+
# idle_takeover predicate (the legacy screen-scrape obligation path is retired).
|
|
290
|
+
_coord_meta = state.setdefault("coordinator", {})
|
|
291
|
+
idle_eval = evaluate_takeover_reminder(
|
|
292
|
+
build_idle_nodes(state),
|
|
293
|
+
monitor_state=_coord_meta.get("idle_takeover_monitor"),
|
|
294
|
+
now_monotonic=_time.monotonic(),
|
|
295
|
+
debounce_seconds=IDLE_DEBOUNCE_SECONDS,
|
|
296
|
+
)
|
|
297
|
+
_coord_meta["idle_takeover_monitor"] = idle_eval.get("monitor_state")
|
|
298
|
+
push_idle_reminder(workspace, state, event_log, idle_eval)
|
|
299
|
+
idle_alerts = (
|
|
300
|
+
[{"alert_type": "idle_takeover", "message": idle_eval.get("message"),
|
|
301
|
+
"reason": idle_eval.get("reason"), "interrupted": idle_eval.get("interrupted_nodes")}]
|
|
302
|
+
if idle_eval.get("should_ping")
|
|
303
|
+
else []
|
|
304
|
+
)
|
|
286
305
|
deadlock_alerts = detect_cross_worker_deadlocks(workspace, state, store, event_log)
|
|
287
306
|
compaction_results: list[dict[str, Any]] = []
|
|
288
307
|
for agent_id, agent_state in state.get("agents", {}).items():
|
|
@@ -318,6 +337,7 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
318
337
|
)
|
|
319
338
|
if drift:
|
|
320
339
|
drift_results.append(drift)
|
|
340
|
+
api_errors = detect_leader_api_errors(workspace, state, store, event_log)
|
|
321
341
|
save_runtime_state(workspace, state)
|
|
322
342
|
results = _collect_results_and_notify_watchers(workspace, event_log)
|
|
323
343
|
# Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
|
|
@@ -338,5 +358,6 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
338
358
|
"deadlock_alerts": deadlock_alerts,
|
|
339
359
|
"compaction": compaction_results,
|
|
340
360
|
"session_drift": drift_results,
|
|
361
|
+
"api_errors": api_errors,
|
|
341
362
|
"results": results,
|
|
342
363
|
}
|