@team-agent/installer 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +104 -3
- package/src/team_agent/cli/parser.py +10 -1
- package/src/team_agent/coordinator/lifecycle.py +3 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +199 -28
- package/src/team_agent/launch/core.py +2 -1
- package/src/team_agent/lifecycle/operations.py +1 -0
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +8 -7
- package/src/team_agent/message_store/schema.py +8 -2
- package/src/team_agent/messaging/delivery.py +293 -1
- package/src/team_agent/messaging/leader.py +13 -4
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +200 -0
- package/src/team_agent/messaging/scheduler.py +12 -0
- package/src/team_agent/messaging/send.py +21 -26
- package/src/team_agent/messaging/tmux_io.py +153 -23
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +44 -0
- package/src/team_agent/restart/orchestration.py +207 -4
- package/src/team_agent/runtime.py +3 -3
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +59 -0
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/watch/__init__.py +145 -0
package/package.json
CHANGED
package/schemas/team.schema.json
CHANGED
|
@@ -72,6 +72,12 @@
|
|
|
72
72
|
"startup_order": {
|
|
73
73
|
"type": "array",
|
|
74
74
|
"items": { "type": "string" }
|
|
75
|
+
},
|
|
76
|
+
"auto_trust_own_workspace": {
|
|
77
|
+
"type": "boolean",
|
|
78
|
+
"default": false,
|
|
79
|
+
"deprecated": true,
|
|
80
|
+
"description": "DEPRECATED: use env TEAM_AGENT_AUTO_TRUST_OWN_WORKSPACE per session. Will be removed in 0.3.0."
|
|
75
81
|
}
|
|
76
82
|
}
|
|
77
83
|
},
|
|
@@ -64,7 +64,7 @@ def handle_provider_startup_prompts(workspace: Path, state: dict[str, Any], even
|
|
|
64
64
|
continue
|
|
65
65
|
agent_state["startup_prompt_check_count"] = check_count + 1
|
|
66
66
|
adapter = get_adapter(agent_state["provider"])
|
|
67
|
-
for prompt_event in adapter.handle_startup_prompts(session_name, window, checks=
|
|
67
|
+
for prompt_event in adapter.handle_startup_prompts(session_name, window, checks=20, sleep_s=0.5):
|
|
68
68
|
event_log.write(
|
|
69
69
|
"runtime.startup_prompt_handled",
|
|
70
70
|
agent_id=agent_id,
|
|
@@ -88,9 +88,25 @@ def cmd_settle(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
def cmd_status(args: argparse.Namespace) -> dict[str, Any]:
|
|
91
|
-
if args
|
|
92
|
-
|
|
93
|
-
|
|
91
|
+
if getattr(args, "summary", False) is True:
|
|
92
|
+
if getattr(args, "json", False) is True:
|
|
93
|
+
raise TeamAgentError("--summary and --json are mutually exclusive")
|
|
94
|
+
if getattr(args, "agent", None):
|
|
95
|
+
raise TeamAgentError("status --summary does not accept an agent argument")
|
|
96
|
+
data = runtime.status(Path(args.workspace).resolve(), as_json=True, compact=False)
|
|
97
|
+
return _format_status_summary(data)
|
|
98
|
+
if getattr(args, "json", False) is True:
|
|
99
|
+
return runtime.status(Path(args.workspace).resolve(), as_json=True, compact=not (getattr(args, "detail", False) is True))
|
|
100
|
+
return runtime.format_status(Path(args.workspace).resolve(), getattr(args, "agent", None))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def cmd_watch(args: argparse.Namespace) -> None:
|
|
104
|
+
from team_agent.watch import run_watch
|
|
105
|
+
try:
|
|
106
|
+
run_watch(Path(args.workspace).resolve(), team=getattr(args, "team", None))
|
|
107
|
+
except KeyboardInterrupt:
|
|
108
|
+
raise SystemExit(0)
|
|
109
|
+
raise SystemExit(0)
|
|
94
110
|
|
|
95
111
|
|
|
96
112
|
def cmd_approvals(args: argparse.Namespace) -> dict[str, Any]:
|
|
@@ -200,6 +216,14 @@ def cmd_validate_result(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
200
216
|
|
|
201
217
|
|
|
202
218
|
def cmd_doctor(args: argparse.Namespace) -> dict[str, Any] | str:
|
|
219
|
+
gate = getattr(args, "gate", None)
|
|
220
|
+
if getattr(args, "fix", False) is True and not gate:
|
|
221
|
+
raise TeamAgentError("--fix requires --gate")
|
|
222
|
+
if isinstance(gate, str) and gate:
|
|
223
|
+
from team_agent.diagnose.orphan_cleanup import orphan_gate
|
|
224
|
+
if gate != "orphans":
|
|
225
|
+
raise TeamAgentError(f"unknown doctor gate: {gate}")
|
|
226
|
+
return orphan_gate(fix=bool(getattr(args, "fix", False)), confirm=bool(getattr(args, "confirm", False)))
|
|
203
227
|
if getattr(args, "cleanup_orphans", False):
|
|
204
228
|
from team_agent.diagnose.orphan_cleanup import cleanup_orphan_coordinators, format_cleanup_orphans
|
|
205
229
|
result = cleanup_orphan_coordinators(confirm=bool(getattr(args, "confirm", False)))
|
|
@@ -210,6 +234,83 @@ def cmd_doctor(args: argparse.Namespace) -> dict[str, Any] | str:
|
|
|
210
234
|
return runtime.doctor(spec)
|
|
211
235
|
|
|
212
236
|
|
|
237
|
+
def _format_status_summary(data: dict[str, Any]) -> str:
|
|
238
|
+
coordinator = data.get("coordinator") or {}
|
|
239
|
+
receiver = data.get("leader_receiver") or {}
|
|
240
|
+
agents = data.get("agents") or {}
|
|
241
|
+
health = data.get("agent_health") or {}
|
|
242
|
+
latest = (data.get("latest_results") or [{}])[0] if data.get("latest_results") else None
|
|
243
|
+
counts = _agent_summary_counts(agents, health)
|
|
244
|
+
agents_line = (
|
|
245
|
+
f"agents: {len(agents)} — running={counts['running']} busy={counts['busy']} "
|
|
246
|
+
f"idle={counts['idle']} stopped={counts['stopped']} failed={counts['failed']} "
|
|
247
|
+
f"unknown={counts['unknown']}"
|
|
248
|
+
)
|
|
249
|
+
# C3 (cr verdict, 2026-05-27): append a (N interacted, M never) marker
|
|
250
|
+
# only when at least one worker has a valid first_send_at stamp. When N
|
|
251
|
+
# is zero, the agents line stays byte-identical to the pre-Route-B
|
|
252
|
+
# output so the Gap 18a triage contract (strict five-line shape with
|
|
253
|
+
# exact line[2] string) remains unchanged.
|
|
254
|
+
interacted_count, never_count = _interaction_counts(agents)
|
|
255
|
+
if interacted_count > 0:
|
|
256
|
+
agents_line = f"{agents_line} ({interacted_count} interacted, {never_count} never)"
|
|
257
|
+
return "\n".join([
|
|
258
|
+
f"coordinator: {coordinator.get('status') or 'stopped'} schema_ok={bool(coordinator.get('schema_ok'))} tmux={bool(data.get('tmux_session_present'))}",
|
|
259
|
+
f"receiver: {receiver.get('pane_id') or '-'} cmd={receiver.get('pane_current_command') or receiver.get('current_command') or '-'}",
|
|
260
|
+
agents_line,
|
|
261
|
+
f"queued: {len(data.get('queued_messages') or [])} mailbox messages awaiting delivery",
|
|
262
|
+
_latest_result_line(latest),
|
|
263
|
+
])
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _interaction_counts(agents: dict[str, Any]) -> tuple[int, int]:
|
|
267
|
+
"""Return (interacted, never_interacted) over the agents dict. An agent is
|
|
268
|
+
interacted when its `interacted` field (added by status.queries.status) is
|
|
269
|
+
a non-empty string other than the literal "never". This intentionally
|
|
270
|
+
sources from the enriched per-status interacted field rather than re-
|
|
271
|
+
parsing first_send_at so the summary stays a derived view."""
|
|
272
|
+
interacted = 0
|
|
273
|
+
never = 0
|
|
274
|
+
for entry in agents.values():
|
|
275
|
+
marker = (entry or {}).get("interacted") if isinstance(entry, dict) else None
|
|
276
|
+
if isinstance(marker, str) and marker and marker != "never":
|
|
277
|
+
interacted += 1
|
|
278
|
+
else:
|
|
279
|
+
never += 1
|
|
280
|
+
return interacted, never
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _agent_summary_counts(agents: dict[str, Any], health: dict[str, Any]) -> dict[str, int]:
|
|
284
|
+
counts = dict.fromkeys(("running", "busy", "idle", "stopped", "failed", "unknown"), 0)
|
|
285
|
+
for agent_id, agent in agents.items():
|
|
286
|
+
raw = str((agent or {}).get("status") or "").lower()
|
|
287
|
+
hstatus = str((health.get(agent_id) or {}).get("status") or "").lower()
|
|
288
|
+
if raw in {"failed", "error"} or hstatus in {"failed", "error"}:
|
|
289
|
+
counts["failed"] += 1
|
|
290
|
+
elif raw in {"stopped", "done"} or hstatus == "done":
|
|
291
|
+
counts["stopped"] += 1
|
|
292
|
+
elif raw == "busy" or hstatus in {"running", "working"}:
|
|
293
|
+
counts["busy"] += 1
|
|
294
|
+
elif hstatus == "idle":
|
|
295
|
+
counts["idle"] += 1
|
|
296
|
+
elif raw in {"blocked", "awaiting_approval", "interrupted", "missing", "stuck", "uncertain"} or hstatus in {
|
|
297
|
+
"blocked", "awaiting_approval", "interrupted", "missing", "stuck", "uncertain"
|
|
298
|
+
}:
|
|
299
|
+
counts["unknown"] += 1
|
|
300
|
+
elif raw == "running":
|
|
301
|
+
counts["running"] += 1
|
|
302
|
+
else:
|
|
303
|
+
counts["unknown"] += 1
|
|
304
|
+
return counts
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _latest_result_line(result: dict[str, Any] | None) -> str:
|
|
308
|
+
if not result:
|
|
309
|
+
return "latest result: none"
|
|
310
|
+
summary = str(result.get("summary") or "").replace("\n", " ")[:80]
|
|
311
|
+
return f"latest result: {result.get('agent_id') or '-'} -> {summary or '-'} @ {runtime._age_text(result.get('created_at'))}"
|
|
312
|
+
|
|
313
|
+
|
|
213
314
|
def cmd_shutdown(args: argparse.Namespace) -> dict[str, Any]:
|
|
214
315
|
return runtime.shutdown(Path(args.workspace).resolve(), keep_logs=args.keep_logs, team=args.team)
|
|
215
316
|
|
|
@@ -24,6 +24,7 @@ from team_agent.cli.commands import (
|
|
|
24
24
|
cmd_wait_ready,
|
|
25
25
|
cmd_settle,
|
|
26
26
|
cmd_status,
|
|
27
|
+
cmd_watch,
|
|
27
28
|
cmd_approvals,
|
|
28
29
|
cmd_peek,
|
|
29
30
|
cmd_inbox,
|
|
@@ -182,9 +183,15 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
182
183
|
p.add_argument("agent", nargs="?")
|
|
183
184
|
p.add_argument("--workspace", default=".")
|
|
184
185
|
p.add_argument("--detail", action="store_true", help="Include full raw runtime state in --json output")
|
|
186
|
+
p.add_argument("--summary", action="store_true", help="Emit five-line human-readable triage summary")
|
|
185
187
|
add_json(p)
|
|
186
188
|
p.set_defaults(func=cmd_status)
|
|
187
189
|
|
|
190
|
+
p = sub.add_parser("watch", help="Watch leader-visible team events")
|
|
191
|
+
p.add_argument("--workspace", default=".")
|
|
192
|
+
p.add_argument("--team", help="Explicit team/session selector when a workspace has multiple teams")
|
|
193
|
+
p.set_defaults(func=cmd_watch)
|
|
194
|
+
|
|
188
195
|
p = sub.add_parser("approvals", help="Show structured pending worker approval prompts")
|
|
189
196
|
p.add_argument("agent", nargs="?")
|
|
190
197
|
p.add_argument("--workspace", default=".")
|
|
@@ -310,6 +317,8 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
310
317
|
|
|
311
318
|
p = sub.add_parser("doctor", help="Check local dependencies, providers, auth hints, tmux, and MCP")
|
|
312
319
|
p.add_argument("spec", nargs="?")
|
|
320
|
+
p.add_argument("--gate", choices=["orphans"], help="Run a CI-friendly doctor gate")
|
|
321
|
+
p.add_argument("--fix", action="store_true", help="With --gate orphans: apply the gate fix")
|
|
313
322
|
p.add_argument(
|
|
314
323
|
"--cleanup-orphans",
|
|
315
324
|
action="store_true",
|
|
@@ -461,7 +470,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
461
470
|
sub._choices_actions = [ # type: ignore[attr-defined]
|
|
462
471
|
action for action in sub._choices_actions if action.help != argparse.SUPPRESS # type: ignore[attr-defined]
|
|
463
472
|
]
|
|
464
|
-
sub.metavar = "{codex,claude,quick-start,send,status,approvals,inbox,takeover,claim-leader,identity,shutdown,restart,start-agent,stop-agent,reset-agent,add-agent,fork-agent,remove-agent,stuck-list,stuck-cancel,acknowledge-idle,doctor}"
|
|
473
|
+
sub.metavar = "{codex,claude,quick-start,send,status,watch,approvals,inbox,takeover,claim-leader,identity,shutdown,restart,start-agent,stop-agent,reset-agent,add-agent,fork-agent,remove-agent,stuck-list,stuck-cancel,acknowledge-idle,doctor}"
|
|
465
474
|
|
|
466
475
|
args = parser.parse_args(raw_argv)
|
|
467
476
|
try:
|
|
@@ -265,6 +265,7 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
265
265
|
detect_idle_fallbacks,
|
|
266
266
|
)
|
|
267
267
|
from team_agent.messaging.activity_detector import detect_compaction_degradation
|
|
268
|
+
from team_agent.messaging.leader_api_errors import detect_leader_api_errors
|
|
268
269
|
from team_agent.messaging.session_drift import detect_session_drift
|
|
269
270
|
from team_agent.state import load_runtime_state, save_runtime_state
|
|
270
271
|
state = load_runtime_state(workspace)
|
|
@@ -318,6 +319,7 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
318
319
|
)
|
|
319
320
|
if drift:
|
|
320
321
|
drift_results.append(drift)
|
|
322
|
+
api_errors = detect_leader_api_errors(workspace, state, store, event_log)
|
|
321
323
|
save_runtime_state(workspace, state)
|
|
322
324
|
results = _collect_results_and_notify_watchers(workspace, event_log)
|
|
323
325
|
# Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
|
|
@@ -338,5 +340,6 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
338
340
|
"deadlock_alerts": deadlock_alerts,
|
|
339
341
|
"compaction": compaction_results,
|
|
340
342
|
"session_drift": drift_results,
|
|
343
|
+
"api_errors": api_errors,
|
|
341
344
|
"results": results,
|
|
342
345
|
}
|
|
@@ -16,7 +16,6 @@ import signal
|
|
|
16
16
|
import subprocess
|
|
17
17
|
import time
|
|
18
18
|
from datetime import datetime, timezone
|
|
19
|
-
from pathlib import Path
|
|
20
19
|
from typing import Any
|
|
21
20
|
|
|
22
21
|
# Pattern: argv contains "team_agent.coordinator --workspace <path>" anywhere.
|
|
@@ -39,6 +38,7 @@ _EPHEMERAL_PATH_HINTS = (
|
|
|
39
38
|
"team-agent-test-",
|
|
40
39
|
)
|
|
41
40
|
_SIGTERM_WAIT_SECONDS = 3.0
|
|
41
|
+
_SIGKILL_WAIT_SECONDS = 2.0
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
def find_coordinator_processes(*, runner=subprocess.run) -> list[dict[str, Any]]:
|
|
@@ -91,7 +91,7 @@ def classify_orphan(entry: dict[str, Any]) -> tuple[bool, str]:
|
|
|
91
91
|
workspace = entry.get("workspace")
|
|
92
92
|
if not workspace:
|
|
93
93
|
return False, "cmdline_unparsed"
|
|
94
|
-
if not
|
|
94
|
+
if not os.path.exists(workspace):
|
|
95
95
|
return True, "workspace_path_missing"
|
|
96
96
|
for hint in _EPHEMERAL_PATH_HINTS:
|
|
97
97
|
if hint in workspace:
|
|
@@ -104,12 +104,35 @@ def cleanup_orphan_coordinators(
|
|
|
104
104
|
confirm: bool = False,
|
|
105
105
|
runner=subprocess.run,
|
|
106
106
|
killer=os.kill,
|
|
107
|
+
pg_killer=None,
|
|
108
|
+
pgid_getter=None,
|
|
107
109
|
sleeper=time.sleep,
|
|
110
|
+
sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
|
|
111
|
+
sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
|
|
108
112
|
) -> dict[str, Any]:
|
|
109
113
|
"""Scan for orphan coordinators. Without confirm: dry-run (just classify and report).
|
|
110
|
-
With confirm: SIGTERM each orphan
|
|
111
|
-
|
|
114
|
+
With confirm: SIGTERM each orphan, wait up to _SIGTERM_WAIT_SECONDS for graceful
|
|
115
|
+
exit; if still alive, escalate to SIGKILL and wait _SIGKILL_WAIT_SECONDS. Only
|
|
116
|
+
report status='failed' (with error='alive_after_sigkill') when the process
|
|
117
|
+
survives BOTH signals — that's extremely rare and almost always indicates a
|
|
118
|
+
zombie/uninterruptible-sleep kernel state.
|
|
119
|
+
|
|
120
|
+
Mac mini 2026-05-26 evidence: real orphan coordinators have been observed alive
|
|
121
|
+
40+ hours; many of them never exit on SIGTERM (signal handler suppressed during
|
|
122
|
+
long sqlite reads, or the python interpreter is hosting an async loop that
|
|
123
|
+
swallows the term signal). SIGKILL escalation is required for production.
|
|
124
|
+
|
|
125
|
+
pg_killer / pgid_getter default to os.killpg / os.getpgid; mock them in tests.
|
|
126
|
+
If pgid_getter succeeds AND returns a pgid > 1 AND the pgid != pid (i.e. the
|
|
127
|
+
process leads its own process group with children), we signal the WHOLE group;
|
|
128
|
+
otherwise we signal the pid directly. This catches orphan coordinators that
|
|
129
|
+
spawned subprocess.Popen children which would otherwise survive a pid-only
|
|
130
|
+
SIGTERM."""
|
|
112
131
|
now = datetime.now(timezone.utc).isoformat()
|
|
132
|
+
if pg_killer is None:
|
|
133
|
+
pg_killer = getattr(os, "killpg", None)
|
|
134
|
+
if pgid_getter is None:
|
|
135
|
+
pgid_getter = getattr(os, "getpgid", None)
|
|
113
136
|
entries = find_coordinator_processes(runner=runner)
|
|
114
137
|
classified: list[dict[str, Any]] = []
|
|
115
138
|
orphans: list[dict[str, Any]] = []
|
|
@@ -131,30 +154,19 @@ def cleanup_orphan_coordinators(
|
|
|
131
154
|
killed: list[dict[str, Any]] = []
|
|
132
155
|
failed: list[dict[str, Any]] = []
|
|
133
156
|
for entry in orphans:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
try:
|
|
146
|
-
killer(pid, 0)
|
|
147
|
-
except ProcessLookupError:
|
|
148
|
-
gone = True
|
|
149
|
-
break
|
|
150
|
-
except (PermissionError, OSError):
|
|
151
|
-
gone = True
|
|
152
|
-
break
|
|
153
|
-
sleeper(0.1)
|
|
154
|
-
if gone:
|
|
155
|
-
killed.append(entry)
|
|
157
|
+
outcome = _terminate_orphan(
|
|
158
|
+
entry["pid"], killer=killer, pg_killer=pg_killer,
|
|
159
|
+
pgid_getter=pgid_getter, sleeper=sleeper,
|
|
160
|
+
sigterm_wait_seconds=sigterm_wait_seconds,
|
|
161
|
+
sigkill_wait_seconds=sigkill_wait_seconds,
|
|
162
|
+
)
|
|
163
|
+
annotated = {**entry, **outcome}
|
|
164
|
+
if outcome.get("status") == "killed":
|
|
165
|
+
killed.append(annotated)
|
|
166
|
+
elif outcome.get("status") == "missing":
|
|
167
|
+
killed.append(annotated)
|
|
156
168
|
else:
|
|
157
|
-
failed.append(
|
|
169
|
+
failed.append(annotated)
|
|
158
170
|
return {
|
|
159
171
|
"ok": True,
|
|
160
172
|
"scanned": len(classified),
|
|
@@ -166,6 +178,162 @@ def cleanup_orphan_coordinators(
|
|
|
166
178
|
}
|
|
167
179
|
|
|
168
180
|
|
|
181
|
+
def _terminate_orphan(
|
|
182
|
+
pid: int,
|
|
183
|
+
*,
|
|
184
|
+
killer,
|
|
185
|
+
pg_killer,
|
|
186
|
+
pgid_getter,
|
|
187
|
+
sleeper,
|
|
188
|
+
sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
|
|
189
|
+
sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
|
|
190
|
+
) -> dict[str, Any]:
|
|
191
|
+
"""SIGTERM → wait 3s → SIGKILL → wait 2s escalation. Returns one of:
|
|
192
|
+
{status: 'killed', sigkill_required: False, signaled: 'pid'|'pgid'}
|
|
193
|
+
{status: 'killed', sigkill_required: True, signaled: 'pid'|'pgid'}
|
|
194
|
+
{status: 'missing', error: '<exc>'} — process gone before SIGTERM
|
|
195
|
+
{status: 'failed', error: 'alive_after_sigkill'} — process survived both
|
|
196
|
+
{status: 'failed', error: '<exc>'} — permission denied / OS error
|
|
197
|
+
"""
|
|
198
|
+
pgid, pgid_error = _safe_getpgid(pid, pgid_getter)
|
|
199
|
+
use_group = bool(pg_killer and pgid is not None and pgid > 1 and pgid != pid)
|
|
200
|
+
signaled = "pgid" if use_group else "pid"
|
|
201
|
+
|
|
202
|
+
def send(sig: int) -> tuple[bool, str | None]:
|
|
203
|
+
try:
|
|
204
|
+
if use_group:
|
|
205
|
+
pg_killer(pgid, sig)
|
|
206
|
+
else:
|
|
207
|
+
killer(pid, sig)
|
|
208
|
+
except ProcessLookupError:
|
|
209
|
+
return False, "process_lookup_error"
|
|
210
|
+
except (PermissionError, OSError) as exc:
|
|
211
|
+
return False, str(exc)
|
|
212
|
+
return True, None
|
|
213
|
+
|
|
214
|
+
ok, err = send(signal.SIGTERM)
|
|
215
|
+
if not ok:
|
|
216
|
+
if err == "process_lookup_error":
|
|
217
|
+
return {"status": "missing", "signaled": signaled, "pgid": pgid}
|
|
218
|
+
return {"status": "failed", "error": err, "signaled": signaled, "pgid": pgid}
|
|
219
|
+
if _wait_for_exit(pid, sigterm_wait_seconds, killer=killer, sleeper=sleeper):
|
|
220
|
+
return {
|
|
221
|
+
"status": "killed",
|
|
222
|
+
"sigkill_required": False,
|
|
223
|
+
"signaled": signaled,
|
|
224
|
+
"pgid": pgid,
|
|
225
|
+
"pgid_error": pgid_error,
|
|
226
|
+
}
|
|
227
|
+
# SIGTERM did not work — escalate.
|
|
228
|
+
ok, err = send(signal.SIGKILL)
|
|
229
|
+
if not ok:
|
|
230
|
+
if err == "process_lookup_error":
|
|
231
|
+
# Race: died between checks.
|
|
232
|
+
return {
|
|
233
|
+
"status": "killed",
|
|
234
|
+
"sigkill_required": False,
|
|
235
|
+
"signaled": signaled,
|
|
236
|
+
"pgid": pgid,
|
|
237
|
+
"pgid_error": pgid_error,
|
|
238
|
+
}
|
|
239
|
+
return {
|
|
240
|
+
"status": "failed",
|
|
241
|
+
"error": err,
|
|
242
|
+
"signaled": signaled,
|
|
243
|
+
"pgid": pgid,
|
|
244
|
+
"sigkill_attempted": True,
|
|
245
|
+
}
|
|
246
|
+
if _wait_for_exit(pid, sigkill_wait_seconds, killer=killer, sleeper=sleeper):
|
|
247
|
+
return {
|
|
248
|
+
"status": "killed",
|
|
249
|
+
"sigkill_required": True,
|
|
250
|
+
"signaled": signaled,
|
|
251
|
+
"pgid": pgid,
|
|
252
|
+
"pgid_error": pgid_error,
|
|
253
|
+
}
|
|
254
|
+
return {
|
|
255
|
+
"status": "failed",
|
|
256
|
+
"error": "alive_after_sigkill",
|
|
257
|
+
"signaled": signaled,
|
|
258
|
+
"pgid": pgid,
|
|
259
|
+
"sigkill_required": True,
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _safe_getpgid(pid: int, pgid_getter) -> tuple[int | None, str | None]:
|
|
264
|
+
if pgid_getter is None:
|
|
265
|
+
return None, "getpgid_unavailable"
|
|
266
|
+
try:
|
|
267
|
+
return pgid_getter(pid), None
|
|
268
|
+
except (ProcessLookupError, PermissionError, OSError) as exc:
|
|
269
|
+
return None, str(exc)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _wait_for_exit(pid: int, timeout: float, *, killer, sleeper) -> bool:
|
|
273
|
+
deadline = time.monotonic() + max(timeout, 0.0)
|
|
274
|
+
while time.monotonic() < deadline:
|
|
275
|
+
try:
|
|
276
|
+
killer(pid, 0)
|
|
277
|
+
except ProcessLookupError:
|
|
278
|
+
return True
|
|
279
|
+
except (PermissionError, OSError):
|
|
280
|
+
return True
|
|
281
|
+
sleeper(0.1)
|
|
282
|
+
# Final check after the deadline elapses.
|
|
283
|
+
try:
|
|
284
|
+
killer(pid, 0)
|
|
285
|
+
except ProcessLookupError:
|
|
286
|
+
return True
|
|
287
|
+
except (PermissionError, OSError):
|
|
288
|
+
return True
|
|
289
|
+
return False
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def orphan_gate(
|
|
293
|
+
*,
|
|
294
|
+
fix: bool = False,
|
|
295
|
+
confirm: bool = False,
|
|
296
|
+
runner=subprocess.run,
|
|
297
|
+
killer=os.kill,
|
|
298
|
+
pg_killer=None,
|
|
299
|
+
pgid_getter=None,
|
|
300
|
+
sleeper=time.sleep,
|
|
301
|
+
sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
|
|
302
|
+
sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
|
|
303
|
+
) -> dict[str, Any]:
|
|
304
|
+
if fix and not confirm:
|
|
305
|
+
return {
|
|
306
|
+
"ok": False,
|
|
307
|
+
"gate": "orphans",
|
|
308
|
+
"status": "refused",
|
|
309
|
+
"reason": "fix_requires_confirm",
|
|
310
|
+
"action": "re-run with --gate orphans --fix --confirm",
|
|
311
|
+
}
|
|
312
|
+
result = cleanup_orphan_coordinators(
|
|
313
|
+
confirm=fix and confirm,
|
|
314
|
+
runner=runner,
|
|
315
|
+
killer=killer,
|
|
316
|
+
pg_killer=pg_killer,
|
|
317
|
+
pgid_getter=pgid_getter,
|
|
318
|
+
sleeper=sleeper,
|
|
319
|
+
sigterm_wait_seconds=sigterm_wait_seconds,
|
|
320
|
+
sigkill_wait_seconds=sigkill_wait_seconds,
|
|
321
|
+
)
|
|
322
|
+
orphans = result.get("orphans") or []
|
|
323
|
+
failed = result.get("failed") or []
|
|
324
|
+
passed = not orphans if not fix else not failed
|
|
325
|
+
envelope = {
|
|
326
|
+
**result,
|
|
327
|
+
"ok": passed,
|
|
328
|
+
"gate": "orphans",
|
|
329
|
+
"status": "passed" if passed else "failed",
|
|
330
|
+
"fix": bool(fix),
|
|
331
|
+
}
|
|
332
|
+
if not fix and orphans:
|
|
333
|
+
envelope["action_required"] = "re-run with --gate orphans --fix --confirm"
|
|
334
|
+
return envelope
|
|
335
|
+
|
|
336
|
+
|
|
169
337
|
def format_cleanup_orphans(result: dict[str, Any]) -> str:
|
|
170
338
|
lines = [
|
|
171
339
|
f"Coordinator orphan scan @ {result.get('scanned_at')}",
|
|
@@ -175,7 +343,9 @@ def format_cleanup_orphans(result: dict[str, Any]) -> str:
|
|
|
175
343
|
if result.get("dry_run"):
|
|
176
344
|
lines.append(" mode: DRY-RUN (no SIGTERM sent; re-run with --confirm)")
|
|
177
345
|
else:
|
|
178
|
-
|
|
346
|
+
killed_entries = result.get("killed") or []
|
|
347
|
+
escalated = sum(1 for k in killed_entries if k.get("sigkill_required"))
|
|
348
|
+
lines.append(f" killed: {len(killed_entries)} (sigkill_required: {escalated})")
|
|
179
349
|
lines.append(f" failed: {len(result.get('failed') or [])}")
|
|
180
350
|
for orphan in result.get("orphans") or []:
|
|
181
351
|
lines.append(
|
|
@@ -190,4 +360,5 @@ __all__ = [
|
|
|
190
360
|
"classify_orphan",
|
|
191
361
|
"find_coordinator_processes",
|
|
192
362
|
"format_cleanup_orphans",
|
|
363
|
+
"orphan_gate",
|
|
193
364
|
]
|
|
@@ -215,7 +215,7 @@ def launch(
|
|
|
215
215
|
stdout=proc.stdout,
|
|
216
216
|
)
|
|
217
217
|
raise RuntimeError(f"Failed to start agent {agent['id']}: {proc.stderr.strip()}")
|
|
218
|
-
handled_prompts = adapter.handle_startup_prompts(session_name, agent["id"], checks=
|
|
218
|
+
handled_prompts = adapter.handle_startup_prompts(session_name, agent["id"], checks=20, sleep_s=0.5)
|
|
219
219
|
for prompt_event in handled_prompts:
|
|
220
220
|
event_log.write(
|
|
221
221
|
"launch.startup_prompt_handled",
|
|
@@ -261,6 +261,7 @@ def launch(
|
|
|
261
261
|
event_log,
|
|
262
262
|
timeout_s=1.5,
|
|
263
263
|
exclude_session_ids=known_session_ids,
|
|
264
|
+
raise_on_missed=False,
|
|
264
265
|
)
|
|
265
266
|
if state.get("display_backend") in GHOSTTY_DISPLAY_BACKENDS:
|
|
266
267
|
display_jobs.append((agent["id"], agent))
|
|
@@ -358,6 +358,7 @@ def fork_agent(
|
|
|
358
358
|
event_log,
|
|
359
359
|
timeout_s=1.5,
|
|
360
360
|
exclude_session_ids={source_session_id},
|
|
361
|
+
raise_on_missed=False,
|
|
361
362
|
)
|
|
362
363
|
if open_display and state.get("display_backend") in {"ghostty", "ghostty_window"}:
|
|
363
364
|
agent_state["display"] = _open_ghostty_worker_window(workspace, session_name, as_agent_id, new_agent, event_log)
|
|
@@ -319,7 +319,7 @@ def _start_agent_unlocked(workspace: Path, agent_id: str, force: bool, open_disp
|
|
|
319
319
|
_clear_session_capture_fields(agent_state)
|
|
320
320
|
if command_agent.get("_session_id"):
|
|
321
321
|
agent_state["_pending_session_id"] = command_agent["_session_id"]
|
|
322
|
-
_capture_agent_session(workspace, agent_id, agent_state, event_log, timeout_s=1.5, exclude_session_ids=known_session_ids)
|
|
322
|
+
_capture_agent_session(workspace, agent_id, agent_state, event_log, timeout_s=1.5, exclude_session_ids=known_session_ids, raise_on_missed=False)
|
|
323
323
|
if open_display and state.get("display_backend") in {"ghostty", "ghostty_window"}:
|
|
324
324
|
agent_state["display"] = _open_ghostty_worker_window(workspace, session_name, agent_id, agent, event_log)
|
|
325
325
|
elif open_display and state.get("display_backend") == "ghostty_workspace":
|
|
@@ -331,17 +331,17 @@ class MessageStore:
|
|
|
331
331
|
return counts
|
|
332
332
|
|
|
333
333
|
def add_result(self, envelope: dict[str, Any], owner_team_id: str | None = None) -> str:
|
|
334
|
-
_ = owner_team_id
|
|
335
334
|
validate_result_envelope(envelope)
|
|
336
335
|
result_id = f"res_{uuid.uuid4().hex[:12]}"
|
|
337
336
|
with closing(self.connect()) as conn:
|
|
338
337
|
with conn:
|
|
339
338
|
conn.execute(
|
|
340
339
|
"""
|
|
341
|
-
insert into results(result_id, task_id, agent_id, envelope, status, created_at)
|
|
342
|
-
values (?, ?, ?, ?, ?, ?)
|
|
340
|
+
insert into results(owner_team_id, result_id, task_id, agent_id, envelope, status, created_at)
|
|
341
|
+
values (?, ?, ?, ?, ?, ?, ?)
|
|
343
342
|
""",
|
|
344
343
|
(
|
|
344
|
+
owner_team_id,
|
|
345
345
|
result_id,
|
|
346
346
|
envelope["task_id"],
|
|
347
347
|
envelope["agent_id"],
|
|
@@ -423,16 +423,17 @@ class MessageStore:
|
|
|
423
423
|
return dict(row) if row else None
|
|
424
424
|
|
|
425
425
|
def latest_results(self, limit: int = 5, owner_team_id: str | None = None) -> list[dict[str, Any]]:
|
|
426
|
-
|
|
426
|
+
owner_clause = "and owner_team_id = ?" if owner_team_id else ""
|
|
427
|
+
args: tuple[Any, ...] = (owner_team_id, limit) if owner_team_id else (limit,)
|
|
427
428
|
with closing(self.connect()) as conn:
|
|
428
429
|
rows = conn.execute(
|
|
429
|
-
"""
|
|
430
|
+
f"""
|
|
430
431
|
select * from results
|
|
431
|
-
where status != 'invalid'
|
|
432
|
+
where status != 'invalid' {owner_clause}
|
|
432
433
|
order by created_at desc
|
|
433
434
|
limit ?
|
|
434
435
|
""",
|
|
435
|
-
|
|
436
|
+
args,
|
|
436
437
|
).fetchall()
|
|
437
438
|
return [dict(row) for row in reversed(rows)]
|
|
438
439
|
|
|
@@ -22,7 +22,7 @@ MESSAGE_COLUMNS = {
|
|
|
22
22
|
"error",
|
|
23
23
|
"delivery_attempts",
|
|
24
24
|
}
|
|
25
|
-
RESULT_COLUMNS = {"result_id", "task_id", "agent_id", "envelope", "status", "created_at"}
|
|
25
|
+
RESULT_COLUMNS = {"owner_team_id", "result_id", "task_id", "agent_id", "envelope", "status", "created_at"}
|
|
26
26
|
SCHEDULED_EVENT_COLUMNS = {
|
|
27
27
|
"id",
|
|
28
28
|
"owner_team_id",
|
|
@@ -125,6 +125,7 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
|
|
|
125
125
|
"""
|
|
126
126
|
create table if not exists results (
|
|
127
127
|
result_id text primary key,
|
|
128
|
+
owner_team_id text,
|
|
128
129
|
task_id text not null,
|
|
129
130
|
agent_id text not null,
|
|
130
131
|
envelope text not null,
|
|
@@ -215,7 +216,12 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
|
|
|
215
216
|
"owner_team_id": "alter table messages add column owner_team_id text",
|
|
216
217
|
},
|
|
217
218
|
)
|
|
218
|
-
_ensure_table_columns(
|
|
219
|
+
_ensure_table_columns(
|
|
220
|
+
conn,
|
|
221
|
+
"results",
|
|
222
|
+
RESULT_COLUMNS,
|
|
223
|
+
{"owner_team_id": "alter table results add column owner_team_id text"},
|
|
224
|
+
)
|
|
219
225
|
_ensure_table_columns(
|
|
220
226
|
conn,
|
|
221
227
|
"scheduled_events",
|