@team-agent/installer 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/package.json +1 -1
  2. package/schemas/team.schema.json +6 -0
  3. package/src/team_agent/approvals/runtime_prompts.py +1 -1
  4. package/src/team_agent/cli/commands.py +122 -6
  5. package/src/team_agent/cli/parser.py +42 -1
  6. package/src/team_agent/coordinator/__main__.py +21 -2
  7. package/src/team_agent/coordinator/lifecycle.py +11 -0
  8. package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
  9. package/src/team_agent/events.py +47 -0
  10. package/src/team_agent/launch/core.py +2 -1
  11. package/src/team_agent/leader/__init__.py +273 -60
  12. package/src/team_agent/lifecycle/agents.py +54 -2
  13. package/src/team_agent/lifecycle/operations.py +87 -9
  14. package/src/team_agent/lifecycle/start.py +1 -1
  15. package/src/team_agent/message_store/core.py +8 -7
  16. package/src/team_agent/message_store/leader_notification_log.py +132 -0
  17. package/src/team_agent/message_store/result_watchers.py +144 -1
  18. package/src/team_agent/message_store/schema.py +31 -2
  19. package/src/team_agent/messaging/delivery.py +293 -1
  20. package/src/team_agent/messaging/idle_alerts.py +109 -9
  21. package/src/team_agent/messaging/leader.py +179 -10
  22. package/src/team_agent/messaging/leader_api_errors.py +216 -0
  23. package/src/team_agent/messaging/leader_panes.py +393 -23
  24. package/src/team_agent/messaging/result_delivery.py +219 -4
  25. package/src/team_agent/messaging/results.py +12 -21
  26. package/src/team_agent/messaging/scheduler.py +24 -2
  27. package/src/team_agent/messaging/send.py +21 -26
  28. package/src/team_agent/messaging/tmux_io.py +153 -23
  29. package/src/team_agent/messaging/tmux_prompt.py +87 -0
  30. package/src/team_agent/messaging/trust_auto_answer.py +44 -0
  31. package/src/team_agent/restart/orchestration.py +207 -4
  32. package/src/team_agent/runtime.py +7 -7
  33. package/src/team_agent/rust_core.py +157 -3
  34. package/src/team_agent/sessions/capture.py +65 -15
  35. package/src/team_agent/spec.py +59 -0
  36. package/src/team_agent/state.py +153 -10
  37. package/src/team_agent/status/inbox.py +33 -3
  38. package/src/team_agent/status/queries.py +32 -1
  39. package/src/team_agent/watch/__init__.py +145 -0
@@ -0,0 +1,364 @@
1
+ """Stage 14 (Gap 37a) — `team-agent doctor --cleanup-orphans` implementation.
2
+
3
+ Scans `ps` for processes matching `team_agent.coordinator --workspace <path>` and
4
+ classifies any whose workspace path no longer exists (or matches the test-tempdir
5
+ pattern) as an orphan. Dry-run by default; --confirm sends SIGTERM.
6
+
7
+ Mac mini 2026-05-26 evidence: 35 orphan coordinator processes alive simultaneously
8
+ pointing at /var/folders/.../T/team-agent-watcher-dedupe-* paths that had been removed
9
+ hours earlier. Each holds a long-lived Python interpreter + SQLite connection.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import re
15
+ import signal
16
+ import subprocess
17
+ import time
18
+ from datetime import datetime, timezone
19
+ from typing import Any
20
+
21
+ # Pattern: argv contains "team_agent.coordinator --workspace <path>" anywhere.
22
+ _COORDINATOR_ARGV_RE = re.compile(
23
+ r"team_agent\.coordinator(?:\.__main__)?(?:\s+|.*?)\s--workspace\s+(\S+)"
24
+ )
25
+ # Test-tempdir patterns that indicate the workspace is ephemeral and almost certainly orphan.
26
+ _EPHEMERAL_PATH_HINTS = (
27
+ "team-agent-watcher-dedupe-",
28
+ "team-agent-gap",
29
+ "team-agent-stage",
30
+ "team-agent-orchestrator-",
31
+ "team-agent-rm-",
32
+ "team-agent-claim-",
33
+ "team-agent-hotfix",
34
+ "team-agent-multi",
35
+ "team-agent-progress-",
36
+ "team-agent-fanout-",
37
+ "team-agent-in-flight-",
38
+ "team-agent-test-",
39
+ )
40
+ _SIGTERM_WAIT_SECONDS = 3.0
41
+ _SIGKILL_WAIT_SECONDS = 2.0
42
+
43
+
44
+ def find_coordinator_processes(*, runner=subprocess.run) -> list[dict[str, Any]]:
45
+ """Return list of {pid, etime, cmdline, workspace} dicts for every running
46
+ team_agent.coordinator process visible to ps. workspace is None when the cmdline
47
+ doesn't parse — those are noted but not auto-classified as orphan."""
48
+ try:
49
+ proc = runner(
50
+ ["ps", "-Awwo", "pid=,etime=,command="],
51
+ text=True,
52
+ capture_output=True,
53
+ timeout=5,
54
+ check=False,
55
+ )
56
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
57
+ return []
58
+ if proc.returncode != 0 or not proc.stdout:
59
+ return []
60
+ rows: list[dict[str, Any]] = []
61
+ for line in proc.stdout.splitlines():
62
+ parts = line.strip().split(None, 2)
63
+ if len(parts) < 3:
64
+ continue
65
+ pid_s, etime, cmdline = parts[0], parts[1], parts[2]
66
+ if "team_agent.coordinator" not in cmdline:
67
+ continue
68
+ if "ps -Awwo" in cmdline:
69
+ continue
70
+ try:
71
+ pid = int(pid_s)
72
+ except ValueError:
73
+ continue
74
+ if pid == os.getpid():
75
+ continue
76
+ match = _COORDINATOR_ARGV_RE.search(cmdline)
77
+ workspace = match.group(1) if match else None
78
+ rows.append({
79
+ "pid": pid,
80
+ "etime": etime,
81
+ "cmdline": cmdline,
82
+ "workspace": workspace,
83
+ })
84
+ return rows
85
+
86
+
87
+ def classify_orphan(entry: dict[str, Any]) -> tuple[bool, str]:
88
+ """Return (is_orphan, reason). An entry is orphan when its workspace path no longer
89
+ exists on disk OR matches a known ephemeral-tempdir pattern (test workspaces should
90
+ NEVER spawn long-lived coordinators)."""
91
+ workspace = entry.get("workspace")
92
+ if not workspace:
93
+ return False, "cmdline_unparsed"
94
+ if not os.path.exists(workspace):
95
+ return True, "workspace_path_missing"
96
+ for hint in _EPHEMERAL_PATH_HINTS:
97
+ if hint in workspace:
98
+ return True, f"ephemeral_tempdir_pattern:{hint}"
99
+ return False, "workspace_alive"
100
+
101
+
102
+ def cleanup_orphan_coordinators(
103
+ *,
104
+ confirm: bool = False,
105
+ runner=subprocess.run,
106
+ killer=os.kill,
107
+ pg_killer=None,
108
+ pgid_getter=None,
109
+ sleeper=time.sleep,
110
+ sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
111
+ sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
112
+ ) -> dict[str, Any]:
113
+ """Scan for orphan coordinators. Without confirm: dry-run (just classify and report).
114
+ With confirm: SIGTERM each orphan, wait up to _SIGTERM_WAIT_SECONDS for graceful
115
+ exit; if still alive, escalate to SIGKILL and wait _SIGKILL_WAIT_SECONDS. Only
116
+ report status='failed' (with error='alive_after_sigkill') when the process
117
+ survives BOTH signals — that's extremely rare and almost always indicates a
118
+ zombie/uninterruptible-sleep kernel state.
119
+
120
+ Mac mini 2026-05-26 evidence: real orphan coordinators have been observed alive
121
+ 40+ hours; many of them never exit on SIGTERM (signal handler suppressed during
122
+ long sqlite reads, or the python interpreter is hosting an async loop that
123
+ swallows the term signal). SIGKILL escalation is required for production.
124
+
125
+ pg_killer / pgid_getter default to os.killpg / os.getpgid; mock them in tests.
126
+ If pgid_getter succeeds AND returns a pgid > 1 AND the pgid != pid (i.e. the
127
+ process leads its own process group with children), we signal the WHOLE group;
128
+ otherwise we signal the pid directly. This catches orphan coordinators that
129
+ spawned subprocess.Popen children which would otherwise survive a pid-only
130
+ SIGTERM."""
131
+ now = datetime.now(timezone.utc).isoformat()
132
+ if pg_killer is None:
133
+ pg_killer = getattr(os, "killpg", None)
134
+ if pgid_getter is None:
135
+ pgid_getter = getattr(os, "getpgid", None)
136
+ entries = find_coordinator_processes(runner=runner)
137
+ classified: list[dict[str, Any]] = []
138
+ orphans: list[dict[str, Any]] = []
139
+ for entry in entries:
140
+ is_orphan, reason = classify_orphan(entry)
141
+ annotated = {**entry, "is_orphan": is_orphan, "reason": reason}
142
+ classified.append(annotated)
143
+ if is_orphan:
144
+ orphans.append(annotated)
145
+ if not confirm:
146
+ return {
147
+ "ok": True,
148
+ "scanned": len(classified),
149
+ "orphans": orphans,
150
+ "dry_run": True,
151
+ "scanned_at": now,
152
+ "action_required": "re-run with --confirm to send SIGTERM",
153
+ }
154
+ killed: list[dict[str, Any]] = []
155
+ failed: list[dict[str, Any]] = []
156
+ for entry in orphans:
157
+ outcome = _terminate_orphan(
158
+ entry["pid"], killer=killer, pg_killer=pg_killer,
159
+ pgid_getter=pgid_getter, sleeper=sleeper,
160
+ sigterm_wait_seconds=sigterm_wait_seconds,
161
+ sigkill_wait_seconds=sigkill_wait_seconds,
162
+ )
163
+ annotated = {**entry, **outcome}
164
+ if outcome.get("status") == "killed":
165
+ killed.append(annotated)
166
+ elif outcome.get("status") == "missing":
167
+ killed.append(annotated)
168
+ else:
169
+ failed.append(annotated)
170
+ return {
171
+ "ok": True,
172
+ "scanned": len(classified),
173
+ "orphans": orphans,
174
+ "killed": killed,
175
+ "failed": failed,
176
+ "dry_run": False,
177
+ "scanned_at": now,
178
+ }
179
+
180
+
181
+ def _terminate_orphan(
182
+ pid: int,
183
+ *,
184
+ killer,
185
+ pg_killer,
186
+ pgid_getter,
187
+ sleeper,
188
+ sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
189
+ sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
190
+ ) -> dict[str, Any]:
191
+ """SIGTERM → wait 3s → SIGKILL → wait 2s escalation. Returns one of:
192
+ {status: 'killed', sigkill_required: False, signaled: 'pid'|'pgid'}
193
+ {status: 'killed', sigkill_required: True, signaled: 'pid'|'pgid'}
194
+ {status: 'missing', error: '<exc>'} — process gone before SIGTERM
195
+ {status: 'failed', error: 'alive_after_sigkill'} — process survived both
196
+ {status: 'failed', error: '<exc>'} — permission denied / OS error
197
+ """
198
+ pgid, pgid_error = _safe_getpgid(pid, pgid_getter)
199
+ use_group = bool(pg_killer and pgid is not None and pgid > 1 and pgid != pid)
200
+ signaled = "pgid" if use_group else "pid"
201
+
202
+ def send(sig: int) -> tuple[bool, str | None]:
203
+ try:
204
+ if use_group:
205
+ pg_killer(pgid, sig)
206
+ else:
207
+ killer(pid, sig)
208
+ except ProcessLookupError:
209
+ return False, "process_lookup_error"
210
+ except (PermissionError, OSError) as exc:
211
+ return False, str(exc)
212
+ return True, None
213
+
214
+ ok, err = send(signal.SIGTERM)
215
+ if not ok:
216
+ if err == "process_lookup_error":
217
+ return {"status": "missing", "signaled": signaled, "pgid": pgid}
218
+ return {"status": "failed", "error": err, "signaled": signaled, "pgid": pgid}
219
+ if _wait_for_exit(pid, sigterm_wait_seconds, killer=killer, sleeper=sleeper):
220
+ return {
221
+ "status": "killed",
222
+ "sigkill_required": False,
223
+ "signaled": signaled,
224
+ "pgid": pgid,
225
+ "pgid_error": pgid_error,
226
+ }
227
+ # SIGTERM did not work — escalate.
228
+ ok, err = send(signal.SIGKILL)
229
+ if not ok:
230
+ if err == "process_lookup_error":
231
+ # Race: died between checks.
232
+ return {
233
+ "status": "killed",
234
+ "sigkill_required": False,
235
+ "signaled": signaled,
236
+ "pgid": pgid,
237
+ "pgid_error": pgid_error,
238
+ }
239
+ return {
240
+ "status": "failed",
241
+ "error": err,
242
+ "signaled": signaled,
243
+ "pgid": pgid,
244
+ "sigkill_attempted": True,
245
+ }
246
+ if _wait_for_exit(pid, sigkill_wait_seconds, killer=killer, sleeper=sleeper):
247
+ return {
248
+ "status": "killed",
249
+ "sigkill_required": True,
250
+ "signaled": signaled,
251
+ "pgid": pgid,
252
+ "pgid_error": pgid_error,
253
+ }
254
+ return {
255
+ "status": "failed",
256
+ "error": "alive_after_sigkill",
257
+ "signaled": signaled,
258
+ "pgid": pgid,
259
+ "sigkill_required": True,
260
+ }
261
+
262
+
263
+ def _safe_getpgid(pid: int, pgid_getter) -> tuple[int | None, str | None]:
264
+ if pgid_getter is None:
265
+ return None, "getpgid_unavailable"
266
+ try:
267
+ return pgid_getter(pid), None
268
+ except (ProcessLookupError, PermissionError, OSError) as exc:
269
+ return None, str(exc)
270
+
271
+
272
+ def _wait_for_exit(pid: int, timeout: float, *, killer, sleeper) -> bool:
273
+ deadline = time.monotonic() + max(timeout, 0.0)
274
+ while time.monotonic() < deadline:
275
+ try:
276
+ killer(pid, 0)
277
+ except ProcessLookupError:
278
+ return True
279
+ except (PermissionError, OSError):
280
+ return True
281
+ sleeper(0.1)
282
+ # Final check after the deadline elapses.
283
+ try:
284
+ killer(pid, 0)
285
+ except ProcessLookupError:
286
+ return True
287
+ except (PermissionError, OSError):
288
+ return True
289
+ return False
290
+
291
+
292
+ def orphan_gate(
293
+ *,
294
+ fix: bool = False,
295
+ confirm: bool = False,
296
+ runner=subprocess.run,
297
+ killer=os.kill,
298
+ pg_killer=None,
299
+ pgid_getter=None,
300
+ sleeper=time.sleep,
301
+ sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
302
+ sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
303
+ ) -> dict[str, Any]:
304
+ if fix and not confirm:
305
+ return {
306
+ "ok": False,
307
+ "gate": "orphans",
308
+ "status": "refused",
309
+ "reason": "fix_requires_confirm",
310
+ "action": "re-run with --gate orphans --fix --confirm",
311
+ }
312
+ result = cleanup_orphan_coordinators(
313
+ confirm=fix and confirm,
314
+ runner=runner,
315
+ killer=killer,
316
+ pg_killer=pg_killer,
317
+ pgid_getter=pgid_getter,
318
+ sleeper=sleeper,
319
+ sigterm_wait_seconds=sigterm_wait_seconds,
320
+ sigkill_wait_seconds=sigkill_wait_seconds,
321
+ )
322
+ orphans = result.get("orphans") or []
323
+ failed = result.get("failed") or []
324
+ passed = not orphans if not fix else not failed
325
+ envelope = {
326
+ **result,
327
+ "ok": passed,
328
+ "gate": "orphans",
329
+ "status": "passed" if passed else "failed",
330
+ "fix": bool(fix),
331
+ }
332
+ if not fix and orphans:
333
+ envelope["action_required"] = "re-run with --gate orphans --fix --confirm"
334
+ return envelope
335
+
336
+
337
+ def format_cleanup_orphans(result: dict[str, Any]) -> str:
338
+ lines = [
339
+ f"Coordinator orphan scan @ {result.get('scanned_at')}",
340
+ f" scanned: {result.get('scanned', 0)} coordinator processes",
341
+ f" orphans: {len(result.get('orphans') or [])}",
342
+ ]
343
+ if result.get("dry_run"):
344
+ lines.append(" mode: DRY-RUN (no SIGTERM sent; re-run with --confirm)")
345
+ else:
346
+ killed_entries = result.get("killed") or []
347
+ escalated = sum(1 for k in killed_entries if k.get("sigkill_required"))
348
+ lines.append(f" killed: {len(killed_entries)} (sigkill_required: {escalated})")
349
+ lines.append(f" failed: {len(result.get('failed') or [])}")
350
+ for orphan in result.get("orphans") or []:
351
+ lines.append(
352
+ f" PID {orphan['pid']} etime={orphan['etime']} "
353
+ f"workspace={orphan.get('workspace') or '?'} reason={orphan.get('reason')}"
354
+ )
355
+ return "\n".join(lines)
356
+
357
+
358
+ __all__ = [
359
+ "cleanup_orphan_coordinators",
360
+ "classify_orphan",
361
+ "find_coordinator_processes",
362
+ "format_cleanup_orphans",
363
+ "orphan_gate",
364
+ ]
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import os
4
5
  from datetime import datetime, timezone
5
6
  from pathlib import Path
6
7
  from typing import Any
@@ -8,6 +9,15 @@ from typing import Any
8
9
  from team_agent.paths import logs_dir
9
10
 
10
11
 
12
+ # Stage 14 (Gap 36a) — bounded retention. 5 MB cap × 5 archives = 25 MB worst-case.
13
+ # Mac mini 2026-05-26 evidence: events.jsonl grew to 28 MB / 128k lines in one day with
14
+ # unbounded retention; coordinator's tick-time scan over the file was a ~22% CPU hot path.
15
+ # Rotation keeps the current segment small so reads are cheap; archives preserve forensic
16
+ # history but are NOT consulted by hot-path scans.
17
+ EVENT_LOG_ROTATE_BYTES = 5 * 1024 * 1024
18
+ EVENT_LOG_ARCHIVE_KEEP = 5
19
+
20
+
11
21
  class EventLog:
12
22
  def __init__(self, workspace: Path):
13
23
  self.workspace = workspace
@@ -20,11 +30,14 @@ class EventLog:
20
30
  "event": event_type,
21
31
  **fields,
22
32
  }
33
+ self._maybe_rotate()
23
34
  with self.path.open("a", encoding="utf-8") as f:
24
35
  f.write(json.dumps(event, ensure_ascii=False, sort_keys=True) + "\n")
25
36
  return event
26
37
 
27
38
  def tail(self, limit: int = 20) -> list[dict[str, Any]]:
39
+ # Hot-path scan reads only the current segment. Archives are forensic; if a
40
+ # caller genuinely needs longer history it can iterate _archive_paths explicitly.
28
41
  if not self.path.exists():
29
42
  return []
30
43
  lines = self.path.read_text(encoding="utf-8").splitlines()[-limit:]
@@ -35,3 +48,37 @@ class EventLog:
35
48
  except json.JSONDecodeError:
36
49
  out.append({"raw": line})
37
50
  return out
51
+
52
+ def _maybe_rotate(self) -> None:
53
+ try:
54
+ size = self.path.stat().st_size
55
+ except FileNotFoundError:
56
+ return
57
+ if size < EVENT_LOG_ROTATE_BYTES:
58
+ return
59
+ # Shift archives: events.jsonl.4 → .5, .3 → .4, …, .1 → .2, current → .1
60
+ # Drop the oldest if it would overflow the keep budget.
61
+ oldest = self._archive_path(EVENT_LOG_ARCHIVE_KEEP)
62
+ if oldest.exists():
63
+ try:
64
+ oldest.unlink()
65
+ except OSError:
66
+ pass
67
+ for idx in range(EVENT_LOG_ARCHIVE_KEEP - 1, 0, -1):
68
+ src = self._archive_path(idx)
69
+ dst = self._archive_path(idx + 1)
70
+ if src.exists():
71
+ try:
72
+ os.replace(src, dst)
73
+ except OSError:
74
+ pass
75
+ try:
76
+ os.replace(self.path, self._archive_path(1))
77
+ except OSError:
78
+ pass
79
+
80
+ def _archive_path(self, index: int) -> Path:
81
+ return self.path.with_name(f"{self.path.name}.{index}")
82
+
83
+ def _archive_paths(self) -> list[Path]:
84
+ return [self._archive_path(i) for i in range(1, EVENT_LOG_ARCHIVE_KEEP + 1)]
@@ -215,7 +215,7 @@ def launch(
215
215
  stdout=proc.stdout,
216
216
  )
217
217
  raise RuntimeError(f"Failed to start agent {agent['id']}: {proc.stderr.strip()}")
218
- handled_prompts = adapter.handle_startup_prompts(session_name, agent["id"], checks=1, sleep_s=0.0)
218
+ handled_prompts = adapter.handle_startup_prompts(session_name, agent["id"], checks=20, sleep_s=0.5)
219
219
  for prompt_event in handled_prompts:
220
220
  event_log.write(
221
221
  "launch.startup_prompt_handled",
@@ -261,6 +261,7 @@ def launch(
261
261
  event_log,
262
262
  timeout_s=1.5,
263
263
  exclude_session_ids=known_session_ids,
264
+ raise_on_missed=False,
264
265
  )
265
266
  if state.get("display_backend") in GHOSTTY_DISPLAY_BACKENDS:
266
267
  display_jobs.append((agent["id"], agent))