@team-agent/installer 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@team-agent/installer",
3
- "version": "0.2.1",
3
+ "version": "0.2.2",
4
4
  "description": "npx installer for Team Agent",
5
5
  "keywords": [
6
6
  "codex",
@@ -119,9 +119,10 @@ def cmd_peek(args: argparse.Namespace) -> dict[str, Any]:
119
119
 
120
120
 
121
121
  def cmd_inbox(args: argparse.Namespace) -> dict[str, Any]:
122
+ since = getattr(args, "since", None)
122
123
  if args.json:
123
- return runtime.inbox(Path(args.workspace).resolve(), args.agent, limit=args.limit)
124
- return runtime.format_inbox(Path(args.workspace).resolve(), args.agent, limit=args.limit)
124
+ return runtime.inbox(Path(args.workspace).resolve(), args.agent, limit=args.limit, since=since)
125
+ return runtime.format_inbox(Path(args.workspace).resolve(), args.agent, limit=args.limit, since=since)
125
126
 
126
127
 
127
128
  def cmd_sessions(args: argparse.Namespace) -> dict[str, Any]:
@@ -136,6 +137,14 @@ def cmd_takeover(args: argparse.Namespace) -> dict[str, Any]:
136
137
  return runtime.takeover(Path(args.workspace).resolve(), team=args.team, confirm=args.confirm)
137
138
 
138
139
 
140
+ def cmd_claim_leader(args: argparse.Namespace) -> dict[str, Any]:
141
+ return runtime.claim_leader(Path(args.workspace).resolve(), team=args.team, confirm=args.confirm)
142
+
143
+
144
+ def cmd_identity(args: argparse.Namespace) -> dict[str, Any]:
145
+ return runtime.leader_identity(Path(args.workspace).resolve(), team=args.team)
146
+
147
+
139
148
  def cmd_send(args: argparse.Namespace) -> dict[str, Any]:
140
149
  target = _send_target(args)
141
150
  return runtime.send_message(
@@ -190,7 +199,13 @@ def cmd_validate_result(args: argparse.Namespace) -> dict[str, Any]:
190
199
  return {"ok": True, "task_id": envelope["task_id"], "agent_id": envelope["agent_id"], "status": envelope["status"]}
191
200
 
192
201
 
193
- def cmd_doctor(args: argparse.Namespace) -> dict[str, Any]:
202
+ def cmd_doctor(args: argparse.Namespace) -> dict[str, Any] | str:
203
+ if getattr(args, "cleanup_orphans", False):
204
+ from team_agent.diagnose.orphan_cleanup import cleanup_orphan_coordinators, format_cleanup_orphans
205
+ result = cleanup_orphan_coordinators(confirm=bool(getattr(args, "confirm", False)))
206
+ if args.json:
207
+ return result
208
+ return format_cleanup_orphans(result)
194
209
  spec = Path(args.spec).resolve() if args.spec else None
195
210
  return runtime.doctor(spec)
196
211
 
@@ -30,6 +30,8 @@ from team_agent.cli.commands import (
30
30
  cmd_sessions,
31
31
  cmd_attach_leader,
32
32
  cmd_takeover,
33
+ cmd_claim_leader,
34
+ cmd_identity,
33
35
  cmd_send,
34
36
  cmd_collect,
35
37
  cmd_diagnose,
@@ -209,6 +211,12 @@ def main(argv: list[str] | None = None) -> None:
209
211
  p.add_argument("agent")
210
212
  p.add_argument("--workspace", default=".")
211
213
  p.add_argument("--limit", type=int, default=20)
214
+ p.add_argument(
215
+ "--since",
216
+ help="ISO 8601 timestamp; only show messages created at-or-after this time. "
217
+ "Use the timestamp from claim-leader's inbox_hint to retrieve messages "
218
+ "missed during a prior ambiguous-leader state.",
219
+ )
212
220
  add_json(p)
213
221
  p.set_defaults(func=cmd_inbox)
214
222
 
@@ -231,6 +239,19 @@ def main(argv: list[str] | None = None) -> None:
231
239
  add_json(p)
232
240
  p.set_defaults(func=cmd_takeover)
233
241
 
242
+ p = sub.add_parser("claim-leader", help="Claim this pane as leader after ambiguous leader recovery")
243
+ p.add_argument("--workspace", default=".")
244
+ p.add_argument("--team", help="Explicit team/session selector when a workspace has multiple teams")
245
+ p.add_argument("--confirm", action="store_true", help="Apply the claim; without this, show a dry-run summary")
246
+ add_json(p)
247
+ p.set_defaults(func=cmd_claim_leader)
248
+
249
+ p = sub.add_parser("identity", help="Show leader identity diagnostics")
250
+ p.add_argument("--workspace", default=".")
251
+ p.add_argument("--team", help="Explicit team/session selector when a workspace has multiple teams")
252
+ add_json(p)
253
+ p.set_defaults(func=cmd_identity)
254
+
234
255
  p = sub.add_parser(
235
256
  "send",
236
257
  help="Send a message to an agent, task assignee, or attached leader",
@@ -289,6 +310,17 @@ def main(argv: list[str] | None = None) -> None:
289
310
 
290
311
  p = sub.add_parser("doctor", help="Check local dependencies, providers, auth hints, tmux, and MCP")
291
312
  p.add_argument("spec", nargs="?")
313
+ p.add_argument(
314
+ "--cleanup-orphans",
315
+ action="store_true",
316
+ help="Scan for orphan team_agent.coordinator processes pointing at non-existent or "
317
+ "ephemeral-tempdir workspaces (dry-run unless --confirm is also passed).",
318
+ )
319
+ p.add_argument(
320
+ "--confirm",
321
+ action="store_true",
322
+ help="With --cleanup-orphans: send SIGTERM to each orphan (default is dry-run).",
323
+ )
292
324
  add_json(p)
293
325
  p.set_defaults(func=cmd_doctor)
294
326
 
@@ -429,7 +461,7 @@ def main(argv: list[str] | None = None) -> None:
429
461
  sub._choices_actions = [ # type: ignore[attr-defined]
430
462
  action for action in sub._choices_actions if action.help != argparse.SUPPRESS # type: ignore[attr-defined]
431
463
  ]
432
- sub.metavar = "{codex,claude,quick-start,send,status,approvals,inbox,shutdown,restart,start-agent,stop-agent,reset-agent,add-agent,fork-agent,remove-agent,stuck-list,stuck-cancel,acknowledge-idle,doctor}"
464
+ sub.metavar = "{codex,claude,quick-start,send,status,approvals,inbox,takeover,claim-leader,identity,shutdown,restart,start-agent,stop-agent,reset-agent,add-agent,fork-agent,remove-agent,stuck-list,stuck-cancel,acknowledge-idle,doctor}"
433
465
 
434
466
  args = parser.parse_args(raw_argv)
435
467
  try:
@@ -38,7 +38,23 @@ def main(argv: list[str] | None = None) -> None:
38
38
  signal.signal(signal.SIGINT, _stop)
39
39
 
40
40
  interval = args.tick_interval if args.tick_interval is not None else _tick_interval(workspace)
41
+ initial_ppid = os.getppid()
41
42
  while not STOP:
43
+ # Stage 14 (Gap 37b) — orphan self-detection. If our original parent (test harness,
44
+ # shell, or supervisor) died, our ppid is reparented to 1 (or to a launchd shim on
45
+ # macOS). When that happens AND the workspace no longer exists on disk, we are an
46
+ # orphan from a torn-down test environment and must self-terminate so we don't
47
+ # accumulate (today's evidence: 35 orphans pointing at /var/folders/...team-agent-
48
+ # watcher-dedupe-* paths long since cleaned up).
49
+ current_ppid = os.getppid()
50
+ if current_ppid != initial_ppid and current_ppid == 1 and not workspace.exists():
51
+ event_log.write(
52
+ "coordinator.orphan_self_terminate",
53
+ initial_ppid=initial_ppid,
54
+ current_ppid=current_ppid,
55
+ workspace=str(workspace),
56
+ )
57
+ break
42
58
  result = runtime.coordinator_tick(workspace)
43
59
  if result.get("stop") or args.once:
44
60
  break
@@ -46,18 +62,21 @@ def main(argv: list[str] | None = None) -> None:
46
62
  event_log.write("coordinator.exit", stop=STOP)
47
63
 
48
64
 
65
+ DEFAULT_TICK_INTERVAL_SEC = 5.0 # Stage 14 (Gap 36c) — bumped from 2.0 (2.5x less CPU)
66
+
67
+
49
68
  def _tick_interval(workspace: Path) -> float:
50
69
  state = load_runtime_state(workspace)
51
70
  spec_path = Path(state.get("spec_path", workspace / "team.spec.yaml"))
52
71
  if spec_path.exists():
53
72
  try:
54
73
  spec = load_spec(spec_path)
55
- return float(spec.get("runtime", {}).get("tick_interval_sec", 2))
74
+ return float(spec.get("runtime", {}).get("tick_interval_sec", DEFAULT_TICK_INTERVAL_SEC))
56
75
  except Exception:
57
76
  pass
58
77
  # Ensure schema exists even before launch; this makes doctor/tick diagnostics deterministic.
59
78
  MessageStore(workspace)
60
- return 2.0
79
+ return DEFAULT_TICK_INTERVAL_SEC
61
80
 
62
81
 
63
82
  if __name__ == "__main__":
@@ -320,6 +320,14 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
320
320
  drift_results.append(drift)
321
321
  save_runtime_state(workspace, state)
322
322
  results = _collect_results_and_notify_watchers(workspace, event_log)
323
+ # Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
324
+ from team_agent.message_store.leader_notification_log import prune_leader_notification_log
325
+ try:
326
+ pruned = prune_leader_notification_log(store, max_age_hours=24)
327
+ if pruned:
328
+ event_log.write("leader_notification.log_pruned", removed=pruned)
329
+ except Exception as exc:
330
+ event_log.write("leader_notification.prune_failed", error=str(exc))
323
331
  return {
324
332
  "ok": True,
325
333
  "stop": False,
@@ -0,0 +1,193 @@
1
+ """Stage 14 (Gap 37a) — `team-agent doctor --cleanup-orphans` implementation.
2
+
3
+ Scans `ps` for processes matching `team_agent.coordinator --workspace <path>` and
4
+ classifies any whose workspace path no longer exists (or matches the test-tempdir
5
+ pattern) as an orphan. Dry-run by default; --confirm sends SIGTERM.
6
+
7
+ Mac mini 2026-05-26 evidence: 35 orphan coordinator processes alive simultaneously
8
+ pointing at /var/folders/.../T/team-agent-watcher-dedupe-* paths that had been removed
9
+ hours earlier. Each holds a long-lived Python interpreter + SQLite connection.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import re
15
+ import signal
16
+ import subprocess
17
+ import time
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ # Pattern: argv contains "team_agent.coordinator --workspace <path>" anywhere.
23
+ _COORDINATOR_ARGV_RE = re.compile(
24
+ r"team_agent\.coordinator(?:\.__main__)?(?:\s+|.*?)\s--workspace\s+(\S+)"
25
+ )
26
+ # Test-tempdir patterns that indicate the workspace is ephemeral and almost certainly orphan.
27
+ _EPHEMERAL_PATH_HINTS = (
28
+ "team-agent-watcher-dedupe-",
29
+ "team-agent-gap",
30
+ "team-agent-stage",
31
+ "team-agent-orchestrator-",
32
+ "team-agent-rm-",
33
+ "team-agent-claim-",
34
+ "team-agent-hotfix",
35
+ "team-agent-multi",
36
+ "team-agent-progress-",
37
+ "team-agent-fanout-",
38
+ "team-agent-in-flight-",
39
+ "team-agent-test-",
40
+ )
41
+ _SIGTERM_WAIT_SECONDS = 3.0
42
+
43
+
44
+ def find_coordinator_processes(*, runner=subprocess.run) -> list[dict[str, Any]]:
45
+ """Return list of {pid, etime, cmdline, workspace} dicts for every running
46
+ team_agent.coordinator process visible to ps. workspace is None when the cmdline
47
+ doesn't parse — those are noted but not auto-classified as orphan."""
48
+ try:
49
+ proc = runner(
50
+ ["ps", "-Awwo", "pid=,etime=,command="],
51
+ text=True,
52
+ capture_output=True,
53
+ timeout=5,
54
+ check=False,
55
+ )
56
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
57
+ return []
58
+ if proc.returncode != 0 or not proc.stdout:
59
+ return []
60
+ rows: list[dict[str, Any]] = []
61
+ for line in proc.stdout.splitlines():
62
+ parts = line.strip().split(None, 2)
63
+ if len(parts) < 3:
64
+ continue
65
+ pid_s, etime, cmdline = parts[0], parts[1], parts[2]
66
+ if "team_agent.coordinator" not in cmdline:
67
+ continue
68
+ if "ps -Awwo" in cmdline:
69
+ continue
70
+ try:
71
+ pid = int(pid_s)
72
+ except ValueError:
73
+ continue
74
+ if pid == os.getpid():
75
+ continue
76
+ match = _COORDINATOR_ARGV_RE.search(cmdline)
77
+ workspace = match.group(1) if match else None
78
+ rows.append({
79
+ "pid": pid,
80
+ "etime": etime,
81
+ "cmdline": cmdline,
82
+ "workspace": workspace,
83
+ })
84
+ return rows
85
+
86
+
87
+ def classify_orphan(entry: dict[str, Any]) -> tuple[bool, str]:
88
+ """Return (is_orphan, reason). An entry is orphan when its workspace path no longer
89
+ exists on disk OR matches a known ephemeral-tempdir pattern (test workspaces should
90
+ NEVER spawn long-lived coordinators)."""
91
+ workspace = entry.get("workspace")
92
+ if not workspace:
93
+ return False, "cmdline_unparsed"
94
+ if not Path(workspace).exists():
95
+ return True, "workspace_path_missing"
96
+ for hint in _EPHEMERAL_PATH_HINTS:
97
+ if hint in workspace:
98
+ return True, f"ephemeral_tempdir_pattern:{hint}"
99
+ return False, "workspace_alive"
100
+
101
+
102
+ def cleanup_orphan_coordinators(
103
+ *,
104
+ confirm: bool = False,
105
+ runner=subprocess.run,
106
+ killer=os.kill,
107
+ sleeper=time.sleep,
108
+ ) -> dict[str, Any]:
109
+ """Scan for orphan coordinators. Without confirm: dry-run (just classify and report).
110
+ With confirm: SIGTERM each orphan and wait up to _SIGTERM_WAIT_SECONDS for the
111
+ process to exit; report success/failure per pid."""
112
+ now = datetime.now(timezone.utc).isoformat()
113
+ entries = find_coordinator_processes(runner=runner)
114
+ classified: list[dict[str, Any]] = []
115
+ orphans: list[dict[str, Any]] = []
116
+ for entry in entries:
117
+ is_orphan, reason = classify_orphan(entry)
118
+ annotated = {**entry, "is_orphan": is_orphan, "reason": reason}
119
+ classified.append(annotated)
120
+ if is_orphan:
121
+ orphans.append(annotated)
122
+ if not confirm:
123
+ return {
124
+ "ok": True,
125
+ "scanned": len(classified),
126
+ "orphans": orphans,
127
+ "dry_run": True,
128
+ "scanned_at": now,
129
+ "action_required": "re-run with --confirm to send SIGTERM",
130
+ }
131
+ killed: list[dict[str, Any]] = []
132
+ failed: list[dict[str, Any]] = []
133
+ for entry in orphans:
134
+ pid = entry["pid"]
135
+ try:
136
+ killer(pid, signal.SIGTERM)
137
+ except (ProcessLookupError, PermissionError, OSError) as exc:
138
+ failed.append({**entry, "error": str(exc)})
139
+ continue
140
+ # Wait briefly; if the process is still alive after _SIGTERM_WAIT_SECONDS,
141
+ # mark as failed (caller may want to SIGKILL).
142
+ deadline = time.monotonic() + _SIGTERM_WAIT_SECONDS
143
+ gone = False
144
+ while time.monotonic() < deadline:
145
+ try:
146
+ killer(pid, 0)
147
+ except ProcessLookupError:
148
+ gone = True
149
+ break
150
+ except (PermissionError, OSError):
151
+ gone = True
152
+ break
153
+ sleeper(0.1)
154
+ if gone:
155
+ killed.append(entry)
156
+ else:
157
+ failed.append({**entry, "error": "still_alive_after_sigterm"})
158
+ return {
159
+ "ok": True,
160
+ "scanned": len(classified),
161
+ "orphans": orphans,
162
+ "killed": killed,
163
+ "failed": failed,
164
+ "dry_run": False,
165
+ "scanned_at": now,
166
+ }
167
+
168
+
169
+ def format_cleanup_orphans(result: dict[str, Any]) -> str:
170
+ lines = [
171
+ f"Coordinator orphan scan @ {result.get('scanned_at')}",
172
+ f" scanned: {result.get('scanned', 0)} coordinator processes",
173
+ f" orphans: {len(result.get('orphans') or [])}",
174
+ ]
175
+ if result.get("dry_run"):
176
+ lines.append(" mode: DRY-RUN (no SIGTERM sent; re-run with --confirm)")
177
+ else:
178
+ lines.append(f" killed: {len(result.get('killed') or [])}")
179
+ lines.append(f" failed: {len(result.get('failed') or [])}")
180
+ for orphan in result.get("orphans") or []:
181
+ lines.append(
182
+ f" PID {orphan['pid']} etime={orphan['etime']} "
183
+ f"workspace={orphan.get('workspace') or '?'} reason={orphan.get('reason')}"
184
+ )
185
+ return "\n".join(lines)
186
+
187
+
188
+ __all__ = [
189
+ "cleanup_orphan_coordinators",
190
+ "classify_orphan",
191
+ "find_coordinator_processes",
192
+ "format_cleanup_orphans",
193
+ ]
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import os
4
5
  from datetime import datetime, timezone
5
6
  from pathlib import Path
6
7
  from typing import Any
@@ -8,6 +9,15 @@ from typing import Any
8
9
  from team_agent.paths import logs_dir
9
10
 
10
11
 
12
+ # Stage 14 (Gap 36a) — bounded retention. 5 MB cap × 5 archives = 25 MB worst-case.
13
+ # Mac mini 2026-05-26 evidence: events.jsonl grew to 28 MB / 128k lines in one day with
14
+ # unbounded retention; coordinator's tick-time scan over the file was a ~22% CPU hot path.
15
+ # Rotation keeps the current segment small so reads are cheap; archives preserve forensic
16
+ # history but are NOT consulted by hot-path scans.
17
+ EVENT_LOG_ROTATE_BYTES = 5 * 1024 * 1024
18
+ EVENT_LOG_ARCHIVE_KEEP = 5
19
+
20
+
11
21
  class EventLog:
12
22
  def __init__(self, workspace: Path):
13
23
  self.workspace = workspace
@@ -20,11 +30,14 @@ class EventLog:
20
30
  "event": event_type,
21
31
  **fields,
22
32
  }
33
+ self._maybe_rotate()
23
34
  with self.path.open("a", encoding="utf-8") as f:
24
35
  f.write(json.dumps(event, ensure_ascii=False, sort_keys=True) + "\n")
25
36
  return event
26
37
 
27
38
  def tail(self, limit: int = 20) -> list[dict[str, Any]]:
39
+ # Hot-path scan reads only the current segment. Archives are forensic; if a
40
+ # caller genuinely needs longer history it can iterate _archive_paths explicitly.
28
41
  if not self.path.exists():
29
42
  return []
30
43
  lines = self.path.read_text(encoding="utf-8").splitlines()[-limit:]
@@ -35,3 +48,37 @@ class EventLog:
35
48
  except json.JSONDecodeError:
36
49
  out.append({"raw": line})
37
50
  return out
51
+
52
+ def _maybe_rotate(self) -> None:
53
+ try:
54
+ size = self.path.stat().st_size
55
+ except FileNotFoundError:
56
+ return
57
+ if size < EVENT_LOG_ROTATE_BYTES:
58
+ return
59
+ # Shift archives: events.jsonl.4 → .5, .3 → .4, …, .1 → .2, current → .1
60
+ # Drop the oldest if it would overflow the keep budget.
61
+ oldest = self._archive_path(EVENT_LOG_ARCHIVE_KEEP)
62
+ if oldest.exists():
63
+ try:
64
+ oldest.unlink()
65
+ except OSError:
66
+ pass
67
+ for idx in range(EVENT_LOG_ARCHIVE_KEEP - 1, 0, -1):
68
+ src = self._archive_path(idx)
69
+ dst = self._archive_path(idx + 1)
70
+ if src.exists():
71
+ try:
72
+ os.replace(src, dst)
73
+ except OSError:
74
+ pass
75
+ try:
76
+ os.replace(self.path, self._archive_path(1))
77
+ except OSError:
78
+ pass
79
+
80
+ def _archive_path(self, index: int) -> Path:
81
+ return self.path.with_name(f"{self.path.name}.{index}")
82
+
83
+ def _archive_paths(self) -> list[Path]:
84
+ return [self._archive_path(i) for i in range(1, EVENT_LOG_ARCHIVE_KEEP + 1)]