@team-agent/installer 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/team_agent/cli/__init__.py +2 -0
- package/src/team_agent/cli/commands.py +22 -3
- package/src/team_agent/cli/parser.py +40 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +23 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +193 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +86 -9
- package/src/team_agent/lifecycle/paste_buffer_hygiene.py +39 -0
- package/src/team_agent/lifecycle/start.py +3 -0
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +23 -0
- package/src/team_agent/messaging/delivery.py +10 -0
- package/src/team_agent/messaging/idle_alerts.py +227 -21
- package/src/team_agent/messaging/leader.py +166 -6
- package/src/team_agent/messaging/leader_panes.py +193 -23
- package/src/team_agent/messaging/owner_bypass.py +29 -0
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +22 -2
- package/src/team_agent/messaging/send.py +9 -2
- package/src/team_agent/messaging/session_drift.py +94 -0
- package/src/team_agent/runtime.py +22 -14
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/state.py +167 -10
- package/src/team_agent/status/inbox.py +33 -3
package/package.json
CHANGED
|
@@ -56,6 +56,7 @@ from team_agent.cli.commands import (
|
|
|
56
56
|
cmd_remove_agent,
|
|
57
57
|
cmd_stuck_list,
|
|
58
58
|
cmd_stuck_cancel,
|
|
59
|
+
cmd_acknowledge_idle,
|
|
59
60
|
cmd_allow_peer_talk,
|
|
60
61
|
cmd_advanced,
|
|
61
62
|
cmd_install_skill,
|
|
@@ -122,6 +123,7 @@ __all__ = [
|
|
|
122
123
|
'cmd_remove_agent',
|
|
123
124
|
'cmd_stuck_list',
|
|
124
125
|
'cmd_stuck_cancel',
|
|
126
|
+
'cmd_acknowledge_idle',
|
|
125
127
|
'cmd_allow_peer_talk',
|
|
126
128
|
'cmd_advanced',
|
|
127
129
|
'cmd_install_skill',
|
|
@@ -119,9 +119,10 @@ def cmd_peek(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
def cmd_inbox(args: argparse.Namespace) -> dict[str, Any]:
|
|
122
|
+
since = getattr(args, "since", None)
|
|
122
123
|
if args.json:
|
|
123
|
-
return runtime.inbox(Path(args.workspace).resolve(), args.agent, limit=args.limit)
|
|
124
|
-
return runtime.format_inbox(Path(args.workspace).resolve(), args.agent, limit=args.limit)
|
|
124
|
+
return runtime.inbox(Path(args.workspace).resolve(), args.agent, limit=args.limit, since=since)
|
|
125
|
+
return runtime.format_inbox(Path(args.workspace).resolve(), args.agent, limit=args.limit, since=since)
|
|
125
126
|
|
|
126
127
|
|
|
127
128
|
def cmd_sessions(args: argparse.Namespace) -> dict[str, Any]:
|
|
@@ -136,6 +137,14 @@ def cmd_takeover(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
136
137
|
return runtime.takeover(Path(args.workspace).resolve(), team=args.team, confirm=args.confirm)
|
|
137
138
|
|
|
138
139
|
|
|
140
|
+
def cmd_claim_leader(args: argparse.Namespace) -> dict[str, Any]:
|
|
141
|
+
return runtime.claim_leader(Path(args.workspace).resolve(), team=args.team, confirm=args.confirm)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def cmd_identity(args: argparse.Namespace) -> dict[str, Any]:
|
|
145
|
+
return runtime.leader_identity(Path(args.workspace).resolve(), team=args.team)
|
|
146
|
+
|
|
147
|
+
|
|
139
148
|
def cmd_send(args: argparse.Namespace) -> dict[str, Any]:
|
|
140
149
|
target = _send_target(args)
|
|
141
150
|
return runtime.send_message(
|
|
@@ -190,7 +199,13 @@ def cmd_validate_result(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
190
199
|
return {"ok": True, "task_id": envelope["task_id"], "agent_id": envelope["agent_id"], "status": envelope["status"]}
|
|
191
200
|
|
|
192
201
|
|
|
193
|
-
def cmd_doctor(args: argparse.Namespace) -> dict[str, Any]:
|
|
202
|
+
def cmd_doctor(args: argparse.Namespace) -> dict[str, Any] | str:
|
|
203
|
+
if getattr(args, "cleanup_orphans", False):
|
|
204
|
+
from team_agent.diagnose.orphan_cleanup import cleanup_orphan_coordinators, format_cleanup_orphans
|
|
205
|
+
result = cleanup_orphan_coordinators(confirm=bool(getattr(args, "confirm", False)))
|
|
206
|
+
if args.json:
|
|
207
|
+
return result
|
|
208
|
+
return format_cleanup_orphans(result)
|
|
194
209
|
spec = Path(args.spec).resolve() if args.spec else None
|
|
195
210
|
return runtime.doctor(spec)
|
|
196
211
|
|
|
@@ -273,6 +288,10 @@ def cmd_stuck_cancel(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
273
288
|
)
|
|
274
289
|
|
|
275
290
|
|
|
291
|
+
def cmd_acknowledge_idle(args: argparse.Namespace) -> dict[str, Any]:
|
|
292
|
+
return runtime.acknowledge_idle(Path(args.workspace).resolve(), team=args.team)
|
|
293
|
+
|
|
294
|
+
|
|
276
295
|
def cmd_allow_peer_talk(args: argparse.Namespace) -> dict[str, Any]:
|
|
277
296
|
return runtime.allow_peer_talk(Path(args.workspace).resolve(), args.agent_a, args.agent_b)
|
|
278
297
|
|
|
@@ -30,6 +30,8 @@ from team_agent.cli.commands import (
|
|
|
30
30
|
cmd_sessions,
|
|
31
31
|
cmd_attach_leader,
|
|
32
32
|
cmd_takeover,
|
|
33
|
+
cmd_claim_leader,
|
|
34
|
+
cmd_identity,
|
|
33
35
|
cmd_send,
|
|
34
36
|
cmd_collect,
|
|
35
37
|
cmd_diagnose,
|
|
@@ -46,6 +48,7 @@ from team_agent.cli.commands import (
|
|
|
46
48
|
cmd_remove_agent,
|
|
47
49
|
cmd_stuck_list,
|
|
48
50
|
cmd_stuck_cancel,
|
|
51
|
+
cmd_acknowledge_idle,
|
|
49
52
|
cmd_allow_peer_talk,
|
|
50
53
|
cmd_advanced,
|
|
51
54
|
cmd_install_skill,
|
|
@@ -208,6 +211,12 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
208
211
|
p.add_argument("agent")
|
|
209
212
|
p.add_argument("--workspace", default=".")
|
|
210
213
|
p.add_argument("--limit", type=int, default=20)
|
|
214
|
+
p.add_argument(
|
|
215
|
+
"--since",
|
|
216
|
+
help="ISO 8601 timestamp; only show messages created at-or-after this time. "
|
|
217
|
+
"Use the timestamp from claim-leader's inbox_hint to retrieve messages "
|
|
218
|
+
"missed during a prior ambiguous-leader state.",
|
|
219
|
+
)
|
|
211
220
|
add_json(p)
|
|
212
221
|
p.set_defaults(func=cmd_inbox)
|
|
213
222
|
|
|
@@ -230,6 +239,19 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
230
239
|
add_json(p)
|
|
231
240
|
p.set_defaults(func=cmd_takeover)
|
|
232
241
|
|
|
242
|
+
p = sub.add_parser("claim-leader", help="Claim this pane as leader after ambiguous leader recovery")
|
|
243
|
+
p.add_argument("--workspace", default=".")
|
|
244
|
+
p.add_argument("--team", help="Explicit team/session selector when a workspace has multiple teams")
|
|
245
|
+
p.add_argument("--confirm", action="store_true", help="Apply the claim; without this, show a dry-run summary")
|
|
246
|
+
add_json(p)
|
|
247
|
+
p.set_defaults(func=cmd_claim_leader)
|
|
248
|
+
|
|
249
|
+
p = sub.add_parser("identity", help="Show leader identity diagnostics")
|
|
250
|
+
p.add_argument("--workspace", default=".")
|
|
251
|
+
p.add_argument("--team", help="Explicit team/session selector when a workspace has multiple teams")
|
|
252
|
+
add_json(p)
|
|
253
|
+
p.set_defaults(func=cmd_identity)
|
|
254
|
+
|
|
233
255
|
p = sub.add_parser(
|
|
234
256
|
"send",
|
|
235
257
|
help="Send a message to an agent, task assignee, or attached leader",
|
|
@@ -288,6 +310,17 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
288
310
|
|
|
289
311
|
p = sub.add_parser("doctor", help="Check local dependencies, providers, auth hints, tmux, and MCP")
|
|
290
312
|
p.add_argument("spec", nargs="?")
|
|
313
|
+
p.add_argument(
|
|
314
|
+
"--cleanup-orphans",
|
|
315
|
+
action="store_true",
|
|
316
|
+
help="Scan for orphan team_agent.coordinator processes pointing at non-existent or "
|
|
317
|
+
"ephemeral-tempdir workspaces (dry-run unless --confirm is also passed).",
|
|
318
|
+
)
|
|
319
|
+
p.add_argument(
|
|
320
|
+
"--confirm",
|
|
321
|
+
action="store_true",
|
|
322
|
+
help="With --cleanup-orphans: send SIGTERM to each orphan (default is dry-run).",
|
|
323
|
+
)
|
|
291
324
|
add_json(p)
|
|
292
325
|
p.set_defaults(func=cmd_doctor)
|
|
293
326
|
|
|
@@ -372,6 +405,12 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
372
405
|
add_json(p)
|
|
373
406
|
p.set_defaults(func=cmd_stuck_cancel)
|
|
374
407
|
|
|
408
|
+
p = sub.add_parser("acknowledge-idle", help="Suppress idle-fallback reminders for this team for a bounded window (default 30 minutes)")
|
|
409
|
+
p.add_argument("team", nargs="?", help="Explicit team/session target when a workspace has multiple teams")
|
|
410
|
+
p.add_argument("--workspace", default=".")
|
|
411
|
+
add_json(p)
|
|
412
|
+
p.set_defaults(func=cmd_acknowledge_idle)
|
|
413
|
+
|
|
375
414
|
p = sub.add_parser("install-skill", help=argparse.SUPPRESS)
|
|
376
415
|
p.add_argument("--target", choices=["codex", "claude", "all"], default="codex")
|
|
377
416
|
p.add_argument("--dest", help="Explicit destination directory; overrides --target")
|
|
@@ -422,7 +461,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
422
461
|
sub._choices_actions = [ # type: ignore[attr-defined]
|
|
423
462
|
action for action in sub._choices_actions if action.help != argparse.SUPPRESS # type: ignore[attr-defined]
|
|
424
463
|
]
|
|
425
|
-
sub.metavar = "{codex,claude,quick-start,send,status,approvals,inbox,shutdown,restart,start-agent,stop-agent,reset-agent,add-agent,fork-agent,remove-agent,stuck-list,stuck-cancel,doctor}"
|
|
464
|
+
sub.metavar = "{codex,claude,quick-start,send,status,approvals,inbox,takeover,claim-leader,identity,shutdown,restart,start-agent,stop-agent,reset-agent,add-agent,fork-agent,remove-agent,stuck-list,stuck-cancel,acknowledge-idle,doctor}"
|
|
426
465
|
|
|
427
466
|
args = parser.parse_args(raw_argv)
|
|
428
467
|
try:
|
|
@@ -38,7 +38,23 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
38
38
|
signal.signal(signal.SIGINT, _stop)
|
|
39
39
|
|
|
40
40
|
interval = args.tick_interval if args.tick_interval is not None else _tick_interval(workspace)
|
|
41
|
+
initial_ppid = os.getppid()
|
|
41
42
|
while not STOP:
|
|
43
|
+
# Stage 14 (Gap 37b) — orphan self-detection. If our original parent (test harness,
|
|
44
|
+
# shell, or supervisor) died, our ppid is reparented to 1 (or to a launchd shim on
|
|
45
|
+
# macOS). When that happens AND the workspace no longer exists on disk, we are an
|
|
46
|
+
# orphan from a torn-down test environment and must self-terminate so we don't
|
|
47
|
+
# accumulate (today's evidence: 35 orphans pointing at /var/folders/...team-agent-
|
|
48
|
+
# watcher-dedupe-* paths long since cleaned up).
|
|
49
|
+
current_ppid = os.getppid()
|
|
50
|
+
if current_ppid != initial_ppid and current_ppid == 1 and not workspace.exists():
|
|
51
|
+
event_log.write(
|
|
52
|
+
"coordinator.orphan_self_terminate",
|
|
53
|
+
initial_ppid=initial_ppid,
|
|
54
|
+
current_ppid=current_ppid,
|
|
55
|
+
workspace=str(workspace),
|
|
56
|
+
)
|
|
57
|
+
break
|
|
42
58
|
result = runtime.coordinator_tick(workspace)
|
|
43
59
|
if result.get("stop") or args.once:
|
|
44
60
|
break
|
|
@@ -46,18 +62,21 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
46
62
|
event_log.write("coordinator.exit", stop=STOP)
|
|
47
63
|
|
|
48
64
|
|
|
65
|
+
DEFAULT_TICK_INTERVAL_SEC = 5.0 # Stage 14 (Gap 36c) — bumped from 2.0 (2.5x less CPU)
|
|
66
|
+
|
|
67
|
+
|
|
49
68
|
def _tick_interval(workspace: Path) -> float:
|
|
50
69
|
state = load_runtime_state(workspace)
|
|
51
70
|
spec_path = Path(state.get("spec_path", workspace / "team.spec.yaml"))
|
|
52
71
|
if spec_path.exists():
|
|
53
72
|
try:
|
|
54
73
|
spec = load_spec(spec_path)
|
|
55
|
-
return float(spec.get("runtime", {}).get("tick_interval_sec",
|
|
74
|
+
return float(spec.get("runtime", {}).get("tick_interval_sec", DEFAULT_TICK_INTERVAL_SEC))
|
|
56
75
|
except Exception:
|
|
57
76
|
pass
|
|
58
77
|
# Ensure schema exists even before launch; this makes doctor/tick diagnostics deterministic.
|
|
59
78
|
MessageStore(workspace)
|
|
60
|
-
return
|
|
79
|
+
return DEFAULT_TICK_INTERVAL_SEC
|
|
61
80
|
|
|
62
81
|
|
|
63
82
|
if __name__ == "__main__":
|
|
@@ -265,6 +265,7 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
265
265
|
detect_idle_fallbacks,
|
|
266
266
|
)
|
|
267
267
|
from team_agent.messaging.activity_detector import detect_compaction_degradation
|
|
268
|
+
from team_agent.messaging.session_drift import detect_session_drift
|
|
268
269
|
from team_agent.state import load_runtime_state, save_runtime_state
|
|
269
270
|
state = load_runtime_state(workspace)
|
|
270
271
|
event_log = EventLog(workspace)
|
|
@@ -304,8 +305,29 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
304
305
|
)
|
|
305
306
|
if result.get("event") and result.get("event") != "compaction_threshold_crossed.none":
|
|
306
307
|
compaction_results.append(result)
|
|
308
|
+
drift_results: list[dict[str, Any]] = []
|
|
309
|
+
for agent_id, agent_state in state.get("agents", {}).items():
|
|
310
|
+
if str(agent_state.get("provider") or "") != "codex":
|
|
311
|
+
continue
|
|
312
|
+
scrollback = str((captures.get(agent_id) or {}).get("scrollback") or "")
|
|
313
|
+
if not scrollback:
|
|
314
|
+
continue
|
|
315
|
+
drift = detect_session_drift(
|
|
316
|
+
workspace, state, event_log,
|
|
317
|
+
agent_id=agent_id, agent_state=agent_state, scrollback=scrollback,
|
|
318
|
+
)
|
|
319
|
+
if drift:
|
|
320
|
+
drift_results.append(drift)
|
|
307
321
|
save_runtime_state(workspace, state)
|
|
308
322
|
results = _collect_results_and_notify_watchers(workspace, event_log)
|
|
323
|
+
# Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
|
|
324
|
+
from team_agent.message_store.leader_notification_log import prune_leader_notification_log
|
|
325
|
+
try:
|
|
326
|
+
pruned = prune_leader_notification_log(store, max_age_hours=24)
|
|
327
|
+
if pruned:
|
|
328
|
+
event_log.write("leader_notification.log_pruned", removed=pruned)
|
|
329
|
+
except Exception as exc:
|
|
330
|
+
event_log.write("leader_notification.prune_failed", error=str(exc))
|
|
309
331
|
return {
|
|
310
332
|
"ok": True,
|
|
311
333
|
"stop": False,
|
|
@@ -315,5 +337,6 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
315
337
|
"idle_alerts": idle_alerts,
|
|
316
338
|
"deadlock_alerts": deadlock_alerts,
|
|
317
339
|
"compaction": compaction_results,
|
|
340
|
+
"session_drift": drift_results,
|
|
318
341
|
"results": results,
|
|
319
342
|
}
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Stage 14 (Gap 37a) — `team-agent doctor --cleanup-orphans` implementation.
|
|
2
|
+
|
|
3
|
+
Scans `ps` for processes matching `team_agent.coordinator --workspace <path>` and
|
|
4
|
+
classifies any whose workspace path no longer exists (or matches the test-tempdir
|
|
5
|
+
pattern) as an orphan. Dry-run by default; --confirm sends SIGTERM.
|
|
6
|
+
|
|
7
|
+
Mac mini 2026-05-26 evidence: 35 orphan coordinator processes alive simultaneously
|
|
8
|
+
pointing at /var/folders/.../T/team-agent-watcher-dedupe-* paths that had been removed
|
|
9
|
+
hours earlier. Each holds a long-lived Python interpreter + SQLite connection.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import signal
|
|
16
|
+
import subprocess
|
|
17
|
+
import time
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
# Pattern: argv contains "team_agent.coordinator --workspace <path>" anywhere.
|
|
23
|
+
_COORDINATOR_ARGV_RE = re.compile(
|
|
24
|
+
r"team_agent\.coordinator(?:\.__main__)?(?:\s+|.*?)\s--workspace\s+(\S+)"
|
|
25
|
+
)
|
|
26
|
+
# Test-tempdir patterns that indicate the workspace is ephemeral and almost certainly orphan.
|
|
27
|
+
_EPHEMERAL_PATH_HINTS = (
|
|
28
|
+
"team-agent-watcher-dedupe-",
|
|
29
|
+
"team-agent-gap",
|
|
30
|
+
"team-agent-stage",
|
|
31
|
+
"team-agent-orchestrator-",
|
|
32
|
+
"team-agent-rm-",
|
|
33
|
+
"team-agent-claim-",
|
|
34
|
+
"team-agent-hotfix",
|
|
35
|
+
"team-agent-multi",
|
|
36
|
+
"team-agent-progress-",
|
|
37
|
+
"team-agent-fanout-",
|
|
38
|
+
"team-agent-in-flight-",
|
|
39
|
+
"team-agent-test-",
|
|
40
|
+
)
|
|
41
|
+
_SIGTERM_WAIT_SECONDS = 3.0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def find_coordinator_processes(*, runner=subprocess.run) -> list[dict[str, Any]]:
|
|
45
|
+
"""Return list of {pid, etime, cmdline, workspace} dicts for every running
|
|
46
|
+
team_agent.coordinator process visible to ps. workspace is None when the cmdline
|
|
47
|
+
doesn't parse — those are noted but not auto-classified as orphan."""
|
|
48
|
+
try:
|
|
49
|
+
proc = runner(
|
|
50
|
+
["ps", "-Awwo", "pid=,etime=,command="],
|
|
51
|
+
text=True,
|
|
52
|
+
capture_output=True,
|
|
53
|
+
timeout=5,
|
|
54
|
+
check=False,
|
|
55
|
+
)
|
|
56
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
57
|
+
return []
|
|
58
|
+
if proc.returncode != 0 or not proc.stdout:
|
|
59
|
+
return []
|
|
60
|
+
rows: list[dict[str, Any]] = []
|
|
61
|
+
for line in proc.stdout.splitlines():
|
|
62
|
+
parts = line.strip().split(None, 2)
|
|
63
|
+
if len(parts) < 3:
|
|
64
|
+
continue
|
|
65
|
+
pid_s, etime, cmdline = parts[0], parts[1], parts[2]
|
|
66
|
+
if "team_agent.coordinator" not in cmdline:
|
|
67
|
+
continue
|
|
68
|
+
if "ps -Awwo" in cmdline:
|
|
69
|
+
continue
|
|
70
|
+
try:
|
|
71
|
+
pid = int(pid_s)
|
|
72
|
+
except ValueError:
|
|
73
|
+
continue
|
|
74
|
+
if pid == os.getpid():
|
|
75
|
+
continue
|
|
76
|
+
match = _COORDINATOR_ARGV_RE.search(cmdline)
|
|
77
|
+
workspace = match.group(1) if match else None
|
|
78
|
+
rows.append({
|
|
79
|
+
"pid": pid,
|
|
80
|
+
"etime": etime,
|
|
81
|
+
"cmdline": cmdline,
|
|
82
|
+
"workspace": workspace,
|
|
83
|
+
})
|
|
84
|
+
return rows
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def classify_orphan(entry: dict[str, Any]) -> tuple[bool, str]:
|
|
88
|
+
"""Return (is_orphan, reason). An entry is orphan when its workspace path no longer
|
|
89
|
+
exists on disk OR matches a known ephemeral-tempdir pattern (test workspaces should
|
|
90
|
+
NEVER spawn long-lived coordinators)."""
|
|
91
|
+
workspace = entry.get("workspace")
|
|
92
|
+
if not workspace:
|
|
93
|
+
return False, "cmdline_unparsed"
|
|
94
|
+
if not Path(workspace).exists():
|
|
95
|
+
return True, "workspace_path_missing"
|
|
96
|
+
for hint in _EPHEMERAL_PATH_HINTS:
|
|
97
|
+
if hint in workspace:
|
|
98
|
+
return True, f"ephemeral_tempdir_pattern:{hint}"
|
|
99
|
+
return False, "workspace_alive"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def cleanup_orphan_coordinators(
|
|
103
|
+
*,
|
|
104
|
+
confirm: bool = False,
|
|
105
|
+
runner=subprocess.run,
|
|
106
|
+
killer=os.kill,
|
|
107
|
+
sleeper=time.sleep,
|
|
108
|
+
) -> dict[str, Any]:
|
|
109
|
+
"""Scan for orphan coordinators. Without confirm: dry-run (just classify and report).
|
|
110
|
+
With confirm: SIGTERM each orphan and wait up to _SIGTERM_WAIT_SECONDS for the
|
|
111
|
+
process to exit; report success/failure per pid."""
|
|
112
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
113
|
+
entries = find_coordinator_processes(runner=runner)
|
|
114
|
+
classified: list[dict[str, Any]] = []
|
|
115
|
+
orphans: list[dict[str, Any]] = []
|
|
116
|
+
for entry in entries:
|
|
117
|
+
is_orphan, reason = classify_orphan(entry)
|
|
118
|
+
annotated = {**entry, "is_orphan": is_orphan, "reason": reason}
|
|
119
|
+
classified.append(annotated)
|
|
120
|
+
if is_orphan:
|
|
121
|
+
orphans.append(annotated)
|
|
122
|
+
if not confirm:
|
|
123
|
+
return {
|
|
124
|
+
"ok": True,
|
|
125
|
+
"scanned": len(classified),
|
|
126
|
+
"orphans": orphans,
|
|
127
|
+
"dry_run": True,
|
|
128
|
+
"scanned_at": now,
|
|
129
|
+
"action_required": "re-run with --confirm to send SIGTERM",
|
|
130
|
+
}
|
|
131
|
+
killed: list[dict[str, Any]] = []
|
|
132
|
+
failed: list[dict[str, Any]] = []
|
|
133
|
+
for entry in orphans:
|
|
134
|
+
pid = entry["pid"]
|
|
135
|
+
try:
|
|
136
|
+
killer(pid, signal.SIGTERM)
|
|
137
|
+
except (ProcessLookupError, PermissionError, OSError) as exc:
|
|
138
|
+
failed.append({**entry, "error": str(exc)})
|
|
139
|
+
continue
|
|
140
|
+
# Wait briefly; if the process is still alive after _SIGTERM_WAIT_SECONDS,
|
|
141
|
+
# mark as failed (caller may want to SIGKILL).
|
|
142
|
+
deadline = time.monotonic() + _SIGTERM_WAIT_SECONDS
|
|
143
|
+
gone = False
|
|
144
|
+
while time.monotonic() < deadline:
|
|
145
|
+
try:
|
|
146
|
+
killer(pid, 0)
|
|
147
|
+
except ProcessLookupError:
|
|
148
|
+
gone = True
|
|
149
|
+
break
|
|
150
|
+
except (PermissionError, OSError):
|
|
151
|
+
gone = True
|
|
152
|
+
break
|
|
153
|
+
sleeper(0.1)
|
|
154
|
+
if gone:
|
|
155
|
+
killed.append(entry)
|
|
156
|
+
else:
|
|
157
|
+
failed.append({**entry, "error": "still_alive_after_sigterm"})
|
|
158
|
+
return {
|
|
159
|
+
"ok": True,
|
|
160
|
+
"scanned": len(classified),
|
|
161
|
+
"orphans": orphans,
|
|
162
|
+
"killed": killed,
|
|
163
|
+
"failed": failed,
|
|
164
|
+
"dry_run": False,
|
|
165
|
+
"scanned_at": now,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def format_cleanup_orphans(result: dict[str, Any]) -> str:
|
|
170
|
+
lines = [
|
|
171
|
+
f"Coordinator orphan scan @ {result.get('scanned_at')}",
|
|
172
|
+
f" scanned: {result.get('scanned', 0)} coordinator processes",
|
|
173
|
+
f" orphans: {len(result.get('orphans') or [])}",
|
|
174
|
+
]
|
|
175
|
+
if result.get("dry_run"):
|
|
176
|
+
lines.append(" mode: DRY-RUN (no SIGTERM sent; re-run with --confirm)")
|
|
177
|
+
else:
|
|
178
|
+
lines.append(f" killed: {len(result.get('killed') or [])}")
|
|
179
|
+
lines.append(f" failed: {len(result.get('failed') or [])}")
|
|
180
|
+
for orphan in result.get("orphans") or []:
|
|
181
|
+
lines.append(
|
|
182
|
+
f" PID {orphan['pid']} etime={orphan['etime']} "
|
|
183
|
+
f"workspace={orphan.get('workspace') or '?'} reason={orphan.get('reason')}"
|
|
184
|
+
)
|
|
185
|
+
return "\n".join(lines)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
__all__ = [
|
|
189
|
+
"cleanup_orphan_coordinators",
|
|
190
|
+
"classify_orphan",
|
|
191
|
+
"find_coordinator_processes",
|
|
192
|
+
"format_cleanup_orphans",
|
|
193
|
+
]
|
package/src/team_agent/events.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
4
5
|
from datetime import datetime, timezone
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any
|
|
@@ -8,6 +9,15 @@ from typing import Any
|
|
|
8
9
|
from team_agent.paths import logs_dir
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
# Stage 14 (Gap 36a) — bounded retention. 5 MB cap × 5 archives = 25 MB worst-case.
|
|
13
|
+
# Mac mini 2026-05-26 evidence: events.jsonl grew to 28 MB / 128k lines in one day with
|
|
14
|
+
# unbounded retention; coordinator's tick-time scan over the file was a ~22% CPU hot path.
|
|
15
|
+
# Rotation keeps the current segment small so reads are cheap; archives preserve forensic
|
|
16
|
+
# history but are NOT consulted by hot-path scans.
|
|
17
|
+
EVENT_LOG_ROTATE_BYTES = 5 * 1024 * 1024
|
|
18
|
+
EVENT_LOG_ARCHIVE_KEEP = 5
|
|
19
|
+
|
|
20
|
+
|
|
11
21
|
class EventLog:
|
|
12
22
|
def __init__(self, workspace: Path):
|
|
13
23
|
self.workspace = workspace
|
|
@@ -20,11 +30,14 @@ class EventLog:
|
|
|
20
30
|
"event": event_type,
|
|
21
31
|
**fields,
|
|
22
32
|
}
|
|
33
|
+
self._maybe_rotate()
|
|
23
34
|
with self.path.open("a", encoding="utf-8") as f:
|
|
24
35
|
f.write(json.dumps(event, ensure_ascii=False, sort_keys=True) + "\n")
|
|
25
36
|
return event
|
|
26
37
|
|
|
27
38
|
def tail(self, limit: int = 20) -> list[dict[str, Any]]:
|
|
39
|
+
# Hot-path scan reads only the current segment. Archives are forensic; if a
|
|
40
|
+
# caller genuinely needs longer history it can iterate _archive_paths explicitly.
|
|
28
41
|
if not self.path.exists():
|
|
29
42
|
return []
|
|
30
43
|
lines = self.path.read_text(encoding="utf-8").splitlines()[-limit:]
|
|
@@ -35,3 +48,37 @@ class EventLog:
|
|
|
35
48
|
except json.JSONDecodeError:
|
|
36
49
|
out.append({"raw": line})
|
|
37
50
|
return out
|
|
51
|
+
|
|
52
|
+
def _maybe_rotate(self) -> None:
|
|
53
|
+
try:
|
|
54
|
+
size = self.path.stat().st_size
|
|
55
|
+
except FileNotFoundError:
|
|
56
|
+
return
|
|
57
|
+
if size < EVENT_LOG_ROTATE_BYTES:
|
|
58
|
+
return
|
|
59
|
+
# Shift archives: events.jsonl.4 → .5, .3 → .4, …, .1 → .2, current → .1
|
|
60
|
+
# Drop the oldest if it would overflow the keep budget.
|
|
61
|
+
oldest = self._archive_path(EVENT_LOG_ARCHIVE_KEEP)
|
|
62
|
+
if oldest.exists():
|
|
63
|
+
try:
|
|
64
|
+
oldest.unlink()
|
|
65
|
+
except OSError:
|
|
66
|
+
pass
|
|
67
|
+
for idx in range(EVENT_LOG_ARCHIVE_KEEP - 1, 0, -1):
|
|
68
|
+
src = self._archive_path(idx)
|
|
69
|
+
dst = self._archive_path(idx + 1)
|
|
70
|
+
if src.exists():
|
|
71
|
+
try:
|
|
72
|
+
os.replace(src, dst)
|
|
73
|
+
except OSError:
|
|
74
|
+
pass
|
|
75
|
+
try:
|
|
76
|
+
os.replace(self.path, self._archive_path(1))
|
|
77
|
+
except OSError:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
def _archive_path(self, index: int) -> Path:
|
|
81
|
+
return self.path.with_name(f"{self.path.name}.{index}")
|
|
82
|
+
|
|
83
|
+
def _archive_paths(self) -> list[Path]:
|
|
84
|
+
return [self._archive_path(i) for i in range(1, EVENT_LOG_ARCHIVE_KEEP + 1)]
|