@team-agent/installer 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/package.json +1 -1
  2. package/src/team_agent/cli/__init__.py +2 -0
  3. package/src/team_agent/cli/commands.py +22 -3
  4. package/src/team_agent/cli/parser.py +40 -1
  5. package/src/team_agent/coordinator/__main__.py +21 -2
  6. package/src/team_agent/coordinator/lifecycle.py +23 -0
  7. package/src/team_agent/diagnose/orphan_cleanup.py +193 -0
  8. package/src/team_agent/events.py +47 -0
  9. package/src/team_agent/leader/__init__.py +273 -60
  10. package/src/team_agent/lifecycle/agents.py +54 -2
  11. package/src/team_agent/lifecycle/operations.py +86 -9
  12. package/src/team_agent/lifecycle/paste_buffer_hygiene.py +39 -0
  13. package/src/team_agent/lifecycle/start.py +3 -0
  14. package/src/team_agent/message_store/leader_notification_log.py +132 -0
  15. package/src/team_agent/message_store/result_watchers.py +144 -1
  16. package/src/team_agent/message_store/schema.py +23 -0
  17. package/src/team_agent/messaging/delivery.py +10 -0
  18. package/src/team_agent/messaging/idle_alerts.py +227 -21
  19. package/src/team_agent/messaging/leader.py +166 -6
  20. package/src/team_agent/messaging/leader_panes.py +193 -23
  21. package/src/team_agent/messaging/owner_bypass.py +29 -0
  22. package/src/team_agent/messaging/result_delivery.py +219 -4
  23. package/src/team_agent/messaging/results.py +12 -21
  24. package/src/team_agent/messaging/scheduler.py +22 -2
  25. package/src/team_agent/messaging/send.py +9 -2
  26. package/src/team_agent/messaging/session_drift.py +94 -0
  27. package/src/team_agent/runtime.py +22 -14
  28. package/src/team_agent/rust_core.py +157 -3
  29. package/src/team_agent/state.py +167 -10
  30. package/src/team_agent/status/inbox.py +33 -3
@@ -359,36 +359,27 @@ def _refresh_leader_receiver_or_flag_rebind(
359
359
  receiver = state.get("leader_receiver") or {}
360
360
  if receiver.get("mode") != "direct_tmux":
361
361
  return state
362
- validation = _validate_leader_receiver(receiver)
362
+ owner_identity = state.get("team_owner") or None
363
+ receiver_for_validation = dict(receiver)
364
+ if owner_identity and owner_identity.get("leader_session_uuid") and not receiver_for_validation.get("leader_session_uuid"):
365
+ receiver_for_validation["leader_session_uuid"] = owner_identity["leader_session_uuid"]
366
+ validation = _validate_leader_receiver(receiver_for_validation)
363
367
  if validation.get("ok"):
364
368
  return state
365
- owner_identity = state.get("team_owner") or None
366
- rediscovered = _rediscover_leader_receiver(receiver, event_log, owner_identity)
369
+ rediscovered = _rediscover_leader_receiver(
370
+ receiver_for_validation,
371
+ event_log,
372
+ owner_identity,
373
+ invalidation_reason=validation.get("reason"),
374
+ team_id=team_state_key(state),
375
+ )
367
376
  if rediscovered.get("status") == "updated":
368
377
  state["leader_receiver"] = rediscovered["receiver"]
369
378
  if persist:
370
379
  save_runtime_state(workspace, state)
371
380
  else:
372
381
  save_team_scoped_state(workspace, state)
373
- event_log.write(
374
- "leader_receiver.rebind_applied",
375
- old_pane_id=receiver.get("pane_id"),
376
- new_pane_id=rediscovered["receiver"].get("pane_id"),
377
- reason=validation.get("reason"),
378
- source="report_result_notify",
379
- owner_identity=owner_identity,
380
- )
381
382
  return state
382
- event_log.write(
383
- "leader_receiver.rebind_required",
384
- old_pane_id=receiver.get("pane_id"),
385
- reason=validation.get("reason"),
386
- validation_error=validation.get("error"),
387
- rediscovery_status=rediscovered.get("status"),
388
- provider=receiver.get("provider"),
389
- source="report_result_notify",
390
- owner_identity=owner_identity,
391
- )
392
383
  return state
393
384
 
394
385
 
@@ -311,6 +311,16 @@ def _suppression_clear_reason(
311
311
  agent_id: str,
312
312
  entry: dict[str, Any],
313
313
  ) -> str | None:
314
+ if entry.get("manual_acknowledge"):
315
+ try:
316
+ expires_at = datetime.fromisoformat(str(entry.get("expires_at")))
317
+ except ValueError:
318
+ return "invalid_suppression_timestamp"
319
+ if expires_at.tzinfo is None:
320
+ expires_at = expires_at.replace(tzinfo=timezone.utc)
321
+ if datetime.now(timezone.utc) < expires_at:
322
+ return None
323
+ return "manual_acknowledge_expired"
314
324
  previous = entry.get("snapshot") if isinstance(entry.get("snapshot"), dict) else {}
315
325
  current = _agent_alert_snapshot(state, store, agent_id)
316
326
  if current.get("assigned_task_ids") != previous.get("assigned_task_ids"):
@@ -399,8 +409,18 @@ def _recent_restart_or_reset_event(event_log: EventLog, agent_id: str, since: da
399
409
  for event in reversed(event_log.tail(200)):
400
410
  if event.get("event") not in _RESTART_RESET_EVENTS:
401
411
  continue
402
- if event.get("agent_id") != agent_id and agent_id not in set(event.get("agents") or []):
403
- continue
412
+ if event.get("agent_id") != agent_id:
413
+ agents_field = event.get("agents") or []
414
+ agent_ids: set[str] = set()
415
+ for entry in agents_field:
416
+ if isinstance(entry, str):
417
+ agent_ids.add(entry)
418
+ elif isinstance(entry, dict):
419
+ aid = entry.get("agent_id")
420
+ if isinstance(aid, str):
421
+ agent_ids.add(aid)
422
+ if agent_id not in agent_ids:
423
+ continue
404
424
  try:
405
425
  ts = datetime.fromisoformat(str(event.get("ts")))
406
426
  except ValueError:
@@ -85,11 +85,13 @@ def _send_message_unlocked(
85
85
  return ambiguous
86
86
  state = select_runtime_state(workspace, team)
87
87
  gate = check_team_owner(state)
88
- if gate:
89
- return gate
90
88
  spec_path = Path(state.get("spec_path", workspace / "team.spec.yaml"))
91
89
  spec = load_spec(spec_path)
92
90
  event_log = EventLog(workspace)
91
+ if gate:
92
+ from team_agent.messaging.owner_bypass import apply_worker_sender_bypass
93
+ if not apply_worker_sender_bypass(state, sender, target, task_id, event_log):
94
+ return gate
93
95
  owner_team_id = team_state_key(state)
94
96
  leader_id = _leader_id(state, spec)
95
97
 
@@ -174,6 +176,11 @@ def _send_single_message_unlocked(
174
176
  if _is_leader_target(target, leader_id) and not _is_leader_sender(sender, leader_id):
175
177
  return _send_to_leader_receiver(workspace, state, leader_id, content, task_id, sender, requires_ack, event_log)
176
178
 
179
+ from team_agent.messaging.session_drift import session_drift_refusal
180
+ drift = session_drift_refusal(state, target, leader_id, sender, task_id, event_log)
181
+ if drift:
182
+ return drift
183
+
177
184
  if task_id and route_task_id:
178
185
  task = _find_task(state.get("tasks", []), task_id)
179
186
  if task.get("human_confirmation") and not task.get("human_confirmed"):
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from team_agent.events import EventLog
9
+
10
+ _UUID = r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
11
+ _RESUME_THREAD_RE = re.compile(
12
+ rf"(?:Switched to thread|resume|thread)\s+({_UUID})",
13
+ re.IGNORECASE,
14
+ )
15
+
16
+
17
+ def extract_thread_id_from_scrollback(scrollback: str) -> str | None:
18
+ if not scrollback:
19
+ return None
20
+ matches = _RESUME_THREAD_RE.findall(scrollback)
21
+ if not matches:
22
+ return None
23
+ return matches[-1].lower()
24
+
25
+
26
+ def detect_session_drift(
27
+ workspace: Path,
28
+ state: dict[str, Any],
29
+ event_log: EventLog,
30
+ *,
31
+ agent_id: str,
32
+ agent_state: dict[str, Any],
33
+ scrollback: str,
34
+ ) -> dict[str, Any] | None:
35
+ provider = str(agent_state.get("provider") or "").lower()
36
+ if provider != "codex":
37
+ return None
38
+ stored = str(agent_state.get("session_id") or "").strip()
39
+ if not stored:
40
+ return None
41
+ if str(agent_state.get("status") or "").lower() == "session_drift":
42
+ return None
43
+ actual = extract_thread_id_from_scrollback(scrollback)
44
+ if not actual:
45
+ return None
46
+ if actual.lower() == stored.lower():
47
+ return None
48
+ now = datetime.now(timezone.utc).isoformat()
49
+ event = event_log.write(
50
+ "coordinator.session_drift_detected",
51
+ agent_id=agent_id,
52
+ stored_session_id=stored,
53
+ actual_thread_id=actual,
54
+ status="session_drift",
55
+ provider=provider,
56
+ ts=now,
57
+ remediation="team-agent reset-agent --discard-session <agent>",
58
+ )
59
+ agent_state["status"] = "session_drift"
60
+ agent_state["session_drift"] = {
61
+ "stored_session_id": stored,
62
+ "actual_thread_id": actual,
63
+ "detected_at": now,
64
+ "remediation": "team-agent reset-agent --discard-session <agent>",
65
+ }
66
+ return event
67
+
68
+
69
+ def session_drift_refusal(state, target, leader_id, sender, task_id, event_log):
70
+ if not target or target == leader_id or target == "*":
71
+ return None
72
+ rs = (state.get("agents") or {}).get(target) or {}
73
+ if str(rs.get("status") or "").lower() != "session_drift":
74
+ return None
75
+ info = rs.get("session_drift") or {}
76
+ event_log.write(
77
+ "send.refused_session_drift",
78
+ target=target,
79
+ sender=sender,
80
+ task_id=task_id,
81
+ stored_session_id=info.get("stored_session_id"),
82
+ actual_thread_id=info.get("actual_thread_id"),
83
+ )
84
+ return {
85
+ "ok": False,
86
+ "status": "refused",
87
+ "reason": "session_drift",
88
+ "to": target,
89
+ "action": f"team-agent reset-agent --discard-session {target}",
90
+ "session_drift": info,
91
+ }
92
+
93
+
94
+ __all__ = ["detect_session_drift", "extract_thread_id_from_scrollback", "session_drift_refusal"]
@@ -67,6 +67,8 @@ from team_agent.display import (
67
67
  from team_agent.leader import (
68
68
  attach_leader,
69
69
  attach_leader_to_state as _attach_leader_to_state,
70
+ claim_leader,
71
+ leader_identity,
70
72
  leader_session_name as _leader_session_name,
71
73
  leader_start_plan,
72
74
  start_leader,
@@ -220,6 +222,7 @@ from team_agent.state import (
220
222
  save_runtime_state,
221
223
  save_team_scoped_state,
222
224
  select_runtime_state,
225
+ team_state_key,
223
226
  write_spec,
224
227
  write_team_state,
225
228
  )
@@ -437,12 +440,10 @@ for _name in (
437
440
  assert hasattr(_launch_pkg, _name), f"team_agent.launch missing {_name}"
438
441
  del _launch_pkg, _name
439
442
 
440
- # Leader lane re-exports keep runtime.attach_leader, runtime.start_leader,
441
- # runtime.leader_start_plan, runtime._attach_leader_to_state,
442
- # runtime._leader_session_name resolving for CLI handlers and tests.
443
+ # Leader lane re-exports keep runtime leader helpers resolving for CLI handlers and tests.
443
444
  import team_agent.leader as _leader_pkg
444
445
  assert attach_leader is _leader_pkg.attach_leader
445
- for _name in ("attach_leader", "attach_leader_to_state", "leader_session_name", "leader_start_plan", "start_leader"):
446
+ for _name in ("attach_leader", "attach_leader_to_state", "claim_leader", "leader_identity", "leader_session_name", "leader_start_plan", "start_leader"):
446
447
  assert hasattr(_leader_pkg, _name), f"team_agent.leader missing {_name}"
447
448
  del _leader_pkg, _name
448
449
  from team_agent.task_graph import ready_tasks, update_task_status
@@ -578,20 +579,27 @@ def remove_agent(
578
579
  return lifecycle_remove_agent(workspace, agent_id, from_spec=from_spec, confirm=confirm, force=force, team=team)
579
580
 
580
581
 
581
- def acknowledge_idle(workspace: Path, agent_id: str) -> dict[str, Any]:
582
+ def acknowledge_idle(workspace: Path, agent_id: str | None = None, *, team: str | None = None) -> dict[str, Any]:
582
583
  with _runtime_lock(workspace, "acknowledge-idle"):
583
- state = load_runtime_state(workspace)
584
+ try:
585
+ state = select_runtime_state(workspace, team)
586
+ except Exception as exc:
587
+ return {"ok": False, "status": "refused", "reason": "team_target_unresolved", "team": team, "error": str(exc)}
584
588
  gate = check_team_owner(state)
585
589
  if gate:
586
590
  return gate
587
- now = datetime.now(timezone.utc).isoformat()
588
- coordinator = state.setdefault("coordinator", {})
589
- ack = coordinator.setdefault("idle_acknowledged", {})
590
- ack[agent_id] = {"acknowledged_at": now}
591
- save_runtime_state(workspace, state)
592
- EventLog(workspace).write("coordinator.idle_acknowledged", agent_id=agent_id, acknowledged_at=now)
593
- return {"ok": True, "agent_id": agent_id, "acknowledged_at": now}
594
-
591
+ now_dt = datetime.now(timezone.utc); now = now_dt.isoformat()
592
+ ttl_seconds = 1800
593
+ expires_at = (now_dt + timedelta(seconds=ttl_seconds)).isoformat()
594
+ owner_team_id = team_state_key(state); coordinator = state.setdefault("coordinator", {})
595
+ coordinator.setdefault("idle_acknowledged", {})[owner_team_id] = {"acknowledged_at": now, "expires_at": expires_at, "ttl_seconds": ttl_seconds}
596
+ team_suppressions = coordinator.setdefault("suppressed_idle_alerts", {}).setdefault(owner_team_id, {})
597
+ entry = {"suppressed_at": now, "suppressed_by": "manual_acknowledge", "manual_acknowledge": True, "expires_at": expires_at, "ttl_seconds": ttl_seconds}
598
+ for worker_id in state.get("agents", {}):
599
+ team_suppressions.setdefault(worker_id, {})["idle_fallback"] = dict(entry)
600
+ save_team_scoped_state(workspace, state)
601
+ EventLog(workspace).write("coordinator.idle_acknowledged", agent_id=agent_id, team=owner_team_id, acknowledged_at=now, expires_at=expires_at, ttl_seconds=ttl_seconds)
602
+ return {"ok": True, "team": owner_team_id, "agent_id": agent_id, "acknowledged_at": now, "expires_at": expires_at, "ttl_seconds": ttl_seconds}
595
603
 
596
604
  def takeover(workspace: Path, team: str | None = None, confirm: bool = False) -> dict[str, Any]:
597
605
  if not confirm:
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import platform
4
5
  import re
5
6
  import shutil
6
7
  import subprocess
@@ -10,6 +11,18 @@ from typing import Any
10
11
  from team_agent.paths import repo_root
11
12
 
12
13
 
14
+ _LEADER_ENV_KEYS = (
15
+ "TEAM_AGENT_LEADER_SESSION_UUID",
16
+ "TEAM_AGENT_LEADER_PANE_ID",
17
+ "TEAM_AGENT_LEADER_PROVIDER",
18
+ "TEAM_AGENT_MACHINE_FINGERPRINT",
19
+ "TEAM_AGENT_LEADER_SESSION_UUID_OVERRIDE",
20
+ )
21
+ _LEADER_SHAPED_COMMANDS = {"codex", "claude", "claude.exe", "node", "nodejs"}
22
+ _PANE_ENV_SCAN_TIMEOUT_SECONDS = 2.0
23
+ _run_subprocess = subprocess.run # test-injectable indirection
24
+
25
+
13
26
  def core_binary() -> Path | None:
14
27
  configured = shutil.which("team-agent-core")
15
28
  if configured:
@@ -105,13 +118,13 @@ def list_targets() -> dict[str, Any]:
105
118
  result = call_core("list-targets")
106
119
  if result.get("ok"):
107
120
  return result
108
- proc = subprocess.run(
121
+ proc = _run_subprocess(
109
122
  [
110
123
  "tmux",
111
124
  "list-panes",
112
125
  "-a",
113
126
  "-F",
114
- "#{pane_id}\t#{session_name}\t#{window_index}\t#{window_name}\t#{pane_index}\t#{pane_tty}\t#{pane_current_command}\t#{pane_active}",
127
+ "#{pane_id}\t#{session_name}\t#{window_index}\t#{window_name}\t#{pane_index}\t#{pane_tty}\t#{pane_current_command}\t#{pane_active}\t#{pane_pid}",
115
128
  ],
116
129
  text=True,
117
130
  capture_output=True,
@@ -123,7 +136,7 @@ def list_targets() -> dict[str, Any]:
123
136
  targets = []
124
137
  for line in proc.stdout.splitlines():
125
138
  parts = line.split("\t")
126
- if len(parts) != 8:
139
+ if len(parts) not in {8, 9}:
127
140
  continue
128
141
  target = {
129
142
  "pane_id": parts[0],
@@ -135,11 +148,152 @@ def list_targets() -> dict[str, Any]:
135
148
  "pane_current_command": parts[6],
136
149
  "pane_active": parts[7] == "1",
137
150
  }
151
+ pane_pid = parts[8].strip() if len(parts) == 9 else ""
152
+ if pane_pid:
153
+ target["pane_pid"] = pane_pid
138
154
  target["fingerprint"] = f"{target['session_name']}|{target['window_index']}|{target['pane_index']}|{target['pane_tty']}"
155
+ _attach_leader_env(target)
139
156
  targets.append(target)
140
157
  return {"ok": True, "targets": targets, "engine": "python_fallback", "fallback_reason": result.get("error")}
141
158
 
142
159
 
160
+ def _attach_leader_env(target: dict[str, Any]) -> None:
161
+ pane_pid = str(target.get("pane_pid") or "").strip()
162
+ if not pane_pid:
163
+ target["leader_env"] = None
164
+ return
165
+ env = _read_process_env(pane_pid)
166
+ if env is None:
167
+ target["leader_env"] = None
168
+ return
169
+ leader_env = {key: env[key] for key in _LEADER_ENV_KEYS if key in env}
170
+ if "TEAM_AGENT_LEADER_SESSION_UUID" not in leader_env:
171
+ for child_pid in _walk_leader_shaped_children(pane_pid):
172
+ child_env = _read_process_env(child_pid)
173
+ if child_env is None:
174
+ continue
175
+ for key in _LEADER_ENV_KEYS:
176
+ if key not in leader_env and key in child_env:
177
+ leader_env[key] = child_env[key]
178
+ if "TEAM_AGENT_LEADER_SESSION_UUID" in leader_env:
179
+ break
180
+ target["leader_env"] = leader_env
181
+ uuid_value = leader_env.get("TEAM_AGENT_LEADER_SESSION_UUID")
182
+ if uuid_value:
183
+ target["leader_session_uuid"] = uuid_value
184
+
185
+
186
+ def _read_process_env(pid: str) -> dict[str, str] | None:
187
+ if platform.system() == "Linux":
188
+ return _read_proc_environ(pid)
189
+ return _read_ps_eww_env(pid)
190
+
191
+
192
+ def _read_proc_environ(pid: str) -> dict[str, str] | None:
193
+ path = Path(f"/proc/{pid}/environ")
194
+ try:
195
+ raw = path.read_bytes()
196
+ except (FileNotFoundError, PermissionError, OSError):
197
+ return None
198
+ env: dict[str, str] = {}
199
+ for token in raw.split(b"\x00"):
200
+ if not token or b"=" not in token:
201
+ continue
202
+ try:
203
+ text = token.decode("utf-8", errors="replace")
204
+ except Exception:
205
+ continue
206
+ key, _, value = text.partition("=")
207
+ env[key] = value
208
+ return env
209
+
210
+
211
+ def _read_ps_eww_env(pid: str) -> dict[str, str] | None:
212
+ try:
213
+ proc = _run_subprocess(
214
+ ["ps", "-E", "-ww", "-p", str(pid)],
215
+ text=True,
216
+ capture_output=True,
217
+ timeout=_PANE_ENV_SCAN_TIMEOUT_SECONDS,
218
+ check=False,
219
+ )
220
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
221
+ return None
222
+ if proc.returncode != 0 or not proc.stdout:
223
+ return None
224
+ return _parse_ps_eww_output(proc.stdout, pid)
225
+
226
+
227
+ def _parse_ps_eww_output(text: str, pid: str) -> dict[str, str]:
228
+ env: dict[str, str] = {}
229
+ lines = text.splitlines()
230
+ if len(lines) < 2:
231
+ return env
232
+ target_row = None
233
+ for line in lines[1:]:
234
+ stripped = line.lstrip()
235
+ if stripped.split(" ", 1)[0] == str(pid):
236
+ target_row = stripped
237
+ break
238
+ if target_row is None:
239
+ # Spark MEDIUM #2 (da436a3): never fall back to lines[1] — that row may belong to
240
+ # an unrelated process and would leak its env (incl. another team's
241
+ # TEAM_AGENT_LEADER_SESSION_UUID) into this pane's leader_env, corrupting rediscovery.
242
+ return env
243
+ for token in target_row.split():
244
+ if "=" not in token:
245
+ continue
246
+ key, _, value = token.partition("=")
247
+ if not key or " " in key:
248
+ continue
249
+ if not (key[0].isalpha() or key[0] == "_"):
250
+ continue
251
+ if not all(ch.isalnum() or ch == "_" for ch in key):
252
+ continue
253
+ env[key] = value
254
+ return env
255
+
256
+
257
+ def _walk_leader_shaped_children(parent_pid: str) -> list[str]:
258
+ try:
259
+ proc = _run_subprocess(
260
+ ["ps", "-o", "pid=,ppid=,comm="],
261
+ text=True,
262
+ capture_output=True,
263
+ timeout=_PANE_ENV_SCAN_TIMEOUT_SECONDS,
264
+ check=False,
265
+ )
266
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
267
+ return []
268
+ if proc.returncode != 0 or not proc.stdout:
269
+ return []
270
+ return _select_leader_shaped_descendants(proc.stdout, parent_pid)
271
+
272
+
273
+ def _select_leader_shaped_descendants(ps_output: str, parent_pid: str) -> list[str]:
274
+ rows: list[tuple[str, str, str]] = []
275
+ for line in ps_output.splitlines():
276
+ parts = line.split()
277
+ if len(parts) < 3:
278
+ continue
279
+ pid, ppid, command = parts[0], parts[1], " ".join(parts[2:])
280
+ rows.append((pid, ppid, Path(command).name))
281
+ descendants: set[str] = set()
282
+ frontier = {str(parent_pid)}
283
+ while frontier:
284
+ next_frontier: set[str] = set()
285
+ for pid, ppid, _ in rows:
286
+ if ppid in frontier and pid not in descendants:
287
+ descendants.add(pid)
288
+ next_frontier.add(pid)
289
+ frontier = next_frontier
290
+ return [
291
+ pid
292
+ for pid, _, command in rows
293
+ if pid in descendants and command in _LEADER_SHAPED_COMMANDS
294
+ ]
295
+
296
+
143
297
  def contains_inline_secret(value: str) -> bool:
144
298
  return (
145
299
  _contains_secret_assignment(value)