@team-agent/installer 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/package.json +1 -1
  2. package/schemas/team.schema.json +6 -0
  3. package/src/team_agent/abnormal_track.py +253 -0
  4. package/src/team_agent/approvals/runtime_prompts.py +1 -1
  5. package/src/team_agent/cli/commands.py +104 -3
  6. package/src/team_agent/cli/parser.py +10 -1
  7. package/src/team_agent/compiler.py +1 -1
  8. package/src/team_agent/coordinator/lifecycle.py +23 -2
  9. package/src/team_agent/diagnose/orphan_cleanup.py +199 -28
  10. package/src/team_agent/display/__init__.py +31 -0
  11. package/src/team_agent/display/adaptive.py +425 -0
  12. package/src/team_agent/display/backend.py +46 -0
  13. package/src/team_agent/display/close.py +6 -0
  14. package/src/team_agent/display/rebuild.py +102 -0
  15. package/src/team_agent/display/tiling.py +156 -0
  16. package/src/team_agent/display/worker_window.py +4 -0
  17. package/src/team_agent/display/workspace.py +36 -127
  18. package/src/team_agent/idle_predicate.py +200 -0
  19. package/src/team_agent/idle_takeover.py +59 -0
  20. package/src/team_agent/idle_takeover_wiring.py +111 -0
  21. package/src/team_agent/launch/core.py +14 -4
  22. package/src/team_agent/leader/__init__.py +444 -61
  23. package/src/team_agent/lifecycle/operations.py +1 -0
  24. package/src/team_agent/lifecycle/start.py +1 -1
  25. package/src/team_agent/message_store/core.py +38 -11
  26. package/src/team_agent/message_store/leader_notification_log.py +47 -26
  27. package/src/team_agent/message_store/schema.py +8 -2
  28. package/src/team_agent/messaging/delivery.py +336 -1
  29. package/src/team_agent/messaging/leader.py +13 -4
  30. package/src/team_agent/messaging/leader_api_errors.py +216 -0
  31. package/src/team_agent/messaging/leader_panes.py +294 -0
  32. package/src/team_agent/messaging/scheduler.py +12 -0
  33. package/src/team_agent/messaging/send.py +54 -26
  34. package/src/team_agent/messaging/tmux_io.py +202 -33
  35. package/src/team_agent/messaging/tmux_prompt.py +87 -0
  36. package/src/team_agent/messaging/trust_auto_answer.py +52 -0
  37. package/src/team_agent/provider_state/README.md +78 -0
  38. package/src/team_agent/provider_state/__init__.py +86 -0
  39. package/src/team_agent/provider_state/claude.py +86 -0
  40. package/src/team_agent/provider_state/codex.py +84 -0
  41. package/src/team_agent/provider_state/common.py +207 -0
  42. package/src/team_agent/provider_state/registry.py +118 -0
  43. package/src/team_agent/restart/orchestration.py +215 -12
  44. package/src/team_agent/runtime.py +65 -15
  45. package/src/team_agent/sessions/capture.py +65 -15
  46. package/src/team_agent/spec.py +63 -3
  47. package/src/team_agent/status/queries.py +32 -1
  48. package/src/team_agent/wake.py +58 -0
  49. package/src/team_agent/watch/__init__.py +145 -0
@@ -8,18 +8,19 @@ from typing import Any
8
8
  from team_agent.events import EventLog
9
9
  from team_agent.message_store import MessageStore
10
10
  from team_agent.permissions import resolve_permissions
11
+ from team_agent.display.backend import display_backend_has_worker_views, display_backend_opens_before_leader_rebind, resolve_restart_display_backend
12
+ from team_agent.display.close import close_team_display_backends
13
+ from team_agent.display.rebuild import rebuild_restart_display_after_rebind
11
14
  from team_agent.restart.selection import select_restart_state
12
15
  from team_agent.restart.snapshot import save_team_runtime_snapshot
13
16
  from team_agent.spec import load_spec
14
17
  from team_agent.state import (
15
18
  check_team_owner,
16
- load_runtime_state,
17
19
  populate_team_owner_from_env,
18
20
  save_runtime_state,
19
21
  write_team_state,
20
22
  )
21
23
 
22
-
23
24
  def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None) -> dict[str, Any]:
24
25
  # Lazy-import everything from team_agent.runtime so existing tests that
25
26
  # patch runtime.shell_resume_command_for_agent / runtime.run_cmd /
@@ -27,7 +28,6 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
27
28
  # at call time. Runtime re-exports the provider helpers, so this also
28
29
  # routes through the providers module without binding it directly.
29
30
  from team_agent.runtime import (
30
- GHOSTTY_DISPLAY_BACKENDS,
31
31
  ResumeUnavailable,
32
32
  RuntimeError,
33
33
  _attach_profile_resume_root,
@@ -35,7 +35,6 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
35
35
  _capture_agent_session,
36
36
  _clear_session_capture_fields,
37
37
  _close_ghostty_display,
38
- _close_ghostty_workspace,
39
38
  _compile_team_dir_spec,
40
39
  _effective_runtime_config,
41
40
  _ensure_agent_start_requirements,
@@ -83,16 +82,73 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
83
82
  )
84
83
  raise RuntimeError(_tmux_session_conflict_error(session_name))
85
84
  runtime_cfg = _effective_runtime_config(spec.get("runtime", {}))
86
- display_backend = spec.get("runtime", {}).get("display_backend", state.get("display_backend", "none"))
87
- _close_ghostty_workspace(state, event_log)
88
- for agent_id, agent_state in state.get("agents", {}).items():
89
- _close_ghostty_display(agent_id, agent_state, event_log)
90
- state["display_backend"] = display_backend
85
+ display_backend = resolve_restart_display_backend(spec, state, event_log)
86
+ # Stage 7 S5 — Slice 6 lifecycle atomicity contract: compute restart_agents
87
+ # early so we can pre-validate resumability BEFORE any destructive teardown
88
+ # (ghostty close, tmux session creation). Without --allow-fresh, every
89
+ # non-paused worker MUST be resumable; if any is not, refuse the operation
90
+ # atomically with a structured result and a restart.atomic_refusal event.
91
+ # No rollback path is needed because nothing has been created yet.
91
92
  restart_agents = [
92
93
  agent
93
94
  for agent in spec.get("agents", [])
94
95
  if state.get("agents", {}).get(agent["id"], {}).get("status") != "paused" and not agent.get("paused")
95
96
  ]
97
+ # cr strict-typing (2026-05-27): refuse the operation deterministically
98
+ # before any decision logic if any persisted first_send_at is corrupt
99
+ # (empty string, 0, False, literal "null", any non-ISO garbage). This
100
+ # avoids silent misclassification through Python truthiness and gives the
101
+ # operator a clear audit signal that state.json is damaged.
102
+ invalid_first_send_at = _collect_corrupt_first_send_at(restart_agents, state)
103
+ if invalid_first_send_at:
104
+ for entry in invalid_first_send_at:
105
+ event_log.write(
106
+ "restart.first_send_at_invalid",
107
+ worker_id=entry["worker_id"],
108
+ raw_first_send_at=entry["raw_first_send_at"],
109
+ raw_first_send_at_type=entry["raw_first_send_at_type"],
110
+ )
111
+ invalid_names = [entry["worker_id"] for entry in invalid_first_send_at]
112
+ return {
113
+ "ok": False,
114
+ "status": "refused",
115
+ "reason": "invalid_first_send_at",
116
+ "invalid_first_send_at": invalid_first_send_at,
117
+ "allow_fresh": bool(allow_fresh),
118
+ "error": (
119
+ f"Cannot restart: workers {invalid_names} have a corrupt "
120
+ "first_send_at in state.json (only null/missing or a valid "
121
+ "ISO-8601 UTC timestamp string is accepted). Inspect the "
122
+ "restart.first_send_at_invalid audit events for raw values "
123
+ "and repair state.json before retrying."
124
+ ),
125
+ }
126
+ # cr C2: emit one restart.resume_decision event per non-paused worker so
127
+ # every restart attempt produces an auditable per-worker classification.
128
+ # The function returns only refused workers — populated when
129
+ # allow_fresh=False AND at least one interacted worker cannot be repaired.
130
+ refused = _emit_resume_decisions(
131
+ workspace, restart_agents, state, get_adapter, event_log, allow_fresh,
132
+ )
133
+ if refused:
134
+ event_log.write(
135
+ "restart.atomic_refusal",
136
+ unresumable=refused,
137
+ allow_fresh=bool(allow_fresh),
138
+ reason="resume_atomicity",
139
+ )
140
+ return {
141
+ "ok": False,
142
+ "status": "refused",
143
+ "reason": "resume_atomicity",
144
+ "unresumable": refused,
145
+ "allow_fresh": bool(allow_fresh),
146
+ "error": _format_atomic_refusal_error(refused),
147
+ }
148
+ close_team_display_backends(state, event_log)
149
+ for agent_id, agent_state in state.get("agents", {}).items():
150
+ _close_ghostty_display(agent_id, agent_state, event_log)
151
+ state["display_backend"] = display_backend
96
152
  _ensure_agent_start_requirements(workspace, restart_agents, event_log, "restart")
97
153
  first = True
98
154
  restarted: list[dict[str, Any]] = []
@@ -271,8 +327,9 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
271
327
  event_log,
272
328
  timeout_s=1.5,
273
329
  exclude_session_ids=known_session_ids,
330
+ raise_on_missed=False,
274
331
  )
275
- if display_backend in GHOSTTY_DISPLAY_BACKENDS:
332
+ if display_backend_has_worker_views(display_backend):
276
333
  display_jobs.append((agent["id"], agent))
277
334
  new_agents[agent["id"]] = agent_state
278
335
  restarted.append(
@@ -283,7 +340,7 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
283
340
  "display_target": None,
284
341
  }
285
342
  )
286
- display_results = _open_worker_displays(workspace, session_name, display_jobs, event_log, display_backend)
343
+ display_results = _open_worker_displays(workspace, session_name, display_jobs, event_log, display_backend) if display_backend_opens_before_leader_rebind(display_backend) else {}
287
344
  for agent_id, display in display_results.items():
288
345
  if agent_id in new_agents:
289
346
  new_agents[agent_id]["display"] = display
@@ -309,12 +366,158 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
309
366
  write_team_state(workspace, spec, state)
310
367
  from team_agent.leader import autobind_leader_receiver_from_env
311
368
  leader_provider = str(spec.get("leader", {}).get("provider") or "codex")
312
- autobind_leader_receiver_from_env(workspace, leader_provider, source="restart")
369
+ rebound_receiver = autobind_leader_receiver_from_env(workspace, leader_provider, source="restart")
370
+ rebuild_restart_display_after_rebind(display_backend, workspace, session_name, spec, event_log, restarted, receiver=rebound_receiver)
313
371
  coordinator = start_coordinator(workspace)
314
372
  event_log.write("restart.complete", session=session_name, agents=restarted, coordinator=coordinator)
315
373
  return {"ok": True, "session_name": session_name, "agents": restarted, "coordinator": coordinator}
316
374
 
317
375
 
376
+ _FIRST_SEND_AT_ABSENT = "absent"
377
+ _FIRST_SEND_AT_VALID = "valid"
378
+ _FIRST_SEND_AT_CORRUPT = "corrupt"
379
+
380
+
381
+ def _classify_first_send_at(value: Any) -> str:
382
+ """Strict first_send_at typing (cr verdict, 2026-05-27).
383
+
384
+ Returns one of:
385
+ "absent" — None or missing field (worker never-interacted).
386
+ "valid" — non-empty ISO-8601 UTC string parseable by datetime.fromisoformat.
387
+ "corrupt" — anything else: empty string, 0, False, literal "null", garbage.
388
+
389
+ The contract requires that corrupt values be detected deterministically
390
+ before any restart decision so we never silent-misclassify a worker's
391
+ interaction state via Python truthiness.
392
+ """
393
+ if value is None:
394
+ return _FIRST_SEND_AT_ABSENT
395
+ if not isinstance(value, str):
396
+ return _FIRST_SEND_AT_CORRUPT
397
+ if not value:
398
+ return _FIRST_SEND_AT_CORRUPT
399
+ try:
400
+ datetime.fromisoformat(value)
401
+ except (ValueError, TypeError):
402
+ return _FIRST_SEND_AT_CORRUPT
403
+ return _FIRST_SEND_AT_VALID
404
+
405
+
406
+ def _collect_corrupt_first_send_at(
407
+ restart_agents: list[dict[str, Any]],
408
+ state: dict[str, Any],
409
+ ) -> list[dict[str, Any]]:
410
+ """Walk every non-paused worker and flag any whose persisted first_send_at
411
+ is corrupt. Returns the list of invalid records ready for the
412
+ `restart.first_send_at_invalid` event and the refusal envelope."""
413
+ invalid: list[dict[str, Any]] = []
414
+ for agent in restart_agents:
415
+ agent_id = agent["id"]
416
+ previous = state.get("agents", {}).get(agent_id, {})
417
+ raw = previous.get("first_send_at") if isinstance(previous, dict) else None
418
+ if _classify_first_send_at(raw) != _FIRST_SEND_AT_CORRUPT:
419
+ continue
420
+ invalid.append({
421
+ "worker_id": agent_id,
422
+ "raw_first_send_at": raw,
423
+ "raw_first_send_at_type": type(raw).__name__,
424
+ })
425
+ return invalid
426
+
427
+
428
+ def _emit_resume_decisions(
429
+ workspace: Path,
430
+ restart_agents: list[dict[str, Any]],
431
+ state: dict[str, Any],
432
+ get_adapter_fn: Any,
433
+ event_log: EventLog,
434
+ allow_fresh: bool,
435
+ ) -> list[dict[str, Any]]:
436
+ """Route B audit-events contract (cr C2, 2026-05-27). For every non-paused
437
+ worker considered by restart, derive the resume decision per the Route B
438
+ matrix and emit ONE `restart.resume_decision` event:
439
+
440
+ resumable AND ... -> decision = "resume"
441
+ not resumable AND not interacted -> decision = "fresh_start"
442
+ not resumable AND interacted AND fresh -> decision = "fresh_start"
443
+ not resumable AND interacted AND not fresh -> decision = "refuse"
444
+
445
+ Resumability mirrors sessions.resume.prepare_resume_state's repair chain
446
+ so workers the runtime would legitimately repair are NOT flagged. Returns
447
+ the subset of refused workers — populated only when allow_fresh=False AND
448
+ some interacted worker cannot be repaired — for use by atomic_refusal.
449
+ """
450
+ from team_agent.sessions.resume import recover_resume_session_from_events
451
+ refused: list[dict[str, Any]] = []
452
+ for agent in restart_agents:
453
+ agent_id = agent["id"]
454
+ previous = state.get("agents", {}).get(agent_id, {})
455
+ session_id = previous.get("session_id")
456
+ first_send_at = previous.get("first_send_at")
457
+ has_first_send_at = _classify_first_send_at(first_send_at) == _FIRST_SEND_AT_VALID
458
+ has_session_id = bool(session_id)
459
+ adapter = get_adapter_fn(agent["provider"])
460
+ resumable = bool(session_id) and adapter.session_is_resumable(previous, workspace)
461
+ if not resumable:
462
+ known_session_ids = {
463
+ str(item.get("session_id"))
464
+ for aid, item in state.get("agents", {}).items()
465
+ if aid != agent_id and item.get("session_id")
466
+ }
467
+ repaired = recover_resume_session_from_events(
468
+ workspace, agent_id, previous, adapter, known_session_ids,
469
+ )
470
+ if not repaired:
471
+ repaired = adapter.recover_session_id(
472
+ agent_id, previous, workspace, known_session_ids,
473
+ )
474
+ resumable = bool(repaired)
475
+ if resumable:
476
+ decision = "resume"
477
+ elif not has_first_send_at:
478
+ decision = "fresh_start"
479
+ elif allow_fresh:
480
+ decision = "fresh_start"
481
+ else:
482
+ decision = "refuse"
483
+ event_log.write(
484
+ "restart.resume_decision",
485
+ worker_id=agent_id,
486
+ has_first_send_at=has_first_send_at,
487
+ has_session_id=has_session_id,
488
+ allow_fresh=bool(allow_fresh),
489
+ decision=decision,
490
+ first_send_at=first_send_at if has_first_send_at else None,
491
+ session_id=session_id,
492
+ )
493
+ if decision == "refuse":
494
+ refused.append({
495
+ "agent_id": agent_id,
496
+ "reason": "no_persisted_session_id" if not session_id else "session_unresumable",
497
+ "session_id": session_id,
498
+ "first_send_at": first_send_at,
499
+ })
500
+ return refused
501
+
502
+
503
+ def _format_atomic_refusal_error(refused: list[dict[str, Any]]) -> str:
504
+ """C4 (cr verdict, 2026-05-27): the human-readable refusal error must
505
+ name every refused worker AND its first_send_at timestamp so an operator
506
+ can decide whether to pass --allow-fresh and accept losing that
507
+ interaction history."""
508
+ names = [item["agent_id"] for item in refused]
509
+ details = ". ".join(
510
+ f"{item['agent_id']} was first interacted with at {item.get('first_send_at')}; "
511
+ "its persisted session is missing"
512
+ for item in refused
513
+ )
514
+ return (
515
+ f"Cannot restart: workers {names} have no resumable session despite "
516
+ f"previous interaction. {details}. "
517
+ "Pass --allow-fresh if you accept losing that interaction history."
518
+ )
519
+
520
+
318
521
  def rollback_restart_session(session_name: str, event_log: EventLog) -> dict[str, Any]:
319
522
  from team_agent.runtime import run_cmd
320
523
  proc = run_cmd(["tmux", "kill-session", "-t", session_name], timeout=10)
@@ -39,10 +39,12 @@ from team_agent.providers import (
39
39
  shell_resume_command_for_agent,
40
40
  )
41
41
  from team_agent.display import (
42
+ GHOSTTY_DISPLAY_BACKENDS,
42
43
  GHOSTTY_WORKSPACE_PANES_PER_WINDOW,
43
44
  close_ghostty_display as _close_ghostty_display,
44
45
  close_ghostty_workspace as _close_ghostty_workspace,
45
46
  close_ghostty_workspace_slot as _close_ghostty_workspace_slot,
47
+ close_team_display_backends as _close_team_display_backends,
46
48
  ghostty_app_exists as _ghostty_app_exists,
47
49
  ghostty_attach_args as _ghostty_attach_args,
48
50
  ghostty_command as _ghostty_command,
@@ -65,6 +67,7 @@ from team_agent.display import (
65
67
  set_ghostty_workspace_pane_title as _set_ghostty_workspace_pane_title,
66
68
  )
67
69
  from team_agent.leader import (
70
+ LEADER_OWNERSHIP_LOCK,
68
71
  attach_leader,
69
72
  attach_leader_to_state as _attach_leader_to_state,
70
73
  claim_leader,
@@ -456,7 +459,6 @@ TMUX_PANE_FORMAT = (
456
459
  "#{pane_current_path}\t#{session_attached}\t#{pane_in_mode}"
457
460
  )
458
461
  HEALTH_STATUSES = {"RUNNING", "IDLE", "AWAITING_APPROVAL", "BLOCKED", "ERROR", "DONE"}
459
- GHOSTTY_DISPLAY_BACKENDS = {"ghostty", "ghostty_window", "ghostty_workspace"}
460
462
  DELIVERY_CAPTURE_LINES = 40
461
463
  SUBMITTED_DELIVERY_STATUSES = {"injected", "visible", "submitted", "submitted_unverified", "delivered", "acknowledged"}
462
464
  TMUX_STDIN_BUFFER_THRESHOLD = 16 * 1024
@@ -480,7 +482,6 @@ def ensure_workspace_dirs(workspace: Path) -> None:
480
482
  path.mkdir(parents=True, exist_ok=True)
481
483
 
482
484
 
483
-
484
485
  def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -> dict[str, Any]:
485
486
  from team_agent.state import resolve_team_scoped_state
486
487
  state, refusal = resolve_team_scoped_state(workspace, team)
@@ -521,7 +522,7 @@ def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -
521
522
  if proc.returncode == 0:
522
523
  log_path.write_text(proc.stdout, encoding="utf-8")
523
524
  captured.append(str(log_path))
524
- _close_ghostty_workspace(state, event_log)
525
+ _close_team_display_backends(state, event_log)
525
526
  for agent_id, agent_state in state.get("agents", {}).items():
526
527
  _close_ghostty_display(agent_id, agent_state, event_log)
527
528
  closed_displays.add(agent_id)
@@ -535,7 +536,7 @@ def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -
535
536
  event_log.write("shutdown.kill_session", session=session_name, keep_logs=keep_logs, captured=captured)
536
537
  else:
537
538
  event_log.write("shutdown.idempotent", session=session_name, reason="session missing")
538
- _close_ghostty_workspace(state, event_log)
539
+ _close_team_display_backends(state, event_log)
539
540
  for agent_id, agent_state in state.get("agents", {}).items():
540
541
  if agent_id not in closed_displays:
541
542
  _close_ghostty_display(agent_id, agent_state, event_log)
@@ -617,7 +618,7 @@ def takeover(workspace: Path, team: str | None = None, confirm: bool = False) ->
617
618
  "reason": "no_caller_identity",
618
619
  "action": "set TEAM_AGENT_LEADER_PANE_ID/PROVIDER/MACHINE_FINGERPRINT or run from a tmux pane",
619
620
  }
620
- with _runtime_lock(workspace, "send"):
621
+ with _runtime_lock(workspace, LEADER_OWNERSHIP_LOCK):
621
622
  try:
622
623
  team_state = select_runtime_state(workspace, team)
623
624
  except RuntimeError as exc:
@@ -628,23 +629,72 @@ def takeover(workspace: Path, team: str | None = None, confirm: bool = False) ->
628
629
  "team": team,
629
630
  "error": str(exc),
630
631
  }
631
- previous_owner = team_state.get("team_owner")
632
+ previous_owner = team_state.get("team_owner") if isinstance(team_state.get("team_owner"), dict) else {}
633
+ previous_receiver = team_state.get("leader_receiver") if isinstance(team_state.get("leader_receiver"), dict) else {}
634
+ from team_agent.leader import _lease_epoch, _receiver_from_claim_target
635
+ next_epoch = _lease_epoch(previous_owner, previous_receiver) + 1
636
+ leader_uuid = str(previous_owner.get("leader_session_uuid") or "")
632
637
  new_owner = {
633
638
  "pane_id": pane_id,
634
639
  "provider": os.environ.get("TEAM_AGENT_LEADER_PROVIDER", ""),
635
640
  "machine_fingerprint": os.environ.get("TEAM_AGENT_MACHINE_FINGERPRINT", ""),
641
+ "owner_epoch": next_epoch,
636
642
  "claimed_at": datetime.now(timezone.utc).isoformat(),
637
643
  "claimed_via": "takeover",
638
644
  }
645
+ if leader_uuid:
646
+ new_owner["leader_session_uuid"] = leader_uuid
639
647
  team_state["team_owner"] = new_owner
640
- save_team_scoped_state(workspace, team_state)
641
- EventLog(workspace).write(
642
- "team_owner.takeover",
643
- team=team,
644
- previous_owner=previous_owner,
648
+ # C11/C17: takeover converges on the same lease mutation as claim-leader.
649
+ # Rebind the leader receiver to the caller pane and write owner + receiver
650
+ # to both state locations together, so takeover never leaves the receiver
651
+ # pointing at the old (often dead) pane.
652
+ targets_result = core_list_targets()
653
+ targets = targets_result.get("targets", []) if isinstance(targets_result, dict) and targets_result.get("ok") else []
654
+ caller_target = next((item for item in targets if isinstance(item, dict) and str(item.get("pane_id")) == str(pane_id)), None)
655
+ new_receiver = None
656
+ if caller_target:
657
+ new_receiver = _receiver_from_claim_target(
658
+ caller_target,
659
+ previous_receiver,
660
+ leader_uuid or None,
661
+ next_epoch,
662
+ )
663
+ new_receiver["discovery"] = "takeover"
664
+ team_state["leader_receiver"] = new_receiver
665
+ from team_agent.leader import _write_lease_dual_state
666
+ _write_lease_dual_state(workspace, team_state)
667
+ # C11: takeover converges on the same lease audit events as claim-leader
668
+ # instead of a divergent legacy team_owner.takeover record.
669
+ event_log = EventLog(workspace)
670
+ uuid_prefix = leader_uuid[:8]
671
+ old_pane_id = previous_receiver.get("pane_id") or (previous_owner or {}).get("pane_id")
672
+ if new_receiver is not None:
673
+ event_log.write(
674
+ "leader_receiver.rebind_applied",
675
+ reason="takeover_confirmed",
676
+ old_pane_id=old_pane_id,
677
+ new_pane_id=pane_id,
678
+ owner_epoch=next_epoch,
679
+ uuid_prefix=uuid_prefix,
680
+ team_id=team,
681
+ )
682
+ event_log.write(
683
+ "owner_epoch_advanced",
684
+ reason="takeover_confirmed",
685
+ old_pane_id=old_pane_id,
686
+ new_pane_id=pane_id,
687
+ owner_epoch=next_epoch,
688
+ uuid_prefix=uuid_prefix,
689
+ team_id=team,
690
+ previous_owner=previous_owner or None,
645
691
  new_owner=new_owner,
692
+ receiver_rebound=bool(new_receiver),
646
693
  )
647
- return {"ok": True, "status": "claimed", "team": team, "team_owner": new_owner, "previous_owner": previous_owner}
694
+ response = {"ok": True, "status": "claimed", "team": team, "team_owner": new_owner, "previous_owner": previous_owner or None, "owner_epoch": next_epoch}
695
+ if new_receiver is not None:
696
+ response["leader_receiver"] = new_receiver
697
+ return response
648
698
 
649
699
 
650
700
  def _running_agent_state(workspace: Path, agent: dict[str, Any], previous: dict[str, Any]) -> dict[str, Any]:
@@ -674,7 +724,7 @@ def _handle_startup_prompts_and_verify_window(
674
724
  session_name: str,
675
725
  start_mode: str,
676
726
  ) -> bool:
677
- handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=1, sleep_s=0.0)
727
+ handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=20, sleep_s=0.5)
678
728
  for prompt_event in handled_prompts:
679
729
  event_log.write(f"{event_prefix}.startup_prompt_handled", agent_id=agent_id, provider=provider, **prompt_event)
680
730
  deadline = time.monotonic() + 1.0
@@ -840,10 +890,10 @@ def _retry_or_failed(task: dict[str, Any]) -> str:
840
890
  return "failed"
841
891
 
842
892
 
843
- def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0) -> dict[str, Any]:
893
+ def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0, *, _trust_retry_attempt: int = 1) -> dict[str, Any]:
844
894
  from team_agent.messaging.delivery import _deliver_pending_message as impl
845
895
 
846
- return impl(workspace, state, message_id, wait_visible, timeout)
896
+ return impl(workspace, state, message_id, wait_visible, timeout, _trust_retry_attempt=_trust_retry_attempt)
847
897
 
848
898
  def _enable_codex_fast_mode(session_name: str, window_name: str) -> dict[str, Any]:
849
899
  from team_agent.messaging.tmux_prompt import _enable_codex_fast_mode as impl
@@ -1,14 +1,25 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import time
3
4
  from datetime import datetime, timezone
4
5
  from pathlib import Path
5
6
  from typing import Any
6
7
 
8
+ from team_agent.errors import RuntimeError as TeamAgentRuntimeError
7
9
  from team_agent.events import EventLog
8
10
  from team_agent.providers import get_adapter
9
11
  from team_agent.state import SESSION_CAPTURE_FIELDS, SESSION_STATE_FIELDS
10
12
 
11
13
 
14
+ # Stage 7 S6 (2026-05-27): capture_agent_session used to do a single adapter
15
+ # call and silently return None on miss, leaving status='running' workers with
16
+ # session_id=null. Slow worker startups (Codex writing the rollout file a few
17
+ # tenths of a second after window creation) raced this check. We now poll on a
18
+ # small interval inside the caller's timeout_s budget so the adapter's own
19
+ # fast-path call doesn't have to absorb all the latency on its own.
20
+ _CAPTURE_POLL_INTERVAL_SECONDS = 0.05
21
+
22
+
12
23
  def capture_missing_sessions(
13
24
  workspace: Path,
14
25
  state: dict[str, Any],
@@ -25,6 +36,10 @@ def capture_missing_sessions(
25
36
  for aid, item in state.get("agents", {}).items()
26
37
  if aid != agent_id and item.get("session_id")
27
38
  }
39
+ # capture_missing_sessions is invoked from coordinator_tick, diagnose,
40
+ # status, etc. with very short timeouts; a transient miss should NOT
41
+ # crash those paths. The loud raise contract belongs to direct callers
42
+ # (e.g. lifecycle start/restart) who own the worker's atomicity.
28
43
  result = capture_agent_session(
29
44
  workspace,
30
45
  agent_id,
@@ -32,6 +47,7 @@ def capture_missing_sessions(
32
47
  event_log,
33
48
  timeout_s=timeout_s,
34
49
  exclude_session_ids=known_session_ids,
50
+ raise_on_missed=False,
35
51
  )
36
52
  if result:
37
53
  captured.append(agent_id)
@@ -53,6 +69,7 @@ def capture_agent_session(
53
69
  event_log: EventLog,
54
70
  timeout_s: float,
55
71
  exclude_session_ids: set[str] | None = None,
72
+ raise_on_missed: bool = True,
56
73
  ) -> dict[str, Any] | None:
57
74
  if agent_state.get("session_id"):
58
75
  return None
@@ -66,21 +83,54 @@ def capture_agent_session(
66
83
  "exclude_session_ids": sorted(exclude_session_ids or set()),
67
84
  "claude_projects_root": agent_state.get("claude_projects_root"),
68
85
  }
69
- result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=timeout_s)
70
- if not isinstance(result, dict) or not result.get("session_id"):
71
- return None
72
- copy_session_metadata(agent_state, result)
73
- agent_state.pop("_pending_session_id", None)
74
- event_log.write(
75
- "session.captured",
76
- agent_id=agent_id,
77
- provider=agent_state.get("provider"),
78
- session_id=agent_state.get("session_id"),
79
- rollout_path=agent_state.get("rollout_path"),
80
- captured_via=agent_state.get("captured_via"),
81
- attribution_confidence=agent_state.get("attribution_confidence"),
82
- )
83
- return result
86
+ deadline = time.monotonic() + max(timeout_s, 0.0)
87
+ while True:
88
+ # Pass timeout_s=0 so the adapter does a single fast-path check; the
89
+ # outer loop owns the polling budget so behaviour stays consistent
90
+ # whether or not the adapter has its own internal sleep.
91
+ result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
92
+ if isinstance(result, dict) and result.get("session_id"):
93
+ copy_session_metadata(agent_state, result)
94
+ agent_state.pop("_pending_session_id", None)
95
+ event_log.write(
96
+ "session.captured",
97
+ agent_id=agent_id,
98
+ provider=agent_state.get("provider"),
99
+ session_id=agent_state.get("session_id"),
100
+ rollout_path=agent_state.get("rollout_path"),
101
+ captured_via=agent_state.get("captured_via"),
102
+ attribution_confidence=agent_state.get("attribution_confidence"),
103
+ )
104
+ return result
105
+ if time.monotonic() >= deadline:
106
+ break
107
+ time.sleep(_CAPTURE_POLL_INTERVAL_SECONDS)
108
+ # Timeout. Slice 1 atomicity contract: a worker whose status is 'running'
109
+ # must NEVER be left with session_id=null — that half-state is what made
110
+ # Mac mini Stage 7 S5/S6 unreproducible and breaks resume on next restart.
111
+ # Emit a structured attention event so the coordinator/operator sees the
112
+ # miss, then raise so callers cannot accidentally treat the None as a
113
+ # silent "no-op". Non-running workers (still starting, paused, stopped)
114
+ # legitimately have no session yet, so they still get the silent-None
115
+ # return that existing callers expect.
116
+ if agent_state.get("status") == "running":
117
+ event_log.write(
118
+ "session.capture_required_attention",
119
+ agent_id=agent_id,
120
+ provider=agent_state.get("provider"),
121
+ timeout_s=timeout_s,
122
+ spawn_cwd=agent_state.get("spawn_cwd"),
123
+ session_name=agent_state.get("session_name"),
124
+ window=agent_state.get("window", agent_id),
125
+ )
126
+ if raise_on_missed:
127
+ raise TeamAgentRuntimeError(
128
+ f"Failed to capture session_id for agent {agent_id}: adapter "
129
+ f"did not produce a session within {timeout_s}s. Worker is "
130
+ "running but unidentifiable; this is a Slice 1 atomicity "
131
+ "violation."
132
+ )
133
+ return None
84
134
 
85
135
 
86
136
  def copy_session_metadata(target: dict[str, Any], source: dict[str, Any]) -> None: