@team-agent/installer 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +1 -1
  2. package/src/team_agent/abnormal_track.py +253 -0
  3. package/src/team_agent/cli/commands.py +17 -1
  4. package/src/team_agent/cli/parser.py +2 -2
  5. package/src/team_agent/compiler.py +1 -1
  6. package/src/team_agent/coordinator/lifecycle.py +20 -2
  7. package/src/team_agent/display/__init__.py +31 -0
  8. package/src/team_agent/display/adaptive.py +425 -0
  9. package/src/team_agent/display/backend.py +46 -0
  10. package/src/team_agent/display/close.py +6 -0
  11. package/src/team_agent/display/rebuild.py +102 -0
  12. package/src/team_agent/display/tiling.py +156 -0
  13. package/src/team_agent/display/worker_window.py +4 -0
  14. package/src/team_agent/display/workspace.py +36 -127
  15. package/src/team_agent/idle_predicate.py +200 -0
  16. package/src/team_agent/idle_takeover.py +59 -0
  17. package/src/team_agent/idle_takeover_wiring.py +111 -0
  18. package/src/team_agent/launch/core.py +13 -4
  19. package/src/team_agent/leader/__init__.py +444 -61
  20. package/src/team_agent/message_store/agent_health.py +6 -2
  21. package/src/team_agent/message_store/core.py +51 -18
  22. package/src/team_agent/message_store/leader_notification_log.py +63 -38
  23. package/src/team_agent/message_store/result_watchers.py +17 -11
  24. package/src/team_agent/message_store/schema.py +19 -2
  25. package/src/team_agent/message_store/schema_migration.py +386 -0
  26. package/src/team_agent/messaging/delivery.py +45 -2
  27. package/src/team_agent/messaging/leader_panes.py +115 -21
  28. package/src/team_agent/messaging/send.py +33 -0
  29. package/src/team_agent/messaging/tmux_io.py +49 -10
  30. package/src/team_agent/messaging/trust_auto_answer.py +11 -3
  31. package/src/team_agent/provider_state/README.md +78 -0
  32. package/src/team_agent/provider_state/__init__.py +86 -0
  33. package/src/team_agent/provider_state/claude.py +86 -0
  34. package/src/team_agent/provider_state/codex.py +84 -0
  35. package/src/team_agent/provider_state/common.py +207 -0
  36. package/src/team_agent/provider_state/registry.py +118 -0
  37. package/src/team_agent/restart/orchestration.py +9 -9
  38. package/src/team_agent/runtime.py +62 -12
  39. package/src/team_agent/spec.py +4 -3
  40. package/src/team_agent/wake.py +58 -0
@@ -18,40 +18,43 @@ from team_agent.state import apply_first_time_leader_binding, derive_leader_sess
18
18
 
19
19
  def attach_leader(workspace: Path, pane: str | None = None, provider: str = "codex") -> dict[str, Any]:
20
20
  from team_agent.message_store import MessageStore
21
- from team_agent.runtime import _attach_leader_to_state, ensure_workspace_dirs
21
+ from team_agent.runtime import _attach_leader_to_state, _runtime_lock, ensure_workspace_dirs
22
22
  ensure_workspace_dirs(workspace)
23
- state = load_runtime_state(workspace)
24
- event_log = EventLog(workspace)
25
- receiver, validation = _attach_leader_to_state(
26
- workspace,
27
- state,
28
- pane=pane,
29
- provider=provider,
30
- event_log=event_log,
31
- source="manual",
32
- )
33
- save_runtime_state(workspace, state)
34
- requeued = MessageStore(workspace).requeue_delivery_exhausted_watchers()
35
- if requeued:
36
- event_log.write(
37
- "leader_receiver.requeued_exhausted_watchers",
38
- watcher_ids=requeued,
39
- count=len(requeued),
40
- trigger="attach_leader",
23
+ # MED1/MED3: attach is a lease mutation; hold the single lease mutex so the state
24
+ # change + event emission + dual-state write happen in one critical section.
25
+ with _runtime_lock(workspace, LEADER_OWNERSHIP_LOCK):
26
+ state = load_runtime_state(workspace)
27
+ event_log = EventLog(workspace)
28
+ receiver, validation = _attach_leader_to_state(
29
+ workspace,
30
+ state,
31
+ pane=pane,
32
+ provider=provider,
33
+ event_log=event_log,
34
+ source="manual",
41
35
  )
42
- for watcher_id in requeued:
36
+ save_runtime_state(workspace, state)
37
+ requeued = MessageStore(workspace).requeue_delivery_exhausted_watchers()
38
+ if requeued:
43
39
  event_log.write(
44
- "result_watcher.requeued",
45
- watcher_id=watcher_id,
40
+ "leader_receiver.requeued_exhausted_watchers",
41
+ watcher_ids=requeued,
42
+ count=len(requeued),
46
43
  trigger="attach_leader",
47
- new_pane_id=receiver.get("pane_id"),
48
44
  )
49
- return {
50
- "ok": True,
51
- "leader_receiver": receiver,
52
- "validation": validation,
53
- "requeued_exhausted_watchers": requeued,
54
- }
45
+ for watcher_id in requeued:
46
+ event_log.write(
47
+ "result_watcher.requeued",
48
+ watcher_id=watcher_id,
49
+ trigger="attach_leader",
50
+ new_pane_id=receiver.get("pane_id"),
51
+ )
52
+ return {
53
+ "ok": True,
54
+ "leader_receiver": receiver,
55
+ "validation": validation,
56
+ "requeued_exhausted_watchers": requeued,
57
+ }
55
58
 
56
59
 
57
60
  def start_leader(
@@ -299,14 +302,21 @@ def attach_leader_to_state(
299
302
  _set_tmux_leader_environment(receiver, identity, event_log, run_cmd)
300
303
  event_log.write("leader_receiver.attached", target=receiver["pane_id"], session_name=receiver["session_name"], window_index=receiver["window_index"], window_name=receiver["window_name"], pane_index=receiver["pane_index"], pane_tty=receiver["pane_tty"], pane_current_command=receiver["pane_current_command"], provider=receiver_provider, requested_provider=provider if receiver_provider != provider else None, discovery=discovery, source=source, first_time=True, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12], leader_session_uuid_source=identity.get("leader_session_uuid_source"))
301
304
  return receiver, validation
305
+ owner_record = state.get("team_owner") if isinstance(state.get("team_owner"), dict) else {}
302
306
  if receiver_provider != "fake":
303
- receiver["leader_session_uuid"] = identity["leader_session_uuid"]
307
+ # C10/C12: carry the recorded owner's identity rather than re-deriving one
308
+ # that can drift under symlinked/worktree paths.
309
+ receiver["leader_session_uuid"] = str(owner_record.get("leader_session_uuid") or identity["leader_session_uuid"])
304
310
  if receiver_provider != provider:
305
311
  receiver["requested_provider"] = provider
306
- validation = validate_leader_uuid_from_targets(receiver, core_list_targets())
312
+ targets = core_list_targets()
313
+ validation = validate_leader_uuid_from_targets(receiver, targets)
307
314
  if validation["ok"]:
308
315
  validation = _validate_leader_receiver(receiver)
309
316
  if not validation["ok"]:
317
+ readopt = _try_readopt_leader_pane(workspace, state, receiver, pane_info, targets, owner_record, receiver_provider, source, event_log)
318
+ if readopt is not None:
319
+ return readopt
310
320
  event_log.write("leader_receiver.attach_failed", target=pane or pane_info.get("pane_id"), discovery=discovery, provider=provider, reason=validation["reason"], error=validation.get("error"), source=source, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12])
311
321
  raise RuntimeError(_strict_leader_validation_error(validation))
312
322
  if validation.get("warning"):
@@ -358,17 +368,381 @@ def leader_identity(workspace: Path, team: str | None = None) -> dict[str, Any]:
358
368
  }
359
369
 
360
370
 
371
+ _LEASE_REASON_ENUM = frozenset(
372
+ {
373
+ "vacant_acquired",
374
+ "previous_owner_pane_dead",
375
+ "previous_owner_alive_refused",
376
+ "owner_epoch_advanced",
377
+ "force_confirm_required",
378
+ "caller_not_leader_shaped",
379
+ "caller_cwd_mismatch",
380
+ "not_in_tmux_pane",
381
+ }
382
+ )
383
+ _LEASE_REBIND_REQUIRED_REASONS = frozenset(
384
+ {"not_in_tmux_pane", "caller_not_leader_shaped", "caller_cwd_mismatch"}
385
+ )
386
+
387
+ # MED1/MED3 (spark, 2026-05-27): one lease mutex serializes every lease mutation —
388
+ # takeover, claim-leader, attach-leader, and autobind. It is the "send" lock so that
389
+ # ownership transfer also serializes against the send mutator (a concurrent send by
390
+ # the old owner cannot race a rebind). takeover must stay on this lock for the same
391
+ # reason, so the three verbs share a single named critical section.
392
+ LEADER_OWNERSHIP_LOCK = "send"
393
+
394
+
395
+ def _lease_caller_pane() -> str:
396
+ return os.environ.get("TEAM_AGENT_LEADER_PANE_ID") or os.environ.get("TMUX_PANE") or ""
397
+
398
+
399
+ def _lease_epoch(owner: dict[str, Any] | None, receiver: dict[str, Any] | None) -> int:
400
+ return int((owner or {}).get("owner_epoch") or (receiver or {}).get("owner_epoch") or 0)
401
+
402
+
403
+ def _pane_is_live_leader(target: dict[str, Any] | None) -> bool:
404
+ # C1/C2: liveness is a live tmux probe. A pane is a live leader if the
405
+ # process tree carries the leader session env (set even when a child command
406
+ # is foreground), or the pane's current command is a provider leader host.
407
+ if not isinstance(target, dict):
408
+ return False
409
+ from team_agent.messaging.leader_panes import _leader_command_looks_usable, _leader_command_provider, _target_leader_session_uuid
410
+ if _target_leader_session_uuid(target):
411
+ return True
412
+ command = str(target.get("pane_current_command", ""))
413
+ return _leader_command_looks_usable(command, "") or _leader_command_provider(command) is not None
414
+
415
+
416
+ def _owner_pane_is_live(target: dict[str, Any] | None, owner_record: dict[str, Any] | None) -> bool:
417
+ # MED2 (spark, 2026-05-27): a recorded owner is only "live" when the candidate
418
+ # pane carries the OWNER's identity, not merely a leader-looking command name.
419
+ # When the owner has a recorded leader_session_uuid, that uuid is the identity:
420
+ # a stray node/claude pane without the matching uuid is not the owner (so a
421
+ # dead-owner recover proceeds), and the real owner is still live even with a
422
+ # non-leader foreground command as long as its session uuid is in the tree.
423
+ if not isinstance(target, dict):
424
+ return False
425
+ owner = owner_record or {}
426
+ owner_uuid = str(owner.get("leader_session_uuid") or "")
427
+ if owner_uuid:
428
+ from team_agent.messaging.leader_panes import _target_leader_session_uuid
429
+ return _target_leader_session_uuid(target) == owner_uuid
430
+ # No recorded uuid: fall back to provider identity (process tree / command for
431
+ # the owner's provider) rather than any leader-looking command.
432
+ owner_provider = str(owner.get("provider") or "")
433
+ if owner_provider:
434
+ from team_agent.messaging.leader_panes import _leader_command_looks_usable, _target_leader_session_uuid
435
+ if _target_leader_session_uuid(target):
436
+ return True
437
+ return _leader_command_looks_usable(str(target.get("pane_current_command", "")), owner_provider)
438
+ return _pane_is_live_leader(target)
439
+
440
+
441
+ def _cwd_inside_workspace(cwd: str | None, workspace: Path) -> bool:
442
+ # C7/C8: realpath both sides; membership is subtree containment.
443
+ if not cwd:
444
+ return True
445
+ ws = os.path.realpath(str(workspace.resolve()))
446
+ candidate = os.path.realpath(str(cwd))
447
+ return candidate == ws or candidate.startswith(ws + os.sep)
448
+
449
+
450
+ def _caller_pane_eligibility(target: dict[str, Any] | None, workspace: Path) -> dict[str, Any]:
451
+ # C5: acquire binds the caller pane only when it is leader-shaped and its cwd
452
+ # is inside the workspace. A plain shell / worker pane never self-binds.
453
+ if not _pane_is_live_leader(target):
454
+ return {"ok": False, "reason": "caller_not_leader_shaped", "action": "run team-agent claim-leader from a leader (claude/codex) tmux pane"}
455
+ if not _cwd_inside_workspace((target or {}).get("pane_current_path"), workspace):
456
+ return {"ok": False, "reason": "caller_cwd_mismatch", "action": "run from a leader pane whose cwd is inside this workspace"}
457
+ return {"ok": True}
458
+
459
+
460
+ def _lease_refused(reason: str, *, action: str | None = None, **extra: Any) -> dict[str, Any]:
461
+ result: dict[str, Any] = {"ok": False, "status": "refused", "reason": reason}
462
+ if action:
463
+ result["action"] = action
464
+ result.update(extra)
465
+ return result
466
+
467
+
468
+ def _emit_lease_refusal(
469
+ event_log: EventLog,
470
+ reason: str,
471
+ owner: dict[str, Any] | None,
472
+ old_pane: str | None,
473
+ new_pane: str | None,
474
+ team_id: str | None,
475
+ host: str,
476
+ os_user: str,
477
+ ) -> None:
478
+ # C20/C21/C22: every refusal emits a structured audit event with a closed-enum
479
+ # reason, redacted uuid prefix, old/new pane id, host, and OS user.
480
+ name = "leader_receiver.rebind_required" if reason in _LEASE_REBIND_REQUIRED_REASONS else "leader_receiver.claim_refused"
481
+ event_log.write(
482
+ name,
483
+ reason=reason,
484
+ old_pane_id=old_pane,
485
+ new_pane_id=new_pane,
486
+ uuid_prefix=str((owner or {}).get("leader_session_uuid") or "")[:8],
487
+ team_id=team_id,
488
+ host=host,
489
+ os_user=os_user,
490
+ )
491
+
492
+
493
+ def _try_readopt_leader_pane(
494
+ workspace: Path,
495
+ state: dict[str, Any],
496
+ receiver: dict[str, Any],
497
+ pane_info: dict[str, Any],
498
+ targets: dict[str, Any],
499
+ owner_record: dict[str, Any],
500
+ receiver_provider: str,
501
+ source: str,
502
+ event_log: EventLog,
503
+ ) -> tuple[dict[str, Any], dict[str, Any]] | None:
504
+ # C4/C11/C12: attach-leader converges on the lease claim. When the strict UUID
505
+ # gate would refuse, re-adopt the pane instead IF it is a live workspace leader
506
+ # (real injected uuid + cwd inside the workspace subtree) and the lease is either
507
+ # vacant or already owned by that same identity. A genuinely different live owner
508
+ # still requires explicit takeover.
509
+ from team_agent.messaging.leader_panes import _leader_command_looks_usable, _target_leader_session_uuid
510
+ target_list = targets.get("targets", []) if isinstance(targets, dict) and targets.get("ok") else []
511
+ pane_target = next((item for item in target_list if isinstance(item, dict) and str(item.get("pane_id")) == str(pane_info.get("pane_id"))), None)
512
+ pane_uuid = _target_leader_session_uuid(pane_target or {}) or _target_leader_session_uuid(pane_info)
513
+ if not pane_uuid:
514
+ return None
515
+ if not _cwd_inside_workspace(pane_info.get("pane_current_path"), workspace):
516
+ return None
517
+ if not _leader_command_looks_usable(str(pane_info.get("pane_current_command", "")), receiver_provider):
518
+ return None
519
+ owner_uuid = str(owner_record.get("leader_session_uuid") or "")
520
+ if owner_uuid and owner_uuid != pane_uuid:
521
+ return None
522
+ epoch = _lease_epoch(owner_record, receiver) + (1 if owner_record else 0)
523
+ receiver["leader_session_uuid"] = pane_uuid
524
+ receiver["owner_epoch"] = epoch
525
+ receiver["discovery"] = "attach_readopt"
526
+ receiver.pop("warning", None)
527
+ old_pane = owner_record.get("pane_id") or (state.get("leader_receiver") or {}).get("pane_id")
528
+ state["team_owner"] = {
529
+ "pane_id": pane_info["pane_id"],
530
+ "provider": receiver.get("provider") or receiver_provider,
531
+ "machine_fingerprint": owner_record.get("machine_fingerprint") or pane_info.get("machine_fingerprint") or "",
532
+ "leader_session_uuid": pane_uuid,
533
+ "owner_epoch": epoch,
534
+ "claimed_at": datetime.now(timezone.utc).isoformat(),
535
+ "claimed_via": "attach-leader",
536
+ }
537
+ state["leader_receiver"] = receiver
538
+ _write_lease_dual_state(workspace, state)
539
+ if old_pane and old_pane != pane_info["pane_id"]:
540
+ event_log.write("owner.adopted_on_restart", reason="attach_readopt", old_pane_id=old_pane, new_pane_id=pane_info["pane_id"], owner_epoch=epoch, uuid_prefix=pane_uuid[:8], team_id=team_state_key(state))
541
+ event_log.write("leader_receiver.rebind_applied", reason="attach_readopt", old_pane_id=old_pane, new_pane_id=pane_info["pane_id"], owner_epoch=epoch, uuid_prefix=pane_uuid[:8], team_id=team_state_key(state))
542
+ event_log.write("leader_receiver.attached", target=pane_info["pane_id"], session_name=pane_info.get("session_name"), provider=receiver.get("provider"), discovery="attach_readopt", source=source, owner_epoch=epoch, uuid_prefix=pane_uuid[:8])
543
+ return receiver, {"ok": True, "pane": pane_info, "readopted": True, "warning": None}
544
+
545
+
546
+ def _detect_dual_state_divergence(workspace: Path, state: dict[str, Any]) -> dict[str, Any] | None:
547
+ # C18: the workspace-level state.json and the team-level runtime snapshot must
548
+ # agree on owner_uuid, receiver_pane_id, and owner_epoch. Detect a pre-existing
549
+ # split so the repair can be audited.
550
+ session = state.get("session_name")
551
+ if not session:
552
+ return None
553
+ from team_agent.restart.snapshot import load_snapshot_state, team_runtime_snapshot_dir
554
+ snap_path = team_runtime_snapshot_dir(workspace, str(session)) / "state.json"
555
+ if not snap_path.exists():
556
+ return None
557
+ snap = load_snapshot_state(snap_path) or {}
558
+ ws_owner = state.get("team_owner") if isinstance(state.get("team_owner"), dict) else {}
559
+ snap_owner = snap.get("team_owner") if isinstance(snap.get("team_owner"), dict) else {}
560
+ ws_receiver = state.get("leader_receiver") if isinstance(state.get("leader_receiver"), dict) else {}
561
+ snap_receiver = snap.get("leader_receiver") if isinstance(snap.get("leader_receiver"), dict) else {}
562
+ diverged = (
563
+ ws_owner.get("pane_id") != snap_owner.get("pane_id")
564
+ or ws_owner.get("leader_session_uuid") != snap_owner.get("leader_session_uuid")
565
+ or _lease_epoch(ws_owner, ws_receiver) != _lease_epoch(snap_owner, snap_receiver)
566
+ or ws_receiver.get("pane_id") != snap_receiver.get("pane_id")
567
+ )
568
+ if not diverged:
569
+ return None
570
+ return {
571
+ "workspace_owner_pane": ws_owner.get("pane_id"),
572
+ "team_owner_pane": snap_owner.get("pane_id"),
573
+ "workspace_receiver_pane": ws_receiver.get("pane_id"),
574
+ "team_receiver_pane": snap_receiver.get("pane_id"),
575
+ }
576
+
577
+
578
+ def _write_lease_dual_state(workspace: Path, state: dict[str, Any]) -> None:
579
+ # C17: write team_owner + leader_receiver to both state locations in one lock
580
+ # hold. The workspace-level state.json and the team-level runtime snapshot
581
+ # (teams/<session>/state.json) must never diverge after a lease mutation.
582
+ save_team_scoped_state(workspace, state)
583
+ if state.get("session_name"):
584
+ from team_agent.restart.snapshot import save_team_runtime_snapshot
585
+ save_team_runtime_snapshot(workspace, state)
586
+
587
+
588
+ def _claim_lease_no_incident(
589
+ workspace: Path,
590
+ state: dict[str, Any],
591
+ team: str | None,
592
+ team_id: str,
593
+ caller_pane: str,
594
+ confirm: bool,
595
+ event_log: EventLog,
596
+ ) -> dict[str, Any]:
597
+ # Gap 39 unified lease: no ambiguous incident is recorded, so this is a direct
598
+ # acquire/claim against live evidence (not the Gap 26 broadcast-claim flow).
599
+ from team_agent.runtime import core_list_targets
600
+ owner = state.get("team_owner") if isinstance(state.get("team_owner"), dict) else {}
601
+ receiver = state.get("leader_receiver") if isinstance(state.get("leader_receiver"), dict) else {}
602
+ precheck_epoch = _lease_epoch(owner, receiver)
603
+ host = str(owner.get("machine_fingerprint") or _identity_machine_fingerprint(state))
604
+ os_user = _identity_os_user()
605
+
606
+ if not caller_pane:
607
+ _emit_lease_refusal(event_log, "not_in_tmux_pane", owner, receiver.get("pane_id"), None, team_id, host, os_user)
608
+ return _lease_refused("not_in_tmux_pane", action="run team-agent claim-leader from the leader's tmux pane")
609
+
610
+ targets_result = core_list_targets()
611
+ targets = targets_result.get("targets", []) if isinstance(targets_result, dict) and targets_result.get("ok") else []
612
+ by_pane = {str(item.get("pane_id")): item for item in targets if isinstance(item, dict)}
613
+
614
+ bound_pane = receiver.get("pane_id") or owner.get("pane_id")
615
+ bound_alive = _owner_pane_is_live(by_pane.get(str(bound_pane)), owner) if bound_pane else False
616
+
617
+ if bound_pane and str(bound_pane) == str(caller_pane):
618
+ return {
619
+ "ok": True,
620
+ "status": "already_bound",
621
+ "leader_receiver": receiver or None,
622
+ "team_owner": owner or None,
623
+ "owner_epoch": precheck_epoch,
624
+ }
625
+
626
+ caller_target = by_pane.get(str(caller_pane))
627
+ eligibility = _caller_pane_eligibility(caller_target, workspace)
628
+ if not eligibility["ok"]:
629
+ _emit_lease_refusal(event_log, eligibility["reason"], owner, bound_pane, caller_pane, team_id, host, os_user)
630
+ return _lease_refused(eligibility["reason"], action=eligibility.get("action"))
631
+
632
+ if bound_alive and not confirm:
633
+ # C4/C13: a live recorded owner is never stolen without --confirm. The audit
634
+ # reason classifies the WHY (owner alive); the result hint tells the operator
635
+ # the action (rerun with --confirm).
636
+ _emit_lease_refusal(event_log, "previous_owner_alive_refused", owner, bound_pane, caller_pane, team_id, host, os_user)
637
+ return _lease_refused(
638
+ "force_confirm_required",
639
+ action="rerun with --confirm to take over the live leader pane",
640
+ bound_pane_id=bound_pane,
641
+ owner_epoch=precheck_epoch,
642
+ )
643
+
644
+ # C3/C15: revalidate under the lock. Re-read both the persisted epoch and live
645
+ # liveness; if the epoch advanced or a previously-dead owner revived since the
646
+ # precheck, abort the claim without double-binding (lost the epoch race).
647
+ locked_state = select_runtime_state(workspace, team)
648
+ locked_owner = locked_state.get("team_owner") if isinstance(locked_state.get("team_owner"), dict) else {}
649
+ locked_receiver = locked_state.get("leader_receiver") if isinstance(locked_state.get("leader_receiver"), dict) else {}
650
+ locked_epoch = _lease_epoch(locked_owner, locked_receiver)
651
+ recheck_result = core_list_targets()
652
+ recheck_targets = recheck_result.get("targets", []) if isinstance(recheck_result, dict) and recheck_result.get("ok") else []
653
+ recheck_by_pane = {str(item.get("pane_id")): item for item in recheck_targets if isinstance(item, dict)}
654
+ revived = bool(bound_pane) and not bound_alive and _owner_pane_is_live(recheck_by_pane.get(str(bound_pane)), owner)
655
+ if locked_epoch != precheck_epoch or (revived and not confirm):
656
+ _emit_lease_refusal(event_log, "owner_epoch_advanced", locked_owner or owner, bound_pane, caller_pane, team_id, host, os_user)
657
+ return _lease_refused(
658
+ "owner_epoch_advanced",
659
+ bound_pane_id=bound_pane,
660
+ owner_epoch=max(locked_epoch, precheck_epoch),
661
+ )
662
+
663
+ divergence = _detect_dual_state_divergence(workspace, state)
664
+ # C10/C12: the caller pane's injected TEAM_AGENT_LEADER_SESSION_UUID is the
665
+ # authoritative identity for the bind; fall back to the recorded owner/receiver
666
+ # uuid, then to the deterministic derivation.
667
+ from team_agent.messaging.leader_panes import _target_leader_session_uuid
668
+ next_epoch = precheck_epoch + 1
669
+ leader_uuid = str(
670
+ _target_leader_session_uuid(caller_target or {})
671
+ or owner.get("leader_session_uuid")
672
+ or receiver.get("leader_session_uuid")
673
+ or _leader_identity_context(workspace, team=team, state=state)["leader_session_uuid"]
674
+ )
675
+ new_receiver = _receiver_from_claim_target(caller_target, receiver, leader_uuid, next_epoch)
676
+ new_owner = {
677
+ "pane_id": caller_pane,
678
+ "provider": new_receiver.get("provider") or owner.get("provider") or "codex",
679
+ "machine_fingerprint": host,
680
+ "leader_session_uuid": leader_uuid,
681
+ "owner_epoch": next_epoch,
682
+ "claimed_at": datetime.now(timezone.utc).isoformat(),
683
+ "claimed_via": "claim-leader",
684
+ }
685
+ state["team_owner"] = new_owner
686
+ state["leader_receiver"] = new_receiver
687
+ _write_lease_dual_state(workspace, state)
688
+ dead_owner = bool(bound_pane) and not bound_alive
689
+ reason = "previous_owner_pane_dead" if dead_owner else "vacant_acquired"
690
+ if dead_owner:
691
+ event_log.write(
692
+ "owner.adopted_on_restart",
693
+ reason=reason,
694
+ old_pane_id=bound_pane,
695
+ new_pane_id=caller_pane,
696
+ owner_epoch=next_epoch,
697
+ uuid_prefix=leader_uuid[:8],
698
+ team_id=team_id,
699
+ host=host,
700
+ os_user=os_user,
701
+ )
702
+ event_log.write(
703
+ "leader_receiver.rebind_applied",
704
+ reason=reason,
705
+ old_pane_id=bound_pane,
706
+ new_pane_id=caller_pane,
707
+ owner_epoch=next_epoch,
708
+ uuid_prefix=leader_uuid[:8],
709
+ team_id=team_id,
710
+ )
711
+ event_log.write(
712
+ "owner_epoch_advanced",
713
+ reason=reason,
714
+ old_pane_id=bound_pane,
715
+ new_pane_id=caller_pane,
716
+ owner_epoch=next_epoch,
717
+ uuid_prefix=leader_uuid[:8],
718
+ team_id=team_id,
719
+ )
720
+ if divergence:
721
+ # C18/C19: the workspace-level and team-level state had diverged before this
722
+ # mutation; the single dual-write above re-converged them.
723
+ event_log.write("leader_receiver.state_divergence_repaired", team_id=team_id, owner_epoch=next_epoch, new_pane_id=caller_pane, **divergence)
724
+ return {
725
+ "ok": True,
726
+ "status": "claimed",
727
+ "leader_receiver": new_receiver,
728
+ "team_owner": new_owner,
729
+ "owner_epoch": next_epoch,
730
+ "reason": reason,
731
+ }
732
+
733
+
361
734
  def claim_leader(workspace: Path, team: str | None = None, confirm: bool = False) -> dict[str, Any]:
362
735
  from team_agent.runtime import RuntimeError, _runtime_lock, core_list_targets
363
- current_pane = os.environ.get("TEAM_AGENT_LEADER_PANE_ID") or os.environ.get("TMUX_PANE")
364
- if not current_pane:
365
- return {"ok": False, "status": "refused", "reason": "no_caller_pane", "action": "run from a tmux leader pane"}
366
- with _runtime_lock(workspace, "leader_receiver"):
736
+ current_pane = _lease_caller_pane()
737
+ with _runtime_lock(workspace, LEADER_OWNERSHIP_LOCK):
367
738
  state = select_runtime_state(workspace, team)
368
739
  event_log = EventLog(workspace)
369
- incident = _latest_ambiguous_incident(event_log, team_state_key(state))
740
+ team_id = team_state_key(state)
741
+ incident = _latest_ambiguous_incident(event_log, team_id)
370
742
  if not incident:
371
- return {"ok": False, "status": "refused", "reason": "no_ambiguous_candidates"}
743
+ return _claim_lease_no_incident(workspace, state, team, team_id, current_pane, confirm, event_log)
744
+ if not current_pane:
745
+ return {"ok": False, "status": "refused", "reason": "no_caller_pane", "action": "run from a tmux leader pane"}
372
746
  candidates = [str(item) for item in incident.get("candidates", [])]
373
747
  if current_pane not in candidates:
374
748
  return {"ok": False, "status": "refused", "reason": "caller_not_candidate", "candidates": candidates}
@@ -395,7 +769,13 @@ def claim_leader(workspace: Path, team: str | None = None, confirm: bool = False
395
769
  epoch = int(owner.get("owner_epoch") or receiver.get("owner_epoch") or 0) + 1
396
770
  owner.update({"pane_id": current_pane, "owner_epoch": epoch, "claimed_at": datetime.now(timezone.utc).isoformat(), "claimed_via": "claim-leader"})
397
771
  state["leader_receiver"] = _receiver_from_claim_target(target, receiver, expected_uuid, epoch)
398
- save_team_scoped_state(workspace, state)
772
+ # HIGH (spark, 2026-05-27): the multi-candidate claim branch must write both
773
+ # state locations atomically (workspace state.json + team/<session> snapshot),
774
+ # exactly like the no-incident lease path, so the branches never split state.
775
+ divergence = _detect_dual_state_divergence(workspace, state)
776
+ _write_lease_dual_state(workspace, state)
777
+ if divergence:
778
+ event_log.write("leader_receiver.state_divergence_repaired", team_id=team_id, owner_epoch=epoch, new_pane_id=current_pane, **divergence)
399
779
  losers = [pane for pane in candidates if pane != current_pane]
400
780
  event_log.write(
401
781
  "leader_receiver.claim_applied",
@@ -495,30 +875,33 @@ def autobind_leader_receiver_from_env(
495
875
  tmux_pane = os.environ.get("TMUX_PANE")
496
876
  if not tmux_pane:
497
877
  return None
498
- from team_agent.runtime import ensure_workspace_dirs
878
+ from team_agent.runtime import _runtime_lock, ensure_workspace_dirs
499
879
  ensure_workspace_dirs(workspace)
500
- state = load_runtime_state(workspace)
501
- event_log = EventLog(workspace)
502
- try:
503
- receiver, _validation = attach_leader_to_state(
504
- workspace,
505
- state,
506
- pane=tmux_pane,
507
- provider=provider,
508
- event_log=event_log,
509
- source=source,
510
- )
511
- except Exception as exc:
512
- event_log.write(
513
- "leader_receiver.autobind_skipped",
514
- pane=tmux_pane,
515
- provider=provider,
516
- source=source,
517
- error=str(exc),
518
- )
519
- return None
520
- save_runtime_state(workspace, state)
521
- return receiver
880
+ # MED1/MED3: the startup autobind is a lease mutation; hold the single lease
881
+ # mutex so it cannot interleave with takeover / claim / attach / send.
882
+ with _runtime_lock(workspace, LEADER_OWNERSHIP_LOCK):
883
+ state = load_runtime_state(workspace)
884
+ event_log = EventLog(workspace)
885
+ try:
886
+ receiver, _validation = attach_leader_to_state(
887
+ workspace,
888
+ state,
889
+ pane=tmux_pane,
890
+ provider=provider,
891
+ event_log=event_log,
892
+ source=source,
893
+ )
894
+ except Exception as exc:
895
+ event_log.write(
896
+ "leader_receiver.autobind_skipped",
897
+ pane=tmux_pane,
898
+ provider=provider,
899
+ source=source,
900
+ error=str(exc),
901
+ )
902
+ return None
903
+ save_runtime_state(workspace, state)
904
+ return receiver
522
905
 
523
906
 
524
907
  __all__ = [
@@ -3,9 +3,13 @@ from __future__ import annotations
3
3
  from contextlib import closing
4
4
  from typing import Any
5
5
 
6
+ from team_agent.message_store.schema_migration import MANAGED_TABLE_LAYOUTS
6
7
  from team_agent.message_store.schema import utcnow
7
8
 
8
9
 
10
+ AGENT_HEALTH_SELECT = ", ".join(MANAGED_TABLE_LAYOUTS["agent_health"])
11
+
12
+
9
13
  def upsert_agent_health(
10
14
  self,
11
15
  agent_id: str,
@@ -50,10 +54,10 @@ def upsert_agent_health(
50
54
  def agent_health(self, owner_team_id: str | None = None) -> dict[str, dict[str, Any]]:
51
55
  with closing(self.connect()) as conn:
52
56
  if owner_team_id is None:
53
- rows = conn.execute("select * from agent_health order by agent_id").fetchall()
57
+ rows = conn.execute(f"select {AGENT_HEALTH_SELECT} from agent_health order by agent_id").fetchall()
54
58
  else:
55
59
  rows = conn.execute(
56
- "select * from agent_health where owner_team_id = ? or owner_team_id is null order by agent_id",
60
+ f"select {AGENT_HEALTH_SELECT} from agent_health where owner_team_id = ? or owner_team_id is null order by agent_id",
57
61
  (owner_team_id,),
58
62
  ).fetchall()
59
63
  return {row["agent_id"]: dict(row) for row in rows}