npm - @team-agent/installer - Versions diffs - 0.2.2 → 0.2.4 - Mend

@team-agent/installer 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/package.json +1 -1
package/schemas/team.schema.json +6 -0
package/src/team_agent/abnormal_track.py +253 -0
package/src/team_agent/approvals/runtime_prompts.py +1 -1
package/src/team_agent/cli/commands.py +104 -3
package/src/team_agent/cli/parser.py +10 -1
package/src/team_agent/compiler.py +1 -1
package/src/team_agent/coordinator/lifecycle.py +23 -2
package/src/team_agent/diagnose/orphan_cleanup.py +199 -28
package/src/team_agent/display/__init__.py +31 -0
package/src/team_agent/display/adaptive.py +425 -0
package/src/team_agent/display/backend.py +46 -0
package/src/team_agent/display/close.py +6 -0
package/src/team_agent/display/rebuild.py +102 -0
package/src/team_agent/display/tiling.py +156 -0
package/src/team_agent/display/worker_window.py +4 -0
package/src/team_agent/display/workspace.py +36 -127
package/src/team_agent/idle_predicate.py +200 -0
package/src/team_agent/idle_takeover.py +59 -0
package/src/team_agent/idle_takeover_wiring.py +111 -0
package/src/team_agent/launch/core.py +14 -4
package/src/team_agent/leader/__init__.py +444 -61
package/src/team_agent/lifecycle/operations.py +1 -0
package/src/team_agent/lifecycle/start.py +1 -1
package/src/team_agent/message_store/core.py +38 -11
package/src/team_agent/message_store/leader_notification_log.py +47 -26
package/src/team_agent/message_store/schema.py +8 -2
package/src/team_agent/messaging/delivery.py +336 -1
package/src/team_agent/messaging/leader.py +13 -4
package/src/team_agent/messaging/leader_api_errors.py +216 -0
package/src/team_agent/messaging/leader_panes.py +294 -0
package/src/team_agent/messaging/scheduler.py +12 -0
package/src/team_agent/messaging/send.py +54 -26
package/src/team_agent/messaging/tmux_io.py +202 -33
package/src/team_agent/messaging/tmux_prompt.py +87 -0
package/src/team_agent/messaging/trust_auto_answer.py +52 -0
package/src/team_agent/provider_state/README.md +78 -0
package/src/team_agent/provider_state/__init__.py +86 -0
package/src/team_agent/provider_state/claude.py +86 -0
package/src/team_agent/provider_state/codex.py +84 -0
package/src/team_agent/provider_state/common.py +207 -0
package/src/team_agent/provider_state/registry.py +118 -0
package/src/team_agent/restart/orchestration.py +215 -12
package/src/team_agent/runtime.py +65 -15
package/src/team_agent/sessions/capture.py +65 -15
package/src/team_agent/spec.py +63 -3
package/src/team_agent/status/queries.py +32 -1
package/src/team_agent/wake.py +58 -0
package/src/team_agent/watch/__init__.py +145 -0

package/src/team_agent/restart/orchestration.py CHANGED Viewed

@@ -8,18 +8,19 @@ from typing import Any
 from team_agent.events import EventLog
 from team_agent.message_store import MessageStore
 from team_agent.permissions import resolve_permissions
+from team_agent.display.backend import display_backend_has_worker_views, display_backend_opens_before_leader_rebind, resolve_restart_display_backend
+from team_agent.display.close import close_team_display_backends
+from team_agent.display.rebuild import rebuild_restart_display_after_rebind
 from team_agent.restart.selection import select_restart_state
 from team_agent.restart.snapshot import save_team_runtime_snapshot
 from team_agent.spec import load_spec
 from team_agent.state import (
     check_team_owner,
-    load_runtime_state,
     populate_team_owner_from_env,
     save_runtime_state,
     write_team_state,
 )
 def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None) -> dict[str, Any]:
     # Lazy-import everything from team_agent.runtime so existing tests that
     # patch runtime.shell_resume_command_for_agent / runtime.run_cmd /
@@ -27,7 +28,6 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
     # at call time. Runtime re-exports the provider helpers, so this also
     # routes through the providers module without binding it directly.
     from team_agent.runtime import (
-        GHOSTTY_DISPLAY_BACKENDS,
         ResumeUnavailable,
         RuntimeError,
         _attach_profile_resume_root,
@@ -35,7 +35,6 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
         _capture_agent_session,
         _clear_session_capture_fields,
         _close_ghostty_display,
-        _close_ghostty_workspace,
         _compile_team_dir_spec,
         _effective_runtime_config,
         _ensure_agent_start_requirements,
@@ -83,16 +82,73 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
         )
         raise RuntimeError(_tmux_session_conflict_error(session_name))
     runtime_cfg = _effective_runtime_config(spec.get("runtime", {}))
-    display_backend = spec.get("runtime", {}).get("display_backend", state.get("display_backend", "none"))
-    _close_ghostty_workspace(state, event_log)
-    for agent_id, agent_state in state.get("agents", {}).items():
-        _close_ghostty_display(agent_id, agent_state, event_log)
-    state["display_backend"] = display_backend
+    display_backend = resolve_restart_display_backend(spec, state, event_log)
+    # Stage 7 S5 — Slice 6 lifecycle atomicity contract: compute restart_agents
+    # early so we can pre-validate resumability BEFORE any destructive teardown
+    # (ghostty close, tmux session creation). Without --allow-fresh, every
+    # non-paused worker MUST be resumable; if any is not, refuse the operation
+    # atomically with a structured result and a restart.atomic_refusal event.
+    # No rollback path is needed because nothing has been created yet.
     restart_agents = [
         agent
         for agent in spec.get("agents", [])
         if state.get("agents", {}).get(agent["id"], {}).get("status") != "paused" and not agent.get("paused")
     ]
+    # cr strict-typing (2026-05-27): refuse the operation deterministically
+    # before any decision logic if any persisted first_send_at is corrupt
+    # (empty string, 0, False, literal "null", any non-ISO garbage). This
+    # avoids silent misclassification through Python truthiness and gives the
+    # operator a clear audit signal that state.json is damaged.
+    invalid_first_send_at = _collect_corrupt_first_send_at(restart_agents, state)
+    if invalid_first_send_at:
+        for entry in invalid_first_send_at:
+            event_log.write(
+                "restart.first_send_at_invalid",
+                worker_id=entry["worker_id"],
+                raw_first_send_at=entry["raw_first_send_at"],
+                raw_first_send_at_type=entry["raw_first_send_at_type"],
+            )
+        invalid_names = [entry["worker_id"] for entry in invalid_first_send_at]
+        return {
+            "ok": False,
+            "status": "refused",
+            "reason": "invalid_first_send_at",
+            "invalid_first_send_at": invalid_first_send_at,
+            "allow_fresh": bool(allow_fresh),
+            "error": (
+                f"Cannot restart: workers {invalid_names} have a corrupt "
+                "first_send_at in state.json (only null/missing or a valid "
+                "ISO-8601 UTC timestamp string is accepted). Inspect the "
+                "restart.first_send_at_invalid audit events for raw values "
+                "and repair state.json before retrying."
+            ),
+        }
+    # cr C2: emit one restart.resume_decision event per non-paused worker so
+    # every restart attempt produces an auditable per-worker classification.
+    # The function returns only refused workers — populated when
+    # allow_fresh=False AND at least one interacted worker cannot be repaired.
+    refused = _emit_resume_decisions(
+        workspace, restart_agents, state, get_adapter, event_log, allow_fresh,
+    )
+    if refused:
+        event_log.write(
+            "restart.atomic_refusal",
+            unresumable=refused,
+            allow_fresh=bool(allow_fresh),
+            reason="resume_atomicity",
+        )
+        return {
+            "ok": False,
+            "status": "refused",
+            "reason": "resume_atomicity",
+            "unresumable": refused,
+            "allow_fresh": bool(allow_fresh),
+            "error": _format_atomic_refusal_error(refused),
+        }
+    close_team_display_backends(state, event_log)
+    for agent_id, agent_state in state.get("agents", {}).items():
+        _close_ghostty_display(agent_id, agent_state, event_log)
+    state["display_backend"] = display_backend
     _ensure_agent_start_requirements(workspace, restart_agents, event_log, "restart")
     first = True
     restarted: list[dict[str, Any]] = []
@@ -271,8 +327,9 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
                 event_log,
                 timeout_s=1.5,
                 exclude_session_ids=known_session_ids,
+                raise_on_missed=False,
             )
-        if display_backend in GHOSTTY_DISPLAY_BACKENDS:
+        if display_backend_has_worker_views(display_backend):
             display_jobs.append((agent["id"], agent))
         new_agents[agent["id"]] = agent_state
         restarted.append(
@@ -283,7 +340,7 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
                 "display_target": None,
             }
         )
-    display_results = _open_worker_displays(workspace, session_name, display_jobs, event_log, display_backend)
+    display_results = _open_worker_displays(workspace, session_name, display_jobs, event_log, display_backend) if display_backend_opens_before_leader_rebind(display_backend) else {}
     for agent_id, display in display_results.items():
         if agent_id in new_agents:
             new_agents[agent_id]["display"] = display
@@ -309,12 +366,158 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
     write_team_state(workspace, spec, state)
     from team_agent.leader import autobind_leader_receiver_from_env
     leader_provider = str(spec.get("leader", {}).get("provider") or "codex")
-    autobind_leader_receiver_from_env(workspace, leader_provider, source="restart")
+    rebound_receiver = autobind_leader_receiver_from_env(workspace, leader_provider, source="restart")
+    rebuild_restart_display_after_rebind(display_backend, workspace, session_name, spec, event_log, restarted, receiver=rebound_receiver)
     coordinator = start_coordinator(workspace)
     event_log.write("restart.complete", session=session_name, agents=restarted, coordinator=coordinator)
     return {"ok": True, "session_name": session_name, "agents": restarted, "coordinator": coordinator}
+_FIRST_SEND_AT_ABSENT = "absent"
+_FIRST_SEND_AT_VALID = "valid"
+_FIRST_SEND_AT_CORRUPT = "corrupt"
+def _classify_first_send_at(value: Any) -> str:
+    """Strict first_send_at typing (cr verdict, 2026-05-27).
+    Returns one of:
+      "absent"  — None or missing field (worker never-interacted).
+      "valid"   — non-empty ISO-8601 UTC string parseable by datetime.fromisoformat.
+      "corrupt" — anything else: empty string, 0, False, literal "null", garbage.
+    The contract requires that corrupt values be detected deterministically
+    before any restart decision so we never silent-misclassify a worker's
+    interaction state via Python truthiness.
+    """
+    if value is None:
+        return _FIRST_SEND_AT_ABSENT
+    if not isinstance(value, str):
+        return _FIRST_SEND_AT_CORRUPT
+    if not value:
+        return _FIRST_SEND_AT_CORRUPT
+    try:
+        datetime.fromisoformat(value)
+    except (ValueError, TypeError):
+        return _FIRST_SEND_AT_CORRUPT
+    return _FIRST_SEND_AT_VALID
+def _collect_corrupt_first_send_at(
+    restart_agents: list[dict[str, Any]],
+    state: dict[str, Any],
+) -> list[dict[str, Any]]:
+    """Walk every non-paused worker and flag any whose persisted first_send_at
+    is corrupt. Returns the list of invalid records ready for the
+    `restart.first_send_at_invalid` event and the refusal envelope."""
+    invalid: list[dict[str, Any]] = []
+    for agent in restart_agents:
+        agent_id = agent["id"]
+        previous = state.get("agents", {}).get(agent_id, {})
+        raw = previous.get("first_send_at") if isinstance(previous, dict) else None
+        if _classify_first_send_at(raw) != _FIRST_SEND_AT_CORRUPT:
+            continue
+        invalid.append({
+            "worker_id": agent_id,
+            "raw_first_send_at": raw,
+            "raw_first_send_at_type": type(raw).__name__,
+        })
+    return invalid
+def _emit_resume_decisions(
+    workspace: Path,
+    restart_agents: list[dict[str, Any]],
+    state: dict[str, Any],
+    get_adapter_fn: Any,
+    event_log: EventLog,
+    allow_fresh: bool,
+) -> list[dict[str, Any]]:
+    """Route B audit-events contract (cr C2, 2026-05-27). For every non-paused
+    worker considered by restart, derive the resume decision per the Route B
+    matrix and emit ONE `restart.resume_decision` event:
+      resumable AND ...                     -> decision = "resume"
+      not resumable AND not interacted      -> decision = "fresh_start"
+      not resumable AND interacted AND fresh -> decision = "fresh_start"
+      not resumable AND interacted AND not fresh -> decision = "refuse"
+    Resumability mirrors sessions.resume.prepare_resume_state's repair chain
+    so workers the runtime would legitimately repair are NOT flagged. Returns
+    the subset of refused workers — populated only when allow_fresh=False AND
+    some interacted worker cannot be repaired — for use by atomic_refusal.
+    """
+    from team_agent.sessions.resume import recover_resume_session_from_events
+    refused: list[dict[str, Any]] = []
+    for agent in restart_agents:
+        agent_id = agent["id"]
+        previous = state.get("agents", {}).get(agent_id, {})
+        session_id = previous.get("session_id")
+        first_send_at = previous.get("first_send_at")
+        has_first_send_at = _classify_first_send_at(first_send_at) == _FIRST_SEND_AT_VALID
+        has_session_id = bool(session_id)
+        adapter = get_adapter_fn(agent["provider"])
+        resumable = bool(session_id) and adapter.session_is_resumable(previous, workspace)
+        if not resumable:
+            known_session_ids = {
+                str(item.get("session_id"))
+                for aid, item in state.get("agents", {}).items()
+                if aid != agent_id and item.get("session_id")
+            }
+            repaired = recover_resume_session_from_events(
+                workspace, agent_id, previous, adapter, known_session_ids,
+            )
+            if not repaired:
+                repaired = adapter.recover_session_id(
+                    agent_id, previous, workspace, known_session_ids,
+                )
+            resumable = bool(repaired)
+        if resumable:
+            decision = "resume"
+        elif not has_first_send_at:
+            decision = "fresh_start"
+        elif allow_fresh:
+            decision = "fresh_start"
+        else:
+            decision = "refuse"
+        event_log.write(
+            "restart.resume_decision",
+            worker_id=agent_id,
+            has_first_send_at=has_first_send_at,
+            has_session_id=has_session_id,
+            allow_fresh=bool(allow_fresh),
+            decision=decision,
+            first_send_at=first_send_at if has_first_send_at else None,
+            session_id=session_id,
+        )
+        if decision == "refuse":
+            refused.append({
+                "agent_id": agent_id,
+                "reason": "no_persisted_session_id" if not session_id else "session_unresumable",
+                "session_id": session_id,
+                "first_send_at": first_send_at,
+            })
+    return refused
+def _format_atomic_refusal_error(refused: list[dict[str, Any]]) -> str:
+    """C4 (cr verdict, 2026-05-27): the human-readable refusal error must
+    name every refused worker AND its first_send_at timestamp so an operator
+    can decide whether to pass --allow-fresh and accept losing that
+    interaction history."""
+    names = [item["agent_id"] for item in refused]
+    details = ". ".join(
+        f"{item['agent_id']} was first interacted with at {item.get('first_send_at')}; "
+        "its persisted session is missing"
+        for item in refused
+    )
+    return (
+        f"Cannot restart: workers {names} have no resumable session despite "
+        f"previous interaction. {details}. "
+        "Pass --allow-fresh if you accept losing that interaction history."
+    )
 def rollback_restart_session(session_name: str, event_log: EventLog) -> dict[str, Any]:
     from team_agent.runtime import run_cmd
     proc = run_cmd(["tmux", "kill-session", "-t", session_name], timeout=10)

package/src/team_agent/runtime.py CHANGED Viewed

@@ -39,10 +39,12 @@ from team_agent.providers import (
     shell_resume_command_for_agent,
 )
 from team_agent.display import (
+    GHOSTTY_DISPLAY_BACKENDS,
     GHOSTTY_WORKSPACE_PANES_PER_WINDOW,
     close_ghostty_display as _close_ghostty_display,
     close_ghostty_workspace as _close_ghostty_workspace,
     close_ghostty_workspace_slot as _close_ghostty_workspace_slot,
+    close_team_display_backends as _close_team_display_backends,
     ghostty_app_exists as _ghostty_app_exists,
     ghostty_attach_args as _ghostty_attach_args,
     ghostty_command as _ghostty_command,
@@ -65,6 +67,7 @@ from team_agent.display import (
     set_ghostty_workspace_pane_title as _set_ghostty_workspace_pane_title,
 )
 from team_agent.leader import (
+    LEADER_OWNERSHIP_LOCK,
     attach_leader,
     attach_leader_to_state as _attach_leader_to_state,
     claim_leader,
@@ -456,7 +459,6 @@ TMUX_PANE_FORMAT = (
     "#{pane_current_path}\t#{session_attached}\t#{pane_in_mode}"
 )
 HEALTH_STATUSES = {"RUNNING", "IDLE", "AWAITING_APPROVAL", "BLOCKED", "ERROR", "DONE"}
-GHOSTTY_DISPLAY_BACKENDS = {"ghostty", "ghostty_window", "ghostty_workspace"}
 DELIVERY_CAPTURE_LINES = 40
 SUBMITTED_DELIVERY_STATUSES = {"injected", "visible", "submitted", "submitted_unverified", "delivered", "acknowledged"}
 TMUX_STDIN_BUFFER_THRESHOLD = 16 * 1024
@@ -480,7 +482,6 @@ def ensure_workspace_dirs(workspace: Path) -> None:
         path.mkdir(parents=True, exist_ok=True)
 def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -> dict[str, Any]:
     from team_agent.state import resolve_team_scoped_state
     state, refusal = resolve_team_scoped_state(workspace, team)
@@ -521,7 +522,7 @@ def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -
             if proc.returncode == 0:
                 log_path.write_text(proc.stdout, encoding="utf-8")
                 captured.append(str(log_path))
-        _close_ghostty_workspace(state, event_log)
+        _close_team_display_backends(state, event_log)
         for agent_id, agent_state in state.get("agents", {}).items():
             _close_ghostty_display(agent_id, agent_state, event_log)
             closed_displays.add(agent_id)
@@ -535,7 +536,7 @@ def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -
             event_log.write("shutdown.kill_session", session=session_name, keep_logs=keep_logs, captured=captured)
     else:
         event_log.write("shutdown.idempotent", session=session_name, reason="session missing")
-        _close_ghostty_workspace(state, event_log)
+        _close_team_display_backends(state, event_log)
     for agent_id, agent_state in state.get("agents", {}).items():
         if agent_id not in closed_displays:
             _close_ghostty_display(agent_id, agent_state, event_log)
@@ -617,7 +618,7 @@ def takeover(workspace: Path, team: str | None = None, confirm: bool = False) ->
             "reason": "no_caller_identity",
             "action": "set TEAM_AGENT_LEADER_PANE_ID/PROVIDER/MACHINE_FINGERPRINT or run from a tmux pane",
         }
-    with _runtime_lock(workspace, "send"):
+    with _runtime_lock(workspace, LEADER_OWNERSHIP_LOCK):
         try:
             team_state = select_runtime_state(workspace, team)
         except RuntimeError as exc:
@@ -628,23 +629,72 @@ def takeover(workspace: Path, team: str | None = None, confirm: bool = False) ->
                 "team": team,
                 "error": str(exc),
             }
-        previous_owner = team_state.get("team_owner")
+        previous_owner = team_state.get("team_owner") if isinstance(team_state.get("team_owner"), dict) else {}
+        previous_receiver = team_state.get("leader_receiver") if isinstance(team_state.get("leader_receiver"), dict) else {}
+        from team_agent.leader import _lease_epoch, _receiver_from_claim_target
+        next_epoch = _lease_epoch(previous_owner, previous_receiver) + 1
+        leader_uuid = str(previous_owner.get("leader_session_uuid") or "")
         new_owner = {
             "pane_id": pane_id,
             "provider": os.environ.get("TEAM_AGENT_LEADER_PROVIDER", ""),
             "machine_fingerprint": os.environ.get("TEAM_AGENT_MACHINE_FINGERPRINT", ""),
+            "owner_epoch": next_epoch,
             "claimed_at": datetime.now(timezone.utc).isoformat(),
             "claimed_via": "takeover",
         }
+        if leader_uuid:
+            new_owner["leader_session_uuid"] = leader_uuid
         team_state["team_owner"] = new_owner
-        save_team_scoped_state(workspace, team_state)
-        EventLog(workspace).write(
-            "team_owner.takeover",
-            team=team,
-            previous_owner=previous_owner,
+        # C11/C17: takeover converges on the same lease mutation as claim-leader.
+        # Rebind the leader receiver to the caller pane and write owner + receiver
+        # to both state locations together, so takeover never leaves the receiver
+        # pointing at the old (often dead) pane.
+        targets_result = core_list_targets()
+        targets = targets_result.get("targets", []) if isinstance(targets_result, dict) and targets_result.get("ok") else []
+        caller_target = next((item for item in targets if isinstance(item, dict) and str(item.get("pane_id")) == str(pane_id)), None)
+        new_receiver = None
+        if caller_target:
+            new_receiver = _receiver_from_claim_target(
+                caller_target,
+                previous_receiver,
+                leader_uuid or None,
+                next_epoch,
+            )
+            new_receiver["discovery"] = "takeover"
+            team_state["leader_receiver"] = new_receiver
+        from team_agent.leader import _write_lease_dual_state
+        _write_lease_dual_state(workspace, team_state)
+        # C11: takeover converges on the same lease audit events as claim-leader
+        # instead of a divergent legacy team_owner.takeover record.
+        event_log = EventLog(workspace)
+        uuid_prefix = leader_uuid[:8]
+        old_pane_id = previous_receiver.get("pane_id") or (previous_owner or {}).get("pane_id")
+        if new_receiver is not None:
+            event_log.write(
+                "leader_receiver.rebind_applied",
+                reason="takeover_confirmed",
+                old_pane_id=old_pane_id,
+                new_pane_id=pane_id,
+                owner_epoch=next_epoch,
+                uuid_prefix=uuid_prefix,
+                team_id=team,
+            )
+        event_log.write(
+            "owner_epoch_advanced",
+            reason="takeover_confirmed",
+            old_pane_id=old_pane_id,
+            new_pane_id=pane_id,
+            owner_epoch=next_epoch,
+            uuid_prefix=uuid_prefix,
+            team_id=team,
+            previous_owner=previous_owner or None,
             new_owner=new_owner,
+            receiver_rebound=bool(new_receiver),
         )
-        return {"ok": True, "status": "claimed", "team": team, "team_owner": new_owner, "previous_owner": previous_owner}
+        response = {"ok": True, "status": "claimed", "team": team, "team_owner": new_owner, "previous_owner": previous_owner or None, "owner_epoch": next_epoch}
+        if new_receiver is not None:
+            response["leader_receiver"] = new_receiver
+        return response
 def _running_agent_state(workspace: Path, agent: dict[str, Any], previous: dict[str, Any]) -> dict[str, Any]:
@@ -674,7 +724,7 @@ def _handle_startup_prompts_and_verify_window(
     session_name: str,
     start_mode: str,
 ) -> bool:
-    handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=1, sleep_s=0.0)
+    handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=20, sleep_s=0.5)
     for prompt_event in handled_prompts:
         event_log.write(f"{event_prefix}.startup_prompt_handled", agent_id=agent_id, provider=provider, **prompt_event)
     deadline = time.monotonic() + 1.0
@@ -840,10 +890,10 @@ def _retry_or_failed(task: dict[str, Any]) -> str:
     return "failed"
-def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0) -> dict[str, Any]:
+def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0, *, _trust_retry_attempt: int = 1) -> dict[str, Any]:
     from team_agent.messaging.delivery import _deliver_pending_message as impl
-    return impl(workspace, state, message_id, wait_visible, timeout)
+    return impl(workspace, state, message_id, wait_visible, timeout, _trust_retry_attempt=_trust_retry_attempt)
 def _enable_codex_fast_mode(session_name: str, window_name: str) -> dict[str, Any]:
     from team_agent.messaging.tmux_prompt import _enable_codex_fast_mode as impl

package/src/team_agent/sessions/capture.py CHANGED Viewed

@@ -1,14 +1,25 @@
 from __future__ import annotations
+import time
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
+from team_agent.errors import RuntimeError as TeamAgentRuntimeError
 from team_agent.events import EventLog
 from team_agent.providers import get_adapter
 from team_agent.state import SESSION_CAPTURE_FIELDS, SESSION_STATE_FIELDS
+# Stage 7 S6 (2026-05-27): capture_agent_session used to do a single adapter
+# call and silently return None on miss, leaving status='running' workers with
+# session_id=null. Slow worker startups (Codex writing the rollout file a few
+# tenths of a second after window creation) raced this check. We now poll on a
+# small interval inside the caller's timeout_s budget so the adapter's own
+# fast-path call doesn't have to absorb all the latency on its own.
+_CAPTURE_POLL_INTERVAL_SECONDS = 0.05
 def capture_missing_sessions(
     workspace: Path,
     state: dict[str, Any],
@@ -25,6 +36,10 @@ def capture_missing_sessions(
             for aid, item in state.get("agents", {}).items()
             if aid != agent_id and item.get("session_id")
         }
+        # capture_missing_sessions is invoked from coordinator_tick, diagnose,
+        # status, etc. with very short timeouts; a transient miss should NOT
+        # crash those paths. The loud raise contract belongs to direct callers
+        # (e.g. lifecycle start/restart) who own the worker's atomicity.
         result = capture_agent_session(
             workspace,
             agent_id,
@@ -32,6 +47,7 @@ def capture_missing_sessions(
             event_log,
             timeout_s=timeout_s,
             exclude_session_ids=known_session_ids,
+            raise_on_missed=False,
         )
         if result:
             captured.append(agent_id)
@@ -53,6 +69,7 @@ def capture_agent_session(
     event_log: EventLog,
     timeout_s: float,
     exclude_session_ids: set[str] | None = None,
+    raise_on_missed: bool = True,
 ) -> dict[str, Any] | None:
     if agent_state.get("session_id"):
         return None
@@ -66,21 +83,54 @@ def capture_agent_session(
         "exclude_session_ids": sorted(exclude_session_ids or set()),
         "claude_projects_root": agent_state.get("claude_projects_root"),
     }
-    result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=timeout_s)
-    if not isinstance(result, dict) or not result.get("session_id"):
-        return None
-    copy_session_metadata(agent_state, result)
-    agent_state.pop("_pending_session_id", None)
-    event_log.write(
-        "session.captured",
-        agent_id=agent_id,
-        provider=agent_state.get("provider"),
-        session_id=agent_state.get("session_id"),
-        rollout_path=agent_state.get("rollout_path"),
-        captured_via=agent_state.get("captured_via"),
-        attribution_confidence=agent_state.get("attribution_confidence"),
-    )
-    return result
+    deadline = time.monotonic() + max(timeout_s, 0.0)
+    while True:
+        # Pass timeout_s=0 so the adapter does a single fast-path check; the
+        # outer loop owns the polling budget so behaviour stays consistent
+        # whether or not the adapter has its own internal sleep.
+        result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
+        if isinstance(result, dict) and result.get("session_id"):
+            copy_session_metadata(agent_state, result)
+            agent_state.pop("_pending_session_id", None)
+            event_log.write(
+                "session.captured",
+                agent_id=agent_id,
+                provider=agent_state.get("provider"),
+                session_id=agent_state.get("session_id"),
+                rollout_path=agent_state.get("rollout_path"),
+                captured_via=agent_state.get("captured_via"),
+                attribution_confidence=agent_state.get("attribution_confidence"),
+            )
+            return result
+        if time.monotonic() >= deadline:
+            break
+        time.sleep(_CAPTURE_POLL_INTERVAL_SECONDS)
+    # Timeout. Slice 1 atomicity contract: a worker whose status is 'running'
+    # must NEVER be left with session_id=null — that half-state is what made
+    # Mac mini Stage 7 S5/S6 unreproducible and breaks resume on next restart.
+    # Emit a structured attention event so the coordinator/operator sees the
+    # miss, then raise so callers cannot accidentally treat the None as a
+    # silent "no-op". Non-running workers (still starting, paused, stopped)
+    # legitimately have no session yet, so they still get the silent-None
+    # return that existing callers expect.
+    if agent_state.get("status") == "running":
+        event_log.write(
+            "session.capture_required_attention",
+            agent_id=agent_id,
+            provider=agent_state.get("provider"),
+            timeout_s=timeout_s,
+            spawn_cwd=agent_state.get("spawn_cwd"),
+            session_name=agent_state.get("session_name"),
+            window=agent_state.get("window", agent_id),
+        )
+        if raise_on_missed:
+            raise TeamAgentRuntimeError(
+                f"Failed to capture session_id for agent {agent_id}: adapter "
+                f"did not produce a session within {timeout_s}s. Worker is "
+                "running but unidentifiable; this is a Slice 1 atomicity "
+                "violation."
+            )
+    return None
 def copy_session_metadata(target: dict[str, Any], source: dict[str, Any]) -> None: