npm - @team-agent/installer - Versions diffs - 0.2.1 → 0.2.3 - Mend

@team-agent/installer 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/package.json +1 -1
package/schemas/team.schema.json +6 -0
package/src/team_agent/approvals/runtime_prompts.py +1 -1
package/src/team_agent/cli/commands.py +122 -6
package/src/team_agent/cli/parser.py +42 -1
package/src/team_agent/coordinator/__main__.py +21 -2
package/src/team_agent/coordinator/lifecycle.py +11 -0
package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
package/src/team_agent/events.py +47 -0
package/src/team_agent/launch/core.py +2 -1
package/src/team_agent/leader/__init__.py +273 -60
package/src/team_agent/lifecycle/agents.py +54 -2
package/src/team_agent/lifecycle/operations.py +87 -9
package/src/team_agent/lifecycle/start.py +1 -1
package/src/team_agent/message_store/core.py +8 -7
package/src/team_agent/message_store/leader_notification_log.py +132 -0
package/src/team_agent/message_store/result_watchers.py +144 -1
package/src/team_agent/message_store/schema.py +31 -2
package/src/team_agent/messaging/delivery.py +293 -1
package/src/team_agent/messaging/idle_alerts.py +109 -9
package/src/team_agent/messaging/leader.py +179 -10
package/src/team_agent/messaging/leader_api_errors.py +216 -0
package/src/team_agent/messaging/leader_panes.py +393 -23
package/src/team_agent/messaging/result_delivery.py +219 -4
package/src/team_agent/messaging/results.py +12 -21
package/src/team_agent/messaging/scheduler.py +24 -2
package/src/team_agent/messaging/send.py +21 -26
package/src/team_agent/messaging/tmux_io.py +153 -23
package/src/team_agent/messaging/tmux_prompt.py +87 -0
package/src/team_agent/messaging/trust_auto_answer.py +44 -0
package/src/team_agent/restart/orchestration.py +207 -4
package/src/team_agent/runtime.py +7 -7
package/src/team_agent/rust_core.py +157 -3
package/src/team_agent/sessions/capture.py +65 -15
package/src/team_agent/spec.py +59 -0
package/src/team_agent/state.py +153 -10
package/src/team_agent/status/inbox.py +33 -3
package/src/team_agent/status/queries.py +32 -1
package/src/team_agent/watch/__init__.py +145 -0

package/src/team_agent/messaging/tmux_io.py CHANGED Viewed

@@ -20,6 +20,7 @@ from team_agent.messaging.deps import (
 from pathlib import Path
 from typing import Any
+from team_agent.messaging.tmux_prompt import detect_non_input_scrollback, non_input_scrollback_window
 def _tmux_inject_text(
     target: str,
@@ -28,6 +29,8 @@ def _tmux_inject_text(
     buffer_name: str,
     attempts: int = 3,
     provider: str = "fake",
+    *,
+    bypass_non_input_gate: bool = False,
 ) -> dict[str, Any]:
     token_match = re.search(r"\[team-agent-token:([^\]]+)\]", text)
     token = token_match.group(1) if token_match else ""
@@ -37,15 +40,25 @@ def _tmux_inject_text(
     submit_settle_timeout = _tmux_submit_settle_timeout(text)
     text_bytes = _tmux_text_size(text)
     for attempt in range(1, max(attempts, 1) + 1):
-        prepared = _prepare_tmux_pane_for_input(target)
+        prepared = (
+            {"ok": True, "verification": "non_input_gate_bypassed"}
+            if bypass_non_input_gate
+            else _prepare_tmux_pane_for_input(target)
+        )
         if not prepared["ok"]:
-            attempt_log.append({"attempt": attempt, "visible": False, "verification": prepared["verification"]})
+            attempt_log.append(_prepare_failure_attempt(attempt, prepared))
             return {
                 "ok": False,
+                "status": "failed",
                 "stage": prepared["stage"],
+                "reason": prepared.get("reason"),
                 "error": prepared.get("error"),
                 "attempts": attempt_log,
                 "verification": prepared["verification"],
+                "detected": prepared.get("detected"),
+                "pane_id": prepared.get("pane_id"),
+                "pane_mode": prepared.get("pane_mode"),
+                "pane_capture_tail": prepared.get("pane_capture_tail"),
             }
         baseline = _capture_tmux_pane_text(target)
         if not baseline["ok"]:
@@ -97,6 +110,9 @@ def _tmux_inject_text(
             attempt_entry["buffer_delete_error"] = deleted.get("error")
         if prepared.get("recovered_from_mode"):
             attempt_entry["recovered_from_mode"] = True
+            attempt_entry["recovered_from_pane_mode"] = prepared.get("pane_mode")
+        if prepared.get("warning_event"):
+            attempt_entry["warning_event"] = prepared["warning_event"]
         attempt_log.append(attempt_entry)
         if not visible:
             time.sleep(0.2)
@@ -276,50 +292,164 @@ def _tmux_load_buffer_stdin(buffer_name: str, text: str) -> subprocess.Completed
 def _prepare_tmux_pane_for_input(target: str) -> dict[str, Any]:
-    mode = run_cmd(["tmux", "display-message", "-p", "-t", target, "#{pane_in_mode}"], timeout=5)
-    if mode.returncode != 0:
+    mode_result = _pane_mode(target)
+    if not mode_result["ok"]:
         return {
             "ok": False,
             "stage": "pane-mode-check",
             "verification": "pane_mode_check_failed",
-            "error": mode.stderr.strip() or "tmux pane mode check failed",
+            "error": mode_result.get("error") or "tmux pane mode check failed",
         }
-    if mode.stdout.strip() != "1":
-        return {"ok": True, "verification": "pane_input_ready"}
-    cancel = run_cmd(["tmux", "send-keys", "-t", target, "-X", "cancel"], timeout=10)
-    if cancel.returncode != 0:
+    capture_result = _pane_capture_tail(target, lines=30)
+    if not capture_result["ok"]:
         return {
             "ok": False,
-            "stage": "pane-mode-cancel",
-            "verification": "pane_mode_cancel_failed",
-            "error": cancel.stderr.strip() or "tmux copy-mode cancel failed",
+            "stage": "pane-tail-capture",
+            "verification": "pane_tail_capture_failed",
+            "error": capture_result.get("error") or "tmux capture-pane failed",
         }
+    pane_mode = _normalize_pane_mode(mode_result.get("pane_mode"))
+    capture_tail = str(capture_result.get("capture") or "")
+    detected = detect_non_input_scrollback(capture_tail)
+    if detected:
+        return _non_input_refusal(target, pane_mode, capture_tail, detected)
+    if not pane_mode:
+        return {"ok": True, "verification": "pane_input_ready"}
+    cancel = _pane_mode_cancel(target, pane_mode)
+    if not cancel["ok"]:
+        return _non_input_refusal(
+            target,
+            pane_mode,
+            capture_tail,
+            f"tmux_{pane_mode}",
+            error=cancel.get("error") or "tmux pane mode cancel failed",
+            verification="pane_mode_cancel_failed",
+            warning_event=cancel.get("warning_event"),
+        )
+    warning_event = cancel.get("warning_event")
     deadline = time.monotonic() + 1.5
     while True:
-        check = run_cmd(["tmux", "display-message", "-p", "-t", target, "#{pane_in_mode}"], timeout=5)
-        if check.returncode != 0:
+        check = _pane_mode(target)
+        if not check["ok"]:
             return {
                 "ok": False,
                 "stage": "pane-mode-check",
                 "verification": "pane_mode_recheck_failed",
-                "error": check.stderr.strip() or "tmux pane mode recheck failed",
+                "error": check.get("error") or "tmux pane mode recheck failed",
             }
-        if check.stdout.strip() != "1":
-            return {"ok": True, "verification": "pane_input_ready_after_mode_cancel", "recovered_from_mode": True}
-        if time.monotonic() >= deadline:
-            return {
-                "ok": False,
-                "stage": "pane-mode-cancel",
-                "verification": "pane_mode_still_active_after_cancel",
-                "error": "tmux pane stayed in copy-mode after cancel",
+        if not _normalize_pane_mode(check.get("pane_mode")):
+            result = {
+                "ok": True,
+                "verification": "pane_input_ready_after_mode_cancel",
+                "recovered_from_mode": True,
+                "pane_mode": pane_mode,
             }
+            if warning_event:
+                result["warning_event"] = warning_event
+            return result
+        if time.monotonic() >= deadline:
+            return _non_input_refusal(
+                target,
+                pane_mode,
+                capture_tail,
+                f"tmux_{pane_mode}",
+                error=f"tmux pane stayed in {pane_mode} after cancel",
+                verification="pane_mode_still_active_after_cancel",
+                warning_event=warning_event,
+            )
         time.sleep(0.1)
+def _pane_mode(target: str) -> dict[str, Any]:
+    proc = run_cmd(["tmux", "display-message", "-p", "-t", target, "#{pane_mode}"], timeout=5)
+    if proc.returncode != 0:
+        return {"ok": False, "error": proc.stderr.strip() or "tmux pane mode check failed"}
+    return {"ok": True, "pane_mode": proc.stdout.strip()}
+def _pane_capture_tail(target: str, lines: int = 30) -> dict[str, Any]:
+    capture = run_cmd(["tmux", "capture-pane", "-p", "-S", f"-{lines}", "-t", target], timeout=5)
+    if capture.returncode != 0:
+        return {"ok": False, "capture": "", "error": capture.stderr.strip() or "tmux capture-pane failed"}
+    return {"ok": True, "capture": capture.stdout}
+def _pane_mode_cancel(target: str, pane_mode: str) -> dict[str, Any]:
+    mode = _normalize_pane_mode(pane_mode)
+    warning_event = None
+    if mode == "copy-mode":
+        args = ["tmux", "send-keys", "-t", target, "-X", "cancel"]
+    elif mode in {"tree-mode", "view-mode"}:
+        args = ["tmux", "send-keys", "-t", target, "q"]
+    elif mode == "client-mode":
+        args = ["tmux", "send-keys", "-t", target, "d"]
+    else:
+        args = ["tmux", "send-keys", "-t", target, "-X", "cancel"]
+        warning_event = "pane_mode_unknown_cancel_attempted"
+    cancel = run_cmd(args, timeout=10)
+    if cancel.returncode != 0:
+        return {
+            "ok": False,
+            "error": cancel.stderr.strip() or f"tmux {mode or 'unknown'} cancel failed",
+            "warning_event": warning_event,
+        }
+    result = {"ok": True, "mode": mode, "args": args}
+    if warning_event:
+        result["warning_event"] = warning_event
+    return result
+def _normalize_pane_mode(mode: Any) -> str:
+    value = str(mode or "").strip()
+    if value == "0":
+        return ""
+    if value == "1":
+        return "copy-mode"
+    return value
+def _non_input_refusal(
+    target: str,
+    pane_mode: str,
+    capture_tail: str,
+    detected: str,
+    *,
+    error: str | None = None,
+    verification: str = "recipient_pane_in_non_input_mode",
+    warning_event: str | None = None,
+) -> dict[str, Any]:
+    result = {
+        "ok": False,
+        "status": "failed",
+        "stage": "pre-paste-pane-state",
+        "reason": "recipient_pane_in_non_input_mode",
+        "error": error or "recipient_pane_in_non_input_mode",
+        "verification": verification,
+        "detected": detected,
+        "pane_id": target,
+        "pane_mode": pane_mode,
+        "pane_capture_tail": non_input_scrollback_window(capture_tail) or _last_lines(capture_tail, 10),
+    }
+    if warning_event:
+        result["warning_event"] = warning_event
+    return result
+def _prepare_failure_attempt(attempt: int, prepared: dict[str, Any]) -> dict[str, Any]:
+    entry = {
+        "attempt": attempt,
+        "visible": False,
+        "verification": prepared["verification"],
+    }
+    for key in ("reason", "detected", "pane_id", "pane_mode", "pane_capture_tail", "warning_event"):
+        if key in prepared:
+            entry[key] = prepared[key]
+    return entry
+def _last_lines(text: str, count: int) -> str:
+    lines = text.splitlines()
+    return "\n".join(lines[-count:])

package/src/team_agent/messaging/tmux_prompt.py CHANGED Viewed

@@ -12,6 +12,93 @@ from team_agent.messaging.deps import (
 from pathlib import Path
 from typing import Any
+_ANSI_ESCAPE_RE = re.compile(r"\x1b\[[0-?]*[ -/]*[@-~]")
+def detect_non_input_scrollback(capture_tail: str) -> str | None:
+    nonempty = _non_input_scrollback_lines(capture_tail)
+    tail_text = "\n".join(nonempty)
+    lower = tail_text.lower()
+    stale_before_input = _stale_non_input_before_ready_prompt(nonempty)
+    if re.search(r"do\s+you\s+trust\s+the\s+contents\s+of\s+this\s+directory", lower):
+        if stale_before_input:
+            return None
+        return "codex_trust_prompt"
+    if "press enter to log in" in lower or "press enter to login" in lower:
+        if stale_before_input:
+            return None
+        return "codex_first_run_auth"
+    if "capability may degrade" in lower:
+        if stale_before_input:
+            return None
+        return "codex_compaction_warning"
+    if re.search(r"press\s+(enter|return)\s+to\s+continue", lower):
+        if stale_before_input:
+            return None
+        return "generic_press_enter"
+    if re.search(r"press\s+any\s+key", lower):
+        if stale_before_input:
+            return None
+        return "generic_press_enter"
+    if re.search(r"(\(y/n\)|\([yY]/n\)|\[y/N\]|\[Y/n\]|\[y/n\])", tail_text):
+        if stale_before_input:
+            return None
+        return "y_n_confirm"
+    for first, second in zip(nonempty, nonempty[1:]):
+        if _starts_numbered_choice(first, "1") and _starts_numbered_choice(second, "2"):
+            if stale_before_input:
+                return None
+            return "numbered_menu"
+    if nonempty:
+        last = nonempty[-1]
+        if re.search(r"(^|[\s~/.\w-])[$%]\s*$", last):
+            return "shell_prompt_cli_dead"
+    return None
+def non_input_scrollback_window(capture_tail: str, limit: int = 15) -> str:
+    return "\n".join(_non_input_scrollback_lines(capture_tail, limit=limit))
+def _non_input_scrollback_lines(capture_tail: str, limit: int = 15) -> list[str]:
+    lines = [_ANSI_ESCAPE_RE.sub("", line).rstrip() for line in capture_tail.splitlines()]
+    while lines and not lines[-1].strip():
+        lines.pop()
+    return [line for line in lines if line.strip()][-limit:]
+def _starts_numbered_choice(line: str, number: str) -> bool:
+    return bool(re.match(rf"^\s*(?:[›❯>]\s*)?{number}\.\s+", line))
+def _stale_non_input_before_ready_prompt(lines: list[str]) -> bool:
+    latest_non_input = -1
+    latest_ready = -1
+    for index, line in enumerate(lines):
+        lower = line.lower()
+        if (
+            "do you trust the contents of this directory" in lower
+            or re.search(r"press\s+(enter|return)\s+to\s+continue", lower)
+            or re.search(r"press\s+any\s+key", lower)
+            or _starts_numbered_choice(line, "1")
+            or _starts_numbered_choice(line, "2")
+        ):
+            latest_non_input = index
+        if _is_input_ready_prompt(line):
+            latest_ready = index
+    return latest_non_input >= 0 and latest_ready > latest_non_input
+def _is_input_ready_prompt(line: str) -> bool:
+    if _starts_numbered_choice(line, "1") or _starts_numbered_choice(line, "2"):
+        return False
+    value = line.strip()
+    if re.match(r"^[›❯>]\s+\S", value):
+        return True
+    return bool(re.search(r"\b(codex|claude)\s*[>›❯]\s*$", value, re.IGNORECASE))
 def _enable_codex_fast_mode(session_name: str, window_name: str) -> dict[str, Any]:
     target = f"{session_name}:{window_name}"
     proc = run_cmd(["tmux", "send-keys", "-t", target, "/fast", "Enter"], timeout=10)

package/src/team_agent/messaging/trust_auto_answer.py ADDED Viewed

@@ -0,0 +1,44 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from team_agent.events import EventLog
+from team_agent.messaging.deps import _tmux_inject_text
+def retry_injection_after_trust_auto_answer(
+    workspace: Path,
+    state: dict[str, Any],
+    event_log: EventLog,
+    injection: dict[str, Any],
+    target: str,
+    text: str,
+    submit_key: str,
+    buffer_name: str,
+    provider: str,
+) -> dict[str, Any]:
+    from team_agent.messaging.delivery import _wait_for_trust_prompt_dismissal
+    from team_agent.messaging.leader_panes import attempt_trust_auto_answer
+    answer = attempt_trust_auto_answer(
+        workspace,
+        injection.get("pane_id") or target,
+        injection.get("pane_capture_tail") or "",
+        event_log,
+        state=state,
+    )
+    if not answer.get("answered"):
+        return injection
+    if not _wait_for_trust_prompt_dismissal(injection.get("pane_id") or target, timeout=3.0):
+        retry_blocked = dict(injection)
+        retry_blocked["error"] = "trust_prompt_not_dismissed_after_answer"
+        retry_blocked["verification"] = "trust_prompt_not_dismissed_after_answer"
+        retry_blocked["stage"] = "trust_auto_answer_dismissal_wait"
+        return retry_blocked
+    return _tmux_inject_text(
+        target,
+        text,
+        submit_key,
+        buffer_name,
+        provider=provider,
+    )

package/src/team_agent/restart/orchestration.py CHANGED Viewed

@@ -84,15 +84,72 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
         raise RuntimeError(_tmux_session_conflict_error(session_name))
     runtime_cfg = _effective_runtime_config(spec.get("runtime", {}))
     display_backend = spec.get("runtime", {}).get("display_backend", state.get("display_backend", "none"))
-    _close_ghostty_workspace(state, event_log)
-    for agent_id, agent_state in state.get("agents", {}).items():
-        _close_ghostty_display(agent_id, agent_state, event_log)
-    state["display_backend"] = display_backend
+    # Stage 7 S5 — Slice 6 lifecycle atomicity contract: compute restart_agents
+    # early so we can pre-validate resumability BEFORE any destructive teardown
+    # (ghostty close, tmux session creation). Without --allow-fresh, every
+    # non-paused worker MUST be resumable; if any is not, refuse the operation
+    # atomically with a structured result and a restart.atomic_refusal event.
+    # No rollback path is needed because nothing has been created yet.
     restart_agents = [
         agent
         for agent in spec.get("agents", [])
         if state.get("agents", {}).get(agent["id"], {}).get("status") != "paused" and not agent.get("paused")
     ]
+    # cr strict-typing (2026-05-27): refuse the operation deterministically
+    # before any decision logic if any persisted first_send_at is corrupt
+    # (empty string, 0, False, literal "null", any non-ISO garbage). This
+    # avoids silent misclassification through Python truthiness and gives the
+    # operator a clear audit signal that state.json is damaged.
+    invalid_first_send_at = _collect_corrupt_first_send_at(restart_agents, state)
+    if invalid_first_send_at:
+        for entry in invalid_first_send_at:
+            event_log.write(
+                "restart.first_send_at_invalid",
+                worker_id=entry["worker_id"],
+                raw_first_send_at=entry["raw_first_send_at"],
+                raw_first_send_at_type=entry["raw_first_send_at_type"],
+            )
+        invalid_names = [entry["worker_id"] for entry in invalid_first_send_at]
+        return {
+            "ok": False,
+            "status": "refused",
+            "reason": "invalid_first_send_at",
+            "invalid_first_send_at": invalid_first_send_at,
+            "allow_fresh": bool(allow_fresh),
+            "error": (
+                f"Cannot restart: workers {invalid_names} have a corrupt "
+                "first_send_at in state.json (only null/missing or a valid "
+                "ISO-8601 UTC timestamp string is accepted). Inspect the "
+                "restart.first_send_at_invalid audit events for raw values "
+                "and repair state.json before retrying."
+            ),
+        }
+    # cr C2: emit one restart.resume_decision event per non-paused worker so
+    # every restart attempt produces an auditable per-worker classification.
+    # The function returns only refused workers — populated when
+    # allow_fresh=False AND at least one interacted worker cannot be repaired.
+    refused = _emit_resume_decisions(
+        workspace, restart_agents, state, get_adapter, event_log, allow_fresh,
+    )
+    if refused:
+        event_log.write(
+            "restart.atomic_refusal",
+            unresumable=refused,
+            allow_fresh=bool(allow_fresh),
+            reason="resume_atomicity",
+        )
+        return {
+            "ok": False,
+            "status": "refused",
+            "reason": "resume_atomicity",
+            "unresumable": refused,
+            "allow_fresh": bool(allow_fresh),
+            "error": _format_atomic_refusal_error(refused),
+        }
+    _close_ghostty_workspace(state, event_log)
+    for agent_id, agent_state in state.get("agents", {}).items():
+        _close_ghostty_display(agent_id, agent_state, event_log)
+    state["display_backend"] = display_backend
     _ensure_agent_start_requirements(workspace, restart_agents, event_log, "restart")
     first = True
     restarted: list[dict[str, Any]] = []
@@ -271,6 +328,7 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
                 event_log,
                 timeout_s=1.5,
                 exclude_session_ids=known_session_ids,
+                raise_on_missed=False,
             )
         if display_backend in GHOSTTY_DISPLAY_BACKENDS:
             display_jobs.append((agent["id"], agent))
@@ -315,6 +373,151 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
     return {"ok": True, "session_name": session_name, "agents": restarted, "coordinator": coordinator}
+_FIRST_SEND_AT_ABSENT = "absent"
+_FIRST_SEND_AT_VALID = "valid"
+_FIRST_SEND_AT_CORRUPT = "corrupt"
+def _classify_first_send_at(value: Any) -> str:
+    """Strict first_send_at typing (cr verdict, 2026-05-27).
+    Returns one of:
+      "absent"  — None or missing field (worker never-interacted).
+      "valid"   — non-empty ISO-8601 UTC string parseable by datetime.fromisoformat.
+      "corrupt" — anything else: empty string, 0, False, literal "null", garbage.
+    The contract requires that corrupt values be detected deterministically
+    before any restart decision so we never silent-misclassify a worker's
+    interaction state via Python truthiness.
+    """
+    if value is None:
+        return _FIRST_SEND_AT_ABSENT
+    if not isinstance(value, str):
+        return _FIRST_SEND_AT_CORRUPT
+    if not value:
+        return _FIRST_SEND_AT_CORRUPT
+    try:
+        datetime.fromisoformat(value)
+    except (ValueError, TypeError):
+        return _FIRST_SEND_AT_CORRUPT
+    return _FIRST_SEND_AT_VALID
+def _collect_corrupt_first_send_at(
+    restart_agents: list[dict[str, Any]],
+    state: dict[str, Any],
+) -> list[dict[str, Any]]:
+    """Walk every non-paused worker and flag any whose persisted first_send_at
+    is corrupt. Returns the list of invalid records ready for the
+    `restart.first_send_at_invalid` event and the refusal envelope."""
+    invalid: list[dict[str, Any]] = []
+    for agent in restart_agents:
+        agent_id = agent["id"]
+        previous = state.get("agents", {}).get(agent_id, {})
+        raw = previous.get("first_send_at") if isinstance(previous, dict) else None
+        if _classify_first_send_at(raw) != _FIRST_SEND_AT_CORRUPT:
+            continue
+        invalid.append({
+            "worker_id": agent_id,
+            "raw_first_send_at": raw,
+            "raw_first_send_at_type": type(raw).__name__,
+        })
+    return invalid
+def _emit_resume_decisions(
+    workspace: Path,
+    restart_agents: list[dict[str, Any]],
+    state: dict[str, Any],
+    get_adapter_fn: Any,
+    event_log: EventLog,
+    allow_fresh: bool,
+) -> list[dict[str, Any]]:
+    """Route B audit-events contract (cr C2, 2026-05-27). For every non-paused
+    worker considered by restart, derive the resume decision per the Route B
+    matrix and emit ONE `restart.resume_decision` event:
+      resumable AND ...                     -> decision = "resume"
+      not resumable AND not interacted      -> decision = "fresh_start"
+      not resumable AND interacted AND fresh -> decision = "fresh_start"
+      not resumable AND interacted AND not fresh -> decision = "refuse"
+    Resumability mirrors sessions.resume.prepare_resume_state's repair chain
+    so workers the runtime would legitimately repair are NOT flagged. Returns
+    the subset of refused workers — populated only when allow_fresh=False AND
+    some interacted worker cannot be repaired — for use by atomic_refusal.
+    """
+    from team_agent.sessions.resume import recover_resume_session_from_events
+    refused: list[dict[str, Any]] = []
+    for agent in restart_agents:
+        agent_id = agent["id"]
+        previous = state.get("agents", {}).get(agent_id, {})
+        session_id = previous.get("session_id")
+        first_send_at = previous.get("first_send_at")
+        has_first_send_at = _classify_first_send_at(first_send_at) == _FIRST_SEND_AT_VALID
+        has_session_id = bool(session_id)
+        adapter = get_adapter_fn(agent["provider"])
+        resumable = bool(session_id) and adapter.session_is_resumable(previous, workspace)
+        if not resumable:
+            known_session_ids = {
+                str(item.get("session_id"))
+                for aid, item in state.get("agents", {}).items()
+                if aid != agent_id and item.get("session_id")
+            }
+            repaired = recover_resume_session_from_events(
+                workspace, agent_id, previous, adapter, known_session_ids,
+            )
+            if not repaired:
+                repaired = adapter.recover_session_id(
+                    agent_id, previous, workspace, known_session_ids,
+                )
+            resumable = bool(repaired)
+        if resumable:
+            decision = "resume"
+        elif not has_first_send_at:
+            decision = "fresh_start"
+        elif allow_fresh:
+            decision = "fresh_start"
+        else:
+            decision = "refuse"
+        event_log.write(
+            "restart.resume_decision",
+            worker_id=agent_id,
+            has_first_send_at=has_first_send_at,
+            has_session_id=has_session_id,
+            allow_fresh=bool(allow_fresh),
+            decision=decision,
+            first_send_at=first_send_at if has_first_send_at else None,
+            session_id=session_id,
+        )
+        if decision == "refuse":
+            refused.append({
+                "agent_id": agent_id,
+                "reason": "no_persisted_session_id" if not session_id else "session_unresumable",
+                "session_id": session_id,
+                "first_send_at": first_send_at,
+            })
+    return refused
+def _format_atomic_refusal_error(refused: list[dict[str, Any]]) -> str:
+    """C4 (cr verdict, 2026-05-27): the human-readable refusal error must
+    name every refused worker AND its first_send_at timestamp so an operator
+    can decide whether to pass --allow-fresh and accept losing that
+    interaction history."""
+    names = [item["agent_id"] for item in refused]
+    details = ". ".join(
+        f"{item['agent_id']} was first interacted with at {item.get('first_send_at')}; "
+        "its persisted session is missing"
+        for item in refused
+    )
+    return (
+        f"Cannot restart: workers {names} have no resumable session despite "
+        f"previous interaction. {details}. "
+        "Pass --allow-fresh if you accept losing that interaction history."
+    )
 def rollback_restart_session(session_name: str, event_log: EventLog) -> dict[str, Any]:
     from team_agent.runtime import run_cmd
     proc = run_cmd(["tmux", "kill-session", "-t", session_name], timeout=10)

package/src/team_agent/runtime.py CHANGED Viewed

@@ -67,6 +67,8 @@ from team_agent.display import (
 from team_agent.leader import (
     attach_leader,
     attach_leader_to_state as _attach_leader_to_state,
+    claim_leader,
+    leader_identity,
     leader_session_name as _leader_session_name,
     leader_start_plan,
     start_leader,
@@ -438,12 +440,10 @@ for _name in (
     assert hasattr(_launch_pkg, _name), f"team_agent.launch missing {_name}"
 del _launch_pkg, _name
-# Leader lane re-exports keep runtime.attach_leader, runtime.start_leader,
-# runtime.leader_start_plan, runtime._attach_leader_to_state,
-# runtime._leader_session_name resolving for CLI handlers and tests.
+# Leader lane re-exports keep runtime leader helpers resolving for CLI handlers and tests.
 import team_agent.leader as _leader_pkg
 assert attach_leader is _leader_pkg.attach_leader
-for _name in ("attach_leader", "attach_leader_to_state", "leader_session_name", "leader_start_plan", "start_leader"):
+for _name in ("attach_leader", "attach_leader_to_state", "claim_leader", "leader_identity", "leader_session_name", "leader_start_plan", "start_leader"):
     assert hasattr(_leader_pkg, _name), f"team_agent.leader missing {_name}"
 del _leader_pkg, _name
 from team_agent.task_graph import ready_tasks, update_task_status
@@ -674,7 +674,7 @@ def _handle_startup_prompts_and_verify_window(
     session_name: str,
     start_mode: str,
 ) -> bool:
-    handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=1, sleep_s=0.0)
+    handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=20, sleep_s=0.5)
     for prompt_event in handled_prompts:
         event_log.write(f"{event_prefix}.startup_prompt_handled", agent_id=agent_id, provider=provider, **prompt_event)
     deadline = time.monotonic() + 1.0
@@ -840,10 +840,10 @@ def _retry_or_failed(task: dict[str, Any]) -> str:
     return "failed"
-def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0) -> dict[str, Any]:
+def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0, *, _trust_retry_attempt: int = 1) -> dict[str, Any]:
     from team_agent.messaging.delivery import _deliver_pending_message as impl
-    return impl(workspace, state, message_id, wait_visible, timeout)
+    return impl(workspace, state, message_id, wait_visible, timeout, _trust_retry_attempt=_trust_retry_attempt)
 def _enable_codex_fast_mode(session_name: str, window_name: str) -> dict[str, Any]:
     from team_agent.messaging.tmux_prompt import _enable_codex_fast_mode as impl