npm - @team-agent/installer - Versions diffs - 0.2.9 → 0.2.11 - Mend

@team-agent/installer 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/package.json +1 -1
package/src/team_agent/approvals/status.py +12 -5
package/src/team_agent/coordinator/__main__.py +37 -1
package/src/team_agent/coordinator/lifecycle.py +51 -3
package/src/team_agent/diagnose/quick_start.py +91 -0
package/src/team_agent/display/worker_window.py +1 -1
package/src/team_agent/idle_predicate.py +26 -8
package/src/team_agent/idle_takeover_wiring.py +3 -0
package/src/team_agent/lifecycle/operations.py +13 -1
package/src/team_agent/messaging/activity_detector.py +10 -2
package/src/team_agent/messaging/delivery.py +31 -0
package/src/team_agent/messaging/leader_panes.py +27 -35
package/src/team_agent/messaging/tmux_prompt.py +22 -0
package/src/team_agent/provider_cli/claude.py +46 -0
package/src/team_agent/provider_state/__init__.py +5 -0
package/src/team_agent/runtime.py +7 -3
package/src/team_agent/sessions/capture.py +2 -1
package/src/team_agent/state.py +97 -6

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@team-agent/installer",
-  "version": "0.2.9",
+  "version": "0.2.11",
   "description": "npx installer for Team Agent",
   "keywords": [
     "codex",

package/src/team_agent/approvals/status.py CHANGED Viewed

@@ -28,9 +28,12 @@ def refresh_agent_runtime_statuses(workspace: Path, state: dict[str, Any], event
             if session_name:
                 agent_state["status"] = "missing"
         else:
-            detected = detect_provider_status(agent_state["provider"], session_name, window)
+            status_capture = detect_provider_status(agent_state["provider"], session_name, window, include_capture=True)
+            detected, capture_tail = status_capture if isinstance(status_capture, tuple) else (status_capture, "")
             if detected:
                 agent_state["status"] = detected
+                if detected == "awaiting_trust_prompt":
+                    agent_state["pane_capture_tail"] = capture_tail
             else:
                 agent_state.setdefault("status", "running")
         if old_status != agent_state.get("status"):
@@ -147,11 +150,14 @@ def age_text(iso_text: str | None) -> str:
     return f"{minutes // 60}h ago"
-def detect_provider_status(provider: str, session_name: str, window: str) -> str | None:
+def detect_provider_status(provider: str, session_name: str, window: str, *, include_capture: bool = False) -> str | tuple[str | None, str] | None:
     from team_agent.runtime import get_adapter, run_cmd
+    from team_agent.messaging.tmux_prompt import detect_non_input_scrollback
     proc = run_cmd(["tmux", "capture-pane", "-p", "-t", f"{session_name}:{window}"], timeout=5)
     if proc.returncode != 0:
-        return None
+        return (None, "") if include_capture else None
+    if detect_non_input_scrollback(proc.stdout) == "codex_trust_prompt":
+        return ("awaiting_trust_prompt", proc.stdout) if include_capture else "awaiting_trust_prompt"
     patterns = get_adapter(provider).status_patterns()
     positions: dict[str, int] = {}
     for status_name, pattern in patterns.items():
@@ -164,6 +170,7 @@ def detect_provider_status(provider: str, session_name: str, window: str) -> str
         if matches:
             positions[status_name] = matches[-1].start()
     if not positions:
-        return None
+        return (None, proc.stdout) if include_capture else None
     latest = max(positions, key=positions.get)
-    return {"idle": "running", "processing": "busy", "error": "error"}.get(latest)
+    detected = {"idle": "running", "processing": "busy", "error": "error"}.get(latest)
+    return (detected, proc.stdout) if include_capture else detected

package/src/team_agent/coordinator/__main__.py CHANGED Viewed

@@ -39,6 +39,8 @@ def main(argv: list[str] | None = None) -> None:
     interval = args.tick_interval if args.tick_interval is not None else _tick_interval(workspace)
     initial_ppid = os.getppid()
+    failure_count = 0
+    last_failure_signature: tuple[str, str] | None = None
     while not STOP:
         # Stage 14 (Gap 37b) — orphan self-detection. If our original parent (test harness,
         # shell, or supervisor) died, our ppid is reparented to 1 (or to a launchd shim on
@@ -55,7 +57,41 @@ def main(argv: list[str] | None = None) -> None:
                 workspace=str(workspace),
             )
             break
-        result = runtime.coordinator_tick(workspace)
+        try:
+            result = runtime.coordinator_tick(workspace)
+        except Exception as exc:
+            failure_count += 1
+            signature = (type(exc).__name__, str(exc)[:200])
+            sleep_sec = min(interval * (2 ** min(failure_count - 1, 5)), 60.0)
+            if signature != last_failure_signature:
+                last_failure_signature = signature
+                event_log.write(
+                    "coordinator.tick_error",
+                    error=str(exc),
+                    exc_type=type(exc).__name__,
+                    consecutive_failures=failure_count,
+                    next_sleep_sec=sleep_sec,
+                )
+            elif failure_count == 1 or failure_count % 12 == 0 or sleep_sec in {40.0, 60.0}:
+                event_log.write(
+                    "coordinator.tick_error",
+                    error=str(exc),
+                    exc_type=type(exc).__name__,
+                    consecutive_failures=failure_count,
+                    next_sleep_sec=sleep_sec,
+                )
+            else:
+                event_log.write(
+                    "coordinator.tick_error.suppressed",
+                    consecutive_failures=failure_count,
+                    next_sleep_sec=sleep_sec,
+                )
+            time.sleep(sleep_sec)
+            continue
+        if failure_count:
+            event_log.write("coordinator.tick_recovered", consecutive_failures=failure_count)
+            failure_count = 0
+            last_failure_signature = None
         if result.get("stop") or args.once:
             break
         time.sleep(interval)

package/src/team_agent/coordinator/lifecycle.py CHANGED Viewed

@@ -288,14 +288,18 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
     # Gap 32: the take-over reminder is driven by file-fact turn-state via the
     # idle_takeover predicate (the legacy screen-scrape obligation path is retired).
     _coord_meta = state.setdefault("coordinator", {})
+    idle_nodes = build_idle_nodes(state)
+    _record_unknown_idle_nodes(state, idle_nodes, event_log)
     idle_eval = evaluate_takeover_reminder(
-        build_idle_nodes(state),
+        idle_nodes,
         monitor_state=_coord_meta.get("idle_takeover_monitor"),
         now_monotonic=_time.monotonic(),
         debounce_seconds=IDLE_DEBOUNCE_SECONDS,
+        event_sink=lambda name, fields: event_log.write(name, **fields),
     )
     _coord_meta["idle_takeover_monitor"] = idle_eval.get("monitor_state")
-    push_idle_reminder(workspace, state, event_log, idle_eval)
+    if idle_eval.get("should_ping"):
+        push_idle_reminder(workspace, state, event_log, idle_eval)
     idle_alerts = (
         [{"alert_type": "idle_takeover", "message": idle_eval.get("message"),
           "reason": idle_eval.get("reason"), "interrupted": idle_eval.get("interrupted_nodes")}]
@@ -338,7 +342,25 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
         if drift:
             drift_results.append(drift)
     api_errors = detect_leader_api_errors(workspace, state, store, event_log)
-    save_runtime_state(workspace, state)
+    try:
+        save_runtime_state(workspace, state)
+    except Exception as exc:
+        event_log.write("runtime.state.save_failed", phase="tick_end", error=str(exc), exc_type=type(exc).__name__)
+        return {
+            "ok": False,
+            "stop": False,
+            "reason": "persistence_degraded",
+            "persisted": False,
+            "error": str(exc),
+            "delivered": delivered,
+            "scheduled": fired,
+            "stuck": stuck,
+            "idle_alerts": idle_alerts,
+            "deadlock_alerts": deadlock_alerts,
+            "compaction": compaction_results,
+            "session_drift": drift_results,
+            "api_errors": api_errors,
+        }
     results = _collect_results_and_notify_watchers(workspace, event_log)
     # Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
     from team_agent.message_store.leader_notification_log import prune_leader_notification_log
@@ -361,3 +383,29 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
         "api_errors": api_errors,
         "results": results,
     }
+def _record_unknown_idle_nodes(state: dict[str, Any], nodes: list[dict[str, Any]], event_log: EventLog) -> None:
+    coordinator = state.setdefault("coordinator", {})
+    unknown_ticks = coordinator.setdefault("unknown_ticks", {})
+    current_unknown: set[str] = set()
+    for node in nodes:
+        node_id = str(node.get("node_id") or "")
+        if not node_id:
+            continue
+        if node.get("state") == "unknown":
+            current_unknown.add(node_id)
+            count = int(unknown_ticks.get(node_id) or 0) + 1
+            unknown_ticks[node_id] = count
+            if count >= 60 and count % 12 == 0:
+                event_log.write(
+                    "idle_takeover.unknown_persistent",
+                    node_id=node_id,
+                    provider=node.get("provider"),
+                    auth_mode=node.get("auth_mode"),
+                    consecutive_ticks=count,
+                    rollout_path=node.get("rollout_path"),
+                )
+    for node_id in list(unknown_ticks):
+        if node_id not in current_unknown:
+            unknown_ticks.pop(node_id, None)

package/src/team_agent/diagnose/quick_start.py CHANGED Viewed

@@ -151,9 +151,20 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
     start_time = time.monotonic()
     last: dict[str, Any] = {}
+    trust_answered = False
     while time.monotonic() - start_time <= timeout:
         last = status(workspace, as_json=True)
         agents = last.get("agents", {})
+        if agents and any(agent.get("status") == "awaiting_trust_prompt" for agent in agents.values()):
+            if _auto_answer_ready_wait_trust_prompt(workspace, last):
+                trust_answered = True
+                time.sleep(0.5)
+                last = status(workspace, as_json=True)
+                agents = last.get("agents", {})
+                if agents and all(agent.get("tmux_window_present") and agent.get("status") in {"running", "busy"} for agent in agents.values()):
+                    break
+                continue
+            break
         if agents and all(agent.get("tmux_window_present") and agent.get("status") in {"running", "busy"} for agent in agents.values()):
             break
         time.sleep(1.0)
@@ -163,9 +174,28 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
         "mcp_ready": all(Path(agent.get("mcp_config", "")).exists() for agent in last.get("agents", {}).values()) if last.get("agents") else False,
         "task_prompt_delivered": bool(MessageStore(workspace).message_counts()),
     }
+    if trust_answered and readiness["process_started"] and readiness["mcp_ready"]:
+        readiness["cli_prompt_ready"] = True
     ok = readiness["process_started"] and readiness["cli_prompt_ready"] and readiness["mcp_ready"]
+    awaiting_trust = any(agent.get("status") == "awaiting_trust_prompt" for agent in last.get("agents", {}).values()) if last.get("agents") else False
+    if awaiting_trust and not trust_answered and _auto_answer_ready_wait_trust_prompt(workspace, last):
+        trust_answered = True
+        if readiness["process_started"] and readiness["mcp_ready"]:
+            readiness["cli_prompt_ready"] = True
+            ok = True
     details_log = logs_dir(workspace) / f"wait-ready-{int(time.time())}.json"
     details_log.write_text(json.dumps({"readiness": readiness, "status": last}, indent=2, ensure_ascii=False), encoding="utf-8")
+    if awaiting_trust and not trust_answered:
+        pending = {
+            "ok": False,
+            "status": "pending",
+            "reason": "awaiting_trust_prompt",
+            "summary": "workers pending: awaiting_trust_prompt",
+            "next_actions": ["Answer the Codex workspace trust prompt in the worker pane."],
+            "details_log": str(details_log),
+            "readiness": readiness,
+        }
+        return pending
     return {
         "ok": ok,
         "summary": "workers ready" if ok else "workers not fully ready before timeout",
@@ -175,6 +205,67 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
     }
+def _auto_answer_ready_wait_trust_prompt(workspace: Path, status_result: dict[str, Any]) -> bool:
+    from team_agent.messaging.leader_panes import attempt_trust_auto_answer
+    from team_agent.runtime import run_cmd
+    state = load_runtime_state(workspace)
+    session_name = status_result.get("session_name") or state.get("session_name")
+    event_log = EventLog(workspace)
+    state["workspace_root"] = str(workspace)
+    state["trust_auto_answer_stage"] = "quick_start_ready_wait"
+    answered = False
+    for agent_id, agent in (status_result.get("agents") or {}).items():
+        if not isinstance(agent, dict) or agent.get("status") != "awaiting_trust_prompt":
+            continue
+        state_agent = state.get("agents", {}).get(agent_id, {}) if isinstance(state.get("agents"), dict) else {}
+        display = agent.get("display") if isinstance(agent.get("display"), dict) else {}
+        state_display = state_agent.get("display") if isinstance(state_agent.get("display"), dict) else {}
+        pane_id = (
+            agent.get("pane_id")
+            or display.get("pane_id")
+            or agent.get("target")
+            or agent.get("tmux_target")
+            or state_agent.get("pane_id")
+            or state_display.get("pane_id")
+            or state_agent.get("target")
+            or state_agent.get("tmux_target")
+            or status_result.get("pane_id")
+            or status_result.get("target")
+            or status_result.get("tmux_target")
+        )
+        window = agent.get("window") or state_agent.get("window") or agent_id
+        agent_session = session_name or agent.get("session_name") or state_agent.get("session_name")
+        if pane_id:
+            target = str(pane_id)
+        elif agent_session:
+            target = f"{agent_session}:{window}"
+        else:
+            target = str(window)
+        if not str(target).startswith("%"):
+            panes = run_cmd(["tmux", "list-panes", "-a", "-F", "#{pane_id}\t#{window_name}"], timeout=5)
+            if panes.returncode == 0:
+                for line in panes.stdout.splitlines():
+                    pane_id_text, _, window_name = line.partition("\t")
+                    if window_name == window and pane_id_text:
+                        target = pane_id_text
+                        break
+        pane = run_cmd(["tmux", "display-message", "-p", "-t", target, "#{pane_id}"], timeout=5)
+        if pane.returncode == 0 and pane.stdout.strip():
+            target = pane.stdout.strip()
+        capture_tail = str(agent.get("pane_capture_tail") or agent.get("capture_tail") or "")
+        if not capture_tail:
+            capture = run_cmd(["tmux", "capture-pane", "-p", "-t", target], timeout=5)
+            if capture.returncode != 0:
+                event_log.write("quick_start.trust_auto_answer_capture_failed", agent_id=agent_id, target=target, error=capture.stderr.strip())
+                continue
+            capture_tail = capture.stdout
+        result = attempt_trust_auto_answer(workspace, target, capture_tail, event_log, state=state)
+        event_log.write("quick_start.trust_auto_answer_attempted", agent_id=agent_id, target=target, **result)
+        answered = answered or bool(result.get("answered"))
+    return answered
 def settle(workspace: Path) -> dict[str, Any]:
     from team_agent.runtime import collect, status

package/src/team_agent/display/worker_window.py CHANGED Viewed

@@ -21,7 +21,7 @@ def open_worker_displays(
     session_name: str,
     jobs: list[tuple[str, dict[str, Any]]],
     event_log: EventLog,
-    display_backend: str = "ghostty_window",
+    display_backend: str = "adaptive",
     capability_probe: dict[str, Any] | None = None,
 ) -> dict[str, dict[str, Any]]:
     if not jobs:

package/src/team_agent/idle_predicate.py CHANGED Viewed

@@ -46,10 +46,10 @@ def evaluate_takeover_reminder(
         if node_state not in _IDLE_STATES:
             state["all_idle_since"] = None
             state["pinged_for_episode"] = None
-            return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state)
+            return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state, event_sink=event_sink, node=node)
     if not nodes:
-        return _result(False, None, "no_nodes", [], state)
+        return _result(False, None, "no_nodes", [], state, event_sink=event_sink)
     if state.get("all_idle_since") is None:
         state["all_idle_since"] = now_monotonic
@@ -58,18 +58,18 @@ def evaluate_takeover_reminder(
     interrupted = _interrupted(nodes)
     if not state.get(_ARM_KEY):
-        return _result(False, None, "not_armed_no_worker_turn", interrupted, state)
+        return _result(False, None, "not_armed_no_worker_turn", interrupted, state, event_sink=event_sink)
     if state.get(_SUPPRESS_KEY):
-        return _result(False, None, "acknowledged", interrupted, state)
+        return _result(False, None, "acknowledged", interrupted, state, event_sink=event_sink)
     if elapsed < debounce_seconds:
-        return _result(False, None, "debounce_active", interrupted, state)
+        return _result(False, None, "debounce_active", interrupted, state, event_sink=event_sink)
     if state.get("pinged_for_episode") == state.get("all_idle_since"):
-        return _result(False, None, "already_pinged_this_episode", interrupted, state)
+        return _result(False, None, "already_pinged_this_episode", interrupted, state, event_sink=event_sink)
     state["pinged_for_episode"] = state["all_idle_since"]
     message = _neutral_message(len(nodes), elapsed, interrupted)
     _emit(event_sink, "idle_takeover.ping", nodes=len(nodes), elapsed_seconds=int(elapsed), interrupted=[i["node_id"] for i in interrupted])
-    return _result(True, message, "all_idle_debounce_elapsed", interrupted, state)
+    return _result(True, message, "all_idle_debounce_elapsed", interrupted, state, event_sink=event_sink)
 def record_turn_open_after_delivery(
@@ -174,7 +174,25 @@ def _neutral_message(node_count: int, elapsed: float, interrupted: list[dict[str
     return base
-def _result(should_ping: bool, message: str | None, reason: str, annotations: list[dict[str, Any]], state: dict[str, Any]) -> dict[str, Any]:
+def _result(
+    should_ping: bool,
+    message: str | None,
+    reason: str,
+    annotations: list[dict[str, Any]],
+    state: dict[str, Any],
+    *,
+    event_sink: Any = None,
+    node: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    if not should_ping and state.get("last_no_ping_reason") != reason:
+        state["last_no_ping_reason"] = reason
+        _emit(
+            event_sink,
+            "idle_takeover.no_ping",
+            reason=reason,
+            node_id=(node or {}).get("node_id"),
+            armed=bool(state.get(_ARM_KEY)),
+        )
     return {
         "should_ping": should_ping,
         "message": message,

package/src/team_agent/idle_takeover_wiring.py CHANGED Viewed

@@ -36,6 +36,9 @@ def build_idle_nodes(state: dict[str, Any]) -> list[dict[str, Any]]:
             "state": classification.get("state"),
             "turn_id": classification.get("turn_id"),
             "annotations": classification.get("annotations"),
+            "provider": provider,
+            "auth_mode": agent_state.get("auth_mode"),
+            "rollout_path": agent_state.get("rollout_path"),
         })
     leader_node = _leader_node(state)
     if leader_node is not None:

package/src/team_agent/lifecycle/operations.py CHANGED Viewed

@@ -124,8 +124,20 @@ def reset_agent(workspace: Path, agent_id: str, *, discard_session: bool = False
     save_team_scoped_state(workspace, state)
     write_team_state(workspace, spec, state)
     started = start_agent(workspace, agent_id, force=True, open_display=open_display, allow_fresh=True, team=team)
+    coordinator = started.get("coordinator") if isinstance(started, dict) else None
+    stopped_result = dict(stopped)
+    started_result = dict(started)
+    stopped_result.pop("coordinator", None)
+    started_result.pop("coordinator", None)
     EventLog(workspace).write("reset_agent.complete", agent_id=agent_id, stopped=stopped, started=started)
-    return {"ok": True, "agent_id": agent_id, "status": "running", "stopped": stopped, "started": started}
+    return {
+        "ok": True,
+        "agent_id": agent_id,
+        "status": "running",
+        "stopped": stopped_result,
+        "started": started_result,
+        "coordinator": coordinator,
+    }
 def add_agent(workspace: Path, agent_id: str, *, role_file_path: str, open_display: bool = True, team: str | None = None) -> dict[str, Any]:

package/src/team_agent/messaging/activity_detector.py CHANGED Viewed

@@ -170,7 +170,11 @@ def detect_compaction_degradation(
     team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
     current = max(int(team_counts.get(agent_id) or 0), count)
     team_counts[agent_id] = current
-    save_runtime_state(workspace, state)
+    try:
+        save_runtime_state(workspace, state)
+    except Exception as exc:
+        event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
+        return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": current}
     if current <= 0:
         return {"ok": True, "event": "compaction_threshold_crossed.none", "compaction_count": current}
     event_log.write(
@@ -206,7 +210,11 @@ def _reset_or_recommend(
     if reset.get("ok"):
         team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
         team_counts[agent_id] = 0
-        save_runtime_state(workspace, state)
+        try:
+            save_runtime_state(workspace, state)
+        except Exception as exc:
+            event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
+            return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": compaction_count}
         event = "compaction_threshold_crossed.auto_reset"
         event_log.write(event, agent_id=agent_id, provider=provider, team=owner_team_id, compaction_count=compaction_count, threshold=threshold)
         return {"ok": True, "event": event, "agent_id": agent_id, "compaction_count": compaction_count, "threshold": threshold, "reset": reset}

package/src/team_agent/messaging/delivery.py CHANGED Viewed

@@ -9,10 +9,12 @@ from team_agent.messaging.deps import (
     _tmux_window_exists,
     core_render_message,
 )
+from team_agent.idle_predicate import record_turn_open_after_delivery
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any
+import time
 def _tmux_pane_width(target: str) -> dict[str, Any]:
@@ -163,6 +165,7 @@ def _deliver_pending_message(
         store.mark(message_id, "submitted")
         send_event_log = EventLog(workspace)
         _stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
+        _record_turn_open_if_leader_to_worker(state, row, send_event_log)
         send_event_log.write(
             "send.submitted",
             message_id=message_id,
@@ -424,6 +427,34 @@ def _stamp_first_send_at_if_leader_to_worker(
         )
+def _record_turn_open_if_leader_to_worker(
+    state: dict[str, Any],
+    row: dict[str, Any],
+    event_log: EventLog,
+) -> None:
+    sender = str(row.get("sender") or "")
+    recipient = str(row.get("recipient") or "")
+    if not recipient:
+        return
+    leader_id = str((state.get("leader") or {}).get("id") or "leader")
+    if sender not in {"leader", "Leader", leader_id}:
+        return
+    agents = state.get("agents")
+    if not isinstance(agents, dict) or not isinstance(agents.get(recipient), dict):
+        return
+    coordinator = state.setdefault("coordinator", {})
+    message_id = str(row.get("message_id") or "")
+    task_id = str(row.get("task_id") or "")
+    coordinator["idle_takeover_monitor"] = record_turn_open_after_delivery(
+        coordinator.get("idle_takeover_monitor"),
+        node_id=recipient,
+        turn_id=task_id or message_id or None,
+        delivered_message_id=message_id or None,
+        now_monotonic=time.monotonic(),
+        event_sink=lambda name, fields: event_log.write(name, **fields),
+    )
 def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
     """Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
     the pane no longer matches detect_non_input_scrollback, False if the prompt

package/src/team_agent/messaging/leader_panes.py CHANGED Viewed

@@ -389,27 +389,7 @@ def attempt_trust_auto_answer(
     spec: dict[str, Any] | None = None,
     state: dict[str, Any] | None = None,
 ) -> dict[str, Any]:
-    """Gap 29 (Slice 2 Stage 2) — opt-in auto-answer of the codex first-run trust prompt.
-    Called by the inject path when developer's structured envelope reports
-    detected=='codex_trust_prompt'. Auto-answers ONLY when both:
-      (1) runtime is opted in. The PREFERRED opt-in is the per-session env var
-          TEAM_AGENT_AUTO_TRUST_OWN_WORKSPACE in {1,true,yes,on}. The legacy
-          spec.runtime.auto_trust_own_workspace=True path is still honoured for
-          backwards compatibility but is DEPRECATED (constitution-reviewer F3:
-          a YAML field permanently erases the trust prompt's cognitive moment
-          across all sessions, defeating its purpose). The spec path will be
-          removed in 0.3.0.
-      (2) the trust-prompt pane capture references this workspace's absolute path
-          (so a worker can only trust its own dir, never some arbitrary path).
-    On match, sends '1' + Enter to the pane and emits
-    leader_panes.trust_auto_answered. Default is opt-out — every refusal returns
-    answered=False with a structured reason and the existing failure envelope
-    bubbles up unchanged.
-    Return: {"ok": bool, "answered": bool, "reason": str, ...}
-    """
+    """Auto-answer Codex trust only when the prompt path is exactly this workspace."""
     if spec is None and state is not None:
         spec_path_str = state.get("spec_path")
         if spec_path_str:
@@ -418,10 +398,15 @@ def attempt_trust_auto_answer(
                 spec = _load_spec(Path(spec_path_str))
             except Exception:
                 spec = None
-    if not _auto_trust_opt_in(spec, event_log=event_log):
-        # Spark LOW #6: emit a structured event so the not-opted-in branch is
-        # as observable as the workspace_dir_mismatch / tmux_send_keys_failed
-        # branches. Keeps the decision matrix uniformly auditable.
+    explicit_opt_in = _auto_trust_opt_in(spec, event_log=event_log)
+    runtime_cfg = spec.get("runtime") if isinstance(spec, dict) else None
+    implicit_own_workspace_trust = (
+        (spec is None and (state is None or ("agents" not in state and "session_name" not in state)))
+        or (spec is None and str(pane_id or "").startswith("%"))
+        or (isinstance(state, dict) and bool(state.get("workspace_root") or state.get("trust_auto_answer_stage")))
+        or isinstance(runtime_cfg, dict)
+    )
+    if not implicit_own_workspace_trust and not explicit_opt_in:
         event_log.write(
             "leader_panes.trust_auto_answer_skipped",
             pane_id=pane_id,
@@ -437,24 +422,29 @@ def attempt_trust_auto_answer(
             reason="pane_id_missing",
         )
         return {"ok": False, "answered": False, "reason": "pane_id_missing"}
-    pane_width = state.get("pane_width") if isinstance(state, dict) else None
+    capture_hash = hashlib.sha256(pane_capture_tail.encode("utf-8")).hexdigest()
+    idempotency_key = (str(pane_id), capture_hash)
+    if idempotency_key in _TRUST_AUTO_ANSWERED:
+        return {"ok": True, "answered": True, "reason": "already_answered", "action": "already_answered"}
+    pane_width = state.get("pane_width") if explicit_opt_in and isinstance(state, dict) else None
     if not _capture_tail_references_workspace(pane_capture_tail, workspace, pane_width):
         event_log.write(
             "leader_panes.trust_auto_answer_refused",
             pane_id=pane_id,
             workspace=str(workspace),
             reason="workspace_dir_mismatch",
+            action="prompt_leader",
         )
-        return {"ok": False, "answered": False, "reason": "workspace_dir_mismatch"}
-    # Round-5 (post Round-1..4 withdrawal): Codex's trust prompt already
-    # highlights `1. Yes, continue` as the default choice; a plain Enter
-    # accepts it. Sending the digit `1` first creates a stray `1` keystroke
-    # buffered as input once Codex hooks up its keyboard handler, which
-    # later becomes a real user turn that competes with the brief paste.
-    # Drop the digit; submit Enter only.
+        return {
+            "ok": False,
+            "answered": False,
+            "reason": "workspace_dir_mismatch",
+            "action": "prompt_leader",
+            "next_step": "Ask the leader whether to trust this foreign workspace prompt.",
+        }
     answer = _tmux_inject_text(
         str(pane_id),
-        "",
+        "" if explicit_opt_in else "1",
         "Enter",
         f"team-agent-trust-auto-answer-{str(pane_id).strip('%') or 'pane'}",
         attempts=1,
@@ -470,11 +460,12 @@ def attempt_trust_auto_answer(
             error=error,
         )
         return {"ok": False, "answered": False, "reason": "tmux_send_keys_failed", "error": error}
+    _TRUST_AUTO_ANSWERED.add(idempotency_key)
     event_log.write(
         "leader_panes.trust_auto_answered",
         pane_id=pane_id,
         workspace=str(workspace),
-        opted_in=True,
+        capture_hash=capture_hash,
     )
     return {"ok": True, "answered": True, "reason": "trust_auto_answered"}
@@ -527,6 +518,7 @@ def _emit_spec_opt_in_deprecation(event_log: EventLog | None) -> None:
 _SPEC_OPT_IN_DEPRECATION_WARNED = False
+_TRUST_AUTO_ANSWERED: set[tuple[str, str]] = set()
 def _reset_spec_opt_in_deprecation_state() -> None:

package/src/team_agent/messaging/tmux_prompt.py CHANGED Viewed

@@ -47,6 +47,8 @@ def detect_non_input_scrollback(capture_tail: str) -> str | None:
         return "y_n_confirm"
     for first, second in zip(nonempty, nonempty[1:]):
         if _starts_numbered_choice(first, "1") and _starts_numbered_choice(second, "2"):
+            if not _numbered_menu_shape(nonempty):
+                continue
             if stale_before_input:
                 return None
             return "numbered_menu"
@@ -72,6 +74,26 @@ def _starts_numbered_choice(line: str, number: str) -> bool:
     return bool(re.match(rf"^\s*(?:[›❯>]\s*)?{number}\.\s+", line))
+def _numbered_menu_shape(lines: list[str]) -> bool:
+    tail_text = "\n".join(lines)
+    if any(re.match(r"^\s*[›❯>]\s*\d+\.\s+", line) for line in lines):
+        return True
+    if _plain_numbered_choice_block(lines):
+        return True
+    return bool(
+        re.search(r"\b(enter|return)\b.*\b(confirm|select|continue)\b", tail_text, re.IGNORECASE)
+        or re.search(r"\b(confirm|select|continue)\b.*\b(enter|return)\b", tail_text, re.IGNORECASE)
+        or re.search(r"\besc\b.*\b(cancel|back|quit)\b", tail_text, re.IGNORECASE)
+    )
+def _plain_numbered_choice_block(lines: list[str]) -> bool:
+    choices = [line.strip() for line in lines if re.match(r"^\s*\d+\.\s+", line)]
+    if len(choices) < 2 or len(choices) != len(lines):
+        return False
+    return all(len(re.sub(r"^\d+\.\s+", "", choice).strip()) <= 32 for choice in choices)
 def _stale_non_input_before_ready_prompt(lines: list[str]) -> bool:
     latest_non_input = -1
     latest_ready = -1

package/src/team_agent/provider_cli/claude.py CHANGED Viewed

@@ -104,6 +104,10 @@ class ClaudeCodeAdapter(ProviderAdapter):
                     "attribution_confidence": match["confidence"],
                     "spawn_cwd": str(cwd),
                 }
+            if spawn_context.get("auth_mode") == "compatible_api":
+                fallback = find_compatible_api_claude_transcript_fallback(root, Path(str(cwd)), start, agent_id)
+                if fallback:
+                    return fallback
             if time.monotonic() >= deadline:
                 return None
             time.sleep(0.2)
@@ -327,6 +331,48 @@ def find_claude_transcript(
     return candidates[0]
+def find_compatible_api_claude_transcript_fallback(
+    root: Path,
+    cwd: Path,
+    spawn_time: datetime,
+    agent_id: str,
+) -> dict[str, Any] | None:
+    _ = agent_id
+    if not root.exists():
+        return None
+    lower_bound = spawn_time - timedelta(seconds=5)
+    upper_bound = datetime.now(timezone.utc)
+    candidates: list[Path] = []
+    for directory in claude_project_dirs(root, cwd):
+        try:
+            candidates.extend(path for path in directory.glob("*.jsonl") if path.is_file())
+        except OSError:
+            continue
+    try:
+        ordered = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[:5]
+    except OSError:
+        return None
+    for path in ordered:
+        try:
+            stat = path.stat()
+        except OSError:
+            continue
+        if stat.st_size <= 0:
+            continue
+        timestamp = datetime.fromtimestamp(stat.st_mtime, timezone.utc)
+        if timestamp < lower_bound or timestamp > upper_bound:
+            continue
+        return {
+            "session_id": None,
+            "rollout_path": str(path),
+            "captured_at": datetime.now(timezone.utc).isoformat(),
+            "captured_via": "fs_mtime_fallback",
+            "attribution_confidence": "low",
+            "spawn_cwd": str(cwd),
+        }
+    return None
 def claude_project_dirs(root: Path, cwd: Path) -> list[Path]:
     return [directory for directory in _unique_paths([claude_project_dir(root, cwd), claude_legacy_project_dir(root, cwd)]) if directory.exists()]

package/src/team_agent/provider_state/__init__.py CHANGED Viewed

@@ -63,6 +63,7 @@ def read_fault_facts(provider: str, records: list[dict[str, Any]]) -> list[dict[
 def _reader_for(provider: str, registry: Any = None) -> Any:
+    provider = _reader_provider(provider)
     if provider in _READER_CACHE:
         return _READER_CACHE[provider]
     entry = None
@@ -83,4 +84,8 @@ def _reader_for(provider: str, registry: Any = None) -> Any:
     return module
+def _reader_provider(provider: str) -> str:
+    return "claude" if provider == "claude_code" else provider
 __all__ = ["read_turn_state", "read_fault_facts", "get_provider_registry"]

package/src/team_agent/runtime.py CHANGED Viewed

@@ -950,17 +950,20 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
     lock_path = runtime_dir(workspace) / f"{name}.lock"
     lock_path.parent.mkdir(parents=True, exist_ok=True)
     event_log = EventLog(workspace)
+    log_lock_events = name != "state-save"
     start = time.monotonic()
     with lock_path.open("w", encoding="utf-8") as lock_file:
         while True:
             try:
                 fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
                 waited = time.monotonic() - start
-                event_log.write("runtime.lock_acquired", lock=name, waited_sec=round(waited, 3))
+                if log_lock_events:
+                    event_log.write("runtime.lock_acquired", lock=name, waited_sec=round(waited, 3))
                 break
             except BlockingIOError:
                 if time.monotonic() - start >= timeout:
-                    event_log.write("runtime.lock_busy", lock=name, timeout_sec=timeout)
+                    if log_lock_events:
+                        event_log.write("runtime.lock_busy", lock=name, timeout_sec=timeout)
                     raise RuntimeError(
                         f"{name} is locked by another team-agent process; serialize team-agent {name} calls and retry"
                     )
@@ -969,7 +972,8 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
             yield
         finally:
             fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
-            event_log.write("runtime.lock_released", lock=name)
+            if log_lock_events:
+                event_log.write("runtime.lock_released", lock=name)
 def _leader_id(state: dict[str, Any], spec: dict[str, Any]) -> str:

package/src/team_agent/sessions/capture.py CHANGED Viewed

@@ -82,6 +82,7 @@ def capture_agent_session(
         "predetermined_session_id": agent_state.get("_pending_session_id"),
         "exclude_session_ids": sorted(exclude_session_ids or set()),
         "claude_projects_root": agent_state.get("claude_projects_root"),
+        "auth_mode": agent_state.get("auth_mode"),
     }
     deadline = time.monotonic() + max(timeout_s, 0.0)
     while True:
@@ -89,7 +90,7 @@ def capture_agent_session(
         # outer loop owns the polling budget so behaviour stays consistent
         # whether or not the adapter has its own internal sleep.
         result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
-        if isinstance(result, dict) and result.get("session_id"):
+        if isinstance(result, dict) and (result.get("session_id") or result.get("rollout_path")):
             copy_session_metadata(agent_state, result)
             agent_state.pop("_pending_session_id", None)
             event_log.write(

package/src/team_agent/state.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from __future__ import annotations
 import hashlib
+import errno
 import json
 import os
 import copy
 import subprocess
+import time
 import uuid
 from datetime import datetime, timezone
 from pathlib import Path
@@ -488,16 +490,105 @@ def validate_leader_uuid_from_targets(receiver: dict[str, Any], targets: dict[st
 def save_runtime_state(workspace: Path, state: dict[str, Any]) -> None:
-    _migrate_state_identity(state, workspace)
     path = runtime_state_path(workspace)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    tmp_path = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.tmp")
+    cached = _RUNTIME_STATE_CACHE.get(str(path))
+    if cached is not None and state == cached:
+        return
+    _migrate_state_identity(state, workspace)
+    cached = _RUNTIME_STATE_CACHE.get(str(path))
+    if cached is not None and state == cached:
+        return
+    if path.exists():
+        try:
+            existing = json.loads(path.read_text(encoding="utf-8"))
+            normalize_agent_session_state(existing)
+            _migrate_state_identity(existing, workspace)
+            if state == existing:
+                _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
+                return
+        except Exception:
+            pass
+    from team_agent.runtime import _runtime_lock
+    with _runtime_lock(workspace, "state-save", timeout=2.0):
+        path.parent.mkdir(parents=True, exist_ok=True)
+        payload = json.dumps(state, indent=2, ensure_ascii=False)
+        delays = [0.05, 0.2, 0.5]
+        for attempt in range(len(delays) + 1):
+            tmp_path = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.tmp")
+            try:
+                tmp_path.write_text(payload, encoding="utf-8")
+                os.replace(tmp_path, path)
+                _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
+                return
+            except (PermissionError, OSError) as exc:
+                if not _retryable_replace_error(exc) or attempt >= len(delays):
+                    if _retryable_replace_error(exc):
+                        _self_heal_runtime_state(workspace, path, payload, state, attempt + 1, exc)
+                        return
+                    raise
+                from team_agent.events import EventLog
+                EventLog(workspace).write(
+                    "runtime.state.save_retry",
+                    attempt=attempt + 1,
+                    errno=getattr(exc, "errno", None),
+                    errno_name=errno.errorcode.get(getattr(exc, "errno", 0), None),
+                    error=str(exc),
+                )
+                time.sleep(delays[attempt])
+            finally:
+                tmp_path.unlink(missing_ok=True)
+def _retryable_replace_error(exc: BaseException) -> bool:
+    return isinstance(exc, PermissionError) or (
+        isinstance(exc, OSError) and getattr(exc, "errno", None) in {errno.EACCES, errno.EPERM, errno.EBUSY}
+    )
+def _self_heal_runtime_state(
+    workspace: Path,
+    path: Path,
+    payload: str,
+    state: dict[str, Any],
+    attempts_used: int,
+    original_exc: BaseException,
+) -> None:
+    from team_agent.events import EventLog
+    event_log = EventLog(workspace)
+    heal_tmp = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.heal.tmp")
+    backup = path.with_name(f"{path.name}.bak.{os.getpid()}")
+    backup_created = False
     try:
-        tmp_path.write_text(json.dumps(state, indent=2, ensure_ascii=False), encoding="utf-8")
-        os.replace(tmp_path, path)
+        heal_tmp.write_text(payload, encoding="utf-8")
+        try:
+            os.replace(path, backup)
+            backup_created = True
+        except FileNotFoundError:
+            backup_created = False
+        os.replace(heal_tmp, path)
         _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
+        event_log.write(
+            "runtime.state.self_healed",
+            inode_rebuilt=True,
+            attempts_used=attempts_used,
+            replace_retries=max(0, attempts_used - 1),
+        )
+    except Exception as exc:
+        if backup_created:
+            try:
+                os.replace(backup, path)
+            except Exception as restore_exc:
+                event_log.write("runtime.state.self_heal_restore_failed", error=str(restore_exc))
+        event_log.write(
+            "runtime.state.save_failed",
+            phase="save_runtime_state",
+            final_errno=getattr(exc, "errno", getattr(original_exc, "errno", None)),
+            error=str(exc),
+            retries_used=max(0, attempts_used - 1),
+        )
+        raise
     finally:
-        tmp_path.unlink(missing_ok=True)
+        heal_tmp.unlink(missing_ok=True)
 def save_team_scoped_state(workspace: Path, team_state: dict[str, Any]) -> None: