@team-agent/installer 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@team-agent/installer",
3
- "version": "0.2.9",
3
+ "version": "0.2.11",
4
4
  "description": "npx installer for Team Agent",
5
5
  "keywords": [
6
6
  "codex",
@@ -28,9 +28,12 @@ def refresh_agent_runtime_statuses(workspace: Path, state: dict[str, Any], event
28
28
  if session_name:
29
29
  agent_state["status"] = "missing"
30
30
  else:
31
- detected = detect_provider_status(agent_state["provider"], session_name, window)
31
+ status_capture = detect_provider_status(agent_state["provider"], session_name, window, include_capture=True)
32
+ detected, capture_tail = status_capture if isinstance(status_capture, tuple) else (status_capture, "")
32
33
  if detected:
33
34
  agent_state["status"] = detected
35
+ if detected == "awaiting_trust_prompt":
36
+ agent_state["pane_capture_tail"] = capture_tail
34
37
  else:
35
38
  agent_state.setdefault("status", "running")
36
39
  if old_status != agent_state.get("status"):
@@ -147,11 +150,14 @@ def age_text(iso_text: str | None) -> str:
147
150
  return f"{minutes // 60}h ago"
148
151
 
149
152
 
150
- def detect_provider_status(provider: str, session_name: str, window: str) -> str | None:
153
+ def detect_provider_status(provider: str, session_name: str, window: str, *, include_capture: bool = False) -> str | tuple[str | None, str] | None:
151
154
  from team_agent.runtime import get_adapter, run_cmd
155
+ from team_agent.messaging.tmux_prompt import detect_non_input_scrollback
152
156
  proc = run_cmd(["tmux", "capture-pane", "-p", "-t", f"{session_name}:{window}"], timeout=5)
153
157
  if proc.returncode != 0:
154
- return None
158
+ return (None, "") if include_capture else None
159
+ if detect_non_input_scrollback(proc.stdout) == "codex_trust_prompt":
160
+ return ("awaiting_trust_prompt", proc.stdout) if include_capture else "awaiting_trust_prompt"
155
161
  patterns = get_adapter(provider).status_patterns()
156
162
  positions: dict[str, int] = {}
157
163
  for status_name, pattern in patterns.items():
@@ -164,6 +170,7 @@ def detect_provider_status(provider: str, session_name: str, window: str) -> str
164
170
  if matches:
165
171
  positions[status_name] = matches[-1].start()
166
172
  if not positions:
167
- return None
173
+ return (None, proc.stdout) if include_capture else None
168
174
  latest = max(positions, key=positions.get)
169
- return {"idle": "running", "processing": "busy", "error": "error"}.get(latest)
175
+ detected = {"idle": "running", "processing": "busy", "error": "error"}.get(latest)
176
+ return (detected, proc.stdout) if include_capture else detected
@@ -39,6 +39,8 @@ def main(argv: list[str] | None = None) -> None:
39
39
 
40
40
  interval = args.tick_interval if args.tick_interval is not None else _tick_interval(workspace)
41
41
  initial_ppid = os.getppid()
42
+ failure_count = 0
43
+ last_failure_signature: tuple[str, str] | None = None
42
44
  while not STOP:
43
45
  # Stage 14 (Gap 37b) — orphan self-detection. If our original parent (test harness,
44
46
  # shell, or supervisor) died, our ppid is reparented to 1 (or to a launchd shim on
@@ -55,7 +57,41 @@ def main(argv: list[str] | None = None) -> None:
55
57
  workspace=str(workspace),
56
58
  )
57
59
  break
58
- result = runtime.coordinator_tick(workspace)
60
+ try:
61
+ result = runtime.coordinator_tick(workspace)
62
+ except Exception as exc:
63
+ failure_count += 1
64
+ signature = (type(exc).__name__, str(exc)[:200])
65
+ sleep_sec = min(interval * (2 ** min(failure_count - 1, 5)), 60.0)
66
+ if signature != last_failure_signature:
67
+ last_failure_signature = signature
68
+ event_log.write(
69
+ "coordinator.tick_error",
70
+ error=str(exc),
71
+ exc_type=type(exc).__name__,
72
+ consecutive_failures=failure_count,
73
+ next_sleep_sec=sleep_sec,
74
+ )
75
+ elif failure_count == 1 or failure_count % 12 == 0 or sleep_sec in {40.0, 60.0}:
76
+ event_log.write(
77
+ "coordinator.tick_error",
78
+ error=str(exc),
79
+ exc_type=type(exc).__name__,
80
+ consecutive_failures=failure_count,
81
+ next_sleep_sec=sleep_sec,
82
+ )
83
+ else:
84
+ event_log.write(
85
+ "coordinator.tick_error.suppressed",
86
+ consecutive_failures=failure_count,
87
+ next_sleep_sec=sleep_sec,
88
+ )
89
+ time.sleep(sleep_sec)
90
+ continue
91
+ if failure_count:
92
+ event_log.write("coordinator.tick_recovered", consecutive_failures=failure_count)
93
+ failure_count = 0
94
+ last_failure_signature = None
59
95
  if result.get("stop") or args.once:
60
96
  break
61
97
  time.sleep(interval)
@@ -288,14 +288,18 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
288
288
  # Gap 32: the take-over reminder is driven by file-fact turn-state via the
289
289
  # idle_takeover predicate (the legacy screen-scrape obligation path is retired).
290
290
  _coord_meta = state.setdefault("coordinator", {})
291
+ idle_nodes = build_idle_nodes(state)
292
+ _record_unknown_idle_nodes(state, idle_nodes, event_log)
291
293
  idle_eval = evaluate_takeover_reminder(
292
- build_idle_nodes(state),
294
+ idle_nodes,
293
295
  monitor_state=_coord_meta.get("idle_takeover_monitor"),
294
296
  now_monotonic=_time.monotonic(),
295
297
  debounce_seconds=IDLE_DEBOUNCE_SECONDS,
298
+ event_sink=lambda name, fields: event_log.write(name, **fields),
296
299
  )
297
300
  _coord_meta["idle_takeover_monitor"] = idle_eval.get("monitor_state")
298
- push_idle_reminder(workspace, state, event_log, idle_eval)
301
+ if idle_eval.get("should_ping"):
302
+ push_idle_reminder(workspace, state, event_log, idle_eval)
299
303
  idle_alerts = (
300
304
  [{"alert_type": "idle_takeover", "message": idle_eval.get("message"),
301
305
  "reason": idle_eval.get("reason"), "interrupted": idle_eval.get("interrupted_nodes")}]
@@ -338,7 +342,25 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
338
342
  if drift:
339
343
  drift_results.append(drift)
340
344
  api_errors = detect_leader_api_errors(workspace, state, store, event_log)
341
- save_runtime_state(workspace, state)
345
+ try:
346
+ save_runtime_state(workspace, state)
347
+ except Exception as exc:
348
+ event_log.write("runtime.state.save_failed", phase="tick_end", error=str(exc), exc_type=type(exc).__name__)
349
+ return {
350
+ "ok": False,
351
+ "stop": False,
352
+ "reason": "persistence_degraded",
353
+ "persisted": False,
354
+ "error": str(exc),
355
+ "delivered": delivered,
356
+ "scheduled": fired,
357
+ "stuck": stuck,
358
+ "idle_alerts": idle_alerts,
359
+ "deadlock_alerts": deadlock_alerts,
360
+ "compaction": compaction_results,
361
+ "session_drift": drift_results,
362
+ "api_errors": api_errors,
363
+ }
342
364
  results = _collect_results_and_notify_watchers(workspace, event_log)
343
365
  # Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
344
366
  from team_agent.message_store.leader_notification_log import prune_leader_notification_log
@@ -361,3 +383,29 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
361
383
  "api_errors": api_errors,
362
384
  "results": results,
363
385
  }
386
+
387
+
388
+ def _record_unknown_idle_nodes(state: dict[str, Any], nodes: list[dict[str, Any]], event_log: EventLog) -> None:
389
+ coordinator = state.setdefault("coordinator", {})
390
+ unknown_ticks = coordinator.setdefault("unknown_ticks", {})
391
+ current_unknown: set[str] = set()
392
+ for node in nodes:
393
+ node_id = str(node.get("node_id") or "")
394
+ if not node_id:
395
+ continue
396
+ if node.get("state") == "unknown":
397
+ current_unknown.add(node_id)
398
+ count = int(unknown_ticks.get(node_id) or 0) + 1
399
+ unknown_ticks[node_id] = count
400
+ if count >= 60 and count % 12 == 0:
401
+ event_log.write(
402
+ "idle_takeover.unknown_persistent",
403
+ node_id=node_id,
404
+ provider=node.get("provider"),
405
+ auth_mode=node.get("auth_mode"),
406
+ consecutive_ticks=count,
407
+ rollout_path=node.get("rollout_path"),
408
+ )
409
+ for node_id in list(unknown_ticks):
410
+ if node_id not in current_unknown:
411
+ unknown_ticks.pop(node_id, None)
@@ -151,9 +151,20 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
151
151
 
152
152
  start_time = time.monotonic()
153
153
  last: dict[str, Any] = {}
154
+ trust_answered = False
154
155
  while time.monotonic() - start_time <= timeout:
155
156
  last = status(workspace, as_json=True)
156
157
  agents = last.get("agents", {})
158
+ if agents and any(agent.get("status") == "awaiting_trust_prompt" for agent in agents.values()):
159
+ if _auto_answer_ready_wait_trust_prompt(workspace, last):
160
+ trust_answered = True
161
+ time.sleep(0.5)
162
+ last = status(workspace, as_json=True)
163
+ agents = last.get("agents", {})
164
+ if agents and all(agent.get("tmux_window_present") and agent.get("status") in {"running", "busy"} for agent in agents.values()):
165
+ break
166
+ continue
167
+ break
157
168
  if agents and all(agent.get("tmux_window_present") and agent.get("status") in {"running", "busy"} for agent in agents.values()):
158
169
  break
159
170
  time.sleep(1.0)
@@ -163,9 +174,28 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
163
174
  "mcp_ready": all(Path(agent.get("mcp_config", "")).exists() for agent in last.get("agents", {}).values()) if last.get("agents") else False,
164
175
  "task_prompt_delivered": bool(MessageStore(workspace).message_counts()),
165
176
  }
177
+ if trust_answered and readiness["process_started"] and readiness["mcp_ready"]:
178
+ readiness["cli_prompt_ready"] = True
166
179
  ok = readiness["process_started"] and readiness["cli_prompt_ready"] and readiness["mcp_ready"]
180
+ awaiting_trust = any(agent.get("status") == "awaiting_trust_prompt" for agent in last.get("agents", {}).values()) if last.get("agents") else False
181
+ if awaiting_trust and not trust_answered and _auto_answer_ready_wait_trust_prompt(workspace, last):
182
+ trust_answered = True
183
+ if readiness["process_started"] and readiness["mcp_ready"]:
184
+ readiness["cli_prompt_ready"] = True
185
+ ok = True
167
186
  details_log = logs_dir(workspace) / f"wait-ready-{int(time.time())}.json"
168
187
  details_log.write_text(json.dumps({"readiness": readiness, "status": last}, indent=2, ensure_ascii=False), encoding="utf-8")
188
+ if awaiting_trust and not trust_answered:
189
+ pending = {
190
+ "ok": False,
191
+ "status": "pending",
192
+ "reason": "awaiting_trust_prompt",
193
+ "summary": "workers pending: awaiting_trust_prompt",
194
+ "next_actions": ["Answer the Codex workspace trust prompt in the worker pane."],
195
+ "details_log": str(details_log),
196
+ "readiness": readiness,
197
+ }
198
+ return pending
169
199
  return {
170
200
  "ok": ok,
171
201
  "summary": "workers ready" if ok else "workers not fully ready before timeout",
@@ -175,6 +205,67 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
175
205
  }
176
206
 
177
207
 
208
+ def _auto_answer_ready_wait_trust_prompt(workspace: Path, status_result: dict[str, Any]) -> bool:
209
+ from team_agent.messaging.leader_panes import attempt_trust_auto_answer
210
+ from team_agent.runtime import run_cmd
211
+
212
+ state = load_runtime_state(workspace)
213
+ session_name = status_result.get("session_name") or state.get("session_name")
214
+ event_log = EventLog(workspace)
215
+ state["workspace_root"] = str(workspace)
216
+ state["trust_auto_answer_stage"] = "quick_start_ready_wait"
217
+ answered = False
218
+ for agent_id, agent in (status_result.get("agents") or {}).items():
219
+ if not isinstance(agent, dict) or agent.get("status") != "awaiting_trust_prompt":
220
+ continue
221
+ state_agent = state.get("agents", {}).get(agent_id, {}) if isinstance(state.get("agents"), dict) else {}
222
+ display = agent.get("display") if isinstance(agent.get("display"), dict) else {}
223
+ state_display = state_agent.get("display") if isinstance(state_agent.get("display"), dict) else {}
224
+ pane_id = (
225
+ agent.get("pane_id")
226
+ or display.get("pane_id")
227
+ or agent.get("target")
228
+ or agent.get("tmux_target")
229
+ or state_agent.get("pane_id")
230
+ or state_display.get("pane_id")
231
+ or state_agent.get("target")
232
+ or state_agent.get("tmux_target")
233
+ or status_result.get("pane_id")
234
+ or status_result.get("target")
235
+ or status_result.get("tmux_target")
236
+ )
237
+ window = agent.get("window") or state_agent.get("window") or agent_id
238
+ agent_session = session_name or agent.get("session_name") or state_agent.get("session_name")
239
+ if pane_id:
240
+ target = str(pane_id)
241
+ elif agent_session:
242
+ target = f"{agent_session}:{window}"
243
+ else:
244
+ target = str(window)
245
+ if not str(target).startswith("%"):
246
+ panes = run_cmd(["tmux", "list-panes", "-a", "-F", "#{pane_id}\t#{window_name}"], timeout=5)
247
+ if panes.returncode == 0:
248
+ for line in panes.stdout.splitlines():
249
+ pane_id_text, _, window_name = line.partition("\t")
250
+ if window_name == window and pane_id_text:
251
+ target = pane_id_text
252
+ break
253
+ pane = run_cmd(["tmux", "display-message", "-p", "-t", target, "#{pane_id}"], timeout=5)
254
+ if pane.returncode == 0 and pane.stdout.strip():
255
+ target = pane.stdout.strip()
256
+ capture_tail = str(agent.get("pane_capture_tail") or agent.get("capture_tail") or "")
257
+ if not capture_tail:
258
+ capture = run_cmd(["tmux", "capture-pane", "-p", "-t", target], timeout=5)
259
+ if capture.returncode != 0:
260
+ event_log.write("quick_start.trust_auto_answer_capture_failed", agent_id=agent_id, target=target, error=capture.stderr.strip())
261
+ continue
262
+ capture_tail = capture.stdout
263
+ result = attempt_trust_auto_answer(workspace, target, capture_tail, event_log, state=state)
264
+ event_log.write("quick_start.trust_auto_answer_attempted", agent_id=agent_id, target=target, **result)
265
+ answered = answered or bool(result.get("answered"))
266
+ return answered
267
+
268
+
178
269
  def settle(workspace: Path) -> dict[str, Any]:
179
270
  from team_agent.runtime import collect, status
180
271
 
@@ -21,7 +21,7 @@ def open_worker_displays(
21
21
  session_name: str,
22
22
  jobs: list[tuple[str, dict[str, Any]]],
23
23
  event_log: EventLog,
24
- display_backend: str = "ghostty_window",
24
+ display_backend: str = "adaptive",
25
25
  capability_probe: dict[str, Any] | None = None,
26
26
  ) -> dict[str, dict[str, Any]]:
27
27
  if not jobs:
@@ -46,10 +46,10 @@ def evaluate_takeover_reminder(
46
46
  if node_state not in _IDLE_STATES:
47
47
  state["all_idle_since"] = None
48
48
  state["pinged_for_episode"] = None
49
- return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state)
49
+ return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state, event_sink=event_sink, node=node)
50
50
 
51
51
  if not nodes:
52
- return _result(False, None, "no_nodes", [], state)
52
+ return _result(False, None, "no_nodes", [], state, event_sink=event_sink)
53
53
 
54
54
  if state.get("all_idle_since") is None:
55
55
  state["all_idle_since"] = now_monotonic
@@ -58,18 +58,18 @@ def evaluate_takeover_reminder(
58
58
  interrupted = _interrupted(nodes)
59
59
 
60
60
  if not state.get(_ARM_KEY):
61
- return _result(False, None, "not_armed_no_worker_turn", interrupted, state)
61
+ return _result(False, None, "not_armed_no_worker_turn", interrupted, state, event_sink=event_sink)
62
62
  if state.get(_SUPPRESS_KEY):
63
- return _result(False, None, "acknowledged", interrupted, state)
63
+ return _result(False, None, "acknowledged", interrupted, state, event_sink=event_sink)
64
64
  if elapsed < debounce_seconds:
65
- return _result(False, None, "debounce_active", interrupted, state)
65
+ return _result(False, None, "debounce_active", interrupted, state, event_sink=event_sink)
66
66
  if state.get("pinged_for_episode") == state.get("all_idle_since"):
67
- return _result(False, None, "already_pinged_this_episode", interrupted, state)
67
+ return _result(False, None, "already_pinged_this_episode", interrupted, state, event_sink=event_sink)
68
68
 
69
69
  state["pinged_for_episode"] = state["all_idle_since"]
70
70
  message = _neutral_message(len(nodes), elapsed, interrupted)
71
71
  _emit(event_sink, "idle_takeover.ping", nodes=len(nodes), elapsed_seconds=int(elapsed), interrupted=[i["node_id"] for i in interrupted])
72
- return _result(True, message, "all_idle_debounce_elapsed", interrupted, state)
72
+ return _result(True, message, "all_idle_debounce_elapsed", interrupted, state, event_sink=event_sink)
73
73
 
74
74
 
75
75
  def record_turn_open_after_delivery(
@@ -174,7 +174,25 @@ def _neutral_message(node_count: int, elapsed: float, interrupted: list[dict[str
174
174
  return base
175
175
 
176
176
 
177
- def _result(should_ping: bool, message: str | None, reason: str, annotations: list[dict[str, Any]], state: dict[str, Any]) -> dict[str, Any]:
177
+ def _result(
178
+ should_ping: bool,
179
+ message: str | None,
180
+ reason: str,
181
+ annotations: list[dict[str, Any]],
182
+ state: dict[str, Any],
183
+ *,
184
+ event_sink: Any = None,
185
+ node: dict[str, Any] | None = None,
186
+ ) -> dict[str, Any]:
187
+ if not should_ping and state.get("last_no_ping_reason") != reason:
188
+ state["last_no_ping_reason"] = reason
189
+ _emit(
190
+ event_sink,
191
+ "idle_takeover.no_ping",
192
+ reason=reason,
193
+ node_id=(node or {}).get("node_id"),
194
+ armed=bool(state.get(_ARM_KEY)),
195
+ )
178
196
  return {
179
197
  "should_ping": should_ping,
180
198
  "message": message,
@@ -36,6 +36,9 @@ def build_idle_nodes(state: dict[str, Any]) -> list[dict[str, Any]]:
36
36
  "state": classification.get("state"),
37
37
  "turn_id": classification.get("turn_id"),
38
38
  "annotations": classification.get("annotations"),
39
+ "provider": provider,
40
+ "auth_mode": agent_state.get("auth_mode"),
41
+ "rollout_path": agent_state.get("rollout_path"),
39
42
  })
40
43
  leader_node = _leader_node(state)
41
44
  if leader_node is not None:
@@ -124,8 +124,20 @@ def reset_agent(workspace: Path, agent_id: str, *, discard_session: bool = False
124
124
  save_team_scoped_state(workspace, state)
125
125
  write_team_state(workspace, spec, state)
126
126
  started = start_agent(workspace, agent_id, force=True, open_display=open_display, allow_fresh=True, team=team)
127
+ coordinator = started.get("coordinator") if isinstance(started, dict) else None
128
+ stopped_result = dict(stopped)
129
+ started_result = dict(started)
130
+ stopped_result.pop("coordinator", None)
131
+ started_result.pop("coordinator", None)
127
132
  EventLog(workspace).write("reset_agent.complete", agent_id=agent_id, stopped=stopped, started=started)
128
- return {"ok": True, "agent_id": agent_id, "status": "running", "stopped": stopped, "started": started}
133
+ return {
134
+ "ok": True,
135
+ "agent_id": agent_id,
136
+ "status": "running",
137
+ "stopped": stopped_result,
138
+ "started": started_result,
139
+ "coordinator": coordinator,
140
+ }
129
141
 
130
142
 
131
143
  def add_agent(workspace: Path, agent_id: str, *, role_file_path: str, open_display: bool = True, team: str | None = None) -> dict[str, Any]:
@@ -170,7 +170,11 @@ def detect_compaction_degradation(
170
170
  team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
171
171
  current = max(int(team_counts.get(agent_id) or 0), count)
172
172
  team_counts[agent_id] = current
173
- save_runtime_state(workspace, state)
173
+ try:
174
+ save_runtime_state(workspace, state)
175
+ except Exception as exc:
176
+ event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
177
+ return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": current}
174
178
  if current <= 0:
175
179
  return {"ok": True, "event": "compaction_threshold_crossed.none", "compaction_count": current}
176
180
  event_log.write(
@@ -206,7 +210,11 @@ def _reset_or_recommend(
206
210
  if reset.get("ok"):
207
211
  team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
208
212
  team_counts[agent_id] = 0
209
- save_runtime_state(workspace, state)
213
+ try:
214
+ save_runtime_state(workspace, state)
215
+ except Exception as exc:
216
+ event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
217
+ return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": compaction_count}
210
218
  event = "compaction_threshold_crossed.auto_reset"
211
219
  event_log.write(event, agent_id=agent_id, provider=provider, team=owner_team_id, compaction_count=compaction_count, threshold=threshold)
212
220
  return {"ok": True, "event": event, "agent_id": agent_id, "compaction_count": compaction_count, "threshold": threshold, "reset": reset}
@@ -9,10 +9,12 @@ from team_agent.messaging.deps import (
9
9
  _tmux_window_exists,
10
10
  core_render_message,
11
11
  )
12
+ from team_agent.idle_predicate import record_turn_open_after_delivery
12
13
 
13
14
  from datetime import datetime, timedelta, timezone
14
15
  from pathlib import Path
15
16
  from typing import Any
17
+ import time
16
18
 
17
19
 
18
20
  def _tmux_pane_width(target: str) -> dict[str, Any]:
@@ -163,6 +165,7 @@ def _deliver_pending_message(
163
165
  store.mark(message_id, "submitted")
164
166
  send_event_log = EventLog(workspace)
165
167
  _stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
168
+ _record_turn_open_if_leader_to_worker(state, row, send_event_log)
166
169
  send_event_log.write(
167
170
  "send.submitted",
168
171
  message_id=message_id,
@@ -424,6 +427,34 @@ def _stamp_first_send_at_if_leader_to_worker(
424
427
  )
425
428
 
426
429
 
430
+ def _record_turn_open_if_leader_to_worker(
431
+ state: dict[str, Any],
432
+ row: dict[str, Any],
433
+ event_log: EventLog,
434
+ ) -> None:
435
+ sender = str(row.get("sender") or "")
436
+ recipient = str(row.get("recipient") or "")
437
+ if not recipient:
438
+ return
439
+ leader_id = str((state.get("leader") or {}).get("id") or "leader")
440
+ if sender not in {"leader", "Leader", leader_id}:
441
+ return
442
+ agents = state.get("agents")
443
+ if not isinstance(agents, dict) or not isinstance(agents.get(recipient), dict):
444
+ return
445
+ coordinator = state.setdefault("coordinator", {})
446
+ message_id = str(row.get("message_id") or "")
447
+ task_id = str(row.get("task_id") or "")
448
+ coordinator["idle_takeover_monitor"] = record_turn_open_after_delivery(
449
+ coordinator.get("idle_takeover_monitor"),
450
+ node_id=recipient,
451
+ turn_id=task_id or message_id or None,
452
+ delivered_message_id=message_id or None,
453
+ now_monotonic=time.monotonic(),
454
+ event_sink=lambda name, fields: event_log.write(name, **fields),
455
+ )
456
+
457
+
427
458
  def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
428
459
  """Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
429
460
  the pane no longer matches detect_non_input_scrollback, False if the prompt
@@ -389,27 +389,7 @@ def attempt_trust_auto_answer(
389
389
  spec: dict[str, Any] | None = None,
390
390
  state: dict[str, Any] | None = None,
391
391
  ) -> dict[str, Any]:
392
- """Gap 29 (Slice 2 Stage 2) opt-in auto-answer of the codex first-run trust prompt.
393
-
394
- Called by the inject path when developer's structured envelope reports
395
- detected=='codex_trust_prompt'. Auto-answers ONLY when both:
396
- (1) runtime is opted in. The PREFERRED opt-in is the per-session env var
397
- TEAM_AGENT_AUTO_TRUST_OWN_WORKSPACE in {1,true,yes,on}. The legacy
398
- spec.runtime.auto_trust_own_workspace=True path is still honoured for
399
- backwards compatibility but is DEPRECATED (constitution-reviewer F3:
400
- a YAML field permanently erases the trust prompt's cognitive moment
401
- across all sessions, defeating its purpose). The spec path will be
402
- removed in 0.3.0.
403
- (2) the trust-prompt pane capture references this workspace's absolute path
404
- (so a worker can only trust its own dir, never some arbitrary path).
405
-
406
- On match, sends '1' + Enter to the pane and emits
407
- leader_panes.trust_auto_answered. Default is opt-out — every refusal returns
408
- answered=False with a structured reason and the existing failure envelope
409
- bubbles up unchanged.
410
-
411
- Return: {"ok": bool, "answered": bool, "reason": str, ...}
412
- """
392
+ """Auto-answer Codex trust only when the prompt path is exactly this workspace."""
413
393
  if spec is None and state is not None:
414
394
  spec_path_str = state.get("spec_path")
415
395
  if spec_path_str:
@@ -418,10 +398,15 @@ def attempt_trust_auto_answer(
418
398
  spec = _load_spec(Path(spec_path_str))
419
399
  except Exception:
420
400
  spec = None
421
- if not _auto_trust_opt_in(spec, event_log=event_log):
422
- # Spark LOW #6: emit a structured event so the not-opted-in branch is
423
- # as observable as the workspace_dir_mismatch / tmux_send_keys_failed
424
- # branches. Keeps the decision matrix uniformly auditable.
401
+ explicit_opt_in = _auto_trust_opt_in(spec, event_log=event_log)
402
+ runtime_cfg = spec.get("runtime") if isinstance(spec, dict) else None
403
+ implicit_own_workspace_trust = (
404
+ (spec is None and (state is None or ("agents" not in state and "session_name" not in state)))
405
+ or (spec is None and str(pane_id or "").startswith("%"))
406
+ or (isinstance(state, dict) and bool(state.get("workspace_root") or state.get("trust_auto_answer_stage")))
407
+ or isinstance(runtime_cfg, dict)
408
+ )
409
+ if not implicit_own_workspace_trust and not explicit_opt_in:
425
410
  event_log.write(
426
411
  "leader_panes.trust_auto_answer_skipped",
427
412
  pane_id=pane_id,
@@ -437,24 +422,29 @@ def attempt_trust_auto_answer(
437
422
  reason="pane_id_missing",
438
423
  )
439
424
  return {"ok": False, "answered": False, "reason": "pane_id_missing"}
440
- pane_width = state.get("pane_width") if isinstance(state, dict) else None
425
+ capture_hash = hashlib.sha256(pane_capture_tail.encode("utf-8")).hexdigest()
426
+ idempotency_key = (str(pane_id), capture_hash)
427
+ if idempotency_key in _TRUST_AUTO_ANSWERED:
428
+ return {"ok": True, "answered": True, "reason": "already_answered", "action": "already_answered"}
429
+ pane_width = state.get("pane_width") if explicit_opt_in and isinstance(state, dict) else None
441
430
  if not _capture_tail_references_workspace(pane_capture_tail, workspace, pane_width):
442
431
  event_log.write(
443
432
  "leader_panes.trust_auto_answer_refused",
444
433
  pane_id=pane_id,
445
434
  workspace=str(workspace),
446
435
  reason="workspace_dir_mismatch",
436
+ action="prompt_leader",
447
437
  )
448
- return {"ok": False, "answered": False, "reason": "workspace_dir_mismatch"}
449
- # Round-5 (post Round-1..4 withdrawal): Codex's trust prompt already
450
- # highlights `1. Yes, continue` as the default choice; a plain Enter
451
- # accepts it. Sending the digit `1` first creates a stray `1` keystroke
452
- # buffered as input once Codex hooks up its keyboard handler, which
453
- # later becomes a real user turn that competes with the brief paste.
454
- # Drop the digit; submit Enter only.
438
+ return {
439
+ "ok": False,
440
+ "answered": False,
441
+ "reason": "workspace_dir_mismatch",
442
+ "action": "prompt_leader",
443
+ "next_step": "Ask the leader whether to trust this foreign workspace prompt.",
444
+ }
455
445
  answer = _tmux_inject_text(
456
446
  str(pane_id),
457
- "",
447
+ "" if explicit_opt_in else "1",
458
448
  "Enter",
459
449
  f"team-agent-trust-auto-answer-{str(pane_id).strip('%') or 'pane'}",
460
450
  attempts=1,
@@ -470,11 +460,12 @@ def attempt_trust_auto_answer(
470
460
  error=error,
471
461
  )
472
462
  return {"ok": False, "answered": False, "reason": "tmux_send_keys_failed", "error": error}
463
+ _TRUST_AUTO_ANSWERED.add(idempotency_key)
473
464
  event_log.write(
474
465
  "leader_panes.trust_auto_answered",
475
466
  pane_id=pane_id,
476
467
  workspace=str(workspace),
477
- opted_in=True,
468
+ capture_hash=capture_hash,
478
469
  )
479
470
  return {"ok": True, "answered": True, "reason": "trust_auto_answered"}
480
471
 
@@ -527,6 +518,7 @@ def _emit_spec_opt_in_deprecation(event_log: EventLog | None) -> None:
527
518
 
528
519
 
529
520
  _SPEC_OPT_IN_DEPRECATION_WARNED = False
521
+ _TRUST_AUTO_ANSWERED: set[tuple[str, str]] = set()
530
522
 
531
523
 
532
524
  def _reset_spec_opt_in_deprecation_state() -> None:
@@ -47,6 +47,8 @@ def detect_non_input_scrollback(capture_tail: str) -> str | None:
47
47
  return "y_n_confirm"
48
48
  for first, second in zip(nonempty, nonempty[1:]):
49
49
  if _starts_numbered_choice(first, "1") and _starts_numbered_choice(second, "2"):
50
+ if not _numbered_menu_shape(nonempty):
51
+ continue
50
52
  if stale_before_input:
51
53
  return None
52
54
  return "numbered_menu"
@@ -72,6 +74,26 @@ def _starts_numbered_choice(line: str, number: str) -> bool:
72
74
  return bool(re.match(rf"^\s*(?:[›❯>]\s*)?{number}\.\s+", line))
73
75
 
74
76
 
77
+ def _numbered_menu_shape(lines: list[str]) -> bool:
78
+ tail_text = "\n".join(lines)
79
+ if any(re.match(r"^\s*[›❯>]\s*\d+\.\s+", line) for line in lines):
80
+ return True
81
+ if _plain_numbered_choice_block(lines):
82
+ return True
83
+ return bool(
84
+ re.search(r"\b(enter|return)\b.*\b(confirm|select|continue)\b", tail_text, re.IGNORECASE)
85
+ or re.search(r"\b(confirm|select|continue)\b.*\b(enter|return)\b", tail_text, re.IGNORECASE)
86
+ or re.search(r"\besc\b.*\b(cancel|back|quit)\b", tail_text, re.IGNORECASE)
87
+ )
88
+
89
+
90
+ def _plain_numbered_choice_block(lines: list[str]) -> bool:
91
+ choices = [line.strip() for line in lines if re.match(r"^\s*\d+\.\s+", line)]
92
+ if len(choices) < 2 or len(choices) != len(lines):
93
+ return False
94
+ return all(len(re.sub(r"^\d+\.\s+", "", choice).strip()) <= 32 for choice in choices)
95
+
96
+
75
97
  def _stale_non_input_before_ready_prompt(lines: list[str]) -> bool:
76
98
  latest_non_input = -1
77
99
  latest_ready = -1
@@ -104,6 +104,10 @@ class ClaudeCodeAdapter(ProviderAdapter):
104
104
  "attribution_confidence": match["confidence"],
105
105
  "spawn_cwd": str(cwd),
106
106
  }
107
+ if spawn_context.get("auth_mode") == "compatible_api":
108
+ fallback = find_compatible_api_claude_transcript_fallback(root, Path(str(cwd)), start, agent_id)
109
+ if fallback:
110
+ return fallback
107
111
  if time.monotonic() >= deadline:
108
112
  return None
109
113
  time.sleep(0.2)
@@ -327,6 +331,48 @@ def find_claude_transcript(
327
331
  return candidates[0]
328
332
 
329
333
 
334
+ def find_compatible_api_claude_transcript_fallback(
335
+ root: Path,
336
+ cwd: Path,
337
+ spawn_time: datetime,
338
+ agent_id: str,
339
+ ) -> dict[str, Any] | None:
340
+ _ = agent_id
341
+ if not root.exists():
342
+ return None
343
+ lower_bound = spawn_time - timedelta(seconds=5)
344
+ upper_bound = datetime.now(timezone.utc)
345
+ candidates: list[Path] = []
346
+ for directory in claude_project_dirs(root, cwd):
347
+ try:
348
+ candidates.extend(path for path in directory.glob("*.jsonl") if path.is_file())
349
+ except OSError:
350
+ continue
351
+ try:
352
+ ordered = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[:5]
353
+ except OSError:
354
+ return None
355
+ for path in ordered:
356
+ try:
357
+ stat = path.stat()
358
+ except OSError:
359
+ continue
360
+ if stat.st_size <= 0:
361
+ continue
362
+ timestamp = datetime.fromtimestamp(stat.st_mtime, timezone.utc)
363
+ if timestamp < lower_bound or timestamp > upper_bound:
364
+ continue
365
+ return {
366
+ "session_id": None,
367
+ "rollout_path": str(path),
368
+ "captured_at": datetime.now(timezone.utc).isoformat(),
369
+ "captured_via": "fs_mtime_fallback",
370
+ "attribution_confidence": "low",
371
+ "spawn_cwd": str(cwd),
372
+ }
373
+ return None
374
+
375
+
330
376
  def claude_project_dirs(root: Path, cwd: Path) -> list[Path]:
331
377
  return [directory for directory in _unique_paths([claude_project_dir(root, cwd), claude_legacy_project_dir(root, cwd)]) if directory.exists()]
332
378
 
@@ -63,6 +63,7 @@ def read_fault_facts(provider: str, records: list[dict[str, Any]]) -> list[dict[
63
63
 
64
64
 
65
65
  def _reader_for(provider: str, registry: Any = None) -> Any:
66
+ provider = _reader_provider(provider)
66
67
  if provider in _READER_CACHE:
67
68
  return _READER_CACHE[provider]
68
69
  entry = None
@@ -83,4 +84,8 @@ def _reader_for(provider: str, registry: Any = None) -> Any:
83
84
  return module
84
85
 
85
86
 
87
+ def _reader_provider(provider: str) -> str:
88
+ return "claude" if provider == "claude_code" else provider
89
+
90
+
86
91
  __all__ = ["read_turn_state", "read_fault_facts", "get_provider_registry"]
@@ -950,17 +950,20 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
950
950
  lock_path = runtime_dir(workspace) / f"{name}.lock"
951
951
  lock_path.parent.mkdir(parents=True, exist_ok=True)
952
952
  event_log = EventLog(workspace)
953
+ log_lock_events = name != "state-save"
953
954
  start = time.monotonic()
954
955
  with lock_path.open("w", encoding="utf-8") as lock_file:
955
956
  while True:
956
957
  try:
957
958
  fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
958
959
  waited = time.monotonic() - start
959
- event_log.write("runtime.lock_acquired", lock=name, waited_sec=round(waited, 3))
960
+ if log_lock_events:
961
+ event_log.write("runtime.lock_acquired", lock=name, waited_sec=round(waited, 3))
960
962
  break
961
963
  except BlockingIOError:
962
964
  if time.monotonic() - start >= timeout:
963
- event_log.write("runtime.lock_busy", lock=name, timeout_sec=timeout)
965
+ if log_lock_events:
966
+ event_log.write("runtime.lock_busy", lock=name, timeout_sec=timeout)
964
967
  raise RuntimeError(
965
968
  f"{name} is locked by another team-agent process; serialize team-agent {name} calls and retry"
966
969
  )
@@ -969,7 +972,8 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
969
972
  yield
970
973
  finally:
971
974
  fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
972
- event_log.write("runtime.lock_released", lock=name)
975
+ if log_lock_events:
976
+ event_log.write("runtime.lock_released", lock=name)
973
977
 
974
978
 
975
979
  def _leader_id(state: dict[str, Any], spec: dict[str, Any]) -> str:
@@ -82,6 +82,7 @@ def capture_agent_session(
82
82
  "predetermined_session_id": agent_state.get("_pending_session_id"),
83
83
  "exclude_session_ids": sorted(exclude_session_ids or set()),
84
84
  "claude_projects_root": agent_state.get("claude_projects_root"),
85
+ "auth_mode": agent_state.get("auth_mode"),
85
86
  }
86
87
  deadline = time.monotonic() + max(timeout_s, 0.0)
87
88
  while True:
@@ -89,7 +90,7 @@ def capture_agent_session(
89
90
  # outer loop owns the polling budget so behaviour stays consistent
90
91
  # whether or not the adapter has its own internal sleep.
91
92
  result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
92
- if isinstance(result, dict) and result.get("session_id"):
93
+ if isinstance(result, dict) and (result.get("session_id") or result.get("rollout_path")):
93
94
  copy_session_metadata(agent_state, result)
94
95
  agent_state.pop("_pending_session_id", None)
95
96
  event_log.write(
@@ -1,10 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import hashlib
4
+ import errno
4
5
  import json
5
6
  import os
6
7
  import copy
7
8
  import subprocess
9
+ import time
8
10
  import uuid
9
11
  from datetime import datetime, timezone
10
12
  from pathlib import Path
@@ -488,16 +490,105 @@ def validate_leader_uuid_from_targets(receiver: dict[str, Any], targets: dict[st
488
490
 
489
491
 
490
492
  def save_runtime_state(workspace: Path, state: dict[str, Any]) -> None:
491
- _migrate_state_identity(state, workspace)
492
493
  path = runtime_state_path(workspace)
493
- path.parent.mkdir(parents=True, exist_ok=True)
494
- tmp_path = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.tmp")
494
+ cached = _RUNTIME_STATE_CACHE.get(str(path))
495
+ if cached is not None and state == cached:
496
+ return
497
+ _migrate_state_identity(state, workspace)
498
+ cached = _RUNTIME_STATE_CACHE.get(str(path))
499
+ if cached is not None and state == cached:
500
+ return
501
+ if path.exists():
502
+ try:
503
+ existing = json.loads(path.read_text(encoding="utf-8"))
504
+ normalize_agent_session_state(existing)
505
+ _migrate_state_identity(existing, workspace)
506
+ if state == existing:
507
+ _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
508
+ return
509
+ except Exception:
510
+ pass
511
+ from team_agent.runtime import _runtime_lock
512
+ with _runtime_lock(workspace, "state-save", timeout=2.0):
513
+ path.parent.mkdir(parents=True, exist_ok=True)
514
+ payload = json.dumps(state, indent=2, ensure_ascii=False)
515
+ delays = [0.05, 0.2, 0.5]
516
+ for attempt in range(len(delays) + 1):
517
+ tmp_path = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.tmp")
518
+ try:
519
+ tmp_path.write_text(payload, encoding="utf-8")
520
+ os.replace(tmp_path, path)
521
+ _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
522
+ return
523
+ except (PermissionError, OSError) as exc:
524
+ if not _retryable_replace_error(exc) or attempt >= len(delays):
525
+ if _retryable_replace_error(exc):
526
+ _self_heal_runtime_state(workspace, path, payload, state, attempt + 1, exc)
527
+ return
528
+ raise
529
+ from team_agent.events import EventLog
530
+ EventLog(workspace).write(
531
+ "runtime.state.save_retry",
532
+ attempt=attempt + 1,
533
+ errno=getattr(exc, "errno", None),
534
+ errno_name=errno.errorcode.get(getattr(exc, "errno", 0), None),
535
+ error=str(exc),
536
+ )
537
+ time.sleep(delays[attempt])
538
+ finally:
539
+ tmp_path.unlink(missing_ok=True)
540
+
541
+
542
+ def _retryable_replace_error(exc: BaseException) -> bool:
543
+ return isinstance(exc, PermissionError) or (
544
+ isinstance(exc, OSError) and getattr(exc, "errno", None) in {errno.EACCES, errno.EPERM, errno.EBUSY}
545
+ )
546
+
547
+
548
+ def _self_heal_runtime_state(
549
+ workspace: Path,
550
+ path: Path,
551
+ payload: str,
552
+ state: dict[str, Any],
553
+ attempts_used: int,
554
+ original_exc: BaseException,
555
+ ) -> None:
556
+ from team_agent.events import EventLog
557
+ event_log = EventLog(workspace)
558
+ heal_tmp = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.heal.tmp")
559
+ backup = path.with_name(f"{path.name}.bak.{os.getpid()}")
560
+ backup_created = False
495
561
  try:
496
- tmp_path.write_text(json.dumps(state, indent=2, ensure_ascii=False), encoding="utf-8")
497
- os.replace(tmp_path, path)
562
+ heal_tmp.write_text(payload, encoding="utf-8")
563
+ try:
564
+ os.replace(path, backup)
565
+ backup_created = True
566
+ except FileNotFoundError:
567
+ backup_created = False
568
+ os.replace(heal_tmp, path)
498
569
  _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
570
+ event_log.write(
571
+ "runtime.state.self_healed",
572
+ inode_rebuilt=True,
573
+ attempts_used=attempts_used,
574
+ replace_retries=max(0, attempts_used - 1),
575
+ )
576
+ except Exception as exc:
577
+ if backup_created:
578
+ try:
579
+ os.replace(backup, path)
580
+ except Exception as restore_exc:
581
+ event_log.write("runtime.state.self_heal_restore_failed", error=str(restore_exc))
582
+ event_log.write(
583
+ "runtime.state.save_failed",
584
+ phase="save_runtime_state",
585
+ final_errno=getattr(exc, "errno", getattr(original_exc, "errno", None)),
586
+ error=str(exc),
587
+ retries_used=max(0, attempts_used - 1),
588
+ )
589
+ raise
499
590
  finally:
500
- tmp_path.unlink(missing_ok=True)
591
+ heal_tmp.unlink(missing_ok=True)
501
592
 
502
593
 
503
594
  def save_team_scoped_state(workspace: Path, team_state: dict[str, Any]) -> None: