@team-agent/installer 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,15 +10,28 @@ from team_agent.messaging.deps import (
10
10
  core_render_message,
11
11
  )
12
12
 
13
+ from datetime import datetime, timedelta, timezone
13
14
  from pathlib import Path
14
15
  from typing import Any
15
16
 
17
+
18
+ # Spark MEDIUM sweep #3 (2026-05-26): retry_needed bounded backoff. Each entry is
19
+ # the delay (seconds) BEFORE the attempt with that number runs; attempt 1 was the
20
+ # original delivery, attempt 2 fires 5s after retry_needed, attempt 3 fires 15s
21
+ # after the previous, attempt 4 fires 30s after the previous. _TRUST_RETRY_MAX_ATTEMPTS
22
+ # bounds the total — the 4th retry_needed is terminal and emits
23
+ # leader_panes.trust_auto_answer_exhausted.
24
+ _TRUST_RETRY_BACKOFF_SECONDS = {2: 5, 3: 15, 4: 30}
25
+ _TRUST_RETRY_MAX_ATTEMPTS = 4
26
+
16
27
  def _deliver_pending_message(
17
28
  workspace: Path,
18
29
  state: dict[str, Any],
19
30
  message_id: str,
20
31
  wait_visible: bool = True,
21
32
  timeout: float = 30.0,
33
+ *,
34
+ _trust_retry_attempt: int = 1,
22
35
  ) -> dict[str, Any]:
23
36
  store = MessageStore(workspace)
24
37
  row = next((m for m in store.messages() if m["message_id"] == message_id), None)
@@ -65,9 +78,49 @@ def _deliver_pending_message(
65
78
  attempts=3 if wait_visible else 1,
66
79
  provider=agent_state.get("provider", "fake"),
67
80
  )
81
+ if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
82
+ # Gap 29 (Stage 2): opt-in trust auto-answer. The helper enforces both the
83
+ # opt-in flag and a workspace-dir match before sending '1'+Enter, then we
84
+ # retry the original paste once the prompt has actually been dismissed.
85
+ # Bypassed entirely when opt-out (default) — the existing failed envelope
86
+ # is preserved.
87
+ from team_agent.messaging.leader_panes import attempt_trust_auto_answer
88
+ answer = attempt_trust_auto_answer(
89
+ workspace,
90
+ injection.get("pane_id") or target,
91
+ injection.get("pane_capture_tail") or "",
92
+ EventLog(workspace),
93
+ state=state,
94
+ )
95
+ if answer.get("answered"):
96
+ # Spark MEDIUM #4 (2026-05-26): replace the fixed 0.3s sleep with a
97
+ # bounded poll. Slow terminals can take well over a second to clear
98
+ # the trust prompt; sleeping a fixed amount races dismissal and
99
+ # leaves the retry hitting the same codex_trust_prompt state. We
100
+ # poll for prompt dismissal up to 3s; if still present, return a
101
+ # retry_needed envelope and let the upstream scheduler decide
102
+ # whether to back off and try again later.
103
+ dismissed = _wait_for_trust_prompt_dismissal(
104
+ injection.get("pane_id") or target, timeout=3.0,
105
+ )
106
+ if not dismissed:
107
+ return _handle_trust_retry_needed(
108
+ workspace, state, store, message_id, target, injection,
109
+ attempt=_trust_retry_attempt,
110
+ )
111
+ injection = _tmux_inject_text(
112
+ target,
113
+ text,
114
+ "Enter",
115
+ f"team-agent-send-{message_id}-trust-retry",
116
+ attempts=3 if wait_visible else 1,
117
+ provider=agent_state.get("provider", "fake"),
118
+ )
68
119
  if injection["ok"]:
69
120
  store.mark(message_id, "submitted")
70
- EventLog(workspace).write(
121
+ send_event_log = EventLog(workspace)
122
+ _stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
123
+ send_event_log.write(
71
124
  "send.submitted",
72
125
  message_id=message_id,
73
126
  target=target,
@@ -112,9 +165,248 @@ def _deliver_pending_message(
112
165
  "turn_verification": injection.get("turn_verification"),
113
166
  "paste_attempts": injection.get("attempts"),
114
167
  "submit_attempts": injection.get("submit_attempts"),
168
+ "detected": injection.get("detected"),
169
+ "pane_id": injection.get("pane_id"),
170
+ "pane_mode": injection.get("pane_mode"),
171
+ "pane_capture_tail": injection.get("pane_capture_tail"),
115
172
  }
116
173
 
117
174
 
175
+ def _handle_trust_retry_needed(
176
+ workspace: Path,
177
+ state: dict[str, Any],
178
+ store: MessageStore,
179
+ message_id: str,
180
+ target: str,
181
+ injection: dict[str, Any],
182
+ *,
183
+ attempt: int,
184
+ ) -> dict[str, Any]:
185
+ """Spark MEDIUM sweep #3: replace the dead-end failed mark with a real
186
+ bounded-backoff consumer. attempt is the number of the delivery that JUST
187
+ failed (1 = the original delivery; 2..4 = the scheduler-fired retries).
188
+
189
+ Behaviour:
190
+ * attempt < _TRUST_RETRY_MAX_ATTEMPTS: schedule a trust_retry
191
+ scheduled_event for the message, holding the message in 'failed' status
192
+ so _deliver_pending_messages does not race the scheduler. Emit
193
+ leader_panes.trust_auto_answer_retry_scheduled. Return status='retry_scheduled'.
194
+ * attempt >= _TRUST_RETRY_MAX_ATTEMPTS: terminal. Mark the message failed
195
+ and emit leader_panes.trust_auto_answer_exhausted. Return
196
+ status='trust_auto_answer_exhausted'.
197
+ """
198
+ event_log = EventLog(workspace)
199
+ next_attempt = attempt + 1
200
+ if next_attempt > _TRUST_RETRY_MAX_ATTEMPTS:
201
+ store.mark(message_id, "failed", "trust_auto_answer_exhausted")
202
+ event_log.write(
203
+ "leader_panes.trust_auto_answer_exhausted",
204
+ message_id=message_id,
205
+ workspace=str(workspace),
206
+ attempts=attempt,
207
+ target=target,
208
+ pane_id=injection.get("pane_id"),
209
+ reason="trust_auto_answer_exhausted",
210
+ )
211
+ return {
212
+ "ok": False,
213
+ "status": "trust_auto_answer_exhausted",
214
+ "reason": "trust_auto_answer_exhausted",
215
+ "attempts": attempt,
216
+ "detected": injection.get("detected"),
217
+ "pane_id": injection.get("pane_id"),
218
+ "pane_mode": injection.get("pane_mode"),
219
+ "pane_capture_tail": injection.get("pane_capture_tail"),
220
+ }
221
+ backoff = _TRUST_RETRY_BACKOFF_SECONDS.get(next_attempt, _TRUST_RETRY_BACKOFF_SECONDS[_TRUST_RETRY_MAX_ATTEMPTS])
222
+ due_at = (datetime.now(timezone.utc) + timedelta(seconds=backoff)).isoformat()
223
+ owner_team_id = _message_owner_team_id(store, message_id)
224
+ event_id = store.add_scheduled_event(
225
+ due_at,
226
+ message_id,
227
+ "trust_retry",
228
+ {
229
+ "message_id": message_id,
230
+ "attempt": next_attempt,
231
+ "max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
232
+ "first_target": target,
233
+ },
234
+ owner_team_id=owner_team_id,
235
+ )
236
+ # Hold the message in 'failed' so _deliver_pending_messages does not race
237
+ # the scheduled retry. The scheduler consumer resets it to 'accepted' just
238
+ # before re-delivery.
239
+ store.mark(message_id, "failed", "trust_retry_scheduled")
240
+ event_log.write(
241
+ "leader_panes.trust_auto_answer_retry_needed",
242
+ message_id=message_id,
243
+ workspace=str(workspace),
244
+ pane_id=injection.get("pane_id") or target,
245
+ target=target,
246
+ reason="trust_prompt_not_dismissed_after_answer",
247
+ attempt=attempt,
248
+ )
249
+ event_log.write(
250
+ "leader_panes.trust_auto_answer_retry_scheduled",
251
+ message_id=message_id,
252
+ workspace=str(workspace),
253
+ scheduled_event_id=event_id,
254
+ due_at=due_at,
255
+ next_attempt=next_attempt,
256
+ max_attempts=_TRUST_RETRY_MAX_ATTEMPTS,
257
+ backoff_seconds=backoff,
258
+ )
259
+ return {
260
+ "ok": False,
261
+ "status": "retry_scheduled",
262
+ "reason": "trust_prompt_not_dismissed_after_answer",
263
+ "stage": "trust_auto_answer_dismissal_wait",
264
+ "verification": "trust_prompt_not_dismissed_after_answer",
265
+ "scheduled_event_id": event_id,
266
+ "scheduled_retry_at": due_at,
267
+ "next_attempt": next_attempt,
268
+ "max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
269
+ "detected": injection.get("detected"),
270
+ "pane_id": injection.get("pane_id"),
271
+ "pane_mode": injection.get("pane_mode"),
272
+ "pane_capture_tail": injection.get("pane_capture_tail"),
273
+ }
274
+
275
+
276
+ def _message_owner_team_id(store: MessageStore, message_id: str) -> str | None:
277
+ row = _message_by_id(store, message_id)
278
+ if not row:
279
+ return None
280
+ owner = row.get("owner_team_id")
281
+ return str(owner) if owner else None
282
+
283
+
284
+ def _execute_trust_retry(
285
+ workspace: Path,
286
+ store: MessageStore,
287
+ event_log: EventLog,
288
+ payload: dict[str, Any],
289
+ *,
290
+ owner_team_id: str | None = None,
291
+ ) -> dict[str, Any]:
292
+ """Scheduler-side consumer for kind='trust_retry'. Resets the message back
293
+ to 'accepted' so claim_for_delivery succeeds, re-runs _deliver_pending_message,
294
+ and either succeeds, escalates to a further retry (via _handle_trust_retry_needed),
295
+ or hits the terminal exhausted branch.
296
+ """
297
+ from team_agent.state import load_runtime_state
298
+ message_id = str(payload.get("message_id") or "")
299
+ if not message_id:
300
+ return {"ok": False, "reason": "trust_retry_missing_message_id"}
301
+ attempt = int(payload.get("attempt") or 1)
302
+ row = _message_by_id(store, message_id)
303
+ if not row:
304
+ event_log.write(
305
+ "leader_panes.trust_auto_answer_retry_skipped",
306
+ message_id=message_id,
307
+ reason="message_missing",
308
+ attempt=attempt,
309
+ )
310
+ return {"ok": False, "reason": "message_missing"}
311
+ # Reset to accepted so claim_for_delivery succeeds. The previous attempt
312
+ # left the row in 'failed' status with reason='trust_retry_scheduled'.
313
+ store.mark(message_id, "accepted", "trust_retry_resuming")
314
+ event_log.write(
315
+ "leader_panes.trust_auto_answer_retry_attempted",
316
+ message_id=message_id,
317
+ workspace=str(workspace),
318
+ attempt=attempt,
319
+ max_attempts=int(payload.get("max_attempts") or _TRUST_RETRY_MAX_ATTEMPTS),
320
+ )
321
+ state = load_runtime_state(workspace)
322
+ if owner_team_id and isinstance(state.get("teams"), dict):
323
+ scoped = state["teams"].get(owner_team_id)
324
+ if isinstance(scoped, dict):
325
+ state = scoped
326
+ delivery_result = _deliver_pending_message(
327
+ workspace, state, message_id,
328
+ wait_visible=True, timeout=30.0,
329
+ _trust_retry_attempt=attempt,
330
+ )
331
+ return delivery_result
332
+
333
+
334
+ def _stamp_first_send_at_if_leader_to_worker(
335
+ state: dict[str, Any],
336
+ row: dict[str, Any],
337
+ event_log: EventLog | None = None,
338
+ ) -> None:
339
+ """Route B atomicity (2026-05-27): record the first time the leader
340
+ successfully sends work to each worker. The presence of this stamp drives
341
+ restart's resumability decision — a worker the leader has interacted with
342
+ has accumulated conversation state, so a missing session_id at restart
343
+ time IS an atomicity violation. A worker that has never received work
344
+ legitimately fresh-starts during restart.
345
+
346
+ Only stamped once per worker (idempotent across re-sends). Only fires on
347
+ leader -> worker sends; worker-to-worker peer messages do not count.
348
+ The mutation lives on the state dict the caller already saves
349
+ (`save_team_scoped_state` in send.py, or `save_runtime_state` after
350
+ coordinator_tick), so persistence is automatic.
351
+
352
+ C1 (cr verdict, 2026-05-27): when the stamp transitions null -> ts (the
353
+ one-time write), emit a `worker.first_interaction` audit event with
354
+ worker_id, first_send_at, message_id. Re-sends to the same worker hit the
355
+ idempotency guard above and do NOT re-emit. Worker-to-worker peer sends
356
+ short-circuit at the sender check and do NOT emit.
357
+ """
358
+ sender = str(row.get("sender") or "")
359
+ recipient = str(row.get("recipient") or "")
360
+ if not recipient:
361
+ return
362
+ leader_id = str((state.get("leader") or {}).get("id") or "leader")
363
+ if sender not in {"leader", "Leader", leader_id}:
364
+ return
365
+ agents = state.get("agents")
366
+ if not isinstance(agents, dict):
367
+ return
368
+ agent_state = agents.get(recipient)
369
+ if not isinstance(agent_state, dict):
370
+ return
371
+ if agent_state.get("first_send_at"):
372
+ return
373
+ stamp = datetime.now(timezone.utc).isoformat()
374
+ agent_state["first_send_at"] = stamp
375
+ if event_log is not None:
376
+ event_log.write(
377
+ "worker.first_interaction",
378
+ worker_id=recipient,
379
+ first_send_at=stamp,
380
+ message_id=str(row.get("message_id") or ""),
381
+ )
382
+
383
+
384
+ def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
385
+ """Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
386
+ the pane no longer matches detect_non_input_scrollback, False if the prompt
387
+ is still present after `timeout` seconds. Uses the same detector the inject
388
+ path uses so behaviour stays consistent."""
389
+ import time as _time
390
+ from team_agent.messaging.tmux_prompt import detect_non_input_scrollback
391
+ deadline = _time.monotonic() + max(timeout, 0.0)
392
+ while True:
393
+ capture = _capture_pane_tail(target)
394
+ detected = detect_non_input_scrollback(capture)
395
+ if detected != "codex_trust_prompt":
396
+ return True
397
+ if _time.monotonic() >= deadline:
398
+ return False
399
+ _time.sleep(poll_interval)
400
+
401
+
402
+ def _capture_pane_tail(target: str) -> str:
403
+ from team_agent.messaging.deps import _capture_tmux_pane_text
404
+ capture = _capture_tmux_pane_text(target)
405
+ if not capture.get("ok"):
406
+ return ""
407
+ return str(capture.get("capture") or "")
408
+
409
+
118
410
  def _deliver_pending_messages(workspace: Path, state: dict[str, Any], event_log: EventLog) -> list[str]:
119
411
  store = MessageStore(workspace)
120
412
  delivered: list[str] = []
@@ -251,6 +251,19 @@ def _send_to_leader_receiver(
251
251
  f"team-agent-leader-receiver-{message_id}",
252
252
  provider=receiver.get("provider", "codex"),
253
253
  )
254
+ if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
255
+ from team_agent.messaging.trust_auto_answer import retry_injection_after_trust_auto_answer
256
+ injection = retry_injection_after_trust_auto_answer(
257
+ workspace,
258
+ state,
259
+ event_log,
260
+ injection,
261
+ target,
262
+ text,
263
+ submit_key,
264
+ f"team-agent-leader-receiver-{message_id}-trust-retry",
265
+ receiver.get("provider", "codex"),
266
+ )
254
267
  if injection["ok"]:
255
268
  store.mark(message_id, "submitted")
256
269
  event_log.write(
@@ -466,10 +479,6 @@ def _format_team_agent_message(payload: dict[str, Any]) -> str:
466
479
 
467
480
 
468
481
 
469
-
470
-
471
-
472
-
473
482
 
474
483
 
475
484
 
@@ -0,0 +1,216 @@
1
+ """Gap 28 (Slice 2 Stage 2): observe-only detection of leader-pane API errors.
2
+
3
+ The coordinator tick captures the leader pane scrollback once per cycle, scans it for
4
+ known upstream-API error patterns (Claude/Codex CLI errors that occur mid-turn), and
5
+ emits a structured `leader.api_error` audit event. The intent is observability — auto-
6
+ retry belongs to the upstream CLI; this module never touches the pane.
7
+
8
+ Event schema (logged via EventLog.write):
9
+
10
+ event: 'leader.api_error'
11
+ ts: ISO-8601 UTC (added by EventLog)
12
+ leader_session_uuid: str | None
13
+ error_class: 'Overloaded' | 'RateLimit' | 'Timeout' |
14
+ 'NetworkError' | 'Unknown'
15
+ provider: 'claude' | 'codex' | 'claude_code' | str | None
16
+ partial_response_streamed: bool (heuristic: assistant text before the error)
17
+ worker_dispatch_just_before: list[str] (leader→worker msg_ids in the prior 60s)
18
+ retry_count: int (always 0 — the framework does not retry today)
19
+ matched_pattern_snippet: str (the captured error line, ≤160 chars)
20
+
21
+ Detection dedupes within the coordinator state via a (error_class, snippet-tail)
22
+ fingerprint stored under `state['coordinator']['last_api_error_fingerprint']`. A
23
+ clean tick (no error pattern present) clears the fingerprint so the next genuine
24
+ error re-emits. This keeps event volume bounded while still catching distinct
25
+ errors as they occur.
26
+ """
27
+ from __future__ import annotations
28
+
29
+ import re
30
+ from datetime import datetime, timedelta, timezone
31
+ from pathlib import Path
32
+ from typing import Any, Callable
33
+
34
+ from team_agent.events import EventLog
35
+ from team_agent.message_store import MessageStore
36
+
37
+
38
+ # Spark MEDIUM sweeps (2026-05-26):
39
+ # (#3) Require an API/provider context marker near the error keyword. Bare '503' /
40
+ # 'fetch failed' / 'timed out' in user text used to false-fire.
41
+ # (#7) Match across short sliding windows of 1-3 adjacent lines so wrapped tmux
42
+ # output ("claude:\n request timed out") still resolves to a single
43
+ # detection. Window joined with a single space; capped at _WINDOW_MAX_CHARS
44
+ # so the scan stays bounded.
45
+ _API_CONTEXT = (
46
+ r"(?:API\s+Error|HTTP\s*Error|HTTPError|request\s+failed|"
47
+ r"codex|claude|Anthropic|OpenAI|TypeError)"
48
+ )
49
+
50
+ # Patterns operate against a sliding window of up to 3 joined lines. The window
51
+ # never contains '\n' (lines are joined with a single space), so `[^\n]` and `.`
52
+ # behave the same; we use `[^\n]` for self-documentation.
53
+ _ERROR_PATTERNS: list[tuple[re.Pattern[str], str]] = [
54
+ # Overloaded — keyword itself already includes the "API Error:" prefix.
55
+ (re.compile(r"API\s+Error:\s*Overloaded", re.IGNORECASE), "Overloaded"),
56
+ # RateLimit — 429 with "Too Many Requests" is sufficiently specific; require it
57
+ # appear AFTER an API context marker OR before "Too Many Requests" tightly.
58
+ (re.compile(rf"(?:{_API_CONTEXT}[^\n]*\b429\b|\b429\s+Too\s+Many\s+Requests)", re.IGNORECASE), "RateLimit"),
59
+ # 5xx — must share a window with an API-context marker on either side.
60
+ (re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}\b5(?:00|02|03|04)\b", re.IGNORECASE), "NetworkError"),
61
+ (re.compile(rf"\b5(?:00|02|03|04)\b[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "NetworkError"),
62
+ # fetch failed — needs an API-context marker in the same window. The TypeError
63
+ # marker on its own counts (Node fetch frames the error this way).
64
+ (re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}fetch\s+failed", re.IGNORECASE), "NetworkError"),
65
+ (re.compile(rf"fetch\s+failed[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "NetworkError"),
66
+ # Timeout — likewise requires an API-context marker in the window, except for
67
+ # the unambiguous syscall token ETIMEDOUT.
68
+ (re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}(?:request|connection)\s+(?:timed\s+out|timeout)", re.IGNORECASE), "Timeout"),
69
+ (re.compile(rf"(?:request|connection)\s+(?:timed\s+out|timeout)[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "Timeout"),
70
+ (re.compile(r"\bETIMEDOUT\b", re.IGNORECASE), "Timeout"),
71
+ ]
72
+
73
+ _RECENT_LINE_WINDOW = 100 # scan only the most recent N lines
74
+ _SLIDING_WINDOW_LINES = 3 # join up to 3 adjacent lines per scan window
75
+ _WINDOW_MAX_CHARS = 400 # discard windows beyond this length to bound work
76
+ _DISPATCH_WINDOW_SECONDS = 60 # leader→worker sends counted within this lookback
77
+ _PARTIAL_RESPONSE_HEAD_BYTES = 4000
78
+
79
+ _PARTIAL_RESPONSE_HINT = re.compile(
80
+ r"(?:^|\n)\s*(?:Assistant|⏺|●|> |I'll |I will |I'm |I am |Let me )",
81
+ re.IGNORECASE,
82
+ )
83
+
84
+
85
+ def detect_leader_api_errors(
86
+ workspace: Path,
87
+ state: dict[str, Any],
88
+ store: MessageStore,
89
+ event_log: EventLog,
90
+ *,
91
+ capture_fn: Callable[[str], dict[str, Any]] | None = None,
92
+ now_fn: Callable[[], datetime] | None = None,
93
+ ) -> list[dict[str, Any]]:
94
+ """Coordinator-tick entry point. Returns a list of emitted events (0 or 1)."""
95
+ receiver = state.get("leader_receiver") or {}
96
+ pane = receiver.get("pane_id") if receiver.get("mode") == "direct_tmux" else None
97
+ if not pane:
98
+ return []
99
+ capture_fn = capture_fn or _default_capture_fn()
100
+ capture = capture_fn(str(pane))
101
+ if not capture.get("ok"):
102
+ return []
103
+ scrollback = str(capture.get("capture") or "")
104
+ coordinator_state = state.setdefault("coordinator", {})
105
+ found = _match_first_error(scrollback)
106
+ if not found:
107
+ if coordinator_state.get("last_api_error_fingerprint"):
108
+ coordinator_state["last_api_error_fingerprint"] = None
109
+ return []
110
+ error_class, snippet = found
111
+ fingerprint = f"{error_class}::{snippet[-120:]}"
112
+ if coordinator_state.get("last_api_error_fingerprint") == fingerprint:
113
+ return []
114
+ coordinator_state["last_api_error_fingerprint"] = fingerprint
115
+ now = (now_fn() if now_fn else datetime.now(timezone.utc))
116
+ cutoff_iso = (now - timedelta(seconds=_DISPATCH_WINDOW_SECONDS)).isoformat()
117
+ leader_uuid = (
118
+ str((state.get("team_owner") or {}).get("leader_session_uuid") or "")
119
+ or str(receiver.get("leader_session_uuid") or "")
120
+ or None
121
+ )
122
+ provider = str(receiver.get("provider") or "") or None
123
+ event = event_log.write(
124
+ "leader.api_error",
125
+ leader_session_uuid=leader_uuid,
126
+ error_class=error_class,
127
+ provider=provider,
128
+ partial_response_streamed=_scrollback_has_partial_response(scrollback, snippet),
129
+ worker_dispatch_just_before=_recent_leader_dispatches(store, cutoff_iso),
130
+ retry_count=0,
131
+ matched_pattern_snippet=snippet[:160],
132
+ )
133
+ return [event]
134
+
135
+
136
+ def _default_capture_fn() -> Callable[[str], dict[str, Any]]:
137
+ from team_agent.messaging.deps import _capture_tmux_pane_text
138
+ return _capture_tmux_pane_text
139
+
140
+
141
+ def _match_first_error(scrollback: str) -> tuple[str, str] | None:
142
+ """Spark MEDIUM #7: sliding window of 1..N adjacent lines. Lines inside a
143
+ window are joined with a single space so a wrapped pair such as
144
+ claude:
145
+ request timed out
146
+ is detected as one event without permitting unbounded cross-line matches.
147
+ Latest window wins so the freshest error is reported."""
148
+ if not scrollback:
149
+ return None
150
+ lines = [line.strip() for line in scrollback.splitlines()[-_RECENT_LINE_WINDOW:]]
151
+ if not lines:
152
+ return None
153
+ best: tuple[int, str, str] | None = None
154
+ for start in range(len(lines)):
155
+ for size in range(1, _SLIDING_WINDOW_LINES + 1):
156
+ end = start + size
157
+ if end > len(lines):
158
+ break
159
+ window = " ".join(line for line in lines[start:end] if line)
160
+ if not window:
161
+ continue
162
+ # Spark MEDIUM sweep #3 (2026-05-26): tail-preserve instead of
163
+ # dropping the window wholesale. Errors land at the END of verbose
164
+ # diagnostics (stack traces, retry chatter, etc.). If we discarded
165
+ # any window over the cap we silently lost recall on long wrapped
166
+ # output. Scanning the LAST _WINDOW_MAX_CHARS still bounds regex
167
+ # cost while keeping the freshest context — the bit most likely to
168
+ # contain the actual provider error keyword.
169
+ if len(window) > _WINDOW_MAX_CHARS:
170
+ window = window[-_WINDOW_MAX_CHARS:]
171
+ for pattern, error_class in _ERROR_PATTERNS:
172
+ match = pattern.search(window)
173
+ if not match:
174
+ continue
175
+ snippet = window[:240]
176
+ if best is None or start > best[0]:
177
+ best = (start, error_class, snippet)
178
+ # First match per window is enough; later windows may override.
179
+ break
180
+ if best is None:
181
+ return None
182
+ return best[1], best[2]
183
+
184
+
185
+ def _scrollback_has_partial_response(scrollback: str, error_snippet: str) -> bool:
186
+ idx = scrollback.rfind(error_snippet)
187
+ if idx == -1:
188
+ return False
189
+ head = scrollback[max(0, idx - _PARTIAL_RESPONSE_HEAD_BYTES): idx]
190
+ return bool(_PARTIAL_RESPONSE_HINT.search(head))
191
+
192
+
193
+ def _recent_leader_dispatches(store: MessageStore, cutoff_iso: str) -> list[str]:
194
+ out: list[str] = []
195
+ try:
196
+ rows = store.messages()
197
+ except Exception:
198
+ return out
199
+ for row in rows:
200
+ sender = str(row.get("sender") or "")
201
+ if sender not in {"leader", "Leader"} and not _looks_like_leader_sender(sender):
202
+ continue
203
+ created = str(row.get("created_at") or "")
204
+ if not created or created < cutoff_iso:
205
+ continue
206
+ msg_id = str(row.get("message_id") or "")
207
+ if msg_id:
208
+ out.append(msg_id)
209
+ return out
210
+
211
+
212
+ def _looks_like_leader_sender(sender: str) -> bool:
213
+ return sender.startswith("leader") or sender.lower() == "leader"
214
+
215
+
216
+ __all__ = ["detect_leader_api_errors"]