@team-agent/installer 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/package.json +1 -1
  2. package/schemas/team.schema.json +6 -0
  3. package/src/team_agent/approvals/runtime_prompts.py +1 -1
  4. package/src/team_agent/cli/commands.py +122 -6
  5. package/src/team_agent/cli/parser.py +42 -1
  6. package/src/team_agent/coordinator/__main__.py +21 -2
  7. package/src/team_agent/coordinator/lifecycle.py +11 -0
  8. package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
  9. package/src/team_agent/events.py +47 -0
  10. package/src/team_agent/launch/core.py +2 -1
  11. package/src/team_agent/leader/__init__.py +273 -60
  12. package/src/team_agent/lifecycle/agents.py +54 -2
  13. package/src/team_agent/lifecycle/operations.py +87 -9
  14. package/src/team_agent/lifecycle/start.py +1 -1
  15. package/src/team_agent/message_store/core.py +8 -7
  16. package/src/team_agent/message_store/leader_notification_log.py +132 -0
  17. package/src/team_agent/message_store/result_watchers.py +144 -1
  18. package/src/team_agent/message_store/schema.py +31 -2
  19. package/src/team_agent/messaging/delivery.py +293 -1
  20. package/src/team_agent/messaging/idle_alerts.py +109 -9
  21. package/src/team_agent/messaging/leader.py +179 -10
  22. package/src/team_agent/messaging/leader_api_errors.py +216 -0
  23. package/src/team_agent/messaging/leader_panes.py +393 -23
  24. package/src/team_agent/messaging/result_delivery.py +219 -4
  25. package/src/team_agent/messaging/results.py +12 -21
  26. package/src/team_agent/messaging/scheduler.py +24 -2
  27. package/src/team_agent/messaging/send.py +21 -26
  28. package/src/team_agent/messaging/tmux_io.py +153 -23
  29. package/src/team_agent/messaging/tmux_prompt.py +87 -0
  30. package/src/team_agent/messaging/trust_auto_answer.py +44 -0
  31. package/src/team_agent/restart/orchestration.py +207 -4
  32. package/src/team_agent/runtime.py +7 -7
  33. package/src/team_agent/rust_core.py +157 -3
  34. package/src/team_agent/sessions/capture.py +65 -15
  35. package/src/team_agent/spec.py +59 -0
  36. package/src/team_agent/state.py +153 -10
  37. package/src/team_agent/status/inbox.py +33 -3
  38. package/src/team_agent/status/queries.py +32 -1
  39. package/src/team_agent/watch/__init__.py +145 -0
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hashlib
4
+
3
5
  from team_agent.messaging.deps import (
4
6
  EventLog,
5
7
  MessageStore,
@@ -10,8 +12,10 @@ from team_agent.messaging.deps import (
10
12
  _validate_leader_receiver,
11
13
  core_render_message,
12
14
  json,
15
+ os,
13
16
  runtime_dir,
14
17
  save_runtime_state,
18
+ team_state_key,
15
19
  time,
16
20
  )
17
21
 
@@ -49,6 +53,19 @@ def _leader_inbox_path(workspace: Path) -> Path:
49
53
  return runtime_dir(workspace) / "leader-inbox.log"
50
54
 
51
55
 
56
+ def _extract_result_id_from_content(content: str) -> str | None:
57
+ """Stage 12: result-notification messages embed a `Result id: <id>` line; the gate
58
+ parses it from content so callers that did NOT plumb the result_id kwarg through
59
+ still consult the dedupe gate. Format mirrors _format_report_result_notification and
60
+ format_result_watcher_notification."""
61
+ if not content:
62
+ return None
63
+ for line in content.splitlines():
64
+ if line.startswith("Result id: "):
65
+ return line.removeprefix("Result id: ").strip() or None
66
+ return None
67
+
68
+
52
69
  def _send_to_leader_receiver(
53
70
  workspace: Path,
54
71
  state: dict[str, Any],
@@ -58,6 +75,8 @@ def _send_to_leader_receiver(
58
75
  sender: str,
59
76
  requires_ack: bool,
60
77
  event_log: EventLog,
78
+ *,
79
+ result_id: str | None = None,
61
80
  ) -> dict[str, Any]:
62
81
  store = MessageStore(workspace)
63
82
  message_id = store.create_message(task_id, sender, leader_id, content, requires_ack=False)
@@ -94,10 +113,30 @@ def _send_to_leader_receiver(
94
113
  error="No direct leader tmux pane is attached. Run team-agent attach-leader.",
95
114
  )
96
115
 
97
- validation = _validate_leader_receiver(receiver)
116
+ owner_identity = state.get("team_owner") or None
117
+ side_pane_refusal = _side_pane_owner_refusal(state, owner_identity)
118
+ if side_pane_refusal:
119
+ event_log.write("leader_receiver.side_pane_refused", **side_pane_refusal)
120
+ return {
121
+ "ok": False,
122
+ "message_id": message_id,
123
+ "status": "refused",
124
+ "to": leader_id,
125
+ "channel": "direct_tmux",
126
+ **side_pane_refusal,
127
+ }
128
+ receiver_for_validation = dict(receiver)
129
+ if owner_identity and owner_identity.get("leader_session_uuid") and not receiver_for_validation.get("leader_session_uuid"):
130
+ receiver_for_validation["leader_session_uuid"] = owner_identity["leader_session_uuid"]
131
+ validation = _validate_leader_receiver(receiver_for_validation)
98
132
  if not validation["ok"]:
99
- owner_identity = state.get("team_owner") or None
100
- rediscovery = _rediscover_leader_receiver(receiver, event_log, owner_identity)
133
+ rediscovery = _rediscover_leader_receiver(
134
+ receiver_for_validation,
135
+ event_log,
136
+ owner_identity,
137
+ invalidation_reason=validation.get("reason"),
138
+ team_id=team_state_key(state),
139
+ )
101
140
  if rediscovery.get("status") == "updated":
102
141
  state["leader_receiver"].update(rediscovery["receiver"])
103
142
  receiver = state["leader_receiver"]
@@ -111,7 +150,7 @@ def _send_to_leader_receiver(
111
150
  payload,
112
151
  event_log,
113
152
  reason="ambiguous",
114
- error="multiple possible leader panes found; rerun team-agent attach-leader --pane <pane_id>",
153
+ error="multiple possible leader panes found; run team-agent claim-leader --confirm from the intended pane",
115
154
  message_status="ambiguous",
116
155
  )
117
156
  if not validation["ok"]:
@@ -128,6 +167,69 @@ def _send_to_leader_receiver(
128
167
  state["leader_receiver"].update(validation["pane"])
129
168
  submit_key, submit_reason = _choose_leader_submit_key(receiver.get("provider", "codex"), validation.get("capture", ""))
130
169
  target = receiver["pane_id"]
170
+ # Stage 12 (Gap 26 ∩ Gap 32 roundtable 2026-05-26) — injection-boundary dedupe gate.
171
+ # Result-notification injections route through claim_leader_notification_delivery; the
172
+ # gate suppresses a second inject for the same (result_id, leader_session_uuid).
173
+ # Non-result messages (peer mirror, idle reminder, ambiguous-prompt) lack a "Result id:"
174
+ # line in their text and bypass the gate.
175
+ effective_result_id = result_id or _extract_result_id_from_content(content)
176
+ leader_uuid_for_gate = str(
177
+ (state.get("team_owner") or {}).get("leader_session_uuid")
178
+ or (state.get("leader_receiver") or {}).get("leader_session_uuid")
179
+ or ""
180
+ )
181
+ if effective_result_id and leader_uuid_for_gate:
182
+ from team_agent.message_store.leader_notification_log import claim_leader_notification_delivery
183
+ envelope_hash = hashlib.sha256(content.encode("utf-8", errors="ignore")).hexdigest()[:16]
184
+ claim = claim_leader_notification_delivery(
185
+ store,
186
+ result_id=effective_result_id,
187
+ leader_session_uuid=leader_uuid_for_gate,
188
+ proposed_message_id=message_id,
189
+ envelope_hash=envelope_hash,
190
+ owner_team_id=team_state_key(state),
191
+ pane_id=target,
192
+ )
193
+ if claim["status"] == "already_notified_by":
194
+ prev_msg = claim.get("notified_message_id")
195
+ prev_hash = claim.get("envelope_content_hash")
196
+ if envelope_hash == prev_hash:
197
+ event_log.write(
198
+ "leader_notification.dedupe_skip",
199
+ result_id=effective_result_id,
200
+ leader_session_uuid=leader_uuid_for_gate,
201
+ prev_message_id=prev_msg,
202
+ this_message_id=message_id,
203
+ prev_ts=claim.get("notified_at"),
204
+ pane_id=target,
205
+ team_id=team_state_key(state),
206
+ )
207
+ else:
208
+ event_log.write(
209
+ "leader_notification.legitimate_duplicate_suspected",
210
+ result_id=effective_result_id,
211
+ leader_session_uuid=leader_uuid_for_gate,
212
+ prev_message_id=prev_msg,
213
+ this_message_id=message_id,
214
+ prev_envelope_hash=prev_hash,
215
+ this_envelope_hash=envelope_hash,
216
+ pane_id=target,
217
+ team_id=team_state_key(state),
218
+ )
219
+ store.mark(message_id, "submitted", "dedupe_suppressed_by_leader_notification_log")
220
+ save_runtime_state(workspace, state)
221
+ return {
222
+ "ok": True,
223
+ "message_id": message_id,
224
+ "status": "submitted",
225
+ "to": leader_id,
226
+ "channel": "direct_tmux",
227
+ "leader_receiver": state["leader_receiver"],
228
+ "visible": False,
229
+ "submitted": False,
230
+ "deduped": True,
231
+ "canonical_message_id": prev_msg,
232
+ }
131
233
  event_log.write(
132
234
  "leader_receiver.deliver_attempt",
133
235
  message_id=message_id,
@@ -139,6 +241,8 @@ def _send_to_leader_receiver(
139
241
  visible_token=rendered.get("token"),
140
242
  payload=payload,
141
243
  warning=validation.get("warning"),
244
+ result_id=effective_result_id,
245
+ leader_session_uuid=leader_uuid_for_gate or None,
142
246
  )
143
247
  injection = _tmux_inject_text(
144
248
  target,
@@ -147,6 +251,19 @@ def _send_to_leader_receiver(
147
251
  f"team-agent-leader-receiver-{message_id}",
148
252
  provider=receiver.get("provider", "codex"),
149
253
  )
254
+ if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
255
+ from team_agent.messaging.trust_auto_answer import retry_injection_after_trust_auto_answer
256
+ injection = retry_injection_after_trust_auto_answer(
257
+ workspace,
258
+ state,
259
+ event_log,
260
+ injection,
261
+ target,
262
+ text,
263
+ submit_key,
264
+ f"team-agent-leader-receiver-{message_id}-trust-retry",
265
+ receiver.get("provider", "codex"),
266
+ )
150
267
  if injection["ok"]:
151
268
  store.mark(message_id, "submitted")
152
269
  event_log.write(
@@ -201,6 +318,64 @@ def _send_to_leader_receiver(
201
318
  )
202
319
 
203
320
 
321
+ def _side_pane_owner_refusal(state: dict[str, Any], owner_identity: dict[str, Any] | None) -> dict[str, Any] | None:
322
+ owner_uuid = str((owner_identity or {}).get("leader_session_uuid") or "")
323
+ caller_uuid = os.environ.get("TEAM_AGENT_LEADER_SESSION_UUID") or os.environ.get("TEAM_AGENT_LEADER_SESSION_UUID_OVERRIDE") or ""
324
+ if not owner_uuid or not caller_uuid or caller_uuid == owner_uuid:
325
+ return None
326
+ bound_pane = (state.get("leader_receiver") or {}).get("pane_id") or (owner_identity or {}).get("pane_id")
327
+ team_id = team_state_key(state)
328
+ return {
329
+ "reason": "team_owner_mismatch",
330
+ "error": (
331
+ f"This workspace's team `{team_id}` is already bound to pane `{bound_pane}`. "
332
+ "To work in this window either start a new team with a different team_id, operate through the bound pane, "
333
+ "or run `team-agent claim-leader --confirm` only if you intend to forcibly take over."
334
+ ),
335
+ "bound_pane_id": bound_pane,
336
+ "caller_uuid_prefix": caller_uuid[:8],
337
+ "uuid_prefix": owner_uuid[:8],
338
+ "action": "team-agent claim-leader --confirm",
339
+ }
340
+
341
+
342
+ def claim_leader_receiver(
343
+ workspace: Path,
344
+ state: dict[str, Any],
345
+ candidate: dict[str, Any],
346
+ event_log: EventLog,
347
+ *,
348
+ confirm: bool,
349
+ expected_epoch: int | None = None,
350
+ ) -> dict[str, Any]:
351
+ from team_agent.messaging.leader_panes import _leader_command_looks_usable, _receiver_from_target, _target_matches_owner_identity, _uuid_prefix
352
+ if not confirm:
353
+ return {"ok": False, "status": "refused", "reason": "confirm_required", "action": "team-agent claim-leader --confirm"}
354
+ owner = state.setdefault("team_owner", {})
355
+ receiver = state.get("leader_receiver") or {}
356
+ current_epoch = int(owner.get("owner_epoch") or receiver.get("owner_epoch") or 0)
357
+ if expected_epoch is not None and current_epoch != expected_epoch:
358
+ event_log.write("leader_receiver.claim_refused", reason="owner_epoch_advanced", owner_epoch=current_epoch, bound_pane_id=receiver.get("pane_id"))
359
+ return {"ok": False, "status": "refused", "reason": "owner_epoch_advanced", "owner_epoch": current_epoch, "bound_pane_id": receiver.get("pane_id")}
360
+ if receiver.get("pane_id") == candidate.get("pane_id"):
361
+ return {"ok": True, "status": "already_bound", "leader_receiver": receiver, "owner_epoch": current_epoch}
362
+ if not _target_matches_owner_identity(candidate, owner):
363
+ event_log.write("leader_receiver.claim_refused", reason="uuid_mismatch", candidate_pane_id=candidate.get("pane_id"))
364
+ return {"ok": False, "status": "refused", "reason": "uuid_mismatch"}
365
+ provider = str(candidate.get("provider") or receiver.get("provider") or "codex")
366
+ if not _leader_command_looks_usable(str(candidate.get("pane_current_command", "")), provider):
367
+ return {"ok": False, "status": "refused", "reason": "wrong_command", "candidate_pane_id": candidate.get("pane_id")}
368
+ next_epoch = current_epoch + 1
369
+ new_receiver = _receiver_from_target(candidate, provider, owner.get("leader_session_uuid"), next_epoch)
370
+ owner["owner_epoch"] = next_epoch
371
+ state["leader_receiver"] = new_receiver
372
+ from team_agent.runtime import _runtime_lock, save_runtime_state
373
+ with _runtime_lock(workspace, "leader_receiver"):
374
+ save_runtime_state(workspace, state)
375
+ event_log.write("leader_receiver.claimed", pane_id=new_receiver["pane_id"], owner_epoch=next_epoch, uuid_prefix=_uuid_prefix(owner))
376
+ return {"ok": True, "status": "claimed", "leader_receiver": new_receiver, "owner_epoch": next_epoch}
377
+
378
+
204
379
  def _fail_leader_delivery(
205
380
  workspace: Path,
206
381
  state: dict[str, Any],
@@ -302,12 +477,6 @@ def _format_team_agent_message(payload: dict[str, Any]) -> str:
302
477
 
303
478
 
304
479
 
305
-
306
-
307
-
308
-
309
-
310
-
311
480
 
312
481
 
313
482
 
@@ -0,0 +1,216 @@
1
+ """Gap 28 (Slice 2 Stage 2): observe-only detection of leader-pane API errors.
2
+
3
+ The coordinator tick captures the leader pane scrollback once per cycle, scans it for
4
+ known upstream-API error patterns (Claude/Codex CLI errors that occur mid-turn), and
5
+ emits a structured `leader.api_error` audit event. The intent is observability — auto-
6
+ retry belongs to the upstream CLI; this module never touches the pane.
7
+
8
+ Event schema (logged via EventLog.write):
9
+
10
+ event: 'leader.api_error'
11
+ ts: ISO-8601 UTC (added by EventLog)
12
+ leader_session_uuid: str | None
13
+ error_class: 'Overloaded' | 'RateLimit' | 'Timeout' |
14
+ 'NetworkError' | 'Unknown'
15
+ provider: 'claude' | 'codex' | 'claude_code' | str | None
16
+ partial_response_streamed: bool (heuristic: assistant text before the error)
17
+ worker_dispatch_just_before: list[str] (leader→worker msg_ids in the prior 60s)
18
+ retry_count: int (always 0 — the framework does not retry today)
19
+ matched_pattern_snippet: str (the captured error line, ≤160 chars)
20
+
21
+ Detection dedupes within the coordinator state via a (error_class, snippet-tail)
22
+ fingerprint stored under `state['coordinator']['last_api_error_fingerprint']`. A
23
+ clean tick (no error pattern present) clears the fingerprint so the next genuine
24
+ error re-emits. This keeps event volume bounded while still catching distinct
25
+ errors as they occur.
26
+ """
27
+ from __future__ import annotations
28
+
29
+ import re
30
+ from datetime import datetime, timedelta, timezone
31
+ from pathlib import Path
32
+ from typing import Any, Callable
33
+
34
+ from team_agent.events import EventLog
35
+ from team_agent.message_store import MessageStore
36
+
37
+
38
+ # Spark MEDIUM sweeps (2026-05-26):
39
+ # (#3) Require an API/provider context marker near the error keyword. Bare '503' /
40
+ # 'fetch failed' / 'timed out' in user text used to false-fire.
41
+ # (#7) Match across short sliding windows of 1-3 adjacent lines so wrapped tmux
42
+ # output ("claude:\n request timed out") still resolves to a single
43
+ # detection. Window joined with a single space; capped at _WINDOW_MAX_CHARS
44
+ # so the scan stays bounded.
45
+ _API_CONTEXT = (
46
+ r"(?:API\s+Error|HTTP\s*Error|HTTPError|request\s+failed|"
47
+ r"codex|claude|Anthropic|OpenAI|TypeError)"
48
+ )
49
+
50
+ # Patterns operate against a sliding window of up to 3 joined lines. The window
51
+ # never contains '\n' (lines are joined with a single space), so `[^\n]` and `.`
52
+ # behave the same; we use `[^\n]` for self-documentation.
53
+ _ERROR_PATTERNS: list[tuple[re.Pattern[str], str]] = [
54
+ # Overloaded — keyword itself already includes the "API Error:" prefix.
55
+ (re.compile(r"API\s+Error:\s*Overloaded", re.IGNORECASE), "Overloaded"),
56
+ # RateLimit — 429 with "Too Many Requests" is sufficiently specific; require it
57
+ # appear AFTER an API context marker OR before "Too Many Requests" tightly.
58
+ (re.compile(rf"(?:{_API_CONTEXT}[^\n]*\b429\b|\b429\s+Too\s+Many\s+Requests)", re.IGNORECASE), "RateLimit"),
59
+ # 5xx — must share a window with an API-context marker on either side.
60
+ (re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}\b5(?:00|02|03|04)\b", re.IGNORECASE), "NetworkError"),
61
+ (re.compile(rf"\b5(?:00|02|03|04)\b[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "NetworkError"),
62
+ # fetch failed — needs an API-context marker in the same window. The TypeError
63
+ # marker on its own counts (Node fetch frames the error this way).
64
+ (re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}fetch\s+failed", re.IGNORECASE), "NetworkError"),
65
+ (re.compile(rf"fetch\s+failed[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "NetworkError"),
66
+ # Timeout — likewise requires an API-context marker in the window, except for
67
+ # the unambiguous syscall token ETIMEDOUT.
68
+ (re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}(?:request|connection)\s+(?:timed\s+out|timeout)", re.IGNORECASE), "Timeout"),
69
+ (re.compile(rf"(?:request|connection)\s+(?:timed\s+out|timeout)[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "Timeout"),
70
+ (re.compile(r"\bETIMEDOUT\b", re.IGNORECASE), "Timeout"),
71
+ ]
72
+
73
+ _RECENT_LINE_WINDOW = 100 # scan only the most recent N lines
74
+ _SLIDING_WINDOW_LINES = 3 # join up to 3 adjacent lines per scan window
75
+ _WINDOW_MAX_CHARS = 400 # discard windows beyond this length to bound work
76
+ _DISPATCH_WINDOW_SECONDS = 60 # leader→worker sends counted within this lookback
77
+ _PARTIAL_RESPONSE_HEAD_BYTES = 4000
78
+
79
+ _PARTIAL_RESPONSE_HINT = re.compile(
80
+ r"(?:^|\n)\s*(?:Assistant|⏺|●|> |I'll |I will |I'm |I am |Let me )",
81
+ re.IGNORECASE,
82
+ )
83
+
84
+
85
+ def detect_leader_api_errors(
86
+ workspace: Path,
87
+ state: dict[str, Any],
88
+ store: MessageStore,
89
+ event_log: EventLog,
90
+ *,
91
+ capture_fn: Callable[[str], dict[str, Any]] | None = None,
92
+ now_fn: Callable[[], datetime] | None = None,
93
+ ) -> list[dict[str, Any]]:
94
+ """Coordinator-tick entry point. Returns a list of emitted events (0 or 1)."""
95
+ receiver = state.get("leader_receiver") or {}
96
+ pane = receiver.get("pane_id") if receiver.get("mode") == "direct_tmux" else None
97
+ if not pane:
98
+ return []
99
+ capture_fn = capture_fn or _default_capture_fn()
100
+ capture = capture_fn(str(pane))
101
+ if not capture.get("ok"):
102
+ return []
103
+ scrollback = str(capture.get("capture") or "")
104
+ coordinator_state = state.setdefault("coordinator", {})
105
+ found = _match_first_error(scrollback)
106
+ if not found:
107
+ if coordinator_state.get("last_api_error_fingerprint"):
108
+ coordinator_state["last_api_error_fingerprint"] = None
109
+ return []
110
+ error_class, snippet = found
111
+ fingerprint = f"{error_class}::{snippet[-120:]}"
112
+ if coordinator_state.get("last_api_error_fingerprint") == fingerprint:
113
+ return []
114
+ coordinator_state["last_api_error_fingerprint"] = fingerprint
115
+ now = (now_fn() if now_fn else datetime.now(timezone.utc))
116
+ cutoff_iso = (now - timedelta(seconds=_DISPATCH_WINDOW_SECONDS)).isoformat()
117
+ leader_uuid = (
118
+ str((state.get("team_owner") or {}).get("leader_session_uuid") or "")
119
+ or str(receiver.get("leader_session_uuid") or "")
120
+ or None
121
+ )
122
+ provider = str(receiver.get("provider") or "") or None
123
+ event = event_log.write(
124
+ "leader.api_error",
125
+ leader_session_uuid=leader_uuid,
126
+ error_class=error_class,
127
+ provider=provider,
128
+ partial_response_streamed=_scrollback_has_partial_response(scrollback, snippet),
129
+ worker_dispatch_just_before=_recent_leader_dispatches(store, cutoff_iso),
130
+ retry_count=0,
131
+ matched_pattern_snippet=snippet[:160],
132
+ )
133
+ return [event]
134
+
135
+
136
+ def _default_capture_fn() -> Callable[[str], dict[str, Any]]:
137
+ from team_agent.messaging.deps import _capture_tmux_pane_text
138
+ return _capture_tmux_pane_text
139
+
140
+
141
+ def _match_first_error(scrollback: str) -> tuple[str, str] | None:
142
+ """Spark MEDIUM #7: sliding window of 1..N adjacent lines. Lines inside a
143
+ window are joined with a single space so a wrapped pair such as
144
+ claude:
145
+ request timed out
146
+ is detected as one event without permitting unbounded cross-line matches.
147
+ Latest window wins so the freshest error is reported."""
148
+ if not scrollback:
149
+ return None
150
+ lines = [line.strip() for line in scrollback.splitlines()[-_RECENT_LINE_WINDOW:]]
151
+ if not lines:
152
+ return None
153
+ best: tuple[int, str, str] | None = None
154
+ for start in range(len(lines)):
155
+ for size in range(1, _SLIDING_WINDOW_LINES + 1):
156
+ end = start + size
157
+ if end > len(lines):
158
+ break
159
+ window = " ".join(line for line in lines[start:end] if line)
160
+ if not window:
161
+ continue
162
+ # Spark MEDIUM sweep #3 (2026-05-26): tail-preserve instead of
163
+ # dropping the window wholesale. Errors land at the END of verbose
164
+ # diagnostics (stack traces, retry chatter, etc.). If we discarded
165
+ # any window over the cap we silently lost recall on long wrapped
166
+ # output. Scanning the LAST _WINDOW_MAX_CHARS still bounds regex
167
+ # cost while keeping the freshest context — the bit most likely to
168
+ # contain the actual provider error keyword.
169
+ if len(window) > _WINDOW_MAX_CHARS:
170
+ window = window[-_WINDOW_MAX_CHARS:]
171
+ for pattern, error_class in _ERROR_PATTERNS:
172
+ match = pattern.search(window)
173
+ if not match:
174
+ continue
175
+ snippet = window[:240]
176
+ if best is None or start > best[0]:
177
+ best = (start, error_class, snippet)
178
+ # First match per window is enough; later windows may override.
179
+ break
180
+ if best is None:
181
+ return None
182
+ return best[1], best[2]
183
+
184
+
185
+ def _scrollback_has_partial_response(scrollback: str, error_snippet: str) -> bool:
186
+ idx = scrollback.rfind(error_snippet)
187
+ if idx == -1:
188
+ return False
189
+ head = scrollback[max(0, idx - _PARTIAL_RESPONSE_HEAD_BYTES): idx]
190
+ return bool(_PARTIAL_RESPONSE_HINT.search(head))
191
+
192
+
193
+ def _recent_leader_dispatches(store: MessageStore, cutoff_iso: str) -> list[str]:
194
+ out: list[str] = []
195
+ try:
196
+ rows = store.messages()
197
+ except Exception:
198
+ return out
199
+ for row in rows:
200
+ sender = str(row.get("sender") or "")
201
+ if sender not in {"leader", "Leader"} and not _looks_like_leader_sender(sender):
202
+ continue
203
+ created = str(row.get("created_at") or "")
204
+ if not created or created < cutoff_iso:
205
+ continue
206
+ msg_id = str(row.get("message_id") or "")
207
+ if msg_id:
208
+ out.append(msg_id)
209
+ return out
210
+
211
+
212
+ def _looks_like_leader_sender(sender: str) -> bool:
213
+ return sender.startswith("leader") or sender.lower() == "leader"
214
+
215
+
216
+ __all__ = ["detect_leader_api_errors"]