@team-agent/installer 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +104 -3
- package/src/team_agent/cli/parser.py +10 -1
- package/src/team_agent/coordinator/lifecycle.py +3 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +199 -28
- package/src/team_agent/launch/core.py +2 -1
- package/src/team_agent/lifecycle/operations.py +1 -0
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +8 -7
- package/src/team_agent/message_store/schema.py +8 -2
- package/src/team_agent/messaging/delivery.py +293 -1
- package/src/team_agent/messaging/leader.py +13 -4
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +200 -0
- package/src/team_agent/messaging/scheduler.py +12 -0
- package/src/team_agent/messaging/send.py +21 -26
- package/src/team_agent/messaging/tmux_io.py +153 -23
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +44 -0
- package/src/team_agent/restart/orchestration.py +207 -4
- package/src/team_agent/runtime.py +3 -3
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +59 -0
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/watch/__init__.py +145 -0
|
@@ -10,15 +10,28 @@ from team_agent.messaging.deps import (
|
|
|
10
10
|
core_render_message,
|
|
11
11
|
)
|
|
12
12
|
|
|
13
|
+
from datetime import datetime, timedelta, timezone
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from typing import Any
|
|
15
16
|
|
|
17
|
+
|
|
18
|
+
# Spark MEDIUM sweep #3 (2026-05-26): retry_needed bounded backoff. Each entry is
|
|
19
|
+
# the delay (seconds) BEFORE the attempt with that number runs; attempt 1 was the
|
|
20
|
+
# original delivery, attempt 2 fires 5s after retry_needed, attempt 3 fires 15s
|
|
21
|
+
# after the previous, attempt 4 fires 30s after the previous. _TRUST_RETRY_MAX_ATTEMPTS
|
|
22
|
+
# bounds the total — the 4th retry_needed is terminal and emits
|
|
23
|
+
# leader_panes.trust_auto_answer_exhausted.
|
|
24
|
+
_TRUST_RETRY_BACKOFF_SECONDS = {2: 5, 3: 15, 4: 30}
|
|
25
|
+
_TRUST_RETRY_MAX_ATTEMPTS = 4
|
|
26
|
+
|
|
16
27
|
def _deliver_pending_message(
|
|
17
28
|
workspace: Path,
|
|
18
29
|
state: dict[str, Any],
|
|
19
30
|
message_id: str,
|
|
20
31
|
wait_visible: bool = True,
|
|
21
32
|
timeout: float = 30.0,
|
|
33
|
+
*,
|
|
34
|
+
_trust_retry_attempt: int = 1,
|
|
22
35
|
) -> dict[str, Any]:
|
|
23
36
|
store = MessageStore(workspace)
|
|
24
37
|
row = next((m for m in store.messages() if m["message_id"] == message_id), None)
|
|
@@ -65,9 +78,49 @@ def _deliver_pending_message(
|
|
|
65
78
|
attempts=3 if wait_visible else 1,
|
|
66
79
|
provider=agent_state.get("provider", "fake"),
|
|
67
80
|
)
|
|
81
|
+
if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
|
|
82
|
+
# Gap 29 (Stage 2): opt-in trust auto-answer. The helper enforces both the
|
|
83
|
+
# opt-in flag and a workspace-dir match before sending '1'+Enter, then we
|
|
84
|
+
# retry the original paste once the prompt has actually been dismissed.
|
|
85
|
+
# Bypassed entirely when opt-out (default) — the existing failed envelope
|
|
86
|
+
# is preserved.
|
|
87
|
+
from team_agent.messaging.leader_panes import attempt_trust_auto_answer
|
|
88
|
+
answer = attempt_trust_auto_answer(
|
|
89
|
+
workspace,
|
|
90
|
+
injection.get("pane_id") or target,
|
|
91
|
+
injection.get("pane_capture_tail") or "",
|
|
92
|
+
EventLog(workspace),
|
|
93
|
+
state=state,
|
|
94
|
+
)
|
|
95
|
+
if answer.get("answered"):
|
|
96
|
+
# Spark MEDIUM #4 (2026-05-26): replace the fixed 0.3s sleep with a
|
|
97
|
+
# bounded poll. Slow terminals can take well over a second to clear
|
|
98
|
+
# the trust prompt; sleeping a fixed amount races dismissal and
|
|
99
|
+
# leaves the retry hitting the same codex_trust_prompt state. We
|
|
100
|
+
# poll for prompt dismissal up to 3s; if still present, return a
|
|
101
|
+
# retry_needed envelope and let the upstream scheduler decide
|
|
102
|
+
# whether to back off and try again later.
|
|
103
|
+
dismissed = _wait_for_trust_prompt_dismissal(
|
|
104
|
+
injection.get("pane_id") or target, timeout=3.0,
|
|
105
|
+
)
|
|
106
|
+
if not dismissed:
|
|
107
|
+
return _handle_trust_retry_needed(
|
|
108
|
+
workspace, state, store, message_id, target, injection,
|
|
109
|
+
attempt=_trust_retry_attempt,
|
|
110
|
+
)
|
|
111
|
+
injection = _tmux_inject_text(
|
|
112
|
+
target,
|
|
113
|
+
text,
|
|
114
|
+
"Enter",
|
|
115
|
+
f"team-agent-send-{message_id}-trust-retry",
|
|
116
|
+
attempts=3 if wait_visible else 1,
|
|
117
|
+
provider=agent_state.get("provider", "fake"),
|
|
118
|
+
)
|
|
68
119
|
if injection["ok"]:
|
|
69
120
|
store.mark(message_id, "submitted")
|
|
70
|
-
EventLog(workspace)
|
|
121
|
+
send_event_log = EventLog(workspace)
|
|
122
|
+
_stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
|
|
123
|
+
send_event_log.write(
|
|
71
124
|
"send.submitted",
|
|
72
125
|
message_id=message_id,
|
|
73
126
|
target=target,
|
|
@@ -112,9 +165,248 @@ def _deliver_pending_message(
|
|
|
112
165
|
"turn_verification": injection.get("turn_verification"),
|
|
113
166
|
"paste_attempts": injection.get("attempts"),
|
|
114
167
|
"submit_attempts": injection.get("submit_attempts"),
|
|
168
|
+
"detected": injection.get("detected"),
|
|
169
|
+
"pane_id": injection.get("pane_id"),
|
|
170
|
+
"pane_mode": injection.get("pane_mode"),
|
|
171
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
115
172
|
}
|
|
116
173
|
|
|
117
174
|
|
|
175
|
+
def _handle_trust_retry_needed(
|
|
176
|
+
workspace: Path,
|
|
177
|
+
state: dict[str, Any],
|
|
178
|
+
store: MessageStore,
|
|
179
|
+
message_id: str,
|
|
180
|
+
target: str,
|
|
181
|
+
injection: dict[str, Any],
|
|
182
|
+
*,
|
|
183
|
+
attempt: int,
|
|
184
|
+
) -> dict[str, Any]:
|
|
185
|
+
"""Spark MEDIUM sweep #3: replace the dead-end failed mark with a real
|
|
186
|
+
bounded-backoff consumer. attempt is the number of the delivery that JUST
|
|
187
|
+
failed (1 = the original delivery; 2..4 = the scheduler-fired retries).
|
|
188
|
+
|
|
189
|
+
Behaviour:
|
|
190
|
+
* attempt < _TRUST_RETRY_MAX_ATTEMPTS: schedule a trust_retry
|
|
191
|
+
scheduled_event for the message, holding the message in 'failed' status
|
|
192
|
+
so _deliver_pending_messages does not race the scheduler. Emit
|
|
193
|
+
leader_panes.trust_auto_answer_retry_scheduled. Return status='retry_scheduled'.
|
|
194
|
+
* attempt >= _TRUST_RETRY_MAX_ATTEMPTS: terminal. Mark the message failed
|
|
195
|
+
and emit leader_panes.trust_auto_answer_exhausted. Return
|
|
196
|
+
status='trust_auto_answer_exhausted'.
|
|
197
|
+
"""
|
|
198
|
+
event_log = EventLog(workspace)
|
|
199
|
+
next_attempt = attempt + 1
|
|
200
|
+
if next_attempt > _TRUST_RETRY_MAX_ATTEMPTS:
|
|
201
|
+
store.mark(message_id, "failed", "trust_auto_answer_exhausted")
|
|
202
|
+
event_log.write(
|
|
203
|
+
"leader_panes.trust_auto_answer_exhausted",
|
|
204
|
+
message_id=message_id,
|
|
205
|
+
workspace=str(workspace),
|
|
206
|
+
attempts=attempt,
|
|
207
|
+
target=target,
|
|
208
|
+
pane_id=injection.get("pane_id"),
|
|
209
|
+
reason="trust_auto_answer_exhausted",
|
|
210
|
+
)
|
|
211
|
+
return {
|
|
212
|
+
"ok": False,
|
|
213
|
+
"status": "trust_auto_answer_exhausted",
|
|
214
|
+
"reason": "trust_auto_answer_exhausted",
|
|
215
|
+
"attempts": attempt,
|
|
216
|
+
"detected": injection.get("detected"),
|
|
217
|
+
"pane_id": injection.get("pane_id"),
|
|
218
|
+
"pane_mode": injection.get("pane_mode"),
|
|
219
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
220
|
+
}
|
|
221
|
+
backoff = _TRUST_RETRY_BACKOFF_SECONDS.get(next_attempt, _TRUST_RETRY_BACKOFF_SECONDS[_TRUST_RETRY_MAX_ATTEMPTS])
|
|
222
|
+
due_at = (datetime.now(timezone.utc) + timedelta(seconds=backoff)).isoformat()
|
|
223
|
+
owner_team_id = _message_owner_team_id(store, message_id)
|
|
224
|
+
event_id = store.add_scheduled_event(
|
|
225
|
+
due_at,
|
|
226
|
+
message_id,
|
|
227
|
+
"trust_retry",
|
|
228
|
+
{
|
|
229
|
+
"message_id": message_id,
|
|
230
|
+
"attempt": next_attempt,
|
|
231
|
+
"max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
|
|
232
|
+
"first_target": target,
|
|
233
|
+
},
|
|
234
|
+
owner_team_id=owner_team_id,
|
|
235
|
+
)
|
|
236
|
+
# Hold the message in 'failed' so _deliver_pending_messages does not race
|
|
237
|
+
# the scheduled retry. The scheduler consumer resets it to 'accepted' just
|
|
238
|
+
# before re-delivery.
|
|
239
|
+
store.mark(message_id, "failed", "trust_retry_scheduled")
|
|
240
|
+
event_log.write(
|
|
241
|
+
"leader_panes.trust_auto_answer_retry_needed",
|
|
242
|
+
message_id=message_id,
|
|
243
|
+
workspace=str(workspace),
|
|
244
|
+
pane_id=injection.get("pane_id") or target,
|
|
245
|
+
target=target,
|
|
246
|
+
reason="trust_prompt_not_dismissed_after_answer",
|
|
247
|
+
attempt=attempt,
|
|
248
|
+
)
|
|
249
|
+
event_log.write(
|
|
250
|
+
"leader_panes.trust_auto_answer_retry_scheduled",
|
|
251
|
+
message_id=message_id,
|
|
252
|
+
workspace=str(workspace),
|
|
253
|
+
scheduled_event_id=event_id,
|
|
254
|
+
due_at=due_at,
|
|
255
|
+
next_attempt=next_attempt,
|
|
256
|
+
max_attempts=_TRUST_RETRY_MAX_ATTEMPTS,
|
|
257
|
+
backoff_seconds=backoff,
|
|
258
|
+
)
|
|
259
|
+
return {
|
|
260
|
+
"ok": False,
|
|
261
|
+
"status": "retry_scheduled",
|
|
262
|
+
"reason": "trust_prompt_not_dismissed_after_answer",
|
|
263
|
+
"stage": "trust_auto_answer_dismissal_wait",
|
|
264
|
+
"verification": "trust_prompt_not_dismissed_after_answer",
|
|
265
|
+
"scheduled_event_id": event_id,
|
|
266
|
+
"scheduled_retry_at": due_at,
|
|
267
|
+
"next_attempt": next_attempt,
|
|
268
|
+
"max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
|
|
269
|
+
"detected": injection.get("detected"),
|
|
270
|
+
"pane_id": injection.get("pane_id"),
|
|
271
|
+
"pane_mode": injection.get("pane_mode"),
|
|
272
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _message_owner_team_id(store: MessageStore, message_id: str) -> str | None:
|
|
277
|
+
row = _message_by_id(store, message_id)
|
|
278
|
+
if not row:
|
|
279
|
+
return None
|
|
280
|
+
owner = row.get("owner_team_id")
|
|
281
|
+
return str(owner) if owner else None
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _execute_trust_retry(
|
|
285
|
+
workspace: Path,
|
|
286
|
+
store: MessageStore,
|
|
287
|
+
event_log: EventLog,
|
|
288
|
+
payload: dict[str, Any],
|
|
289
|
+
*,
|
|
290
|
+
owner_team_id: str | None = None,
|
|
291
|
+
) -> dict[str, Any]:
|
|
292
|
+
"""Scheduler-side consumer for kind='trust_retry'. Resets the message back
|
|
293
|
+
to 'accepted' so claim_for_delivery succeeds, re-runs _deliver_pending_message,
|
|
294
|
+
and either succeeds, escalates to a further retry (via _handle_trust_retry_needed),
|
|
295
|
+
or hits the terminal exhausted branch.
|
|
296
|
+
"""
|
|
297
|
+
from team_agent.state import load_runtime_state
|
|
298
|
+
message_id = str(payload.get("message_id") or "")
|
|
299
|
+
if not message_id:
|
|
300
|
+
return {"ok": False, "reason": "trust_retry_missing_message_id"}
|
|
301
|
+
attempt = int(payload.get("attempt") or 1)
|
|
302
|
+
row = _message_by_id(store, message_id)
|
|
303
|
+
if not row:
|
|
304
|
+
event_log.write(
|
|
305
|
+
"leader_panes.trust_auto_answer_retry_skipped",
|
|
306
|
+
message_id=message_id,
|
|
307
|
+
reason="message_missing",
|
|
308
|
+
attempt=attempt,
|
|
309
|
+
)
|
|
310
|
+
return {"ok": False, "reason": "message_missing"}
|
|
311
|
+
# Reset to accepted so claim_for_delivery succeeds. The previous attempt
|
|
312
|
+
# left the row in 'failed' status with reason='trust_retry_scheduled'.
|
|
313
|
+
store.mark(message_id, "accepted", "trust_retry_resuming")
|
|
314
|
+
event_log.write(
|
|
315
|
+
"leader_panes.trust_auto_answer_retry_attempted",
|
|
316
|
+
message_id=message_id,
|
|
317
|
+
workspace=str(workspace),
|
|
318
|
+
attempt=attempt,
|
|
319
|
+
max_attempts=int(payload.get("max_attempts") or _TRUST_RETRY_MAX_ATTEMPTS),
|
|
320
|
+
)
|
|
321
|
+
state = load_runtime_state(workspace)
|
|
322
|
+
if owner_team_id and isinstance(state.get("teams"), dict):
|
|
323
|
+
scoped = state["teams"].get(owner_team_id)
|
|
324
|
+
if isinstance(scoped, dict):
|
|
325
|
+
state = scoped
|
|
326
|
+
delivery_result = _deliver_pending_message(
|
|
327
|
+
workspace, state, message_id,
|
|
328
|
+
wait_visible=True, timeout=30.0,
|
|
329
|
+
_trust_retry_attempt=attempt,
|
|
330
|
+
)
|
|
331
|
+
return delivery_result
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _stamp_first_send_at_if_leader_to_worker(
|
|
335
|
+
state: dict[str, Any],
|
|
336
|
+
row: dict[str, Any],
|
|
337
|
+
event_log: EventLog | None = None,
|
|
338
|
+
) -> None:
|
|
339
|
+
"""Route B atomicity (2026-05-27): record the first time the leader
|
|
340
|
+
successfully sends work to each worker. The presence of this stamp drives
|
|
341
|
+
restart's resumability decision — a worker the leader has interacted with
|
|
342
|
+
has accumulated conversation state, so a missing session_id at restart
|
|
343
|
+
time IS an atomicity violation. A worker that has never received work
|
|
344
|
+
legitimately fresh-starts during restart.
|
|
345
|
+
|
|
346
|
+
Only stamped once per worker (idempotent across re-sends). Only fires on
|
|
347
|
+
leader -> worker sends; worker-to-worker peer messages do not count.
|
|
348
|
+
The mutation lives on the state dict the caller already saves
|
|
349
|
+
(`save_team_scoped_state` in send.py, or `save_runtime_state` after
|
|
350
|
+
coordinator_tick), so persistence is automatic.
|
|
351
|
+
|
|
352
|
+
C1 (cr verdict, 2026-05-27): when the stamp transitions null -> ts (the
|
|
353
|
+
one-time write), emit a `worker.first_interaction` audit event with
|
|
354
|
+
worker_id, first_send_at, message_id. Re-sends to the same worker hit the
|
|
355
|
+
idempotency guard above and do NOT re-emit. Worker-to-worker peer sends
|
|
356
|
+
short-circuit at the sender check and do NOT emit.
|
|
357
|
+
"""
|
|
358
|
+
sender = str(row.get("sender") or "")
|
|
359
|
+
recipient = str(row.get("recipient") or "")
|
|
360
|
+
if not recipient:
|
|
361
|
+
return
|
|
362
|
+
leader_id = str((state.get("leader") or {}).get("id") or "leader")
|
|
363
|
+
if sender not in {"leader", "Leader", leader_id}:
|
|
364
|
+
return
|
|
365
|
+
agents = state.get("agents")
|
|
366
|
+
if not isinstance(agents, dict):
|
|
367
|
+
return
|
|
368
|
+
agent_state = agents.get(recipient)
|
|
369
|
+
if not isinstance(agent_state, dict):
|
|
370
|
+
return
|
|
371
|
+
if agent_state.get("first_send_at"):
|
|
372
|
+
return
|
|
373
|
+
stamp = datetime.now(timezone.utc).isoformat()
|
|
374
|
+
agent_state["first_send_at"] = stamp
|
|
375
|
+
if event_log is not None:
|
|
376
|
+
event_log.write(
|
|
377
|
+
"worker.first_interaction",
|
|
378
|
+
worker_id=recipient,
|
|
379
|
+
first_send_at=stamp,
|
|
380
|
+
message_id=str(row.get("message_id") or ""),
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
|
|
385
|
+
"""Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
|
|
386
|
+
the pane no longer matches detect_non_input_scrollback, False if the prompt
|
|
387
|
+
is still present after `timeout` seconds. Uses the same detector the inject
|
|
388
|
+
path uses so behaviour stays consistent."""
|
|
389
|
+
import time as _time
|
|
390
|
+
from team_agent.messaging.tmux_prompt import detect_non_input_scrollback
|
|
391
|
+
deadline = _time.monotonic() + max(timeout, 0.0)
|
|
392
|
+
while True:
|
|
393
|
+
capture = _capture_pane_tail(target)
|
|
394
|
+
detected = detect_non_input_scrollback(capture)
|
|
395
|
+
if detected != "codex_trust_prompt":
|
|
396
|
+
return True
|
|
397
|
+
if _time.monotonic() >= deadline:
|
|
398
|
+
return False
|
|
399
|
+
_time.sleep(poll_interval)
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _capture_pane_tail(target: str) -> str:
|
|
403
|
+
from team_agent.messaging.deps import _capture_tmux_pane_text
|
|
404
|
+
capture = _capture_tmux_pane_text(target)
|
|
405
|
+
if not capture.get("ok"):
|
|
406
|
+
return ""
|
|
407
|
+
return str(capture.get("capture") or "")
|
|
408
|
+
|
|
409
|
+
|
|
118
410
|
def _deliver_pending_messages(workspace: Path, state: dict[str, Any], event_log: EventLog) -> list[str]:
|
|
119
411
|
store = MessageStore(workspace)
|
|
120
412
|
delivered: list[str] = []
|
|
@@ -251,6 +251,19 @@ def _send_to_leader_receiver(
|
|
|
251
251
|
f"team-agent-leader-receiver-{message_id}",
|
|
252
252
|
provider=receiver.get("provider", "codex"),
|
|
253
253
|
)
|
|
254
|
+
if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
|
|
255
|
+
from team_agent.messaging.trust_auto_answer import retry_injection_after_trust_auto_answer
|
|
256
|
+
injection = retry_injection_after_trust_auto_answer(
|
|
257
|
+
workspace,
|
|
258
|
+
state,
|
|
259
|
+
event_log,
|
|
260
|
+
injection,
|
|
261
|
+
target,
|
|
262
|
+
text,
|
|
263
|
+
submit_key,
|
|
264
|
+
f"team-agent-leader-receiver-{message_id}-trust-retry",
|
|
265
|
+
receiver.get("provider", "codex"),
|
|
266
|
+
)
|
|
254
267
|
if injection["ok"]:
|
|
255
268
|
store.mark(message_id, "submitted")
|
|
256
269
|
event_log.write(
|
|
@@ -466,10 +479,6 @@ def _format_team_agent_message(payload: dict[str, Any]) -> str:
|
|
|
466
479
|
|
|
467
480
|
|
|
468
481
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
482
|
|
|
474
483
|
|
|
475
484
|
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""Gap 28 (Slice 2 Stage 2): observe-only detection of leader-pane API errors.
|
|
2
|
+
|
|
3
|
+
The coordinator tick captures the leader pane scrollback once per cycle, scans it for
|
|
4
|
+
known upstream-API error patterns (Claude/Codex CLI errors that occur mid-turn), and
|
|
5
|
+
emits a structured `leader.api_error` audit event. The intent is observability — auto-
|
|
6
|
+
retry belongs to the upstream CLI; this module never touches the pane.
|
|
7
|
+
|
|
8
|
+
Event schema (logged via EventLog.write):
|
|
9
|
+
|
|
10
|
+
event: 'leader.api_error'
|
|
11
|
+
ts: ISO-8601 UTC (added by EventLog)
|
|
12
|
+
leader_session_uuid: str | None
|
|
13
|
+
error_class: 'Overloaded' | 'RateLimit' | 'Timeout' |
|
|
14
|
+
'NetworkError' | 'Unknown'
|
|
15
|
+
provider: 'claude' | 'codex' | 'claude_code' | str | None
|
|
16
|
+
partial_response_streamed: bool (heuristic: assistant text before the error)
|
|
17
|
+
worker_dispatch_just_before: list[str] (leader→worker msg_ids in the prior 60s)
|
|
18
|
+
retry_count: int (always 0 — the framework does not retry today)
|
|
19
|
+
matched_pattern_snippet: str (the captured error line, ≤160 chars)
|
|
20
|
+
|
|
21
|
+
Detection dedupes within the coordinator state via a (error_class, snippet-tail)
|
|
22
|
+
fingerprint stored under `state['coordinator']['last_api_error_fingerprint']`. A
|
|
23
|
+
clean tick (no error pattern present) clears the fingerprint so the next genuine
|
|
24
|
+
error re-emits. This keeps event volume bounded while still catching distinct
|
|
25
|
+
errors as they occur.
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
from datetime import datetime, timedelta, timezone
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Any, Callable
|
|
33
|
+
|
|
34
|
+
from team_agent.events import EventLog
|
|
35
|
+
from team_agent.message_store import MessageStore
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Spark MEDIUM sweeps (2026-05-26):
|
|
39
|
+
# (#3) Require an API/provider context marker near the error keyword. Bare '503' /
|
|
40
|
+
# 'fetch failed' / 'timed out' in user text used to false-fire.
|
|
41
|
+
# (#7) Match across short sliding windows of 1-3 adjacent lines so wrapped tmux
|
|
42
|
+
# output ("claude:\n request timed out") still resolves to a single
|
|
43
|
+
# detection. Window joined with a single space; capped at _WINDOW_MAX_CHARS
|
|
44
|
+
# so the scan stays bounded.
|
|
45
|
+
_API_CONTEXT = (
|
|
46
|
+
r"(?:API\s+Error|HTTP\s*Error|HTTPError|request\s+failed|"
|
|
47
|
+
r"codex|claude|Anthropic|OpenAI|TypeError)"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Patterns operate against a sliding window of up to 3 joined lines. The window
|
|
51
|
+
# never contains '\n' (lines are joined with a single space), so `[^\n]` and `.`
|
|
52
|
+
# behave the same; we use `[^\n]` for self-documentation.
|
|
53
|
+
_ERROR_PATTERNS: list[tuple[re.Pattern[str], str]] = [
|
|
54
|
+
# Overloaded — keyword itself already includes the "API Error:" prefix.
|
|
55
|
+
(re.compile(r"API\s+Error:\s*Overloaded", re.IGNORECASE), "Overloaded"),
|
|
56
|
+
# RateLimit — 429 with "Too Many Requests" is sufficiently specific; require it
|
|
57
|
+
# appear AFTER an API context marker OR before "Too Many Requests" tightly.
|
|
58
|
+
(re.compile(rf"(?:{_API_CONTEXT}[^\n]*\b429\b|\b429\s+Too\s+Many\s+Requests)", re.IGNORECASE), "RateLimit"),
|
|
59
|
+
# 5xx — must share a window with an API-context marker on either side.
|
|
60
|
+
(re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}\b5(?:00|02|03|04)\b", re.IGNORECASE), "NetworkError"),
|
|
61
|
+
(re.compile(rf"\b5(?:00|02|03|04)\b[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "NetworkError"),
|
|
62
|
+
# fetch failed — needs an API-context marker in the same window. The TypeError
|
|
63
|
+
# marker on its own counts (Node fetch frames the error this way).
|
|
64
|
+
(re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}fetch\s+failed", re.IGNORECASE), "NetworkError"),
|
|
65
|
+
(re.compile(rf"fetch\s+failed[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "NetworkError"),
|
|
66
|
+
# Timeout — likewise requires an API-context marker in the window, except for
|
|
67
|
+
# the unambiguous syscall token ETIMEDOUT.
|
|
68
|
+
(re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}(?:request|connection)\s+(?:timed\s+out|timeout)", re.IGNORECASE), "Timeout"),
|
|
69
|
+
(re.compile(rf"(?:request|connection)\s+(?:timed\s+out|timeout)[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "Timeout"),
|
|
70
|
+
(re.compile(r"\bETIMEDOUT\b", re.IGNORECASE), "Timeout"),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
_RECENT_LINE_WINDOW = 100 # scan only the most recent N lines
|
|
74
|
+
_SLIDING_WINDOW_LINES = 3 # join up to 3 adjacent lines per scan window
|
|
75
|
+
_WINDOW_MAX_CHARS = 400 # discard windows beyond this length to bound work
|
|
76
|
+
_DISPATCH_WINDOW_SECONDS = 60 # leader→worker sends counted within this lookback
|
|
77
|
+
_PARTIAL_RESPONSE_HEAD_BYTES = 4000
|
|
78
|
+
|
|
79
|
+
_PARTIAL_RESPONSE_HINT = re.compile(
|
|
80
|
+
r"(?:^|\n)\s*(?:Assistant|⏺|●|> |I'll |I will |I'm |I am |Let me )",
|
|
81
|
+
re.IGNORECASE,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def detect_leader_api_errors(
|
|
86
|
+
workspace: Path,
|
|
87
|
+
state: dict[str, Any],
|
|
88
|
+
store: MessageStore,
|
|
89
|
+
event_log: EventLog,
|
|
90
|
+
*,
|
|
91
|
+
capture_fn: Callable[[str], dict[str, Any]] | None = None,
|
|
92
|
+
now_fn: Callable[[], datetime] | None = None,
|
|
93
|
+
) -> list[dict[str, Any]]:
|
|
94
|
+
"""Coordinator-tick entry point. Returns a list of emitted events (0 or 1)."""
|
|
95
|
+
receiver = state.get("leader_receiver") or {}
|
|
96
|
+
pane = receiver.get("pane_id") if receiver.get("mode") == "direct_tmux" else None
|
|
97
|
+
if not pane:
|
|
98
|
+
return []
|
|
99
|
+
capture_fn = capture_fn or _default_capture_fn()
|
|
100
|
+
capture = capture_fn(str(pane))
|
|
101
|
+
if not capture.get("ok"):
|
|
102
|
+
return []
|
|
103
|
+
scrollback = str(capture.get("capture") or "")
|
|
104
|
+
coordinator_state = state.setdefault("coordinator", {})
|
|
105
|
+
found = _match_first_error(scrollback)
|
|
106
|
+
if not found:
|
|
107
|
+
if coordinator_state.get("last_api_error_fingerprint"):
|
|
108
|
+
coordinator_state["last_api_error_fingerprint"] = None
|
|
109
|
+
return []
|
|
110
|
+
error_class, snippet = found
|
|
111
|
+
fingerprint = f"{error_class}::{snippet[-120:]}"
|
|
112
|
+
if coordinator_state.get("last_api_error_fingerprint") == fingerprint:
|
|
113
|
+
return []
|
|
114
|
+
coordinator_state["last_api_error_fingerprint"] = fingerprint
|
|
115
|
+
now = (now_fn() if now_fn else datetime.now(timezone.utc))
|
|
116
|
+
cutoff_iso = (now - timedelta(seconds=_DISPATCH_WINDOW_SECONDS)).isoformat()
|
|
117
|
+
leader_uuid = (
|
|
118
|
+
str((state.get("team_owner") or {}).get("leader_session_uuid") or "")
|
|
119
|
+
or str(receiver.get("leader_session_uuid") or "")
|
|
120
|
+
or None
|
|
121
|
+
)
|
|
122
|
+
provider = str(receiver.get("provider") or "") or None
|
|
123
|
+
event = event_log.write(
|
|
124
|
+
"leader.api_error",
|
|
125
|
+
leader_session_uuid=leader_uuid,
|
|
126
|
+
error_class=error_class,
|
|
127
|
+
provider=provider,
|
|
128
|
+
partial_response_streamed=_scrollback_has_partial_response(scrollback, snippet),
|
|
129
|
+
worker_dispatch_just_before=_recent_leader_dispatches(store, cutoff_iso),
|
|
130
|
+
retry_count=0,
|
|
131
|
+
matched_pattern_snippet=snippet[:160],
|
|
132
|
+
)
|
|
133
|
+
return [event]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _default_capture_fn() -> Callable[[str], dict[str, Any]]:
|
|
137
|
+
from team_agent.messaging.deps import _capture_tmux_pane_text
|
|
138
|
+
return _capture_tmux_pane_text
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _match_first_error(scrollback: str) -> tuple[str, str] | None:
|
|
142
|
+
"""Spark MEDIUM #7: sliding window of 1..N adjacent lines. Lines inside a
|
|
143
|
+
window are joined with a single space so a wrapped pair such as
|
|
144
|
+
claude:
|
|
145
|
+
request timed out
|
|
146
|
+
is detected as one event without permitting unbounded cross-line matches.
|
|
147
|
+
Latest window wins so the freshest error is reported."""
|
|
148
|
+
if not scrollback:
|
|
149
|
+
return None
|
|
150
|
+
lines = [line.strip() for line in scrollback.splitlines()[-_RECENT_LINE_WINDOW:]]
|
|
151
|
+
if not lines:
|
|
152
|
+
return None
|
|
153
|
+
best: tuple[int, str, str] | None = None
|
|
154
|
+
for start in range(len(lines)):
|
|
155
|
+
for size in range(1, _SLIDING_WINDOW_LINES + 1):
|
|
156
|
+
end = start + size
|
|
157
|
+
if end > len(lines):
|
|
158
|
+
break
|
|
159
|
+
window = " ".join(line for line in lines[start:end] if line)
|
|
160
|
+
if not window:
|
|
161
|
+
continue
|
|
162
|
+
# Spark MEDIUM sweep #3 (2026-05-26): tail-preserve instead of
|
|
163
|
+
# dropping the window wholesale. Errors land at the END of verbose
|
|
164
|
+
# diagnostics (stack traces, retry chatter, etc.). If we discarded
|
|
165
|
+
# any window over the cap we silently lost recall on long wrapped
|
|
166
|
+
# output. Scanning the LAST _WINDOW_MAX_CHARS still bounds regex
|
|
167
|
+
# cost while keeping the freshest context — the bit most likely to
|
|
168
|
+
# contain the actual provider error keyword.
|
|
169
|
+
if len(window) > _WINDOW_MAX_CHARS:
|
|
170
|
+
window = window[-_WINDOW_MAX_CHARS:]
|
|
171
|
+
for pattern, error_class in _ERROR_PATTERNS:
|
|
172
|
+
match = pattern.search(window)
|
|
173
|
+
if not match:
|
|
174
|
+
continue
|
|
175
|
+
snippet = window[:240]
|
|
176
|
+
if best is None or start > best[0]:
|
|
177
|
+
best = (start, error_class, snippet)
|
|
178
|
+
# First match per window is enough; later windows may override.
|
|
179
|
+
break
|
|
180
|
+
if best is None:
|
|
181
|
+
return None
|
|
182
|
+
return best[1], best[2]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _scrollback_has_partial_response(scrollback: str, error_snippet: str) -> bool:
|
|
186
|
+
idx = scrollback.rfind(error_snippet)
|
|
187
|
+
if idx == -1:
|
|
188
|
+
return False
|
|
189
|
+
head = scrollback[max(0, idx - _PARTIAL_RESPONSE_HEAD_BYTES): idx]
|
|
190
|
+
return bool(_PARTIAL_RESPONSE_HINT.search(head))
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _recent_leader_dispatches(store: MessageStore, cutoff_iso: str) -> list[str]:
|
|
194
|
+
out: list[str] = []
|
|
195
|
+
try:
|
|
196
|
+
rows = store.messages()
|
|
197
|
+
except Exception:
|
|
198
|
+
return out
|
|
199
|
+
for row in rows:
|
|
200
|
+
sender = str(row.get("sender") or "")
|
|
201
|
+
if sender not in {"leader", "Leader"} and not _looks_like_leader_sender(sender):
|
|
202
|
+
continue
|
|
203
|
+
created = str(row.get("created_at") or "")
|
|
204
|
+
if not created or created < cutoff_iso:
|
|
205
|
+
continue
|
|
206
|
+
msg_id = str(row.get("message_id") or "")
|
|
207
|
+
if msg_id:
|
|
208
|
+
out.append(msg_id)
|
|
209
|
+
return out
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _looks_like_leader_sender(sender: str) -> bool:
|
|
213
|
+
return sender.startswith("leader") or sender.lower() == "leader"
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
__all__ = ["detect_leader_api_errors"]
|