@team-agent/installer 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +122 -6
- package/src/team_agent/cli/parser.py +42 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +11 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/launch/core.py +2 -1
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +87 -9
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +8 -7
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +31 -2
- package/src/team_agent/messaging/delivery.py +293 -1
- package/src/team_agent/messaging/idle_alerts.py +109 -9
- package/src/team_agent/messaging/leader.py +179 -10
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +393 -23
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +24 -2
- package/src/team_agent/messaging/send.py +21 -26
- package/src/team_agent/messaging/tmux_io.py +153 -23
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +44 -0
- package/src/team_agent/restart/orchestration.py +207 -4
- package/src/team_agent/runtime.py +7 -7
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +59 -0
- package/src/team_agent/state.py +153 -10
- package/src/team_agent/status/inbox.py +33 -3
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/watch/__init__.py +145 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
4
|
+
|
|
3
5
|
from team_agent.messaging.deps import (
|
|
4
6
|
EventLog,
|
|
5
7
|
MessageStore,
|
|
@@ -10,8 +12,10 @@ from team_agent.messaging.deps import (
|
|
|
10
12
|
_validate_leader_receiver,
|
|
11
13
|
core_render_message,
|
|
12
14
|
json,
|
|
15
|
+
os,
|
|
13
16
|
runtime_dir,
|
|
14
17
|
save_runtime_state,
|
|
18
|
+
team_state_key,
|
|
15
19
|
time,
|
|
16
20
|
)
|
|
17
21
|
|
|
@@ -49,6 +53,19 @@ def _leader_inbox_path(workspace: Path) -> Path:
|
|
|
49
53
|
return runtime_dir(workspace) / "leader-inbox.log"
|
|
50
54
|
|
|
51
55
|
|
|
56
|
+
def _extract_result_id_from_content(content: str) -> str | None:
|
|
57
|
+
"""Stage 12: result-notification messages embed a `Result id: <id>` line; the gate
|
|
58
|
+
parses it from content so callers that did NOT plumb the result_id kwarg through
|
|
59
|
+
still consult the dedupe gate. Format mirrors _format_report_result_notification and
|
|
60
|
+
format_result_watcher_notification."""
|
|
61
|
+
if not content:
|
|
62
|
+
return None
|
|
63
|
+
for line in content.splitlines():
|
|
64
|
+
if line.startswith("Result id: "):
|
|
65
|
+
return line.removeprefix("Result id: ").strip() or None
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
52
69
|
def _send_to_leader_receiver(
|
|
53
70
|
workspace: Path,
|
|
54
71
|
state: dict[str, Any],
|
|
@@ -58,6 +75,8 @@ def _send_to_leader_receiver(
|
|
|
58
75
|
sender: str,
|
|
59
76
|
requires_ack: bool,
|
|
60
77
|
event_log: EventLog,
|
|
78
|
+
*,
|
|
79
|
+
result_id: str | None = None,
|
|
61
80
|
) -> dict[str, Any]:
|
|
62
81
|
store = MessageStore(workspace)
|
|
63
82
|
message_id = store.create_message(task_id, sender, leader_id, content, requires_ack=False)
|
|
@@ -94,10 +113,30 @@ def _send_to_leader_receiver(
|
|
|
94
113
|
error="No direct leader tmux pane is attached. Run team-agent attach-leader.",
|
|
95
114
|
)
|
|
96
115
|
|
|
97
|
-
|
|
116
|
+
owner_identity = state.get("team_owner") or None
|
|
117
|
+
side_pane_refusal = _side_pane_owner_refusal(state, owner_identity)
|
|
118
|
+
if side_pane_refusal:
|
|
119
|
+
event_log.write("leader_receiver.side_pane_refused", **side_pane_refusal)
|
|
120
|
+
return {
|
|
121
|
+
"ok": False,
|
|
122
|
+
"message_id": message_id,
|
|
123
|
+
"status": "refused",
|
|
124
|
+
"to": leader_id,
|
|
125
|
+
"channel": "direct_tmux",
|
|
126
|
+
**side_pane_refusal,
|
|
127
|
+
}
|
|
128
|
+
receiver_for_validation = dict(receiver)
|
|
129
|
+
if owner_identity and owner_identity.get("leader_session_uuid") and not receiver_for_validation.get("leader_session_uuid"):
|
|
130
|
+
receiver_for_validation["leader_session_uuid"] = owner_identity["leader_session_uuid"]
|
|
131
|
+
validation = _validate_leader_receiver(receiver_for_validation)
|
|
98
132
|
if not validation["ok"]:
|
|
99
|
-
|
|
100
|
-
|
|
133
|
+
rediscovery = _rediscover_leader_receiver(
|
|
134
|
+
receiver_for_validation,
|
|
135
|
+
event_log,
|
|
136
|
+
owner_identity,
|
|
137
|
+
invalidation_reason=validation.get("reason"),
|
|
138
|
+
team_id=team_state_key(state),
|
|
139
|
+
)
|
|
101
140
|
if rediscovery.get("status") == "updated":
|
|
102
141
|
state["leader_receiver"].update(rediscovery["receiver"])
|
|
103
142
|
receiver = state["leader_receiver"]
|
|
@@ -111,7 +150,7 @@ def _send_to_leader_receiver(
|
|
|
111
150
|
payload,
|
|
112
151
|
event_log,
|
|
113
152
|
reason="ambiguous",
|
|
114
|
-
error="multiple possible leader panes found;
|
|
153
|
+
error="multiple possible leader panes found; run team-agent claim-leader --confirm from the intended pane",
|
|
115
154
|
message_status="ambiguous",
|
|
116
155
|
)
|
|
117
156
|
if not validation["ok"]:
|
|
@@ -128,6 +167,69 @@ def _send_to_leader_receiver(
|
|
|
128
167
|
state["leader_receiver"].update(validation["pane"])
|
|
129
168
|
submit_key, submit_reason = _choose_leader_submit_key(receiver.get("provider", "codex"), validation.get("capture", ""))
|
|
130
169
|
target = receiver["pane_id"]
|
|
170
|
+
# Stage 12 (Gap 26 ∩ Gap 32 roundtable 2026-05-26) — injection-boundary dedupe gate.
|
|
171
|
+
# Result-notification injections route through claim_leader_notification_delivery; the
|
|
172
|
+
# gate suppresses a second inject for the same (result_id, leader_session_uuid).
|
|
173
|
+
# Non-result messages (peer mirror, idle reminder, ambiguous-prompt) lack a "Result id:"
|
|
174
|
+
# line in their text and bypass the gate.
|
|
175
|
+
effective_result_id = result_id or _extract_result_id_from_content(content)
|
|
176
|
+
leader_uuid_for_gate = str(
|
|
177
|
+
(state.get("team_owner") or {}).get("leader_session_uuid")
|
|
178
|
+
or (state.get("leader_receiver") or {}).get("leader_session_uuid")
|
|
179
|
+
or ""
|
|
180
|
+
)
|
|
181
|
+
if effective_result_id and leader_uuid_for_gate:
|
|
182
|
+
from team_agent.message_store.leader_notification_log import claim_leader_notification_delivery
|
|
183
|
+
envelope_hash = hashlib.sha256(content.encode("utf-8", errors="ignore")).hexdigest()[:16]
|
|
184
|
+
claim = claim_leader_notification_delivery(
|
|
185
|
+
store,
|
|
186
|
+
result_id=effective_result_id,
|
|
187
|
+
leader_session_uuid=leader_uuid_for_gate,
|
|
188
|
+
proposed_message_id=message_id,
|
|
189
|
+
envelope_hash=envelope_hash,
|
|
190
|
+
owner_team_id=team_state_key(state),
|
|
191
|
+
pane_id=target,
|
|
192
|
+
)
|
|
193
|
+
if claim["status"] == "already_notified_by":
|
|
194
|
+
prev_msg = claim.get("notified_message_id")
|
|
195
|
+
prev_hash = claim.get("envelope_content_hash")
|
|
196
|
+
if envelope_hash == prev_hash:
|
|
197
|
+
event_log.write(
|
|
198
|
+
"leader_notification.dedupe_skip",
|
|
199
|
+
result_id=effective_result_id,
|
|
200
|
+
leader_session_uuid=leader_uuid_for_gate,
|
|
201
|
+
prev_message_id=prev_msg,
|
|
202
|
+
this_message_id=message_id,
|
|
203
|
+
prev_ts=claim.get("notified_at"),
|
|
204
|
+
pane_id=target,
|
|
205
|
+
team_id=team_state_key(state),
|
|
206
|
+
)
|
|
207
|
+
else:
|
|
208
|
+
event_log.write(
|
|
209
|
+
"leader_notification.legitimate_duplicate_suspected",
|
|
210
|
+
result_id=effective_result_id,
|
|
211
|
+
leader_session_uuid=leader_uuid_for_gate,
|
|
212
|
+
prev_message_id=prev_msg,
|
|
213
|
+
this_message_id=message_id,
|
|
214
|
+
prev_envelope_hash=prev_hash,
|
|
215
|
+
this_envelope_hash=envelope_hash,
|
|
216
|
+
pane_id=target,
|
|
217
|
+
team_id=team_state_key(state),
|
|
218
|
+
)
|
|
219
|
+
store.mark(message_id, "submitted", "dedupe_suppressed_by_leader_notification_log")
|
|
220
|
+
save_runtime_state(workspace, state)
|
|
221
|
+
return {
|
|
222
|
+
"ok": True,
|
|
223
|
+
"message_id": message_id,
|
|
224
|
+
"status": "submitted",
|
|
225
|
+
"to": leader_id,
|
|
226
|
+
"channel": "direct_tmux",
|
|
227
|
+
"leader_receiver": state["leader_receiver"],
|
|
228
|
+
"visible": False,
|
|
229
|
+
"submitted": False,
|
|
230
|
+
"deduped": True,
|
|
231
|
+
"canonical_message_id": prev_msg,
|
|
232
|
+
}
|
|
131
233
|
event_log.write(
|
|
132
234
|
"leader_receiver.deliver_attempt",
|
|
133
235
|
message_id=message_id,
|
|
@@ -139,6 +241,8 @@ def _send_to_leader_receiver(
|
|
|
139
241
|
visible_token=rendered.get("token"),
|
|
140
242
|
payload=payload,
|
|
141
243
|
warning=validation.get("warning"),
|
|
244
|
+
result_id=effective_result_id,
|
|
245
|
+
leader_session_uuid=leader_uuid_for_gate or None,
|
|
142
246
|
)
|
|
143
247
|
injection = _tmux_inject_text(
|
|
144
248
|
target,
|
|
@@ -147,6 +251,19 @@ def _send_to_leader_receiver(
|
|
|
147
251
|
f"team-agent-leader-receiver-{message_id}",
|
|
148
252
|
provider=receiver.get("provider", "codex"),
|
|
149
253
|
)
|
|
254
|
+
if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
|
|
255
|
+
from team_agent.messaging.trust_auto_answer import retry_injection_after_trust_auto_answer
|
|
256
|
+
injection = retry_injection_after_trust_auto_answer(
|
|
257
|
+
workspace,
|
|
258
|
+
state,
|
|
259
|
+
event_log,
|
|
260
|
+
injection,
|
|
261
|
+
target,
|
|
262
|
+
text,
|
|
263
|
+
submit_key,
|
|
264
|
+
f"team-agent-leader-receiver-{message_id}-trust-retry",
|
|
265
|
+
receiver.get("provider", "codex"),
|
|
266
|
+
)
|
|
150
267
|
if injection["ok"]:
|
|
151
268
|
store.mark(message_id, "submitted")
|
|
152
269
|
event_log.write(
|
|
@@ -201,6 +318,64 @@ def _send_to_leader_receiver(
|
|
|
201
318
|
)
|
|
202
319
|
|
|
203
320
|
|
|
321
|
+
def _side_pane_owner_refusal(state: dict[str, Any], owner_identity: dict[str, Any] | None) -> dict[str, Any] | None:
|
|
322
|
+
owner_uuid = str((owner_identity or {}).get("leader_session_uuid") or "")
|
|
323
|
+
caller_uuid = os.environ.get("TEAM_AGENT_LEADER_SESSION_UUID") or os.environ.get("TEAM_AGENT_LEADER_SESSION_UUID_OVERRIDE") or ""
|
|
324
|
+
if not owner_uuid or not caller_uuid or caller_uuid == owner_uuid:
|
|
325
|
+
return None
|
|
326
|
+
bound_pane = (state.get("leader_receiver") or {}).get("pane_id") or (owner_identity or {}).get("pane_id")
|
|
327
|
+
team_id = team_state_key(state)
|
|
328
|
+
return {
|
|
329
|
+
"reason": "team_owner_mismatch",
|
|
330
|
+
"error": (
|
|
331
|
+
f"This workspace's team `{team_id}` is already bound to pane `{bound_pane}`. "
|
|
332
|
+
"To work in this window either start a new team with a different team_id, operate through the bound pane, "
|
|
333
|
+
"or run `team-agent claim-leader --confirm` only if you intend to forcibly take over."
|
|
334
|
+
),
|
|
335
|
+
"bound_pane_id": bound_pane,
|
|
336
|
+
"caller_uuid_prefix": caller_uuid[:8],
|
|
337
|
+
"uuid_prefix": owner_uuid[:8],
|
|
338
|
+
"action": "team-agent claim-leader --confirm",
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def claim_leader_receiver(
|
|
343
|
+
workspace: Path,
|
|
344
|
+
state: dict[str, Any],
|
|
345
|
+
candidate: dict[str, Any],
|
|
346
|
+
event_log: EventLog,
|
|
347
|
+
*,
|
|
348
|
+
confirm: bool,
|
|
349
|
+
expected_epoch: int | None = None,
|
|
350
|
+
) -> dict[str, Any]:
|
|
351
|
+
from team_agent.messaging.leader_panes import _leader_command_looks_usable, _receiver_from_target, _target_matches_owner_identity, _uuid_prefix
|
|
352
|
+
if not confirm:
|
|
353
|
+
return {"ok": False, "status": "refused", "reason": "confirm_required", "action": "team-agent claim-leader --confirm"}
|
|
354
|
+
owner = state.setdefault("team_owner", {})
|
|
355
|
+
receiver = state.get("leader_receiver") or {}
|
|
356
|
+
current_epoch = int(owner.get("owner_epoch") or receiver.get("owner_epoch") or 0)
|
|
357
|
+
if expected_epoch is not None and current_epoch != expected_epoch:
|
|
358
|
+
event_log.write("leader_receiver.claim_refused", reason="owner_epoch_advanced", owner_epoch=current_epoch, bound_pane_id=receiver.get("pane_id"))
|
|
359
|
+
return {"ok": False, "status": "refused", "reason": "owner_epoch_advanced", "owner_epoch": current_epoch, "bound_pane_id": receiver.get("pane_id")}
|
|
360
|
+
if receiver.get("pane_id") == candidate.get("pane_id"):
|
|
361
|
+
return {"ok": True, "status": "already_bound", "leader_receiver": receiver, "owner_epoch": current_epoch}
|
|
362
|
+
if not _target_matches_owner_identity(candidate, owner):
|
|
363
|
+
event_log.write("leader_receiver.claim_refused", reason="uuid_mismatch", candidate_pane_id=candidate.get("pane_id"))
|
|
364
|
+
return {"ok": False, "status": "refused", "reason": "uuid_mismatch"}
|
|
365
|
+
provider = str(candidate.get("provider") or receiver.get("provider") or "codex")
|
|
366
|
+
if not _leader_command_looks_usable(str(candidate.get("pane_current_command", "")), provider):
|
|
367
|
+
return {"ok": False, "status": "refused", "reason": "wrong_command", "candidate_pane_id": candidate.get("pane_id")}
|
|
368
|
+
next_epoch = current_epoch + 1
|
|
369
|
+
new_receiver = _receiver_from_target(candidate, provider, owner.get("leader_session_uuid"), next_epoch)
|
|
370
|
+
owner["owner_epoch"] = next_epoch
|
|
371
|
+
state["leader_receiver"] = new_receiver
|
|
372
|
+
from team_agent.runtime import _runtime_lock, save_runtime_state
|
|
373
|
+
with _runtime_lock(workspace, "leader_receiver"):
|
|
374
|
+
save_runtime_state(workspace, state)
|
|
375
|
+
event_log.write("leader_receiver.claimed", pane_id=new_receiver["pane_id"], owner_epoch=next_epoch, uuid_prefix=_uuid_prefix(owner))
|
|
376
|
+
return {"ok": True, "status": "claimed", "leader_receiver": new_receiver, "owner_epoch": next_epoch}
|
|
377
|
+
|
|
378
|
+
|
|
204
379
|
def _fail_leader_delivery(
|
|
205
380
|
workspace: Path,
|
|
206
381
|
state: dict[str, Any],
|
|
@@ -302,12 +477,6 @@ def _format_team_agent_message(payload: dict[str, Any]) -> str:
|
|
|
302
477
|
|
|
303
478
|
|
|
304
479
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
480
|
|
|
312
481
|
|
|
313
482
|
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""Gap 28 (Slice 2 Stage 2): observe-only detection of leader-pane API errors.
|
|
2
|
+
|
|
3
|
+
The coordinator tick captures the leader pane scrollback once per cycle, scans it for
|
|
4
|
+
known upstream-API error patterns (Claude/Codex CLI errors that occur mid-turn), and
|
|
5
|
+
emits a structured `leader.api_error` audit event. The intent is observability — auto-
|
|
6
|
+
retry belongs to the upstream CLI; this module never touches the pane.
|
|
7
|
+
|
|
8
|
+
Event schema (logged via EventLog.write):
|
|
9
|
+
|
|
10
|
+
event: 'leader.api_error'
|
|
11
|
+
ts: ISO-8601 UTC (added by EventLog)
|
|
12
|
+
leader_session_uuid: str | None
|
|
13
|
+
error_class: 'Overloaded' | 'RateLimit' | 'Timeout' |
|
|
14
|
+
'NetworkError' | 'Unknown'
|
|
15
|
+
provider: 'claude' | 'codex' | 'claude_code' | str | None
|
|
16
|
+
partial_response_streamed: bool (heuristic: assistant text before the error)
|
|
17
|
+
worker_dispatch_just_before: list[str] (leader→worker msg_ids in the prior 60s)
|
|
18
|
+
retry_count: int (always 0 — the framework does not retry today)
|
|
19
|
+
matched_pattern_snippet: str (the captured error line, ≤160 chars)
|
|
20
|
+
|
|
21
|
+
Detection dedupes within the coordinator state via a (error_class, snippet-tail)
|
|
22
|
+
fingerprint stored under `state['coordinator']['last_api_error_fingerprint']`. A
|
|
23
|
+
clean tick (no error pattern present) clears the fingerprint so the next genuine
|
|
24
|
+
error re-emits. This keeps event volume bounded while still catching distinct
|
|
25
|
+
errors as they occur.
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
from datetime import datetime, timedelta, timezone
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Any, Callable
|
|
33
|
+
|
|
34
|
+
from team_agent.events import EventLog
|
|
35
|
+
from team_agent.message_store import MessageStore
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Spark MEDIUM sweeps (2026-05-26):
|
|
39
|
+
# (#3) Require an API/provider context marker near the error keyword. Bare '503' /
|
|
40
|
+
# 'fetch failed' / 'timed out' in user text used to false-fire.
|
|
41
|
+
# (#7) Match across short sliding windows of 1-3 adjacent lines so wrapped tmux
|
|
42
|
+
# output ("claude:\n request timed out") still resolves to a single
|
|
43
|
+
# detection. Window joined with a single space; capped at _WINDOW_MAX_CHARS
|
|
44
|
+
# so the scan stays bounded.
|
|
45
|
+
_API_CONTEXT = (
|
|
46
|
+
r"(?:API\s+Error|HTTP\s*Error|HTTPError|request\s+failed|"
|
|
47
|
+
r"codex|claude|Anthropic|OpenAI|TypeError)"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Patterns operate against a sliding window of up to 3 joined lines. The window
|
|
51
|
+
# never contains '\n' (lines are joined with a single space), so `[^\n]` and `.`
|
|
52
|
+
# behave the same; we use `[^\n]` for self-documentation.
|
|
53
|
+
_ERROR_PATTERNS: list[tuple[re.Pattern[str], str]] = [
|
|
54
|
+
# Overloaded — keyword itself already includes the "API Error:" prefix.
|
|
55
|
+
(re.compile(r"API\s+Error:\s*Overloaded", re.IGNORECASE), "Overloaded"),
|
|
56
|
+
# RateLimit — 429 with "Too Many Requests" is sufficiently specific; require it
|
|
57
|
+
# appear AFTER an API context marker OR before "Too Many Requests" tightly.
|
|
58
|
+
(re.compile(rf"(?:{_API_CONTEXT}[^\n]*\b429\b|\b429\s+Too\s+Many\s+Requests)", re.IGNORECASE), "RateLimit"),
|
|
59
|
+
# 5xx — must share a window with an API-context marker on either side.
|
|
60
|
+
(re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}\b5(?:00|02|03|04)\b", re.IGNORECASE), "NetworkError"),
|
|
61
|
+
(re.compile(rf"\b5(?:00|02|03|04)\b[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "NetworkError"),
|
|
62
|
+
# fetch failed — needs an API-context marker in the same window. The TypeError
|
|
63
|
+
# marker on its own counts (Node fetch frames the error this way).
|
|
64
|
+
(re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}fetch\s+failed", re.IGNORECASE), "NetworkError"),
|
|
65
|
+
(re.compile(rf"fetch\s+failed[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "NetworkError"),
|
|
66
|
+
# Timeout — likewise requires an API-context marker in the window, except for
|
|
67
|
+
# the unambiguous syscall token ETIMEDOUT.
|
|
68
|
+
(re.compile(rf"{_API_CONTEXT}[^\n]{{0,200}}(?:request|connection)\s+(?:timed\s+out|timeout)", re.IGNORECASE), "Timeout"),
|
|
69
|
+
(re.compile(rf"(?:request|connection)\s+(?:timed\s+out|timeout)[^\n]{{0,200}}{_API_CONTEXT}", re.IGNORECASE), "Timeout"),
|
|
70
|
+
(re.compile(r"\bETIMEDOUT\b", re.IGNORECASE), "Timeout"),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
_RECENT_LINE_WINDOW = 100 # scan only the most recent N lines
|
|
74
|
+
_SLIDING_WINDOW_LINES = 3 # join up to 3 adjacent lines per scan window
|
|
75
|
+
_WINDOW_MAX_CHARS = 400 # discard windows beyond this length to bound work
|
|
76
|
+
_DISPATCH_WINDOW_SECONDS = 60 # leader→worker sends counted within this lookback
|
|
77
|
+
_PARTIAL_RESPONSE_HEAD_BYTES = 4000
|
|
78
|
+
|
|
79
|
+
_PARTIAL_RESPONSE_HINT = re.compile(
|
|
80
|
+
r"(?:^|\n)\s*(?:Assistant|⏺|●|> |I'll |I will |I'm |I am |Let me )",
|
|
81
|
+
re.IGNORECASE,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def detect_leader_api_errors(
|
|
86
|
+
workspace: Path,
|
|
87
|
+
state: dict[str, Any],
|
|
88
|
+
store: MessageStore,
|
|
89
|
+
event_log: EventLog,
|
|
90
|
+
*,
|
|
91
|
+
capture_fn: Callable[[str], dict[str, Any]] | None = None,
|
|
92
|
+
now_fn: Callable[[], datetime] | None = None,
|
|
93
|
+
) -> list[dict[str, Any]]:
|
|
94
|
+
"""Coordinator-tick entry point. Returns a list of emitted events (0 or 1)."""
|
|
95
|
+
receiver = state.get("leader_receiver") or {}
|
|
96
|
+
pane = receiver.get("pane_id") if receiver.get("mode") == "direct_tmux" else None
|
|
97
|
+
if not pane:
|
|
98
|
+
return []
|
|
99
|
+
capture_fn = capture_fn or _default_capture_fn()
|
|
100
|
+
capture = capture_fn(str(pane))
|
|
101
|
+
if not capture.get("ok"):
|
|
102
|
+
return []
|
|
103
|
+
scrollback = str(capture.get("capture") or "")
|
|
104
|
+
coordinator_state = state.setdefault("coordinator", {})
|
|
105
|
+
found = _match_first_error(scrollback)
|
|
106
|
+
if not found:
|
|
107
|
+
if coordinator_state.get("last_api_error_fingerprint"):
|
|
108
|
+
coordinator_state["last_api_error_fingerprint"] = None
|
|
109
|
+
return []
|
|
110
|
+
error_class, snippet = found
|
|
111
|
+
fingerprint = f"{error_class}::{snippet[-120:]}"
|
|
112
|
+
if coordinator_state.get("last_api_error_fingerprint") == fingerprint:
|
|
113
|
+
return []
|
|
114
|
+
coordinator_state["last_api_error_fingerprint"] = fingerprint
|
|
115
|
+
now = (now_fn() if now_fn else datetime.now(timezone.utc))
|
|
116
|
+
cutoff_iso = (now - timedelta(seconds=_DISPATCH_WINDOW_SECONDS)).isoformat()
|
|
117
|
+
leader_uuid = (
|
|
118
|
+
str((state.get("team_owner") or {}).get("leader_session_uuid") or "")
|
|
119
|
+
or str(receiver.get("leader_session_uuid") or "")
|
|
120
|
+
or None
|
|
121
|
+
)
|
|
122
|
+
provider = str(receiver.get("provider") or "") or None
|
|
123
|
+
event = event_log.write(
|
|
124
|
+
"leader.api_error",
|
|
125
|
+
leader_session_uuid=leader_uuid,
|
|
126
|
+
error_class=error_class,
|
|
127
|
+
provider=provider,
|
|
128
|
+
partial_response_streamed=_scrollback_has_partial_response(scrollback, snippet),
|
|
129
|
+
worker_dispatch_just_before=_recent_leader_dispatches(store, cutoff_iso),
|
|
130
|
+
retry_count=0,
|
|
131
|
+
matched_pattern_snippet=snippet[:160],
|
|
132
|
+
)
|
|
133
|
+
return [event]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _default_capture_fn() -> Callable[[str], dict[str, Any]]:
|
|
137
|
+
from team_agent.messaging.deps import _capture_tmux_pane_text
|
|
138
|
+
return _capture_tmux_pane_text
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _match_first_error(scrollback: str) -> tuple[str, str] | None:
|
|
142
|
+
"""Spark MEDIUM #7: sliding window of 1..N adjacent lines. Lines inside a
|
|
143
|
+
window are joined with a single space so a wrapped pair such as
|
|
144
|
+
claude:
|
|
145
|
+
request timed out
|
|
146
|
+
is detected as one event without permitting unbounded cross-line matches.
|
|
147
|
+
Latest window wins so the freshest error is reported."""
|
|
148
|
+
if not scrollback:
|
|
149
|
+
return None
|
|
150
|
+
lines = [line.strip() for line in scrollback.splitlines()[-_RECENT_LINE_WINDOW:]]
|
|
151
|
+
if not lines:
|
|
152
|
+
return None
|
|
153
|
+
best: tuple[int, str, str] | None = None
|
|
154
|
+
for start in range(len(lines)):
|
|
155
|
+
for size in range(1, _SLIDING_WINDOW_LINES + 1):
|
|
156
|
+
end = start + size
|
|
157
|
+
if end > len(lines):
|
|
158
|
+
break
|
|
159
|
+
window = " ".join(line for line in lines[start:end] if line)
|
|
160
|
+
if not window:
|
|
161
|
+
continue
|
|
162
|
+
# Spark MEDIUM sweep #3 (2026-05-26): tail-preserve instead of
|
|
163
|
+
# dropping the window wholesale. Errors land at the END of verbose
|
|
164
|
+
# diagnostics (stack traces, retry chatter, etc.). If we discarded
|
|
165
|
+
# any window over the cap we silently lost recall on long wrapped
|
|
166
|
+
# output. Scanning the LAST _WINDOW_MAX_CHARS still bounds regex
|
|
167
|
+
# cost while keeping the freshest context — the bit most likely to
|
|
168
|
+
# contain the actual provider error keyword.
|
|
169
|
+
if len(window) > _WINDOW_MAX_CHARS:
|
|
170
|
+
window = window[-_WINDOW_MAX_CHARS:]
|
|
171
|
+
for pattern, error_class in _ERROR_PATTERNS:
|
|
172
|
+
match = pattern.search(window)
|
|
173
|
+
if not match:
|
|
174
|
+
continue
|
|
175
|
+
snippet = window[:240]
|
|
176
|
+
if best is None or start > best[0]:
|
|
177
|
+
best = (start, error_class, snippet)
|
|
178
|
+
# First match per window is enough; later windows may override.
|
|
179
|
+
break
|
|
180
|
+
if best is None:
|
|
181
|
+
return None
|
|
182
|
+
return best[1], best[2]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _scrollback_has_partial_response(scrollback: str, error_snippet: str) -> bool:
|
|
186
|
+
idx = scrollback.rfind(error_snippet)
|
|
187
|
+
if idx == -1:
|
|
188
|
+
return False
|
|
189
|
+
head = scrollback[max(0, idx - _PARTIAL_RESPONSE_HEAD_BYTES): idx]
|
|
190
|
+
return bool(_PARTIAL_RESPONSE_HINT.search(head))
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _recent_leader_dispatches(store: MessageStore, cutoff_iso: str) -> list[str]:
|
|
194
|
+
out: list[str] = []
|
|
195
|
+
try:
|
|
196
|
+
rows = store.messages()
|
|
197
|
+
except Exception:
|
|
198
|
+
return out
|
|
199
|
+
for row in rows:
|
|
200
|
+
sender = str(row.get("sender") or "")
|
|
201
|
+
if sender not in {"leader", "Leader"} and not _looks_like_leader_sender(sender):
|
|
202
|
+
continue
|
|
203
|
+
created = str(row.get("created_at") or "")
|
|
204
|
+
if not created or created < cutoff_iso:
|
|
205
|
+
continue
|
|
206
|
+
msg_id = str(row.get("message_id") or "")
|
|
207
|
+
if msg_id:
|
|
208
|
+
out.append(msg_id)
|
|
209
|
+
return out
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _looks_like_leader_sender(sender: str) -> bool:
|
|
213
|
+
return sender.startswith("leader") or sender.lower() == "leader"
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
__all__ = ["detect_leader_api_errors"]
|