@team-agent/installer 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/abnormal_track.py +253 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +104 -3
- package/src/team_agent/cli/parser.py +10 -1
- package/src/team_agent/compiler.py +1 -1
- package/src/team_agent/coordinator/lifecycle.py +23 -2
- package/src/team_agent/diagnose/orphan_cleanup.py +199 -28
- package/src/team_agent/display/__init__.py +31 -0
- package/src/team_agent/display/adaptive.py +425 -0
- package/src/team_agent/display/backend.py +46 -0
- package/src/team_agent/display/close.py +6 -0
- package/src/team_agent/display/rebuild.py +102 -0
- package/src/team_agent/display/tiling.py +156 -0
- package/src/team_agent/display/worker_window.py +4 -0
- package/src/team_agent/display/workspace.py +36 -127
- package/src/team_agent/idle_predicate.py +200 -0
- package/src/team_agent/idle_takeover.py +59 -0
- package/src/team_agent/idle_takeover_wiring.py +111 -0
- package/src/team_agent/launch/core.py +14 -4
- package/src/team_agent/leader/__init__.py +444 -61
- package/src/team_agent/lifecycle/operations.py +1 -0
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +38 -11
- package/src/team_agent/message_store/leader_notification_log.py +47 -26
- package/src/team_agent/message_store/schema.py +8 -2
- package/src/team_agent/messaging/delivery.py +336 -1
- package/src/team_agent/messaging/leader.py +13 -4
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +294 -0
- package/src/team_agent/messaging/scheduler.py +12 -0
- package/src/team_agent/messaging/send.py +54 -26
- package/src/team_agent/messaging/tmux_io.py +202 -33
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +52 -0
- package/src/team_agent/provider_state/README.md +78 -0
- package/src/team_agent/provider_state/__init__.py +86 -0
- package/src/team_agent/provider_state/claude.py +86 -0
- package/src/team_agent/provider_state/codex.py +84 -0
- package/src/team_agent/provider_state/common.py +207 -0
- package/src/team_agent/provider_state/registry.py +118 -0
- package/src/team_agent/restart/orchestration.py +215 -12
- package/src/team_agent/runtime.py +65 -15
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +63 -3
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/wake.py +58 -0
- package/src/team_agent/watch/__init__.py +145 -0
|
@@ -2,11 +2,12 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import sqlite3
|
|
5
|
+
import time
|
|
5
6
|
import uuid
|
|
6
7
|
from contextlib import closing
|
|
7
8
|
from datetime import datetime, timedelta, timezone
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
10
|
+
from typing import Any, Callable
|
|
10
11
|
|
|
11
12
|
from . import agent_health as _agent_health
|
|
12
13
|
from . import result_watchers as _result_watchers
|
|
@@ -15,6 +16,28 @@ from team_agent.paths import runtime_dir
|
|
|
15
16
|
from team_agent.spec import validate_result_envelope
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
def _is_sqlite_locked(exc: sqlite3.OperationalError) -> bool:
|
|
20
|
+
message = str(exc).lower()
|
|
21
|
+
return (
|
|
22
|
+
"database is locked" in message
|
|
23
|
+
or "database table is locked" in message
|
|
24
|
+
or "database schema is locked" in message
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _with_sqlite_busy_retry(action: Callable[[], None]) -> None:
|
|
29
|
+
delay = 0.05
|
|
30
|
+
for attempt in range(6):
|
|
31
|
+
try:
|
|
32
|
+
action()
|
|
33
|
+
return
|
|
34
|
+
except sqlite3.OperationalError as exc:
|
|
35
|
+
if not _is_sqlite_locked(exc) or attempt == 5:
|
|
36
|
+
raise
|
|
37
|
+
time.sleep(delay)
|
|
38
|
+
delay *= 2
|
|
39
|
+
|
|
40
|
+
|
|
18
41
|
class MessageStore:
|
|
19
42
|
SCHEMA_VERSION = SCHEMA_VERSION
|
|
20
43
|
|
|
@@ -27,13 +50,16 @@ class MessageStore:
|
|
|
27
50
|
def connect(self) -> sqlite3.Connection:
|
|
28
51
|
conn = sqlite3.connect(self.path, timeout=30.0, isolation_level=None)
|
|
29
52
|
conn.row_factory = sqlite3.Row
|
|
30
|
-
conn.execute("PRAGMA journal_mode=WAL")
|
|
31
53
|
conn.execute("PRAGMA busy_timeout=30000")
|
|
54
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
32
55
|
return conn
|
|
33
56
|
|
|
34
57
|
def _init(self) -> None:
|
|
35
|
-
|
|
36
|
-
|
|
58
|
+
def initialize() -> None:
|
|
59
|
+
with closing(self.connect()) as conn:
|
|
60
|
+
initialize_schema(conn)
|
|
61
|
+
|
|
62
|
+
_with_sqlite_busy_retry(initialize)
|
|
37
63
|
|
|
38
64
|
def create_message(
|
|
39
65
|
self,
|
|
@@ -331,17 +357,17 @@ class MessageStore:
|
|
|
331
357
|
return counts
|
|
332
358
|
|
|
333
359
|
def add_result(self, envelope: dict[str, Any], owner_team_id: str | None = None) -> str:
|
|
334
|
-
_ = owner_team_id
|
|
335
360
|
validate_result_envelope(envelope)
|
|
336
361
|
result_id = f"res_{uuid.uuid4().hex[:12]}"
|
|
337
362
|
with closing(self.connect()) as conn:
|
|
338
363
|
with conn:
|
|
339
364
|
conn.execute(
|
|
340
365
|
"""
|
|
341
|
-
insert into results(result_id, task_id, agent_id, envelope, status, created_at)
|
|
342
|
-
values (?, ?, ?, ?, ?, ?)
|
|
366
|
+
insert into results(owner_team_id, result_id, task_id, agent_id, envelope, status, created_at)
|
|
367
|
+
values (?, ?, ?, ?, ?, ?, ?)
|
|
343
368
|
""",
|
|
344
369
|
(
|
|
370
|
+
owner_team_id,
|
|
345
371
|
result_id,
|
|
346
372
|
envelope["task_id"],
|
|
347
373
|
envelope["agent_id"],
|
|
@@ -423,16 +449,17 @@ class MessageStore:
|
|
|
423
449
|
return dict(row) if row else None
|
|
424
450
|
|
|
425
451
|
def latest_results(self, limit: int = 5, owner_team_id: str | None = None) -> list[dict[str, Any]]:
|
|
426
|
-
|
|
452
|
+
owner_clause = "and owner_team_id = ?" if owner_team_id else ""
|
|
453
|
+
args: tuple[Any, ...] = (owner_team_id, limit) if owner_team_id else (limit,)
|
|
427
454
|
with closing(self.connect()) as conn:
|
|
428
455
|
rows = conn.execute(
|
|
429
|
-
"""
|
|
456
|
+
f"""
|
|
430
457
|
select * from results
|
|
431
|
-
where status != 'invalid'
|
|
458
|
+
where status != 'invalid' {owner_clause}
|
|
432
459
|
order by created_at desc
|
|
433
460
|
limit ?
|
|
434
461
|
""",
|
|
435
|
-
|
|
462
|
+
args,
|
|
436
463
|
).fetchall()
|
|
437
464
|
return [dict(row) for row in reversed(rows)]
|
|
438
465
|
|
|
@@ -11,9 +11,20 @@ from __future__ import annotations
|
|
|
11
11
|
|
|
12
12
|
from contextlib import closing
|
|
13
13
|
from datetime import datetime, timedelta, timezone
|
|
14
|
+
import sqlite3
|
|
15
|
+
import time
|
|
14
16
|
from typing import Any
|
|
15
17
|
|
|
16
18
|
|
|
19
|
+
def _sqlite_locked(exc: sqlite3.OperationalError) -> bool:
|
|
20
|
+
message = str(exc).lower()
|
|
21
|
+
return (
|
|
22
|
+
"database is locked" in message
|
|
23
|
+
or "database table is locked" in message
|
|
24
|
+
or "database schema is locked" in message
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
17
28
|
def claim_leader_notification_delivery(
|
|
18
29
|
store: Any,
|
|
19
30
|
*,
|
|
@@ -28,32 +39,42 @@ def claim_leader_notification_delivery(
|
|
|
28
39
|
rowcount=0 means a prior row exists for (result_id, leader_session_uuid); SELECT
|
|
29
40
|
it and return so the caller can decide to suppress (same envelope_hash) or surface
|
|
30
41
|
legitimate-duplicate (different envelope_hash)."""
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
42
|
+
delay = 0.05
|
|
43
|
+
row = None
|
|
44
|
+
for attempt in range(6):
|
|
45
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
46
|
+
try:
|
|
47
|
+
with closing(store.connect()) as conn:
|
|
48
|
+
with conn:
|
|
49
|
+
cur = conn.execute(
|
|
50
|
+
"insert or ignore into leader_notification_log("
|
|
51
|
+
" result_id, leader_session_uuid, notified_message_id, notified_at,"
|
|
52
|
+
" leader_pane_id_at_notify, envelope_content_hash, owner_team_id"
|
|
53
|
+
") values (?, ?, ?, ?, ?, ?, ?)",
|
|
54
|
+
(
|
|
55
|
+
result_id, leader_session_uuid, proposed_message_id, now,
|
|
56
|
+
pane_id, envelope_hash, owner_team_id,
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
if cur.rowcount == 1:
|
|
60
|
+
return {
|
|
61
|
+
"status": "claimed_by_you",
|
|
62
|
+
"notified_message_id": proposed_message_id,
|
|
63
|
+
"notified_at": now,
|
|
64
|
+
"envelope_content_hash": envelope_hash,
|
|
65
|
+
}
|
|
66
|
+
row = conn.execute(
|
|
67
|
+
"select notified_message_id, notified_at, envelope_content_hash, "
|
|
68
|
+
"leader_pane_id_at_notify from leader_notification_log "
|
|
69
|
+
"where result_id = ? and leader_session_uuid = ?",
|
|
70
|
+
(result_id, leader_session_uuid),
|
|
71
|
+
).fetchone()
|
|
72
|
+
break
|
|
73
|
+
except sqlite3.OperationalError as exc:
|
|
74
|
+
if not _sqlite_locked(exc) or attempt == 5:
|
|
75
|
+
raise
|
|
76
|
+
time.sleep(delay)
|
|
77
|
+
delay *= 2
|
|
57
78
|
if row is None:
|
|
58
79
|
# Should not happen (INSERT OR IGNORE returned 0 → row must exist), but be defensive.
|
|
59
80
|
return {"status": "claimed_by_you", "notified_message_id": proposed_message_id,
|
|
@@ -22,7 +22,7 @@ MESSAGE_COLUMNS = {
|
|
|
22
22
|
"error",
|
|
23
23
|
"delivery_attempts",
|
|
24
24
|
}
|
|
25
|
-
RESULT_COLUMNS = {"result_id", "task_id", "agent_id", "envelope", "status", "created_at"}
|
|
25
|
+
RESULT_COLUMNS = {"owner_team_id", "result_id", "task_id", "agent_id", "envelope", "status", "created_at"}
|
|
26
26
|
SCHEDULED_EVENT_COLUMNS = {
|
|
27
27
|
"id",
|
|
28
28
|
"owner_team_id",
|
|
@@ -125,6 +125,7 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
|
|
|
125
125
|
"""
|
|
126
126
|
create table if not exists results (
|
|
127
127
|
result_id text primary key,
|
|
128
|
+
owner_team_id text,
|
|
128
129
|
task_id text not null,
|
|
129
130
|
agent_id text not null,
|
|
130
131
|
envelope text not null,
|
|
@@ -215,7 +216,12 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
|
|
|
215
216
|
"owner_team_id": "alter table messages add column owner_team_id text",
|
|
216
217
|
},
|
|
217
218
|
)
|
|
218
|
-
_ensure_table_columns(
|
|
219
|
+
_ensure_table_columns(
|
|
220
|
+
conn,
|
|
221
|
+
"results",
|
|
222
|
+
RESULT_COLUMNS,
|
|
223
|
+
{"owner_team_id": "alter table results add column owner_team_id text"},
|
|
224
|
+
)
|
|
219
225
|
_ensure_table_columns(
|
|
220
226
|
conn,
|
|
221
227
|
"scheduled_events",
|
|
@@ -10,15 +10,62 @@ from team_agent.messaging.deps import (
|
|
|
10
10
|
core_render_message,
|
|
11
11
|
)
|
|
12
12
|
|
|
13
|
+
from datetime import datetime, timedelta, timezone
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from typing import Any
|
|
15
16
|
|
|
17
|
+
|
|
18
|
+
def _tmux_pane_width(target: str) -> dict[str, Any]:
|
|
19
|
+
"""Query the tmux pane width (display columns) for ``target``.
|
|
20
|
+
|
|
21
|
+
Live wiring seam for the trust-prompt truncation matcher: returns
|
|
22
|
+
``{"ok": True, "pane_width": <int>}`` on success or
|
|
23
|
+
``{"ok": False, "error": "..."}`` on any failure / timeout / unparseable
|
|
24
|
+
output. Fail-safe by design: NEVER returns a default width. Callers must
|
|
25
|
+
treat failure as "no boundary signal" and let the workspace matcher fall
|
|
26
|
+
back to exact equality, so a hard-truncated prompt is never auto-answered
|
|
27
|
+
on guesswork.
|
|
28
|
+
"""
|
|
29
|
+
from team_agent.messaging.deps import run_cmd
|
|
30
|
+
try:
|
|
31
|
+
proc = run_cmd(
|
|
32
|
+
["tmux", "display-message", "-p", "-t", str(target), "-F", "#{pane_width}"],
|
|
33
|
+
timeout=2,
|
|
34
|
+
)
|
|
35
|
+
except Exception as exc: # pragma: no cover - defensive; tmux not present, timeout, etc.
|
|
36
|
+
return {"ok": False, "error": f"tmux_query_failed:{exc.__class__.__name__}"}
|
|
37
|
+
if getattr(proc, "returncode", 1) != 0:
|
|
38
|
+
err = (getattr(proc, "stderr", "") or "").strip().splitlines()
|
|
39
|
+
return {"ok": False, "error": err[0] if err else "tmux_query_nonzero"}
|
|
40
|
+
text = (getattr(proc, "stdout", "") or "").strip()
|
|
41
|
+
if not text:
|
|
42
|
+
return {"ok": False, "error": "empty_output"}
|
|
43
|
+
try:
|
|
44
|
+
width = int(text.splitlines()[0].strip())
|
|
45
|
+
except (ValueError, IndexError):
|
|
46
|
+
return {"ok": False, "error": "unparseable_output"}
|
|
47
|
+
if width <= 0:
|
|
48
|
+
return {"ok": False, "error": "non_positive_width"}
|
|
49
|
+
return {"ok": True, "pane_width": width}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Spark MEDIUM sweep #3 (2026-05-26): retry_needed bounded backoff. Each entry is
|
|
53
|
+
# the delay (seconds) BEFORE the attempt with that number runs; attempt 1 was the
|
|
54
|
+
# original delivery, attempt 2 fires 5s after retry_needed, attempt 3 fires 15s
|
|
55
|
+
# after the previous, attempt 4 fires 30s after the previous. _TRUST_RETRY_MAX_ATTEMPTS
|
|
56
|
+
# bounds the total — the 4th retry_needed is terminal and emits
|
|
57
|
+
# leader_panes.trust_auto_answer_exhausted.
|
|
58
|
+
_TRUST_RETRY_BACKOFF_SECONDS = {2: 5, 3: 15, 4: 30}
|
|
59
|
+
_TRUST_RETRY_MAX_ATTEMPTS = 4
|
|
60
|
+
|
|
16
61
|
def _deliver_pending_message(
|
|
17
62
|
workspace: Path,
|
|
18
63
|
state: dict[str, Any],
|
|
19
64
|
message_id: str,
|
|
20
65
|
wait_visible: bool = True,
|
|
21
66
|
timeout: float = 30.0,
|
|
67
|
+
*,
|
|
68
|
+
_trust_retry_attempt: int = 1,
|
|
22
69
|
) -> dict[str, Any]:
|
|
23
70
|
store = MessageStore(workspace)
|
|
24
71
|
row = next((m for m in store.messages() if m["message_id"] == message_id), None)
|
|
@@ -65,9 +112,58 @@ def _deliver_pending_message(
|
|
|
65
112
|
attempts=3 if wait_visible else 1,
|
|
66
113
|
provider=agent_state.get("provider", "fake"),
|
|
67
114
|
)
|
|
115
|
+
if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
|
|
116
|
+
# Gap 29 (Stage 2): opt-in trust auto-answer. The helper enforces both the
|
|
117
|
+
# opt-in flag and a workspace-dir match before sending '1'+Enter, then we
|
|
118
|
+
# retry the original paste once the prompt has actually been dismissed.
|
|
119
|
+
# Bypassed entirely when opt-out (default) — the existing failed envelope
|
|
120
|
+
# is preserved.
|
|
121
|
+
from team_agent.messaging.leader_panes import attempt_trust_auto_answer
|
|
122
|
+
pane_target = injection.get("pane_id") or target
|
|
123
|
+
# Live wiring: query the tmux pane width now and hand it to the trust
|
|
124
|
+
# matcher via state["pane_width"]. On failure we leave pane_width
|
|
125
|
+
# absent so the matcher falls back to exact equality (fail-safe — a
|
|
126
|
+
# right-edge truncated prefix is never auto-answered on guesswork).
|
|
127
|
+
width_query = _tmux_pane_width(pane_target)
|
|
128
|
+
trust_state = dict(state) if isinstance(state, dict) else {}
|
|
129
|
+
if width_query.get("ok"):
|
|
130
|
+
trust_state["pane_width"] = width_query["pane_width"]
|
|
131
|
+
answer = attempt_trust_auto_answer(
|
|
132
|
+
workspace,
|
|
133
|
+
pane_target,
|
|
134
|
+
injection.get("pane_capture_tail") or "",
|
|
135
|
+
EventLog(workspace),
|
|
136
|
+
state=trust_state,
|
|
137
|
+
)
|
|
138
|
+
if answer.get("answered"):
|
|
139
|
+
# Spark MEDIUM #4 (2026-05-26): replace the fixed 0.3s sleep with a
|
|
140
|
+
# bounded poll. Slow terminals can take well over a second to clear
|
|
141
|
+
# the trust prompt; sleeping a fixed amount races dismissal and
|
|
142
|
+
# leaves the retry hitting the same codex_trust_prompt state. We
|
|
143
|
+
# poll for prompt dismissal up to 3s; if still present, return a
|
|
144
|
+
# retry_needed envelope and let the upstream scheduler decide
|
|
145
|
+
# whether to back off and try again later.
|
|
146
|
+
dismissed = _wait_for_trust_prompt_dismissal(
|
|
147
|
+
injection.get("pane_id") or target, timeout=3.0,
|
|
148
|
+
)
|
|
149
|
+
if not dismissed:
|
|
150
|
+
return _handle_trust_retry_needed(
|
|
151
|
+
workspace, state, store, message_id, target, injection,
|
|
152
|
+
attempt=_trust_retry_attempt,
|
|
153
|
+
)
|
|
154
|
+
injection = _tmux_inject_text(
|
|
155
|
+
target,
|
|
156
|
+
text,
|
|
157
|
+
"Enter",
|
|
158
|
+
f"team-agent-send-{message_id}-trust-retry",
|
|
159
|
+
attempts=3 if wait_visible else 1,
|
|
160
|
+
provider=agent_state.get("provider", "fake"),
|
|
161
|
+
)
|
|
68
162
|
if injection["ok"]:
|
|
69
163
|
store.mark(message_id, "submitted")
|
|
70
|
-
EventLog(workspace)
|
|
164
|
+
send_event_log = EventLog(workspace)
|
|
165
|
+
_stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
|
|
166
|
+
send_event_log.write(
|
|
71
167
|
"send.submitted",
|
|
72
168
|
message_id=message_id,
|
|
73
169
|
target=target,
|
|
@@ -112,9 +208,248 @@ def _deliver_pending_message(
|
|
|
112
208
|
"turn_verification": injection.get("turn_verification"),
|
|
113
209
|
"paste_attempts": injection.get("attempts"),
|
|
114
210
|
"submit_attempts": injection.get("submit_attempts"),
|
|
211
|
+
"detected": injection.get("detected"),
|
|
212
|
+
"pane_id": injection.get("pane_id"),
|
|
213
|
+
"pane_mode": injection.get("pane_mode"),
|
|
214
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
115
215
|
}
|
|
116
216
|
|
|
117
217
|
|
|
218
|
+
def _handle_trust_retry_needed(
|
|
219
|
+
workspace: Path,
|
|
220
|
+
state: dict[str, Any],
|
|
221
|
+
store: MessageStore,
|
|
222
|
+
message_id: str,
|
|
223
|
+
target: str,
|
|
224
|
+
injection: dict[str, Any],
|
|
225
|
+
*,
|
|
226
|
+
attempt: int,
|
|
227
|
+
) -> dict[str, Any]:
|
|
228
|
+
"""Spark MEDIUM sweep #3: replace the dead-end failed mark with a real
|
|
229
|
+
bounded-backoff consumer. attempt is the number of the delivery that JUST
|
|
230
|
+
failed (1 = the original delivery; 2..4 = the scheduler-fired retries).
|
|
231
|
+
|
|
232
|
+
Behaviour:
|
|
233
|
+
* attempt < _TRUST_RETRY_MAX_ATTEMPTS: schedule a trust_retry
|
|
234
|
+
scheduled_event for the message, holding the message in 'failed' status
|
|
235
|
+
so _deliver_pending_messages does not race the scheduler. Emit
|
|
236
|
+
leader_panes.trust_auto_answer_retry_scheduled. Return status='retry_scheduled'.
|
|
237
|
+
* attempt >= _TRUST_RETRY_MAX_ATTEMPTS: terminal. Mark the message failed
|
|
238
|
+
and emit leader_panes.trust_auto_answer_exhausted. Return
|
|
239
|
+
status='trust_auto_answer_exhausted'.
|
|
240
|
+
"""
|
|
241
|
+
event_log = EventLog(workspace)
|
|
242
|
+
next_attempt = attempt + 1
|
|
243
|
+
if next_attempt > _TRUST_RETRY_MAX_ATTEMPTS:
|
|
244
|
+
store.mark(message_id, "failed", "trust_auto_answer_exhausted")
|
|
245
|
+
event_log.write(
|
|
246
|
+
"leader_panes.trust_auto_answer_exhausted",
|
|
247
|
+
message_id=message_id,
|
|
248
|
+
workspace=str(workspace),
|
|
249
|
+
attempts=attempt,
|
|
250
|
+
target=target,
|
|
251
|
+
pane_id=injection.get("pane_id"),
|
|
252
|
+
reason="trust_auto_answer_exhausted",
|
|
253
|
+
)
|
|
254
|
+
return {
|
|
255
|
+
"ok": False,
|
|
256
|
+
"status": "trust_auto_answer_exhausted",
|
|
257
|
+
"reason": "trust_auto_answer_exhausted",
|
|
258
|
+
"attempts": attempt,
|
|
259
|
+
"detected": injection.get("detected"),
|
|
260
|
+
"pane_id": injection.get("pane_id"),
|
|
261
|
+
"pane_mode": injection.get("pane_mode"),
|
|
262
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
263
|
+
}
|
|
264
|
+
backoff = _TRUST_RETRY_BACKOFF_SECONDS.get(next_attempt, _TRUST_RETRY_BACKOFF_SECONDS[_TRUST_RETRY_MAX_ATTEMPTS])
|
|
265
|
+
due_at = (datetime.now(timezone.utc) + timedelta(seconds=backoff)).isoformat()
|
|
266
|
+
owner_team_id = _message_owner_team_id(store, message_id)
|
|
267
|
+
event_id = store.add_scheduled_event(
|
|
268
|
+
due_at,
|
|
269
|
+
message_id,
|
|
270
|
+
"trust_retry",
|
|
271
|
+
{
|
|
272
|
+
"message_id": message_id,
|
|
273
|
+
"attempt": next_attempt,
|
|
274
|
+
"max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
|
|
275
|
+
"first_target": target,
|
|
276
|
+
},
|
|
277
|
+
owner_team_id=owner_team_id,
|
|
278
|
+
)
|
|
279
|
+
# Hold the message in 'failed' so _deliver_pending_messages does not race
|
|
280
|
+
# the scheduled retry. The scheduler consumer resets it to 'accepted' just
|
|
281
|
+
# before re-delivery.
|
|
282
|
+
store.mark(message_id, "failed", "trust_retry_scheduled")
|
|
283
|
+
event_log.write(
|
|
284
|
+
"leader_panes.trust_auto_answer_retry_needed",
|
|
285
|
+
message_id=message_id,
|
|
286
|
+
workspace=str(workspace),
|
|
287
|
+
pane_id=injection.get("pane_id") or target,
|
|
288
|
+
target=target,
|
|
289
|
+
reason="trust_prompt_not_dismissed_after_answer",
|
|
290
|
+
attempt=attempt,
|
|
291
|
+
)
|
|
292
|
+
event_log.write(
|
|
293
|
+
"leader_panes.trust_auto_answer_retry_scheduled",
|
|
294
|
+
message_id=message_id,
|
|
295
|
+
workspace=str(workspace),
|
|
296
|
+
scheduled_event_id=event_id,
|
|
297
|
+
due_at=due_at,
|
|
298
|
+
next_attempt=next_attempt,
|
|
299
|
+
max_attempts=_TRUST_RETRY_MAX_ATTEMPTS,
|
|
300
|
+
backoff_seconds=backoff,
|
|
301
|
+
)
|
|
302
|
+
return {
|
|
303
|
+
"ok": False,
|
|
304
|
+
"status": "retry_scheduled",
|
|
305
|
+
"reason": "trust_prompt_not_dismissed_after_answer",
|
|
306
|
+
"stage": "trust_auto_answer_dismissal_wait",
|
|
307
|
+
"verification": "trust_prompt_not_dismissed_after_answer",
|
|
308
|
+
"scheduled_event_id": event_id,
|
|
309
|
+
"scheduled_retry_at": due_at,
|
|
310
|
+
"next_attempt": next_attempt,
|
|
311
|
+
"max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
|
|
312
|
+
"detected": injection.get("detected"),
|
|
313
|
+
"pane_id": injection.get("pane_id"),
|
|
314
|
+
"pane_mode": injection.get("pane_mode"),
|
|
315
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _message_owner_team_id(store: MessageStore, message_id: str) -> str | None:
|
|
320
|
+
row = _message_by_id(store, message_id)
|
|
321
|
+
if not row:
|
|
322
|
+
return None
|
|
323
|
+
owner = row.get("owner_team_id")
|
|
324
|
+
return str(owner) if owner else None
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _execute_trust_retry(
|
|
328
|
+
workspace: Path,
|
|
329
|
+
store: MessageStore,
|
|
330
|
+
event_log: EventLog,
|
|
331
|
+
payload: dict[str, Any],
|
|
332
|
+
*,
|
|
333
|
+
owner_team_id: str | None = None,
|
|
334
|
+
) -> dict[str, Any]:
|
|
335
|
+
"""Scheduler-side consumer for kind='trust_retry'. Resets the message back
|
|
336
|
+
to 'accepted' so claim_for_delivery succeeds, re-runs _deliver_pending_message,
|
|
337
|
+
and either succeeds, escalates to a further retry (via _handle_trust_retry_needed),
|
|
338
|
+
or hits the terminal exhausted branch.
|
|
339
|
+
"""
|
|
340
|
+
from team_agent.state import load_runtime_state
|
|
341
|
+
message_id = str(payload.get("message_id") or "")
|
|
342
|
+
if not message_id:
|
|
343
|
+
return {"ok": False, "reason": "trust_retry_missing_message_id"}
|
|
344
|
+
attempt = int(payload.get("attempt") or 1)
|
|
345
|
+
row = _message_by_id(store, message_id)
|
|
346
|
+
if not row:
|
|
347
|
+
event_log.write(
|
|
348
|
+
"leader_panes.trust_auto_answer_retry_skipped",
|
|
349
|
+
message_id=message_id,
|
|
350
|
+
reason="message_missing",
|
|
351
|
+
attempt=attempt,
|
|
352
|
+
)
|
|
353
|
+
return {"ok": False, "reason": "message_missing"}
|
|
354
|
+
# Reset to accepted so claim_for_delivery succeeds. The previous attempt
|
|
355
|
+
# left the row in 'failed' status with reason='trust_retry_scheduled'.
|
|
356
|
+
store.mark(message_id, "accepted", "trust_retry_resuming")
|
|
357
|
+
event_log.write(
|
|
358
|
+
"leader_panes.trust_auto_answer_retry_attempted",
|
|
359
|
+
message_id=message_id,
|
|
360
|
+
workspace=str(workspace),
|
|
361
|
+
attempt=attempt,
|
|
362
|
+
max_attempts=int(payload.get("max_attempts") or _TRUST_RETRY_MAX_ATTEMPTS),
|
|
363
|
+
)
|
|
364
|
+
state = load_runtime_state(workspace)
|
|
365
|
+
if owner_team_id and isinstance(state.get("teams"), dict):
|
|
366
|
+
scoped = state["teams"].get(owner_team_id)
|
|
367
|
+
if isinstance(scoped, dict):
|
|
368
|
+
state = scoped
|
|
369
|
+
delivery_result = _deliver_pending_message(
|
|
370
|
+
workspace, state, message_id,
|
|
371
|
+
wait_visible=True, timeout=30.0,
|
|
372
|
+
_trust_retry_attempt=attempt,
|
|
373
|
+
)
|
|
374
|
+
return delivery_result
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _stamp_first_send_at_if_leader_to_worker(
|
|
378
|
+
state: dict[str, Any],
|
|
379
|
+
row: dict[str, Any],
|
|
380
|
+
event_log: EventLog | None = None,
|
|
381
|
+
) -> None:
|
|
382
|
+
"""Route B atomicity (2026-05-27): record the first time the leader
|
|
383
|
+
successfully sends work to each worker. The presence of this stamp drives
|
|
384
|
+
restart's resumability decision — a worker the leader has interacted with
|
|
385
|
+
has accumulated conversation state, so a missing session_id at restart
|
|
386
|
+
time IS an atomicity violation. A worker that has never received work
|
|
387
|
+
legitimately fresh-starts during restart.
|
|
388
|
+
|
|
389
|
+
Only stamped once per worker (idempotent across re-sends). Only fires on
|
|
390
|
+
leader -> worker sends; worker-to-worker peer messages do not count.
|
|
391
|
+
The mutation lives on the state dict the caller already saves
|
|
392
|
+
(`save_team_scoped_state` in send.py, or `save_runtime_state` after
|
|
393
|
+
coordinator_tick), so persistence is automatic.
|
|
394
|
+
|
|
395
|
+
C1 (cr verdict, 2026-05-27): when the stamp transitions null -> ts (the
|
|
396
|
+
one-time write), emit a `worker.first_interaction` audit event with
|
|
397
|
+
worker_id, first_send_at, message_id. Re-sends to the same worker hit the
|
|
398
|
+
idempotency guard above and do NOT re-emit. Worker-to-worker peer sends
|
|
399
|
+
short-circuit at the sender check and do NOT emit.
|
|
400
|
+
"""
|
|
401
|
+
sender = str(row.get("sender") or "")
|
|
402
|
+
recipient = str(row.get("recipient") or "")
|
|
403
|
+
if not recipient:
|
|
404
|
+
return
|
|
405
|
+
leader_id = str((state.get("leader") or {}).get("id") or "leader")
|
|
406
|
+
if sender not in {"leader", "Leader", leader_id}:
|
|
407
|
+
return
|
|
408
|
+
agents = state.get("agents")
|
|
409
|
+
if not isinstance(agents, dict):
|
|
410
|
+
return
|
|
411
|
+
agent_state = agents.get(recipient)
|
|
412
|
+
if not isinstance(agent_state, dict):
|
|
413
|
+
return
|
|
414
|
+
if agent_state.get("first_send_at"):
|
|
415
|
+
return
|
|
416
|
+
stamp = datetime.now(timezone.utc).isoformat()
|
|
417
|
+
agent_state["first_send_at"] = stamp
|
|
418
|
+
if event_log is not None:
|
|
419
|
+
event_log.write(
|
|
420
|
+
"worker.first_interaction",
|
|
421
|
+
worker_id=recipient,
|
|
422
|
+
first_send_at=stamp,
|
|
423
|
+
message_id=str(row.get("message_id") or ""),
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
|
|
428
|
+
"""Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
|
|
429
|
+
the pane no longer matches detect_non_input_scrollback, False if the prompt
|
|
430
|
+
is still present after `timeout` seconds. Uses the same detector the inject
|
|
431
|
+
path uses so behaviour stays consistent."""
|
|
432
|
+
import time as _time
|
|
433
|
+
from team_agent.messaging.tmux_prompt import detect_non_input_scrollback
|
|
434
|
+
deadline = _time.monotonic() + max(timeout, 0.0)
|
|
435
|
+
while True:
|
|
436
|
+
capture = _capture_pane_tail(target)
|
|
437
|
+
detected = detect_non_input_scrollback(capture)
|
|
438
|
+
if detected != "codex_trust_prompt":
|
|
439
|
+
return True
|
|
440
|
+
if _time.monotonic() >= deadline:
|
|
441
|
+
return False
|
|
442
|
+
_time.sleep(poll_interval)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _capture_pane_tail(target: str) -> str:
|
|
446
|
+
from team_agent.messaging.deps import _capture_tmux_pane_text
|
|
447
|
+
capture = _capture_tmux_pane_text(target)
|
|
448
|
+
if not capture.get("ok"):
|
|
449
|
+
return ""
|
|
450
|
+
return str(capture.get("capture") or "")
|
|
451
|
+
|
|
452
|
+
|
|
118
453
|
def _deliver_pending_messages(workspace: Path, state: dict[str, Any], event_log: EventLog) -> list[str]:
|
|
119
454
|
store = MessageStore(workspace)
|
|
120
455
|
delivered: list[str] = []
|
|
@@ -251,6 +251,19 @@ def _send_to_leader_receiver(
|
|
|
251
251
|
f"team-agent-leader-receiver-{message_id}",
|
|
252
252
|
provider=receiver.get("provider", "codex"),
|
|
253
253
|
)
|
|
254
|
+
if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
|
|
255
|
+
from team_agent.messaging.trust_auto_answer import retry_injection_after_trust_auto_answer
|
|
256
|
+
injection = retry_injection_after_trust_auto_answer(
|
|
257
|
+
workspace,
|
|
258
|
+
state,
|
|
259
|
+
event_log,
|
|
260
|
+
injection,
|
|
261
|
+
target,
|
|
262
|
+
text,
|
|
263
|
+
submit_key,
|
|
264
|
+
f"team-agent-leader-receiver-{message_id}-trust-retry",
|
|
265
|
+
receiver.get("provider", "codex"),
|
|
266
|
+
)
|
|
254
267
|
if injection["ok"]:
|
|
255
268
|
store.mark(message_id, "submitted")
|
|
256
269
|
event_log.write(
|
|
@@ -466,10 +479,6 @@ def _format_team_agent_message(payload: dict[str, Any]) -> str:
|
|
|
466
479
|
|
|
467
480
|
|
|
468
481
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
482
|
|
|
474
483
|
|
|
475
484
|
|