@team-agent/installer 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +122 -6
- package/src/team_agent/cli/parser.py +42 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +11 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/launch/core.py +2 -1
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +87 -9
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +8 -7
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +31 -2
- package/src/team_agent/messaging/delivery.py +293 -1
- package/src/team_agent/messaging/idle_alerts.py +109 -9
- package/src/team_agent/messaging/leader.py +179 -10
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +393 -23
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +24 -2
- package/src/team_agent/messaging/send.py +21 -26
- package/src/team_agent/messaging/tmux_io.py +153 -23
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +44 -0
- package/src/team_agent/restart/orchestration.py +207 -4
- package/src/team_agent/runtime.py +7 -7
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +59 -0
- package/src/team_agent/state.py +153 -10
- package/src/team_agent/status/inbox.py +33 -3
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/watch/__init__.py +145 -0
|
@@ -20,6 +20,7 @@ from team_agent.messaging.deps import (
|
|
|
20
20
|
|
|
21
21
|
from pathlib import Path
|
|
22
22
|
from typing import Any
|
|
23
|
+
from team_agent.messaging.tmux_prompt import detect_non_input_scrollback, non_input_scrollback_window
|
|
23
24
|
|
|
24
25
|
def _tmux_inject_text(
|
|
25
26
|
target: str,
|
|
@@ -28,6 +29,8 @@ def _tmux_inject_text(
|
|
|
28
29
|
buffer_name: str,
|
|
29
30
|
attempts: int = 3,
|
|
30
31
|
provider: str = "fake",
|
|
32
|
+
*,
|
|
33
|
+
bypass_non_input_gate: bool = False,
|
|
31
34
|
) -> dict[str, Any]:
|
|
32
35
|
token_match = re.search(r"\[team-agent-token:([^\]]+)\]", text)
|
|
33
36
|
token = token_match.group(1) if token_match else ""
|
|
@@ -37,15 +40,25 @@ def _tmux_inject_text(
|
|
|
37
40
|
submit_settle_timeout = _tmux_submit_settle_timeout(text)
|
|
38
41
|
text_bytes = _tmux_text_size(text)
|
|
39
42
|
for attempt in range(1, max(attempts, 1) + 1):
|
|
40
|
-
prepared =
|
|
43
|
+
prepared = (
|
|
44
|
+
{"ok": True, "verification": "non_input_gate_bypassed"}
|
|
45
|
+
if bypass_non_input_gate
|
|
46
|
+
else _prepare_tmux_pane_for_input(target)
|
|
47
|
+
)
|
|
41
48
|
if not prepared["ok"]:
|
|
42
|
-
attempt_log.append(
|
|
49
|
+
attempt_log.append(_prepare_failure_attempt(attempt, prepared))
|
|
43
50
|
return {
|
|
44
51
|
"ok": False,
|
|
52
|
+
"status": "failed",
|
|
45
53
|
"stage": prepared["stage"],
|
|
54
|
+
"reason": prepared.get("reason"),
|
|
46
55
|
"error": prepared.get("error"),
|
|
47
56
|
"attempts": attempt_log,
|
|
48
57
|
"verification": prepared["verification"],
|
|
58
|
+
"detected": prepared.get("detected"),
|
|
59
|
+
"pane_id": prepared.get("pane_id"),
|
|
60
|
+
"pane_mode": prepared.get("pane_mode"),
|
|
61
|
+
"pane_capture_tail": prepared.get("pane_capture_tail"),
|
|
49
62
|
}
|
|
50
63
|
baseline = _capture_tmux_pane_text(target)
|
|
51
64
|
if not baseline["ok"]:
|
|
@@ -97,6 +110,9 @@ def _tmux_inject_text(
|
|
|
97
110
|
attempt_entry["buffer_delete_error"] = deleted.get("error")
|
|
98
111
|
if prepared.get("recovered_from_mode"):
|
|
99
112
|
attempt_entry["recovered_from_mode"] = True
|
|
113
|
+
attempt_entry["recovered_from_pane_mode"] = prepared.get("pane_mode")
|
|
114
|
+
if prepared.get("warning_event"):
|
|
115
|
+
attempt_entry["warning_event"] = prepared["warning_event"]
|
|
100
116
|
attempt_log.append(attempt_entry)
|
|
101
117
|
if not visible:
|
|
102
118
|
time.sleep(0.2)
|
|
@@ -276,50 +292,164 @@ def _tmux_load_buffer_stdin(buffer_name: str, text: str) -> subprocess.Completed
|
|
|
276
292
|
|
|
277
293
|
|
|
278
294
|
def _prepare_tmux_pane_for_input(target: str) -> dict[str, Any]:
|
|
279
|
-
|
|
280
|
-
if
|
|
295
|
+
mode_result = _pane_mode(target)
|
|
296
|
+
if not mode_result["ok"]:
|
|
281
297
|
return {
|
|
282
298
|
"ok": False,
|
|
283
299
|
"stage": "pane-mode-check",
|
|
284
300
|
"verification": "pane_mode_check_failed",
|
|
285
|
-
"error":
|
|
301
|
+
"error": mode_result.get("error") or "tmux pane mode check failed",
|
|
286
302
|
}
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
cancel = run_cmd(["tmux", "send-keys", "-t", target, "-X", "cancel"], timeout=10)
|
|
290
|
-
if cancel.returncode != 0:
|
|
303
|
+
capture_result = _pane_capture_tail(target, lines=30)
|
|
304
|
+
if not capture_result["ok"]:
|
|
291
305
|
return {
|
|
292
306
|
"ok": False,
|
|
293
|
-
"stage": "pane-
|
|
294
|
-
"verification": "
|
|
295
|
-
"error":
|
|
307
|
+
"stage": "pane-tail-capture",
|
|
308
|
+
"verification": "pane_tail_capture_failed",
|
|
309
|
+
"error": capture_result.get("error") or "tmux capture-pane failed",
|
|
296
310
|
}
|
|
311
|
+
pane_mode = _normalize_pane_mode(mode_result.get("pane_mode"))
|
|
312
|
+
capture_tail = str(capture_result.get("capture") or "")
|
|
313
|
+
detected = detect_non_input_scrollback(capture_tail)
|
|
314
|
+
if detected:
|
|
315
|
+
return _non_input_refusal(target, pane_mode, capture_tail, detected)
|
|
316
|
+
if not pane_mode:
|
|
317
|
+
return {"ok": True, "verification": "pane_input_ready"}
|
|
318
|
+
cancel = _pane_mode_cancel(target, pane_mode)
|
|
319
|
+
if not cancel["ok"]:
|
|
320
|
+
return _non_input_refusal(
|
|
321
|
+
target,
|
|
322
|
+
pane_mode,
|
|
323
|
+
capture_tail,
|
|
324
|
+
f"tmux_{pane_mode}",
|
|
325
|
+
error=cancel.get("error") or "tmux pane mode cancel failed",
|
|
326
|
+
verification="pane_mode_cancel_failed",
|
|
327
|
+
warning_event=cancel.get("warning_event"),
|
|
328
|
+
)
|
|
329
|
+
warning_event = cancel.get("warning_event")
|
|
297
330
|
deadline = time.monotonic() + 1.5
|
|
298
331
|
while True:
|
|
299
|
-
check =
|
|
300
|
-
if check
|
|
332
|
+
check = _pane_mode(target)
|
|
333
|
+
if not check["ok"]:
|
|
301
334
|
return {
|
|
302
335
|
"ok": False,
|
|
303
336
|
"stage": "pane-mode-check",
|
|
304
337
|
"verification": "pane_mode_recheck_failed",
|
|
305
|
-
"error": check.
|
|
338
|
+
"error": check.get("error") or "tmux pane mode recheck failed",
|
|
306
339
|
}
|
|
307
|
-
if check.
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
"
|
|
312
|
-
"
|
|
313
|
-
"verification": "pane_mode_still_active_after_cancel",
|
|
314
|
-
"error": "tmux pane stayed in copy-mode after cancel",
|
|
340
|
+
if not _normalize_pane_mode(check.get("pane_mode")):
|
|
341
|
+
result = {
|
|
342
|
+
"ok": True,
|
|
343
|
+
"verification": "pane_input_ready_after_mode_cancel",
|
|
344
|
+
"recovered_from_mode": True,
|
|
345
|
+
"pane_mode": pane_mode,
|
|
315
346
|
}
|
|
347
|
+
if warning_event:
|
|
348
|
+
result["warning_event"] = warning_event
|
|
349
|
+
return result
|
|
350
|
+
if time.monotonic() >= deadline:
|
|
351
|
+
return _non_input_refusal(
|
|
352
|
+
target,
|
|
353
|
+
pane_mode,
|
|
354
|
+
capture_tail,
|
|
355
|
+
f"tmux_{pane_mode}",
|
|
356
|
+
error=f"tmux pane stayed in {pane_mode} after cancel",
|
|
357
|
+
verification="pane_mode_still_active_after_cancel",
|
|
358
|
+
warning_event=warning_event,
|
|
359
|
+
)
|
|
316
360
|
time.sleep(0.1)
|
|
317
361
|
|
|
318
362
|
|
|
363
|
+
def _pane_mode(target: str) -> dict[str, Any]:
|
|
364
|
+
proc = run_cmd(["tmux", "display-message", "-p", "-t", target, "#{pane_mode}"], timeout=5)
|
|
365
|
+
if proc.returncode != 0:
|
|
366
|
+
return {"ok": False, "error": proc.stderr.strip() or "tmux pane mode check failed"}
|
|
367
|
+
return {"ok": True, "pane_mode": proc.stdout.strip()}
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _pane_capture_tail(target: str, lines: int = 30) -> dict[str, Any]:
|
|
371
|
+
capture = run_cmd(["tmux", "capture-pane", "-p", "-S", f"-{lines}", "-t", target], timeout=5)
|
|
372
|
+
if capture.returncode != 0:
|
|
373
|
+
return {"ok": False, "capture": "", "error": capture.stderr.strip() or "tmux capture-pane failed"}
|
|
374
|
+
return {"ok": True, "capture": capture.stdout}
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _pane_mode_cancel(target: str, pane_mode: str) -> dict[str, Any]:
|
|
378
|
+
mode = _normalize_pane_mode(pane_mode)
|
|
379
|
+
warning_event = None
|
|
380
|
+
if mode == "copy-mode":
|
|
381
|
+
args = ["tmux", "send-keys", "-t", target, "-X", "cancel"]
|
|
382
|
+
elif mode in {"tree-mode", "view-mode"}:
|
|
383
|
+
args = ["tmux", "send-keys", "-t", target, "q"]
|
|
384
|
+
elif mode == "client-mode":
|
|
385
|
+
args = ["tmux", "send-keys", "-t", target, "d"]
|
|
386
|
+
else:
|
|
387
|
+
args = ["tmux", "send-keys", "-t", target, "-X", "cancel"]
|
|
388
|
+
warning_event = "pane_mode_unknown_cancel_attempted"
|
|
389
|
+
cancel = run_cmd(args, timeout=10)
|
|
390
|
+
if cancel.returncode != 0:
|
|
391
|
+
return {
|
|
392
|
+
"ok": False,
|
|
393
|
+
"error": cancel.stderr.strip() or f"tmux {mode or 'unknown'} cancel failed",
|
|
394
|
+
"warning_event": warning_event,
|
|
395
|
+
}
|
|
396
|
+
result = {"ok": True, "mode": mode, "args": args}
|
|
397
|
+
if warning_event:
|
|
398
|
+
result["warning_event"] = warning_event
|
|
399
|
+
return result
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _normalize_pane_mode(mode: Any) -> str:
|
|
403
|
+
value = str(mode or "").strip()
|
|
404
|
+
if value == "0":
|
|
405
|
+
return ""
|
|
406
|
+
if value == "1":
|
|
407
|
+
return "copy-mode"
|
|
408
|
+
return value
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _non_input_refusal(
|
|
412
|
+
target: str,
|
|
413
|
+
pane_mode: str,
|
|
414
|
+
capture_tail: str,
|
|
415
|
+
detected: str,
|
|
416
|
+
*,
|
|
417
|
+
error: str | None = None,
|
|
418
|
+
verification: str = "recipient_pane_in_non_input_mode",
|
|
419
|
+
warning_event: str | None = None,
|
|
420
|
+
) -> dict[str, Any]:
|
|
421
|
+
result = {
|
|
422
|
+
"ok": False,
|
|
423
|
+
"status": "failed",
|
|
424
|
+
"stage": "pre-paste-pane-state",
|
|
425
|
+
"reason": "recipient_pane_in_non_input_mode",
|
|
426
|
+
"error": error or "recipient_pane_in_non_input_mode",
|
|
427
|
+
"verification": verification,
|
|
428
|
+
"detected": detected,
|
|
429
|
+
"pane_id": target,
|
|
430
|
+
"pane_mode": pane_mode,
|
|
431
|
+
"pane_capture_tail": non_input_scrollback_window(capture_tail) or _last_lines(capture_tail, 10),
|
|
432
|
+
}
|
|
433
|
+
if warning_event:
|
|
434
|
+
result["warning_event"] = warning_event
|
|
435
|
+
return result
|
|
319
436
|
|
|
320
437
|
|
|
438
|
+
def _prepare_failure_attempt(attempt: int, prepared: dict[str, Any]) -> dict[str, Any]:
|
|
439
|
+
entry = {
|
|
440
|
+
"attempt": attempt,
|
|
441
|
+
"visible": False,
|
|
442
|
+
"verification": prepared["verification"],
|
|
443
|
+
}
|
|
444
|
+
for key in ("reason", "detected", "pane_id", "pane_mode", "pane_capture_tail", "warning_event"):
|
|
445
|
+
if key in prepared:
|
|
446
|
+
entry[key] = prepared[key]
|
|
447
|
+
return entry
|
|
321
448
|
|
|
322
449
|
|
|
450
|
+
def _last_lines(text: str, count: int) -> str:
|
|
451
|
+
lines = text.splitlines()
|
|
452
|
+
return "\n".join(lines[-count:])
|
|
323
453
|
|
|
324
454
|
|
|
325
455
|
|
|
@@ -12,6 +12,93 @@ from team_agent.messaging.deps import (
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
from typing import Any
|
|
14
14
|
|
|
15
|
+
|
|
16
|
+
_ANSI_ESCAPE_RE = re.compile(r"\x1b\[[0-?]*[ -/]*[@-~]")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def detect_non_input_scrollback(capture_tail: str) -> str | None:
|
|
20
|
+
nonempty = _non_input_scrollback_lines(capture_tail)
|
|
21
|
+
tail_text = "\n".join(nonempty)
|
|
22
|
+
lower = tail_text.lower()
|
|
23
|
+
stale_before_input = _stale_non_input_before_ready_prompt(nonempty)
|
|
24
|
+
if re.search(r"do\s+you\s+trust\s+the\s+contents\s+of\s+this\s+directory", lower):
|
|
25
|
+
if stale_before_input:
|
|
26
|
+
return None
|
|
27
|
+
return "codex_trust_prompt"
|
|
28
|
+
if "press enter to log in" in lower or "press enter to login" in lower:
|
|
29
|
+
if stale_before_input:
|
|
30
|
+
return None
|
|
31
|
+
return "codex_first_run_auth"
|
|
32
|
+
if "capability may degrade" in lower:
|
|
33
|
+
if stale_before_input:
|
|
34
|
+
return None
|
|
35
|
+
return "codex_compaction_warning"
|
|
36
|
+
if re.search(r"press\s+(enter|return)\s+to\s+continue", lower):
|
|
37
|
+
if stale_before_input:
|
|
38
|
+
return None
|
|
39
|
+
return "generic_press_enter"
|
|
40
|
+
if re.search(r"press\s+any\s+key", lower):
|
|
41
|
+
if stale_before_input:
|
|
42
|
+
return None
|
|
43
|
+
return "generic_press_enter"
|
|
44
|
+
if re.search(r"(\(y/n\)|\([yY]/n\)|\[y/N\]|\[Y/n\]|\[y/n\])", tail_text):
|
|
45
|
+
if stale_before_input:
|
|
46
|
+
return None
|
|
47
|
+
return "y_n_confirm"
|
|
48
|
+
for first, second in zip(nonempty, nonempty[1:]):
|
|
49
|
+
if _starts_numbered_choice(first, "1") and _starts_numbered_choice(second, "2"):
|
|
50
|
+
if stale_before_input:
|
|
51
|
+
return None
|
|
52
|
+
return "numbered_menu"
|
|
53
|
+
if nonempty:
|
|
54
|
+
last = nonempty[-1]
|
|
55
|
+
if re.search(r"(^|[\s~/.\w-])[$%]\s*$", last):
|
|
56
|
+
return "shell_prompt_cli_dead"
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def non_input_scrollback_window(capture_tail: str, limit: int = 15) -> str:
|
|
61
|
+
return "\n".join(_non_input_scrollback_lines(capture_tail, limit=limit))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _non_input_scrollback_lines(capture_tail: str, limit: int = 15) -> list[str]:
|
|
65
|
+
lines = [_ANSI_ESCAPE_RE.sub("", line).rstrip() for line in capture_tail.splitlines()]
|
|
66
|
+
while lines and not lines[-1].strip():
|
|
67
|
+
lines.pop()
|
|
68
|
+
return [line for line in lines if line.strip()][-limit:]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _starts_numbered_choice(line: str, number: str) -> bool:
|
|
72
|
+
return bool(re.match(rf"^\s*(?:[›❯>]\s*)?{number}\.\s+", line))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _stale_non_input_before_ready_prompt(lines: list[str]) -> bool:
|
|
76
|
+
latest_non_input = -1
|
|
77
|
+
latest_ready = -1
|
|
78
|
+
for index, line in enumerate(lines):
|
|
79
|
+
lower = line.lower()
|
|
80
|
+
if (
|
|
81
|
+
"do you trust the contents of this directory" in lower
|
|
82
|
+
or re.search(r"press\s+(enter|return)\s+to\s+continue", lower)
|
|
83
|
+
or re.search(r"press\s+any\s+key", lower)
|
|
84
|
+
or _starts_numbered_choice(line, "1")
|
|
85
|
+
or _starts_numbered_choice(line, "2")
|
|
86
|
+
):
|
|
87
|
+
latest_non_input = index
|
|
88
|
+
if _is_input_ready_prompt(line):
|
|
89
|
+
latest_ready = index
|
|
90
|
+
return latest_non_input >= 0 and latest_ready > latest_non_input
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _is_input_ready_prompt(line: str) -> bool:
|
|
94
|
+
if _starts_numbered_choice(line, "1") or _starts_numbered_choice(line, "2"):
|
|
95
|
+
return False
|
|
96
|
+
value = line.strip()
|
|
97
|
+
if re.match(r"^[›❯>]\s+\S", value):
|
|
98
|
+
return True
|
|
99
|
+
return bool(re.search(r"\b(codex|claude)\s*[>›❯]\s*$", value, re.IGNORECASE))
|
|
100
|
+
|
|
101
|
+
|
|
15
102
|
def _enable_codex_fast_mode(session_name: str, window_name: str) -> dict[str, Any]:
|
|
16
103
|
target = f"{session_name}:{window_name}"
|
|
17
104
|
proc = run_cmd(["tmux", "send-keys", "-t", target, "/fast", "Enter"], timeout=10)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from team_agent.events import EventLog
|
|
7
|
+
from team_agent.messaging.deps import _tmux_inject_text
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def retry_injection_after_trust_auto_answer(
|
|
11
|
+
workspace: Path,
|
|
12
|
+
state: dict[str, Any],
|
|
13
|
+
event_log: EventLog,
|
|
14
|
+
injection: dict[str, Any],
|
|
15
|
+
target: str,
|
|
16
|
+
text: str,
|
|
17
|
+
submit_key: str,
|
|
18
|
+
buffer_name: str,
|
|
19
|
+
provider: str,
|
|
20
|
+
) -> dict[str, Any]:
|
|
21
|
+
from team_agent.messaging.delivery import _wait_for_trust_prompt_dismissal
|
|
22
|
+
from team_agent.messaging.leader_panes import attempt_trust_auto_answer
|
|
23
|
+
answer = attempt_trust_auto_answer(
|
|
24
|
+
workspace,
|
|
25
|
+
injection.get("pane_id") or target,
|
|
26
|
+
injection.get("pane_capture_tail") or "",
|
|
27
|
+
event_log,
|
|
28
|
+
state=state,
|
|
29
|
+
)
|
|
30
|
+
if not answer.get("answered"):
|
|
31
|
+
return injection
|
|
32
|
+
if not _wait_for_trust_prompt_dismissal(injection.get("pane_id") or target, timeout=3.0):
|
|
33
|
+
retry_blocked = dict(injection)
|
|
34
|
+
retry_blocked["error"] = "trust_prompt_not_dismissed_after_answer"
|
|
35
|
+
retry_blocked["verification"] = "trust_prompt_not_dismissed_after_answer"
|
|
36
|
+
retry_blocked["stage"] = "trust_auto_answer_dismissal_wait"
|
|
37
|
+
return retry_blocked
|
|
38
|
+
return _tmux_inject_text(
|
|
39
|
+
target,
|
|
40
|
+
text,
|
|
41
|
+
submit_key,
|
|
42
|
+
buffer_name,
|
|
43
|
+
provider=provider,
|
|
44
|
+
)
|
|
@@ -84,15 +84,72 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
84
84
|
raise RuntimeError(_tmux_session_conflict_error(session_name))
|
|
85
85
|
runtime_cfg = _effective_runtime_config(spec.get("runtime", {}))
|
|
86
86
|
display_backend = spec.get("runtime", {}).get("display_backend", state.get("display_backend", "none"))
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
87
|
+
# Stage 7 S5 — Slice 6 lifecycle atomicity contract: compute restart_agents
|
|
88
|
+
# early so we can pre-validate resumability BEFORE any destructive teardown
|
|
89
|
+
# (ghostty close, tmux session creation). Without --allow-fresh, every
|
|
90
|
+
# non-paused worker MUST be resumable; if any is not, refuse the operation
|
|
91
|
+
# atomically with a structured result and a restart.atomic_refusal event.
|
|
92
|
+
# No rollback path is needed because nothing has been created yet.
|
|
91
93
|
restart_agents = [
|
|
92
94
|
agent
|
|
93
95
|
for agent in spec.get("agents", [])
|
|
94
96
|
if state.get("agents", {}).get(agent["id"], {}).get("status") != "paused" and not agent.get("paused")
|
|
95
97
|
]
|
|
98
|
+
# cr strict-typing (2026-05-27): refuse the operation deterministically
|
|
99
|
+
# before any decision logic if any persisted first_send_at is corrupt
|
|
100
|
+
# (empty string, 0, False, literal "null", any non-ISO garbage). This
|
|
101
|
+
# avoids silent misclassification through Python truthiness and gives the
|
|
102
|
+
# operator a clear audit signal that state.json is damaged.
|
|
103
|
+
invalid_first_send_at = _collect_corrupt_first_send_at(restart_agents, state)
|
|
104
|
+
if invalid_first_send_at:
|
|
105
|
+
for entry in invalid_first_send_at:
|
|
106
|
+
event_log.write(
|
|
107
|
+
"restart.first_send_at_invalid",
|
|
108
|
+
worker_id=entry["worker_id"],
|
|
109
|
+
raw_first_send_at=entry["raw_first_send_at"],
|
|
110
|
+
raw_first_send_at_type=entry["raw_first_send_at_type"],
|
|
111
|
+
)
|
|
112
|
+
invalid_names = [entry["worker_id"] for entry in invalid_first_send_at]
|
|
113
|
+
return {
|
|
114
|
+
"ok": False,
|
|
115
|
+
"status": "refused",
|
|
116
|
+
"reason": "invalid_first_send_at",
|
|
117
|
+
"invalid_first_send_at": invalid_first_send_at,
|
|
118
|
+
"allow_fresh": bool(allow_fresh),
|
|
119
|
+
"error": (
|
|
120
|
+
f"Cannot restart: workers {invalid_names} have a corrupt "
|
|
121
|
+
"first_send_at in state.json (only null/missing or a valid "
|
|
122
|
+
"ISO-8601 UTC timestamp string is accepted). Inspect the "
|
|
123
|
+
"restart.first_send_at_invalid audit events for raw values "
|
|
124
|
+
"and repair state.json before retrying."
|
|
125
|
+
),
|
|
126
|
+
}
|
|
127
|
+
# cr C2: emit one restart.resume_decision event per non-paused worker so
|
|
128
|
+
# every restart attempt produces an auditable per-worker classification.
|
|
129
|
+
# The function returns only refused workers — populated when
|
|
130
|
+
# allow_fresh=False AND at least one interacted worker cannot be repaired.
|
|
131
|
+
refused = _emit_resume_decisions(
|
|
132
|
+
workspace, restart_agents, state, get_adapter, event_log, allow_fresh,
|
|
133
|
+
)
|
|
134
|
+
if refused:
|
|
135
|
+
event_log.write(
|
|
136
|
+
"restart.atomic_refusal",
|
|
137
|
+
unresumable=refused,
|
|
138
|
+
allow_fresh=bool(allow_fresh),
|
|
139
|
+
reason="resume_atomicity",
|
|
140
|
+
)
|
|
141
|
+
return {
|
|
142
|
+
"ok": False,
|
|
143
|
+
"status": "refused",
|
|
144
|
+
"reason": "resume_atomicity",
|
|
145
|
+
"unresumable": refused,
|
|
146
|
+
"allow_fresh": bool(allow_fresh),
|
|
147
|
+
"error": _format_atomic_refusal_error(refused),
|
|
148
|
+
}
|
|
149
|
+
_close_ghostty_workspace(state, event_log)
|
|
150
|
+
for agent_id, agent_state in state.get("agents", {}).items():
|
|
151
|
+
_close_ghostty_display(agent_id, agent_state, event_log)
|
|
152
|
+
state["display_backend"] = display_backend
|
|
96
153
|
_ensure_agent_start_requirements(workspace, restart_agents, event_log, "restart")
|
|
97
154
|
first = True
|
|
98
155
|
restarted: list[dict[str, Any]] = []
|
|
@@ -271,6 +328,7 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
271
328
|
event_log,
|
|
272
329
|
timeout_s=1.5,
|
|
273
330
|
exclude_session_ids=known_session_ids,
|
|
331
|
+
raise_on_missed=False,
|
|
274
332
|
)
|
|
275
333
|
if display_backend in GHOSTTY_DISPLAY_BACKENDS:
|
|
276
334
|
display_jobs.append((agent["id"], agent))
|
|
@@ -315,6 +373,151 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
315
373
|
return {"ok": True, "session_name": session_name, "agents": restarted, "coordinator": coordinator}
|
|
316
374
|
|
|
317
375
|
|
|
376
|
+
_FIRST_SEND_AT_ABSENT = "absent"
|
|
377
|
+
_FIRST_SEND_AT_VALID = "valid"
|
|
378
|
+
_FIRST_SEND_AT_CORRUPT = "corrupt"
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _classify_first_send_at(value: Any) -> str:
|
|
382
|
+
"""Strict first_send_at typing (cr verdict, 2026-05-27).
|
|
383
|
+
|
|
384
|
+
Returns one of:
|
|
385
|
+
"absent" — None or missing field (worker never-interacted).
|
|
386
|
+
"valid" — non-empty ISO-8601 UTC string parseable by datetime.fromisoformat.
|
|
387
|
+
"corrupt" — anything else: empty string, 0, False, literal "null", garbage.
|
|
388
|
+
|
|
389
|
+
The contract requires that corrupt values be detected deterministically
|
|
390
|
+
before any restart decision so we never silent-misclassify a worker's
|
|
391
|
+
interaction state via Python truthiness.
|
|
392
|
+
"""
|
|
393
|
+
if value is None:
|
|
394
|
+
return _FIRST_SEND_AT_ABSENT
|
|
395
|
+
if not isinstance(value, str):
|
|
396
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
397
|
+
if not value:
|
|
398
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
399
|
+
try:
|
|
400
|
+
datetime.fromisoformat(value)
|
|
401
|
+
except (ValueError, TypeError):
|
|
402
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
403
|
+
return _FIRST_SEND_AT_VALID
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _collect_corrupt_first_send_at(
|
|
407
|
+
restart_agents: list[dict[str, Any]],
|
|
408
|
+
state: dict[str, Any],
|
|
409
|
+
) -> list[dict[str, Any]]:
|
|
410
|
+
"""Walk every non-paused worker and flag any whose persisted first_send_at
|
|
411
|
+
is corrupt. Returns the list of invalid records ready for the
|
|
412
|
+
`restart.first_send_at_invalid` event and the refusal envelope."""
|
|
413
|
+
invalid: list[dict[str, Any]] = []
|
|
414
|
+
for agent in restart_agents:
|
|
415
|
+
agent_id = agent["id"]
|
|
416
|
+
previous = state.get("agents", {}).get(agent_id, {})
|
|
417
|
+
raw = previous.get("first_send_at") if isinstance(previous, dict) else None
|
|
418
|
+
if _classify_first_send_at(raw) != _FIRST_SEND_AT_CORRUPT:
|
|
419
|
+
continue
|
|
420
|
+
invalid.append({
|
|
421
|
+
"worker_id": agent_id,
|
|
422
|
+
"raw_first_send_at": raw,
|
|
423
|
+
"raw_first_send_at_type": type(raw).__name__,
|
|
424
|
+
})
|
|
425
|
+
return invalid
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _emit_resume_decisions(
|
|
429
|
+
workspace: Path,
|
|
430
|
+
restart_agents: list[dict[str, Any]],
|
|
431
|
+
state: dict[str, Any],
|
|
432
|
+
get_adapter_fn: Any,
|
|
433
|
+
event_log: EventLog,
|
|
434
|
+
allow_fresh: bool,
|
|
435
|
+
) -> list[dict[str, Any]]:
|
|
436
|
+
"""Route B audit-events contract (cr C2, 2026-05-27). For every non-paused
|
|
437
|
+
worker considered by restart, derive the resume decision per the Route B
|
|
438
|
+
matrix and emit ONE `restart.resume_decision` event:
|
|
439
|
+
|
|
440
|
+
resumable AND ... -> decision = "resume"
|
|
441
|
+
not resumable AND not interacted -> decision = "fresh_start"
|
|
442
|
+
not resumable AND interacted AND fresh -> decision = "fresh_start"
|
|
443
|
+
not resumable AND interacted AND not fresh -> decision = "refuse"
|
|
444
|
+
|
|
445
|
+
Resumability mirrors sessions.resume.prepare_resume_state's repair chain
|
|
446
|
+
so workers the runtime would legitimately repair are NOT flagged. Returns
|
|
447
|
+
the subset of refused workers — populated only when allow_fresh=False AND
|
|
448
|
+
some interacted worker cannot be repaired — for use by atomic_refusal.
|
|
449
|
+
"""
|
|
450
|
+
from team_agent.sessions.resume import recover_resume_session_from_events
|
|
451
|
+
refused: list[dict[str, Any]] = []
|
|
452
|
+
for agent in restart_agents:
|
|
453
|
+
agent_id = agent["id"]
|
|
454
|
+
previous = state.get("agents", {}).get(agent_id, {})
|
|
455
|
+
session_id = previous.get("session_id")
|
|
456
|
+
first_send_at = previous.get("first_send_at")
|
|
457
|
+
has_first_send_at = _classify_first_send_at(first_send_at) == _FIRST_SEND_AT_VALID
|
|
458
|
+
has_session_id = bool(session_id)
|
|
459
|
+
adapter = get_adapter_fn(agent["provider"])
|
|
460
|
+
resumable = bool(session_id) and adapter.session_is_resumable(previous, workspace)
|
|
461
|
+
if not resumable:
|
|
462
|
+
known_session_ids = {
|
|
463
|
+
str(item.get("session_id"))
|
|
464
|
+
for aid, item in state.get("agents", {}).items()
|
|
465
|
+
if aid != agent_id and item.get("session_id")
|
|
466
|
+
}
|
|
467
|
+
repaired = recover_resume_session_from_events(
|
|
468
|
+
workspace, agent_id, previous, adapter, known_session_ids,
|
|
469
|
+
)
|
|
470
|
+
if not repaired:
|
|
471
|
+
repaired = adapter.recover_session_id(
|
|
472
|
+
agent_id, previous, workspace, known_session_ids,
|
|
473
|
+
)
|
|
474
|
+
resumable = bool(repaired)
|
|
475
|
+
if resumable:
|
|
476
|
+
decision = "resume"
|
|
477
|
+
elif not has_first_send_at:
|
|
478
|
+
decision = "fresh_start"
|
|
479
|
+
elif allow_fresh:
|
|
480
|
+
decision = "fresh_start"
|
|
481
|
+
else:
|
|
482
|
+
decision = "refuse"
|
|
483
|
+
event_log.write(
|
|
484
|
+
"restart.resume_decision",
|
|
485
|
+
worker_id=agent_id,
|
|
486
|
+
has_first_send_at=has_first_send_at,
|
|
487
|
+
has_session_id=has_session_id,
|
|
488
|
+
allow_fresh=bool(allow_fresh),
|
|
489
|
+
decision=decision,
|
|
490
|
+
first_send_at=first_send_at if has_first_send_at else None,
|
|
491
|
+
session_id=session_id,
|
|
492
|
+
)
|
|
493
|
+
if decision == "refuse":
|
|
494
|
+
refused.append({
|
|
495
|
+
"agent_id": agent_id,
|
|
496
|
+
"reason": "no_persisted_session_id" if not session_id else "session_unresumable",
|
|
497
|
+
"session_id": session_id,
|
|
498
|
+
"first_send_at": first_send_at,
|
|
499
|
+
})
|
|
500
|
+
return refused
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _format_atomic_refusal_error(refused: list[dict[str, Any]]) -> str:
|
|
504
|
+
"""C4 (cr verdict, 2026-05-27): the human-readable refusal error must
|
|
505
|
+
name every refused worker AND its first_send_at timestamp so an operator
|
|
506
|
+
can decide whether to pass --allow-fresh and accept losing that
|
|
507
|
+
interaction history."""
|
|
508
|
+
names = [item["agent_id"] for item in refused]
|
|
509
|
+
details = ". ".join(
|
|
510
|
+
f"{item['agent_id']} was first interacted with at {item.get('first_send_at')}; "
|
|
511
|
+
"its persisted session is missing"
|
|
512
|
+
for item in refused
|
|
513
|
+
)
|
|
514
|
+
return (
|
|
515
|
+
f"Cannot restart: workers {names} have no resumable session despite "
|
|
516
|
+
f"previous interaction. {details}. "
|
|
517
|
+
"Pass --allow-fresh if you accept losing that interaction history."
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
|
|
318
521
|
def rollback_restart_session(session_name: str, event_log: EventLog) -> dict[str, Any]:
|
|
319
522
|
from team_agent.runtime import run_cmd
|
|
320
523
|
proc = run_cmd(["tmux", "kill-session", "-t", session_name], timeout=10)
|
|
@@ -67,6 +67,8 @@ from team_agent.display import (
|
|
|
67
67
|
from team_agent.leader import (
|
|
68
68
|
attach_leader,
|
|
69
69
|
attach_leader_to_state as _attach_leader_to_state,
|
|
70
|
+
claim_leader,
|
|
71
|
+
leader_identity,
|
|
70
72
|
leader_session_name as _leader_session_name,
|
|
71
73
|
leader_start_plan,
|
|
72
74
|
start_leader,
|
|
@@ -438,12 +440,10 @@ for _name in (
|
|
|
438
440
|
assert hasattr(_launch_pkg, _name), f"team_agent.launch missing {_name}"
|
|
439
441
|
del _launch_pkg, _name
|
|
440
442
|
|
|
441
|
-
# Leader lane re-exports keep runtime
|
|
442
|
-
# runtime.leader_start_plan, runtime._attach_leader_to_state,
|
|
443
|
-
# runtime._leader_session_name resolving for CLI handlers and tests.
|
|
443
|
+
# Leader lane re-exports keep runtime leader helpers resolving for CLI handlers and tests.
|
|
444
444
|
import team_agent.leader as _leader_pkg
|
|
445
445
|
assert attach_leader is _leader_pkg.attach_leader
|
|
446
|
-
for _name in ("attach_leader", "attach_leader_to_state", "leader_session_name", "leader_start_plan", "start_leader"):
|
|
446
|
+
for _name in ("attach_leader", "attach_leader_to_state", "claim_leader", "leader_identity", "leader_session_name", "leader_start_plan", "start_leader"):
|
|
447
447
|
assert hasattr(_leader_pkg, _name), f"team_agent.leader missing {_name}"
|
|
448
448
|
del _leader_pkg, _name
|
|
449
449
|
from team_agent.task_graph import ready_tasks, update_task_status
|
|
@@ -674,7 +674,7 @@ def _handle_startup_prompts_and_verify_window(
|
|
|
674
674
|
session_name: str,
|
|
675
675
|
start_mode: str,
|
|
676
676
|
) -> bool:
|
|
677
|
-
handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=
|
|
677
|
+
handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=20, sleep_s=0.5)
|
|
678
678
|
for prompt_event in handled_prompts:
|
|
679
679
|
event_log.write(f"{event_prefix}.startup_prompt_handled", agent_id=agent_id, provider=provider, **prompt_event)
|
|
680
680
|
deadline = time.monotonic() + 1.0
|
|
@@ -840,10 +840,10 @@ def _retry_or_failed(task: dict[str, Any]) -> str:
|
|
|
840
840
|
return "failed"
|
|
841
841
|
|
|
842
842
|
|
|
843
|
-
def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0) -> dict[str, Any]:
|
|
843
|
+
def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0, *, _trust_retry_attempt: int = 1) -> dict[str, Any]:
|
|
844
844
|
from team_agent.messaging.delivery import _deliver_pending_message as impl
|
|
845
845
|
|
|
846
|
-
return impl(workspace, state, message_id, wait_visible, timeout)
|
|
846
|
+
return impl(workspace, state, message_id, wait_visible, timeout, _trust_retry_attempt=_trust_retry_attempt)
|
|
847
847
|
|
|
848
848
|
def _enable_codex_fast_mode(session_name: str, window_name: str) -> dict[str, Any]:
|
|
849
849
|
from team_agent.messaging.tmux_prompt import _enable_codex_fast_mode as impl
|