@team-agent/installer 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/abnormal_track.py +253 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +104 -3
- package/src/team_agent/cli/parser.py +10 -1
- package/src/team_agent/compiler.py +1 -1
- package/src/team_agent/coordinator/lifecycle.py +23 -2
- package/src/team_agent/diagnose/orphan_cleanup.py +199 -28
- package/src/team_agent/display/__init__.py +31 -0
- package/src/team_agent/display/adaptive.py +425 -0
- package/src/team_agent/display/backend.py +46 -0
- package/src/team_agent/display/close.py +6 -0
- package/src/team_agent/display/rebuild.py +102 -0
- package/src/team_agent/display/tiling.py +156 -0
- package/src/team_agent/display/worker_window.py +4 -0
- package/src/team_agent/display/workspace.py +36 -127
- package/src/team_agent/idle_predicate.py +200 -0
- package/src/team_agent/idle_takeover.py +59 -0
- package/src/team_agent/idle_takeover_wiring.py +111 -0
- package/src/team_agent/launch/core.py +14 -4
- package/src/team_agent/leader/__init__.py +444 -61
- package/src/team_agent/lifecycle/operations.py +1 -0
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +38 -11
- package/src/team_agent/message_store/leader_notification_log.py +47 -26
- package/src/team_agent/message_store/schema.py +8 -2
- package/src/team_agent/messaging/delivery.py +336 -1
- package/src/team_agent/messaging/leader.py +13 -4
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +294 -0
- package/src/team_agent/messaging/scheduler.py +12 -0
- package/src/team_agent/messaging/send.py +54 -26
- package/src/team_agent/messaging/tmux_io.py +202 -33
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +52 -0
- package/src/team_agent/provider_state/README.md +78 -0
- package/src/team_agent/provider_state/__init__.py +86 -0
- package/src/team_agent/provider_state/claude.py +86 -0
- package/src/team_agent/provider_state/codex.py +84 -0
- package/src/team_agent/provider_state/common.py +207 -0
- package/src/team_agent/provider_state/registry.py +118 -0
- package/src/team_agent/restart/orchestration.py +215 -12
- package/src/team_agent/runtime.py +65 -15
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +63 -3
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/wake.py +58 -0
- package/src/team_agent/watch/__init__.py +145 -0
|
@@ -8,18 +8,19 @@ from typing import Any
|
|
|
8
8
|
from team_agent.events import EventLog
|
|
9
9
|
from team_agent.message_store import MessageStore
|
|
10
10
|
from team_agent.permissions import resolve_permissions
|
|
11
|
+
from team_agent.display.backend import display_backend_has_worker_views, display_backend_opens_before_leader_rebind, resolve_restart_display_backend
|
|
12
|
+
from team_agent.display.close import close_team_display_backends
|
|
13
|
+
from team_agent.display.rebuild import rebuild_restart_display_after_rebind
|
|
11
14
|
from team_agent.restart.selection import select_restart_state
|
|
12
15
|
from team_agent.restart.snapshot import save_team_runtime_snapshot
|
|
13
16
|
from team_agent.spec import load_spec
|
|
14
17
|
from team_agent.state import (
|
|
15
18
|
check_team_owner,
|
|
16
|
-
load_runtime_state,
|
|
17
19
|
populate_team_owner_from_env,
|
|
18
20
|
save_runtime_state,
|
|
19
21
|
write_team_state,
|
|
20
22
|
)
|
|
21
23
|
|
|
22
|
-
|
|
23
24
|
def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None) -> dict[str, Any]:
|
|
24
25
|
# Lazy-import everything from team_agent.runtime so existing tests that
|
|
25
26
|
# patch runtime.shell_resume_command_for_agent / runtime.run_cmd /
|
|
@@ -27,7 +28,6 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
27
28
|
# at call time. Runtime re-exports the provider helpers, so this also
|
|
28
29
|
# routes through the providers module without binding it directly.
|
|
29
30
|
from team_agent.runtime import (
|
|
30
|
-
GHOSTTY_DISPLAY_BACKENDS,
|
|
31
31
|
ResumeUnavailable,
|
|
32
32
|
RuntimeError,
|
|
33
33
|
_attach_profile_resume_root,
|
|
@@ -35,7 +35,6 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
35
35
|
_capture_agent_session,
|
|
36
36
|
_clear_session_capture_fields,
|
|
37
37
|
_close_ghostty_display,
|
|
38
|
-
_close_ghostty_workspace,
|
|
39
38
|
_compile_team_dir_spec,
|
|
40
39
|
_effective_runtime_config,
|
|
41
40
|
_ensure_agent_start_requirements,
|
|
@@ -83,16 +82,73 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
83
82
|
)
|
|
84
83
|
raise RuntimeError(_tmux_session_conflict_error(session_name))
|
|
85
84
|
runtime_cfg = _effective_runtime_config(spec.get("runtime", {}))
|
|
86
|
-
display_backend = spec
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
85
|
+
display_backend = resolve_restart_display_backend(spec, state, event_log)
|
|
86
|
+
# Stage 7 S5 — Slice 6 lifecycle atomicity contract: compute restart_agents
|
|
87
|
+
# early so we can pre-validate resumability BEFORE any destructive teardown
|
|
88
|
+
# (ghostty close, tmux session creation). Without --allow-fresh, every
|
|
89
|
+
# non-paused worker MUST be resumable; if any is not, refuse the operation
|
|
90
|
+
# atomically with a structured result and a restart.atomic_refusal event.
|
|
91
|
+
# No rollback path is needed because nothing has been created yet.
|
|
91
92
|
restart_agents = [
|
|
92
93
|
agent
|
|
93
94
|
for agent in spec.get("agents", [])
|
|
94
95
|
if state.get("agents", {}).get(agent["id"], {}).get("status") != "paused" and not agent.get("paused")
|
|
95
96
|
]
|
|
97
|
+
# cr strict-typing (2026-05-27): refuse the operation deterministically
|
|
98
|
+
# before any decision logic if any persisted first_send_at is corrupt
|
|
99
|
+
# (empty string, 0, False, literal "null", any non-ISO garbage). This
|
|
100
|
+
# avoids silent misclassification through Python truthiness and gives the
|
|
101
|
+
# operator a clear audit signal that state.json is damaged.
|
|
102
|
+
invalid_first_send_at = _collect_corrupt_first_send_at(restart_agents, state)
|
|
103
|
+
if invalid_first_send_at:
|
|
104
|
+
for entry in invalid_first_send_at:
|
|
105
|
+
event_log.write(
|
|
106
|
+
"restart.first_send_at_invalid",
|
|
107
|
+
worker_id=entry["worker_id"],
|
|
108
|
+
raw_first_send_at=entry["raw_first_send_at"],
|
|
109
|
+
raw_first_send_at_type=entry["raw_first_send_at_type"],
|
|
110
|
+
)
|
|
111
|
+
invalid_names = [entry["worker_id"] for entry in invalid_first_send_at]
|
|
112
|
+
return {
|
|
113
|
+
"ok": False,
|
|
114
|
+
"status": "refused",
|
|
115
|
+
"reason": "invalid_first_send_at",
|
|
116
|
+
"invalid_first_send_at": invalid_first_send_at,
|
|
117
|
+
"allow_fresh": bool(allow_fresh),
|
|
118
|
+
"error": (
|
|
119
|
+
f"Cannot restart: workers {invalid_names} have a corrupt "
|
|
120
|
+
"first_send_at in state.json (only null/missing or a valid "
|
|
121
|
+
"ISO-8601 UTC timestamp string is accepted). Inspect the "
|
|
122
|
+
"restart.first_send_at_invalid audit events for raw values "
|
|
123
|
+
"and repair state.json before retrying."
|
|
124
|
+
),
|
|
125
|
+
}
|
|
126
|
+
# cr C2: emit one restart.resume_decision event per non-paused worker so
|
|
127
|
+
# every restart attempt produces an auditable per-worker classification.
|
|
128
|
+
# The function returns only refused workers — populated when
|
|
129
|
+
# allow_fresh=False AND at least one interacted worker cannot be repaired.
|
|
130
|
+
refused = _emit_resume_decisions(
|
|
131
|
+
workspace, restart_agents, state, get_adapter, event_log, allow_fresh,
|
|
132
|
+
)
|
|
133
|
+
if refused:
|
|
134
|
+
event_log.write(
|
|
135
|
+
"restart.atomic_refusal",
|
|
136
|
+
unresumable=refused,
|
|
137
|
+
allow_fresh=bool(allow_fresh),
|
|
138
|
+
reason="resume_atomicity",
|
|
139
|
+
)
|
|
140
|
+
return {
|
|
141
|
+
"ok": False,
|
|
142
|
+
"status": "refused",
|
|
143
|
+
"reason": "resume_atomicity",
|
|
144
|
+
"unresumable": refused,
|
|
145
|
+
"allow_fresh": bool(allow_fresh),
|
|
146
|
+
"error": _format_atomic_refusal_error(refused),
|
|
147
|
+
}
|
|
148
|
+
close_team_display_backends(state, event_log)
|
|
149
|
+
for agent_id, agent_state in state.get("agents", {}).items():
|
|
150
|
+
_close_ghostty_display(agent_id, agent_state, event_log)
|
|
151
|
+
state["display_backend"] = display_backend
|
|
96
152
|
_ensure_agent_start_requirements(workspace, restart_agents, event_log, "restart")
|
|
97
153
|
first = True
|
|
98
154
|
restarted: list[dict[str, Any]] = []
|
|
@@ -271,8 +327,9 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
271
327
|
event_log,
|
|
272
328
|
timeout_s=1.5,
|
|
273
329
|
exclude_session_ids=known_session_ids,
|
|
330
|
+
raise_on_missed=False,
|
|
274
331
|
)
|
|
275
|
-
if display_backend
|
|
332
|
+
if display_backend_has_worker_views(display_backend):
|
|
276
333
|
display_jobs.append((agent["id"], agent))
|
|
277
334
|
new_agents[agent["id"]] = agent_state
|
|
278
335
|
restarted.append(
|
|
@@ -283,7 +340,7 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
283
340
|
"display_target": None,
|
|
284
341
|
}
|
|
285
342
|
)
|
|
286
|
-
display_results = _open_worker_displays(workspace, session_name, display_jobs, event_log, display_backend)
|
|
343
|
+
display_results = _open_worker_displays(workspace, session_name, display_jobs, event_log, display_backend) if display_backend_opens_before_leader_rebind(display_backend) else {}
|
|
287
344
|
for agent_id, display in display_results.items():
|
|
288
345
|
if agent_id in new_agents:
|
|
289
346
|
new_agents[agent_id]["display"] = display
|
|
@@ -309,12 +366,158 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
309
366
|
write_team_state(workspace, spec, state)
|
|
310
367
|
from team_agent.leader import autobind_leader_receiver_from_env
|
|
311
368
|
leader_provider = str(spec.get("leader", {}).get("provider") or "codex")
|
|
312
|
-
autobind_leader_receiver_from_env(workspace, leader_provider, source="restart")
|
|
369
|
+
rebound_receiver = autobind_leader_receiver_from_env(workspace, leader_provider, source="restart")
|
|
370
|
+
rebuild_restart_display_after_rebind(display_backend, workspace, session_name, spec, event_log, restarted, receiver=rebound_receiver)
|
|
313
371
|
coordinator = start_coordinator(workspace)
|
|
314
372
|
event_log.write("restart.complete", session=session_name, agents=restarted, coordinator=coordinator)
|
|
315
373
|
return {"ok": True, "session_name": session_name, "agents": restarted, "coordinator": coordinator}
|
|
316
374
|
|
|
317
375
|
|
|
376
|
+
_FIRST_SEND_AT_ABSENT = "absent"
|
|
377
|
+
_FIRST_SEND_AT_VALID = "valid"
|
|
378
|
+
_FIRST_SEND_AT_CORRUPT = "corrupt"
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _classify_first_send_at(value: Any) -> str:
|
|
382
|
+
"""Strict first_send_at typing (cr verdict, 2026-05-27).
|
|
383
|
+
|
|
384
|
+
Returns one of:
|
|
385
|
+
"absent" — None or missing field (worker never-interacted).
|
|
386
|
+
"valid" — non-empty ISO-8601 UTC string parseable by datetime.fromisoformat.
|
|
387
|
+
"corrupt" — anything else: empty string, 0, False, literal "null", garbage.
|
|
388
|
+
|
|
389
|
+
The contract requires that corrupt values be detected deterministically
|
|
390
|
+
before any restart decision so we never silent-misclassify a worker's
|
|
391
|
+
interaction state via Python truthiness.
|
|
392
|
+
"""
|
|
393
|
+
if value is None:
|
|
394
|
+
return _FIRST_SEND_AT_ABSENT
|
|
395
|
+
if not isinstance(value, str):
|
|
396
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
397
|
+
if not value:
|
|
398
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
399
|
+
try:
|
|
400
|
+
datetime.fromisoformat(value)
|
|
401
|
+
except (ValueError, TypeError):
|
|
402
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
403
|
+
return _FIRST_SEND_AT_VALID
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _collect_corrupt_first_send_at(
|
|
407
|
+
restart_agents: list[dict[str, Any]],
|
|
408
|
+
state: dict[str, Any],
|
|
409
|
+
) -> list[dict[str, Any]]:
|
|
410
|
+
"""Walk every non-paused worker and flag any whose persisted first_send_at
|
|
411
|
+
is corrupt. Returns the list of invalid records ready for the
|
|
412
|
+
`restart.first_send_at_invalid` event and the refusal envelope."""
|
|
413
|
+
invalid: list[dict[str, Any]] = []
|
|
414
|
+
for agent in restart_agents:
|
|
415
|
+
agent_id = agent["id"]
|
|
416
|
+
previous = state.get("agents", {}).get(agent_id, {})
|
|
417
|
+
raw = previous.get("first_send_at") if isinstance(previous, dict) else None
|
|
418
|
+
if _classify_first_send_at(raw) != _FIRST_SEND_AT_CORRUPT:
|
|
419
|
+
continue
|
|
420
|
+
invalid.append({
|
|
421
|
+
"worker_id": agent_id,
|
|
422
|
+
"raw_first_send_at": raw,
|
|
423
|
+
"raw_first_send_at_type": type(raw).__name__,
|
|
424
|
+
})
|
|
425
|
+
return invalid
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _emit_resume_decisions(
|
|
429
|
+
workspace: Path,
|
|
430
|
+
restart_agents: list[dict[str, Any]],
|
|
431
|
+
state: dict[str, Any],
|
|
432
|
+
get_adapter_fn: Any,
|
|
433
|
+
event_log: EventLog,
|
|
434
|
+
allow_fresh: bool,
|
|
435
|
+
) -> list[dict[str, Any]]:
|
|
436
|
+
"""Route B audit-events contract (cr C2, 2026-05-27). For every non-paused
|
|
437
|
+
worker considered by restart, derive the resume decision per the Route B
|
|
438
|
+
matrix and emit ONE `restart.resume_decision` event:
|
|
439
|
+
|
|
440
|
+
resumable AND ... -> decision = "resume"
|
|
441
|
+
not resumable AND not interacted -> decision = "fresh_start"
|
|
442
|
+
not resumable AND interacted AND fresh -> decision = "fresh_start"
|
|
443
|
+
not resumable AND interacted AND not fresh -> decision = "refuse"
|
|
444
|
+
|
|
445
|
+
Resumability mirrors sessions.resume.prepare_resume_state's repair chain
|
|
446
|
+
so workers the runtime would legitimately repair are NOT flagged. Returns
|
|
447
|
+
the subset of refused workers — populated only when allow_fresh=False AND
|
|
448
|
+
some interacted worker cannot be repaired — for use by atomic_refusal.
|
|
449
|
+
"""
|
|
450
|
+
from team_agent.sessions.resume import recover_resume_session_from_events
|
|
451
|
+
refused: list[dict[str, Any]] = []
|
|
452
|
+
for agent in restart_agents:
|
|
453
|
+
agent_id = agent["id"]
|
|
454
|
+
previous = state.get("agents", {}).get(agent_id, {})
|
|
455
|
+
session_id = previous.get("session_id")
|
|
456
|
+
first_send_at = previous.get("first_send_at")
|
|
457
|
+
has_first_send_at = _classify_first_send_at(first_send_at) == _FIRST_SEND_AT_VALID
|
|
458
|
+
has_session_id = bool(session_id)
|
|
459
|
+
adapter = get_adapter_fn(agent["provider"])
|
|
460
|
+
resumable = bool(session_id) and adapter.session_is_resumable(previous, workspace)
|
|
461
|
+
if not resumable:
|
|
462
|
+
known_session_ids = {
|
|
463
|
+
str(item.get("session_id"))
|
|
464
|
+
for aid, item in state.get("agents", {}).items()
|
|
465
|
+
if aid != agent_id and item.get("session_id")
|
|
466
|
+
}
|
|
467
|
+
repaired = recover_resume_session_from_events(
|
|
468
|
+
workspace, agent_id, previous, adapter, known_session_ids,
|
|
469
|
+
)
|
|
470
|
+
if not repaired:
|
|
471
|
+
repaired = adapter.recover_session_id(
|
|
472
|
+
agent_id, previous, workspace, known_session_ids,
|
|
473
|
+
)
|
|
474
|
+
resumable = bool(repaired)
|
|
475
|
+
if resumable:
|
|
476
|
+
decision = "resume"
|
|
477
|
+
elif not has_first_send_at:
|
|
478
|
+
decision = "fresh_start"
|
|
479
|
+
elif allow_fresh:
|
|
480
|
+
decision = "fresh_start"
|
|
481
|
+
else:
|
|
482
|
+
decision = "refuse"
|
|
483
|
+
event_log.write(
|
|
484
|
+
"restart.resume_decision",
|
|
485
|
+
worker_id=agent_id,
|
|
486
|
+
has_first_send_at=has_first_send_at,
|
|
487
|
+
has_session_id=has_session_id,
|
|
488
|
+
allow_fresh=bool(allow_fresh),
|
|
489
|
+
decision=decision,
|
|
490
|
+
first_send_at=first_send_at if has_first_send_at else None,
|
|
491
|
+
session_id=session_id,
|
|
492
|
+
)
|
|
493
|
+
if decision == "refuse":
|
|
494
|
+
refused.append({
|
|
495
|
+
"agent_id": agent_id,
|
|
496
|
+
"reason": "no_persisted_session_id" if not session_id else "session_unresumable",
|
|
497
|
+
"session_id": session_id,
|
|
498
|
+
"first_send_at": first_send_at,
|
|
499
|
+
})
|
|
500
|
+
return refused
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _format_atomic_refusal_error(refused: list[dict[str, Any]]) -> str:
|
|
504
|
+
"""C4 (cr verdict, 2026-05-27): the human-readable refusal error must
|
|
505
|
+
name every refused worker AND its first_send_at timestamp so an operator
|
|
506
|
+
can decide whether to pass --allow-fresh and accept losing that
|
|
507
|
+
interaction history."""
|
|
508
|
+
names = [item["agent_id"] for item in refused]
|
|
509
|
+
details = ". ".join(
|
|
510
|
+
f"{item['agent_id']} was first interacted with at {item.get('first_send_at')}; "
|
|
511
|
+
"its persisted session is missing"
|
|
512
|
+
for item in refused
|
|
513
|
+
)
|
|
514
|
+
return (
|
|
515
|
+
f"Cannot restart: workers {names} have no resumable session despite "
|
|
516
|
+
f"previous interaction. {details}. "
|
|
517
|
+
"Pass --allow-fresh if you accept losing that interaction history."
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
|
|
318
521
|
def rollback_restart_session(session_name: str, event_log: EventLog) -> dict[str, Any]:
|
|
319
522
|
from team_agent.runtime import run_cmd
|
|
320
523
|
proc = run_cmd(["tmux", "kill-session", "-t", session_name], timeout=10)
|
|
@@ -39,10 +39,12 @@ from team_agent.providers import (
|
|
|
39
39
|
shell_resume_command_for_agent,
|
|
40
40
|
)
|
|
41
41
|
from team_agent.display import (
|
|
42
|
+
GHOSTTY_DISPLAY_BACKENDS,
|
|
42
43
|
GHOSTTY_WORKSPACE_PANES_PER_WINDOW,
|
|
43
44
|
close_ghostty_display as _close_ghostty_display,
|
|
44
45
|
close_ghostty_workspace as _close_ghostty_workspace,
|
|
45
46
|
close_ghostty_workspace_slot as _close_ghostty_workspace_slot,
|
|
47
|
+
close_team_display_backends as _close_team_display_backends,
|
|
46
48
|
ghostty_app_exists as _ghostty_app_exists,
|
|
47
49
|
ghostty_attach_args as _ghostty_attach_args,
|
|
48
50
|
ghostty_command as _ghostty_command,
|
|
@@ -65,6 +67,7 @@ from team_agent.display import (
|
|
|
65
67
|
set_ghostty_workspace_pane_title as _set_ghostty_workspace_pane_title,
|
|
66
68
|
)
|
|
67
69
|
from team_agent.leader import (
|
|
70
|
+
LEADER_OWNERSHIP_LOCK,
|
|
68
71
|
attach_leader,
|
|
69
72
|
attach_leader_to_state as _attach_leader_to_state,
|
|
70
73
|
claim_leader,
|
|
@@ -456,7 +459,6 @@ TMUX_PANE_FORMAT = (
|
|
|
456
459
|
"#{pane_current_path}\t#{session_attached}\t#{pane_in_mode}"
|
|
457
460
|
)
|
|
458
461
|
HEALTH_STATUSES = {"RUNNING", "IDLE", "AWAITING_APPROVAL", "BLOCKED", "ERROR", "DONE"}
|
|
459
|
-
GHOSTTY_DISPLAY_BACKENDS = {"ghostty", "ghostty_window", "ghostty_workspace"}
|
|
460
462
|
DELIVERY_CAPTURE_LINES = 40
|
|
461
463
|
SUBMITTED_DELIVERY_STATUSES = {"injected", "visible", "submitted", "submitted_unverified", "delivered", "acknowledged"}
|
|
462
464
|
TMUX_STDIN_BUFFER_THRESHOLD = 16 * 1024
|
|
@@ -480,7 +482,6 @@ def ensure_workspace_dirs(workspace: Path) -> None:
|
|
|
480
482
|
path.mkdir(parents=True, exist_ok=True)
|
|
481
483
|
|
|
482
484
|
|
|
483
|
-
|
|
484
485
|
def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -> dict[str, Any]:
|
|
485
486
|
from team_agent.state import resolve_team_scoped_state
|
|
486
487
|
state, refusal = resolve_team_scoped_state(workspace, team)
|
|
@@ -521,7 +522,7 @@ def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -
|
|
|
521
522
|
if proc.returncode == 0:
|
|
522
523
|
log_path.write_text(proc.stdout, encoding="utf-8")
|
|
523
524
|
captured.append(str(log_path))
|
|
524
|
-
|
|
525
|
+
_close_team_display_backends(state, event_log)
|
|
525
526
|
for agent_id, agent_state in state.get("agents", {}).items():
|
|
526
527
|
_close_ghostty_display(agent_id, agent_state, event_log)
|
|
527
528
|
closed_displays.add(agent_id)
|
|
@@ -535,7 +536,7 @@ def shutdown(workspace: Path, keep_logs: bool = True, team: str | None = None) -
|
|
|
535
536
|
event_log.write("shutdown.kill_session", session=session_name, keep_logs=keep_logs, captured=captured)
|
|
536
537
|
else:
|
|
537
538
|
event_log.write("shutdown.idempotent", session=session_name, reason="session missing")
|
|
538
|
-
|
|
539
|
+
_close_team_display_backends(state, event_log)
|
|
539
540
|
for agent_id, agent_state in state.get("agents", {}).items():
|
|
540
541
|
if agent_id not in closed_displays:
|
|
541
542
|
_close_ghostty_display(agent_id, agent_state, event_log)
|
|
@@ -617,7 +618,7 @@ def takeover(workspace: Path, team: str | None = None, confirm: bool = False) ->
|
|
|
617
618
|
"reason": "no_caller_identity",
|
|
618
619
|
"action": "set TEAM_AGENT_LEADER_PANE_ID/PROVIDER/MACHINE_FINGERPRINT or run from a tmux pane",
|
|
619
620
|
}
|
|
620
|
-
with _runtime_lock(workspace,
|
|
621
|
+
with _runtime_lock(workspace, LEADER_OWNERSHIP_LOCK):
|
|
621
622
|
try:
|
|
622
623
|
team_state = select_runtime_state(workspace, team)
|
|
623
624
|
except RuntimeError as exc:
|
|
@@ -628,23 +629,72 @@ def takeover(workspace: Path, team: str | None = None, confirm: bool = False) ->
|
|
|
628
629
|
"team": team,
|
|
629
630
|
"error": str(exc),
|
|
630
631
|
}
|
|
631
|
-
previous_owner = team_state.get("team_owner")
|
|
632
|
+
previous_owner = team_state.get("team_owner") if isinstance(team_state.get("team_owner"), dict) else {}
|
|
633
|
+
previous_receiver = team_state.get("leader_receiver") if isinstance(team_state.get("leader_receiver"), dict) else {}
|
|
634
|
+
from team_agent.leader import _lease_epoch, _receiver_from_claim_target
|
|
635
|
+
next_epoch = _lease_epoch(previous_owner, previous_receiver) + 1
|
|
636
|
+
leader_uuid = str(previous_owner.get("leader_session_uuid") or "")
|
|
632
637
|
new_owner = {
|
|
633
638
|
"pane_id": pane_id,
|
|
634
639
|
"provider": os.environ.get("TEAM_AGENT_LEADER_PROVIDER", ""),
|
|
635
640
|
"machine_fingerprint": os.environ.get("TEAM_AGENT_MACHINE_FINGERPRINT", ""),
|
|
641
|
+
"owner_epoch": next_epoch,
|
|
636
642
|
"claimed_at": datetime.now(timezone.utc).isoformat(),
|
|
637
643
|
"claimed_via": "takeover",
|
|
638
644
|
}
|
|
645
|
+
if leader_uuid:
|
|
646
|
+
new_owner["leader_session_uuid"] = leader_uuid
|
|
639
647
|
team_state["team_owner"] = new_owner
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
648
|
+
# C11/C17: takeover converges on the same lease mutation as claim-leader.
|
|
649
|
+
# Rebind the leader receiver to the caller pane and write owner + receiver
|
|
650
|
+
# to both state locations together, so takeover never leaves the receiver
|
|
651
|
+
# pointing at the old (often dead) pane.
|
|
652
|
+
targets_result = core_list_targets()
|
|
653
|
+
targets = targets_result.get("targets", []) if isinstance(targets_result, dict) and targets_result.get("ok") else []
|
|
654
|
+
caller_target = next((item for item in targets if isinstance(item, dict) and str(item.get("pane_id")) == str(pane_id)), None)
|
|
655
|
+
new_receiver = None
|
|
656
|
+
if caller_target:
|
|
657
|
+
new_receiver = _receiver_from_claim_target(
|
|
658
|
+
caller_target,
|
|
659
|
+
previous_receiver,
|
|
660
|
+
leader_uuid or None,
|
|
661
|
+
next_epoch,
|
|
662
|
+
)
|
|
663
|
+
new_receiver["discovery"] = "takeover"
|
|
664
|
+
team_state["leader_receiver"] = new_receiver
|
|
665
|
+
from team_agent.leader import _write_lease_dual_state
|
|
666
|
+
_write_lease_dual_state(workspace, team_state)
|
|
667
|
+
# C11: takeover converges on the same lease audit events as claim-leader
|
|
668
|
+
# instead of a divergent legacy team_owner.takeover record.
|
|
669
|
+
event_log = EventLog(workspace)
|
|
670
|
+
uuid_prefix = leader_uuid[:8]
|
|
671
|
+
old_pane_id = previous_receiver.get("pane_id") or (previous_owner or {}).get("pane_id")
|
|
672
|
+
if new_receiver is not None:
|
|
673
|
+
event_log.write(
|
|
674
|
+
"leader_receiver.rebind_applied",
|
|
675
|
+
reason="takeover_confirmed",
|
|
676
|
+
old_pane_id=old_pane_id,
|
|
677
|
+
new_pane_id=pane_id,
|
|
678
|
+
owner_epoch=next_epoch,
|
|
679
|
+
uuid_prefix=uuid_prefix,
|
|
680
|
+
team_id=team,
|
|
681
|
+
)
|
|
682
|
+
event_log.write(
|
|
683
|
+
"owner_epoch_advanced",
|
|
684
|
+
reason="takeover_confirmed",
|
|
685
|
+
old_pane_id=old_pane_id,
|
|
686
|
+
new_pane_id=pane_id,
|
|
687
|
+
owner_epoch=next_epoch,
|
|
688
|
+
uuid_prefix=uuid_prefix,
|
|
689
|
+
team_id=team,
|
|
690
|
+
previous_owner=previous_owner or None,
|
|
645
691
|
new_owner=new_owner,
|
|
692
|
+
receiver_rebound=bool(new_receiver),
|
|
646
693
|
)
|
|
647
|
-
|
|
694
|
+
response = {"ok": True, "status": "claimed", "team": team, "team_owner": new_owner, "previous_owner": previous_owner or None, "owner_epoch": next_epoch}
|
|
695
|
+
if new_receiver is not None:
|
|
696
|
+
response["leader_receiver"] = new_receiver
|
|
697
|
+
return response
|
|
648
698
|
|
|
649
699
|
|
|
650
700
|
def _running_agent_state(workspace: Path, agent: dict[str, Any], previous: dict[str, Any]) -> dict[str, Any]:
|
|
@@ -674,7 +724,7 @@ def _handle_startup_prompts_and_verify_window(
|
|
|
674
724
|
session_name: str,
|
|
675
725
|
start_mode: str,
|
|
676
726
|
) -> bool:
|
|
677
|
-
handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=
|
|
727
|
+
handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=20, sleep_s=0.5)
|
|
678
728
|
for prompt_event in handled_prompts:
|
|
679
729
|
event_log.write(f"{event_prefix}.startup_prompt_handled", agent_id=agent_id, provider=provider, **prompt_event)
|
|
680
730
|
deadline = time.monotonic() + 1.0
|
|
@@ -840,10 +890,10 @@ def _retry_or_failed(task: dict[str, Any]) -> str:
|
|
|
840
890
|
return "failed"
|
|
841
891
|
|
|
842
892
|
|
|
843
|
-
def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0) -> dict[str, Any]:
|
|
893
|
+
def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0, *, _trust_retry_attempt: int = 1) -> dict[str, Any]:
|
|
844
894
|
from team_agent.messaging.delivery import _deliver_pending_message as impl
|
|
845
895
|
|
|
846
|
-
return impl(workspace, state, message_id, wait_visible, timeout)
|
|
896
|
+
return impl(workspace, state, message_id, wait_visible, timeout, _trust_retry_attempt=_trust_retry_attempt)
|
|
847
897
|
|
|
848
898
|
def _enable_codex_fast_mode(session_name: str, window_name: str) -> dict[str, Any]:
|
|
849
899
|
from team_agent.messaging.tmux_prompt import _enable_codex_fast_mode as impl
|
|
@@ -1,14 +1,25 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import time
|
|
3
4
|
from datetime import datetime, timezone
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any
|
|
6
7
|
|
|
8
|
+
from team_agent.errors import RuntimeError as TeamAgentRuntimeError
|
|
7
9
|
from team_agent.events import EventLog
|
|
8
10
|
from team_agent.providers import get_adapter
|
|
9
11
|
from team_agent.state import SESSION_CAPTURE_FIELDS, SESSION_STATE_FIELDS
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
# Stage 7 S6 (2026-05-27): capture_agent_session used to do a single adapter
|
|
15
|
+
# call and silently return None on miss, leaving status='running' workers with
|
|
16
|
+
# session_id=null. Slow worker startups (Codex writing the rollout file a few
|
|
17
|
+
# tenths of a second after window creation) raced this check. We now poll on a
|
|
18
|
+
# small interval inside the caller's timeout_s budget so the adapter's own
|
|
19
|
+
# fast-path call doesn't have to absorb all the latency on its own.
|
|
20
|
+
_CAPTURE_POLL_INTERVAL_SECONDS = 0.05
|
|
21
|
+
|
|
22
|
+
|
|
12
23
|
def capture_missing_sessions(
|
|
13
24
|
workspace: Path,
|
|
14
25
|
state: dict[str, Any],
|
|
@@ -25,6 +36,10 @@ def capture_missing_sessions(
|
|
|
25
36
|
for aid, item in state.get("agents", {}).items()
|
|
26
37
|
if aid != agent_id and item.get("session_id")
|
|
27
38
|
}
|
|
39
|
+
# capture_missing_sessions is invoked from coordinator_tick, diagnose,
|
|
40
|
+
# status, etc. with very short timeouts; a transient miss should NOT
|
|
41
|
+
# crash those paths. The loud raise contract belongs to direct callers
|
|
42
|
+
# (e.g. lifecycle start/restart) who own the worker's atomicity.
|
|
28
43
|
result = capture_agent_session(
|
|
29
44
|
workspace,
|
|
30
45
|
agent_id,
|
|
@@ -32,6 +47,7 @@ def capture_missing_sessions(
|
|
|
32
47
|
event_log,
|
|
33
48
|
timeout_s=timeout_s,
|
|
34
49
|
exclude_session_ids=known_session_ids,
|
|
50
|
+
raise_on_missed=False,
|
|
35
51
|
)
|
|
36
52
|
if result:
|
|
37
53
|
captured.append(agent_id)
|
|
@@ -53,6 +69,7 @@ def capture_agent_session(
|
|
|
53
69
|
event_log: EventLog,
|
|
54
70
|
timeout_s: float,
|
|
55
71
|
exclude_session_ids: set[str] | None = None,
|
|
72
|
+
raise_on_missed: bool = True,
|
|
56
73
|
) -> dict[str, Any] | None:
|
|
57
74
|
if agent_state.get("session_id"):
|
|
58
75
|
return None
|
|
@@ -66,21 +83,54 @@ def capture_agent_session(
|
|
|
66
83
|
"exclude_session_ids": sorted(exclude_session_ids or set()),
|
|
67
84
|
"claude_projects_root": agent_state.get("claude_projects_root"),
|
|
68
85
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
86
|
+
deadline = time.monotonic() + max(timeout_s, 0.0)
|
|
87
|
+
while True:
|
|
88
|
+
# Pass timeout_s=0 so the adapter does a single fast-path check; the
|
|
89
|
+
# outer loop owns the polling budget so behaviour stays consistent
|
|
90
|
+
# whether or not the adapter has its own internal sleep.
|
|
91
|
+
result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
|
|
92
|
+
if isinstance(result, dict) and result.get("session_id"):
|
|
93
|
+
copy_session_metadata(agent_state, result)
|
|
94
|
+
agent_state.pop("_pending_session_id", None)
|
|
95
|
+
event_log.write(
|
|
96
|
+
"session.captured",
|
|
97
|
+
agent_id=agent_id,
|
|
98
|
+
provider=agent_state.get("provider"),
|
|
99
|
+
session_id=agent_state.get("session_id"),
|
|
100
|
+
rollout_path=agent_state.get("rollout_path"),
|
|
101
|
+
captured_via=agent_state.get("captured_via"),
|
|
102
|
+
attribution_confidence=agent_state.get("attribution_confidence"),
|
|
103
|
+
)
|
|
104
|
+
return result
|
|
105
|
+
if time.monotonic() >= deadline:
|
|
106
|
+
break
|
|
107
|
+
time.sleep(_CAPTURE_POLL_INTERVAL_SECONDS)
|
|
108
|
+
# Timeout. Slice 1 atomicity contract: a worker whose status is 'running'
|
|
109
|
+
# must NEVER be left with session_id=null — that half-state is what made
|
|
110
|
+
# Mac mini Stage 7 S5/S6 unreproducible and breaks resume on next restart.
|
|
111
|
+
# Emit a structured attention event so the coordinator/operator sees the
|
|
112
|
+
# miss, then raise so callers cannot accidentally treat the None as a
|
|
113
|
+
# silent "no-op". Non-running workers (still starting, paused, stopped)
|
|
114
|
+
# legitimately have no session yet, so they still get the silent-None
|
|
115
|
+
# return that existing callers expect.
|
|
116
|
+
if agent_state.get("status") == "running":
|
|
117
|
+
event_log.write(
|
|
118
|
+
"session.capture_required_attention",
|
|
119
|
+
agent_id=agent_id,
|
|
120
|
+
provider=agent_state.get("provider"),
|
|
121
|
+
timeout_s=timeout_s,
|
|
122
|
+
spawn_cwd=agent_state.get("spawn_cwd"),
|
|
123
|
+
session_name=agent_state.get("session_name"),
|
|
124
|
+
window=agent_state.get("window", agent_id),
|
|
125
|
+
)
|
|
126
|
+
if raise_on_missed:
|
|
127
|
+
raise TeamAgentRuntimeError(
|
|
128
|
+
f"Failed to capture session_id for agent {agent_id}: adapter "
|
|
129
|
+
f"did not produce a session within {timeout_s}s. Worker is "
|
|
130
|
+
"running but unidentifiable; this is a Slice 1 atomicity "
|
|
131
|
+
"violation."
|
|
132
|
+
)
|
|
133
|
+
return None
|
|
84
134
|
|
|
85
135
|
|
|
86
136
|
def copy_session_metadata(target: dict[str, Any], source: dict[str, Any]) -> None:
|