@team-agent/installer 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +104 -3
- package/src/team_agent/cli/parser.py +10 -1
- package/src/team_agent/coordinator/lifecycle.py +3 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +199 -28
- package/src/team_agent/launch/core.py +2 -1
- package/src/team_agent/lifecycle/operations.py +1 -0
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +8 -7
- package/src/team_agent/message_store/schema.py +8 -2
- package/src/team_agent/messaging/delivery.py +293 -1
- package/src/team_agent/messaging/leader.py +13 -4
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +200 -0
- package/src/team_agent/messaging/scheduler.py +12 -0
- package/src/team_agent/messaging/send.py +21 -26
- package/src/team_agent/messaging/tmux_io.py +153 -23
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +44 -0
- package/src/team_agent/restart/orchestration.py +207 -4
- package/src/team_agent/runtime.py +3 -3
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +59 -0
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/watch/__init__.py +145 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from team_agent.events import EventLog
|
|
7
|
+
from team_agent.messaging.deps import _tmux_inject_text
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def retry_injection_after_trust_auto_answer(
|
|
11
|
+
workspace: Path,
|
|
12
|
+
state: dict[str, Any],
|
|
13
|
+
event_log: EventLog,
|
|
14
|
+
injection: dict[str, Any],
|
|
15
|
+
target: str,
|
|
16
|
+
text: str,
|
|
17
|
+
submit_key: str,
|
|
18
|
+
buffer_name: str,
|
|
19
|
+
provider: str,
|
|
20
|
+
) -> dict[str, Any]:
|
|
21
|
+
from team_agent.messaging.delivery import _wait_for_trust_prompt_dismissal
|
|
22
|
+
from team_agent.messaging.leader_panes import attempt_trust_auto_answer
|
|
23
|
+
answer = attempt_trust_auto_answer(
|
|
24
|
+
workspace,
|
|
25
|
+
injection.get("pane_id") or target,
|
|
26
|
+
injection.get("pane_capture_tail") or "",
|
|
27
|
+
event_log,
|
|
28
|
+
state=state,
|
|
29
|
+
)
|
|
30
|
+
if not answer.get("answered"):
|
|
31
|
+
return injection
|
|
32
|
+
if not _wait_for_trust_prompt_dismissal(injection.get("pane_id") or target, timeout=3.0):
|
|
33
|
+
retry_blocked = dict(injection)
|
|
34
|
+
retry_blocked["error"] = "trust_prompt_not_dismissed_after_answer"
|
|
35
|
+
retry_blocked["verification"] = "trust_prompt_not_dismissed_after_answer"
|
|
36
|
+
retry_blocked["stage"] = "trust_auto_answer_dismissal_wait"
|
|
37
|
+
return retry_blocked
|
|
38
|
+
return _tmux_inject_text(
|
|
39
|
+
target,
|
|
40
|
+
text,
|
|
41
|
+
submit_key,
|
|
42
|
+
buffer_name,
|
|
43
|
+
provider=provider,
|
|
44
|
+
)
|
|
@@ -84,15 +84,72 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
84
84
|
raise RuntimeError(_tmux_session_conflict_error(session_name))
|
|
85
85
|
runtime_cfg = _effective_runtime_config(spec.get("runtime", {}))
|
|
86
86
|
display_backend = spec.get("runtime", {}).get("display_backend", state.get("display_backend", "none"))
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
87
|
+
# Stage 7 S5 — Slice 6 lifecycle atomicity contract: compute restart_agents
|
|
88
|
+
# early so we can pre-validate resumability BEFORE any destructive teardown
|
|
89
|
+
# (ghostty close, tmux session creation). Without --allow-fresh, every
|
|
90
|
+
# non-paused worker MUST be resumable; if any is not, refuse the operation
|
|
91
|
+
# atomically with a structured result and a restart.atomic_refusal event.
|
|
92
|
+
# No rollback path is needed because nothing has been created yet.
|
|
91
93
|
restart_agents = [
|
|
92
94
|
agent
|
|
93
95
|
for agent in spec.get("agents", [])
|
|
94
96
|
if state.get("agents", {}).get(agent["id"], {}).get("status") != "paused" and not agent.get("paused")
|
|
95
97
|
]
|
|
98
|
+
# cr strict-typing (2026-05-27): refuse the operation deterministically
|
|
99
|
+
# before any decision logic if any persisted first_send_at is corrupt
|
|
100
|
+
# (empty string, 0, False, literal "null", any non-ISO garbage). This
|
|
101
|
+
# avoids silent misclassification through Python truthiness and gives the
|
|
102
|
+
# operator a clear audit signal that state.json is damaged.
|
|
103
|
+
invalid_first_send_at = _collect_corrupt_first_send_at(restart_agents, state)
|
|
104
|
+
if invalid_first_send_at:
|
|
105
|
+
for entry in invalid_first_send_at:
|
|
106
|
+
event_log.write(
|
|
107
|
+
"restart.first_send_at_invalid",
|
|
108
|
+
worker_id=entry["worker_id"],
|
|
109
|
+
raw_first_send_at=entry["raw_first_send_at"],
|
|
110
|
+
raw_first_send_at_type=entry["raw_first_send_at_type"],
|
|
111
|
+
)
|
|
112
|
+
invalid_names = [entry["worker_id"] for entry in invalid_first_send_at]
|
|
113
|
+
return {
|
|
114
|
+
"ok": False,
|
|
115
|
+
"status": "refused",
|
|
116
|
+
"reason": "invalid_first_send_at",
|
|
117
|
+
"invalid_first_send_at": invalid_first_send_at,
|
|
118
|
+
"allow_fresh": bool(allow_fresh),
|
|
119
|
+
"error": (
|
|
120
|
+
f"Cannot restart: workers {invalid_names} have a corrupt "
|
|
121
|
+
"first_send_at in state.json (only null/missing or a valid "
|
|
122
|
+
"ISO-8601 UTC timestamp string is accepted). Inspect the "
|
|
123
|
+
"restart.first_send_at_invalid audit events for raw values "
|
|
124
|
+
"and repair state.json before retrying."
|
|
125
|
+
),
|
|
126
|
+
}
|
|
127
|
+
# cr C2: emit one restart.resume_decision event per non-paused worker so
|
|
128
|
+
# every restart attempt produces an auditable per-worker classification.
|
|
129
|
+
# The function returns only refused workers — populated when
|
|
130
|
+
# allow_fresh=False AND at least one interacted worker cannot be repaired.
|
|
131
|
+
refused = _emit_resume_decisions(
|
|
132
|
+
workspace, restart_agents, state, get_adapter, event_log, allow_fresh,
|
|
133
|
+
)
|
|
134
|
+
if refused:
|
|
135
|
+
event_log.write(
|
|
136
|
+
"restart.atomic_refusal",
|
|
137
|
+
unresumable=refused,
|
|
138
|
+
allow_fresh=bool(allow_fresh),
|
|
139
|
+
reason="resume_atomicity",
|
|
140
|
+
)
|
|
141
|
+
return {
|
|
142
|
+
"ok": False,
|
|
143
|
+
"status": "refused",
|
|
144
|
+
"reason": "resume_atomicity",
|
|
145
|
+
"unresumable": refused,
|
|
146
|
+
"allow_fresh": bool(allow_fresh),
|
|
147
|
+
"error": _format_atomic_refusal_error(refused),
|
|
148
|
+
}
|
|
149
|
+
_close_ghostty_workspace(state, event_log)
|
|
150
|
+
for agent_id, agent_state in state.get("agents", {}).items():
|
|
151
|
+
_close_ghostty_display(agent_id, agent_state, event_log)
|
|
152
|
+
state["display_backend"] = display_backend
|
|
96
153
|
_ensure_agent_start_requirements(workspace, restart_agents, event_log, "restart")
|
|
97
154
|
first = True
|
|
98
155
|
restarted: list[dict[str, Any]] = []
|
|
@@ -271,6 +328,7 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
271
328
|
event_log,
|
|
272
329
|
timeout_s=1.5,
|
|
273
330
|
exclude_session_ids=known_session_ids,
|
|
331
|
+
raise_on_missed=False,
|
|
274
332
|
)
|
|
275
333
|
if display_backend in GHOSTTY_DISPLAY_BACKENDS:
|
|
276
334
|
display_jobs.append((agent["id"], agent))
|
|
@@ -315,6 +373,151 @@ def restart(workspace: Path, allow_fresh: bool = False, team: str | None = None)
|
|
|
315
373
|
return {"ok": True, "session_name": session_name, "agents": restarted, "coordinator": coordinator}
|
|
316
374
|
|
|
317
375
|
|
|
376
|
+
_FIRST_SEND_AT_ABSENT = "absent"
|
|
377
|
+
_FIRST_SEND_AT_VALID = "valid"
|
|
378
|
+
_FIRST_SEND_AT_CORRUPT = "corrupt"
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _classify_first_send_at(value: Any) -> str:
|
|
382
|
+
"""Strict first_send_at typing (cr verdict, 2026-05-27).
|
|
383
|
+
|
|
384
|
+
Returns one of:
|
|
385
|
+
"absent" — None or missing field (worker never-interacted).
|
|
386
|
+
"valid" — non-empty ISO-8601 UTC string parseable by datetime.fromisoformat.
|
|
387
|
+
"corrupt" — anything else: empty string, 0, False, literal "null", garbage.
|
|
388
|
+
|
|
389
|
+
The contract requires that corrupt values be detected deterministically
|
|
390
|
+
before any restart decision so we never silent-misclassify a worker's
|
|
391
|
+
interaction state via Python truthiness.
|
|
392
|
+
"""
|
|
393
|
+
if value is None:
|
|
394
|
+
return _FIRST_SEND_AT_ABSENT
|
|
395
|
+
if not isinstance(value, str):
|
|
396
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
397
|
+
if not value:
|
|
398
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
399
|
+
try:
|
|
400
|
+
datetime.fromisoformat(value)
|
|
401
|
+
except (ValueError, TypeError):
|
|
402
|
+
return _FIRST_SEND_AT_CORRUPT
|
|
403
|
+
return _FIRST_SEND_AT_VALID
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _collect_corrupt_first_send_at(
|
|
407
|
+
restart_agents: list[dict[str, Any]],
|
|
408
|
+
state: dict[str, Any],
|
|
409
|
+
) -> list[dict[str, Any]]:
|
|
410
|
+
"""Walk every non-paused worker and flag any whose persisted first_send_at
|
|
411
|
+
is corrupt. Returns the list of invalid records ready for the
|
|
412
|
+
`restart.first_send_at_invalid` event and the refusal envelope."""
|
|
413
|
+
invalid: list[dict[str, Any]] = []
|
|
414
|
+
for agent in restart_agents:
|
|
415
|
+
agent_id = agent["id"]
|
|
416
|
+
previous = state.get("agents", {}).get(agent_id, {})
|
|
417
|
+
raw = previous.get("first_send_at") if isinstance(previous, dict) else None
|
|
418
|
+
if _classify_first_send_at(raw) != _FIRST_SEND_AT_CORRUPT:
|
|
419
|
+
continue
|
|
420
|
+
invalid.append({
|
|
421
|
+
"worker_id": agent_id,
|
|
422
|
+
"raw_first_send_at": raw,
|
|
423
|
+
"raw_first_send_at_type": type(raw).__name__,
|
|
424
|
+
})
|
|
425
|
+
return invalid
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _emit_resume_decisions(
|
|
429
|
+
workspace: Path,
|
|
430
|
+
restart_agents: list[dict[str, Any]],
|
|
431
|
+
state: dict[str, Any],
|
|
432
|
+
get_adapter_fn: Any,
|
|
433
|
+
event_log: EventLog,
|
|
434
|
+
allow_fresh: bool,
|
|
435
|
+
) -> list[dict[str, Any]]:
|
|
436
|
+
"""Route B audit-events contract (cr C2, 2026-05-27). For every non-paused
|
|
437
|
+
worker considered by restart, derive the resume decision per the Route B
|
|
438
|
+
matrix and emit ONE `restart.resume_decision` event:
|
|
439
|
+
|
|
440
|
+
resumable AND ... -> decision = "resume"
|
|
441
|
+
not resumable AND not interacted -> decision = "fresh_start"
|
|
442
|
+
not resumable AND interacted AND fresh -> decision = "fresh_start"
|
|
443
|
+
not resumable AND interacted AND not fresh -> decision = "refuse"
|
|
444
|
+
|
|
445
|
+
Resumability mirrors sessions.resume.prepare_resume_state's repair chain
|
|
446
|
+
so workers the runtime would legitimately repair are NOT flagged. Returns
|
|
447
|
+
the subset of refused workers — populated only when allow_fresh=False AND
|
|
448
|
+
some interacted worker cannot be repaired — for use by atomic_refusal.
|
|
449
|
+
"""
|
|
450
|
+
from team_agent.sessions.resume import recover_resume_session_from_events
|
|
451
|
+
refused: list[dict[str, Any]] = []
|
|
452
|
+
for agent in restart_agents:
|
|
453
|
+
agent_id = agent["id"]
|
|
454
|
+
previous = state.get("agents", {}).get(agent_id, {})
|
|
455
|
+
session_id = previous.get("session_id")
|
|
456
|
+
first_send_at = previous.get("first_send_at")
|
|
457
|
+
has_first_send_at = _classify_first_send_at(first_send_at) == _FIRST_SEND_AT_VALID
|
|
458
|
+
has_session_id = bool(session_id)
|
|
459
|
+
adapter = get_adapter_fn(agent["provider"])
|
|
460
|
+
resumable = bool(session_id) and adapter.session_is_resumable(previous, workspace)
|
|
461
|
+
if not resumable:
|
|
462
|
+
known_session_ids = {
|
|
463
|
+
str(item.get("session_id"))
|
|
464
|
+
for aid, item in state.get("agents", {}).items()
|
|
465
|
+
if aid != agent_id and item.get("session_id")
|
|
466
|
+
}
|
|
467
|
+
repaired = recover_resume_session_from_events(
|
|
468
|
+
workspace, agent_id, previous, adapter, known_session_ids,
|
|
469
|
+
)
|
|
470
|
+
if not repaired:
|
|
471
|
+
repaired = adapter.recover_session_id(
|
|
472
|
+
agent_id, previous, workspace, known_session_ids,
|
|
473
|
+
)
|
|
474
|
+
resumable = bool(repaired)
|
|
475
|
+
if resumable:
|
|
476
|
+
decision = "resume"
|
|
477
|
+
elif not has_first_send_at:
|
|
478
|
+
decision = "fresh_start"
|
|
479
|
+
elif allow_fresh:
|
|
480
|
+
decision = "fresh_start"
|
|
481
|
+
else:
|
|
482
|
+
decision = "refuse"
|
|
483
|
+
event_log.write(
|
|
484
|
+
"restart.resume_decision",
|
|
485
|
+
worker_id=agent_id,
|
|
486
|
+
has_first_send_at=has_first_send_at,
|
|
487
|
+
has_session_id=has_session_id,
|
|
488
|
+
allow_fresh=bool(allow_fresh),
|
|
489
|
+
decision=decision,
|
|
490
|
+
first_send_at=first_send_at if has_first_send_at else None,
|
|
491
|
+
session_id=session_id,
|
|
492
|
+
)
|
|
493
|
+
if decision == "refuse":
|
|
494
|
+
refused.append({
|
|
495
|
+
"agent_id": agent_id,
|
|
496
|
+
"reason": "no_persisted_session_id" if not session_id else "session_unresumable",
|
|
497
|
+
"session_id": session_id,
|
|
498
|
+
"first_send_at": first_send_at,
|
|
499
|
+
})
|
|
500
|
+
return refused
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _format_atomic_refusal_error(refused: list[dict[str, Any]]) -> str:
|
|
504
|
+
"""C4 (cr verdict, 2026-05-27): the human-readable refusal error must
|
|
505
|
+
name every refused worker AND its first_send_at timestamp so an operator
|
|
506
|
+
can decide whether to pass --allow-fresh and accept losing that
|
|
507
|
+
interaction history."""
|
|
508
|
+
names = [item["agent_id"] for item in refused]
|
|
509
|
+
details = ". ".join(
|
|
510
|
+
f"{item['agent_id']} was first interacted with at {item.get('first_send_at')}; "
|
|
511
|
+
"its persisted session is missing"
|
|
512
|
+
for item in refused
|
|
513
|
+
)
|
|
514
|
+
return (
|
|
515
|
+
f"Cannot restart: workers {names} have no resumable session despite "
|
|
516
|
+
f"previous interaction. {details}. "
|
|
517
|
+
"Pass --allow-fresh if you accept losing that interaction history."
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
|
|
318
521
|
def rollback_restart_session(session_name: str, event_log: EventLog) -> dict[str, Any]:
|
|
319
522
|
from team_agent.runtime import run_cmd
|
|
320
523
|
proc = run_cmd(["tmux", "kill-session", "-t", session_name], timeout=10)
|
|
@@ -674,7 +674,7 @@ def _handle_startup_prompts_and_verify_window(
|
|
|
674
674
|
session_name: str,
|
|
675
675
|
start_mode: str,
|
|
676
676
|
) -> bool:
|
|
677
|
-
handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=
|
|
677
|
+
handled_prompts = adapter.handle_startup_prompts(session_name, agent_id, checks=20, sleep_s=0.5)
|
|
678
678
|
for prompt_event in handled_prompts:
|
|
679
679
|
event_log.write(f"{event_prefix}.startup_prompt_handled", agent_id=agent_id, provider=provider, **prompt_event)
|
|
680
680
|
deadline = time.monotonic() + 1.0
|
|
@@ -840,10 +840,10 @@ def _retry_or_failed(task: dict[str, Any]) -> str:
|
|
|
840
840
|
return "failed"
|
|
841
841
|
|
|
842
842
|
|
|
843
|
-
def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0) -> dict[str, Any]:
|
|
843
|
+
def _deliver_pending_message(workspace: Path, state: dict[str, Any], message_id: str, wait_visible: bool = True, timeout: float = 30.0, *, _trust_retry_attempt: int = 1) -> dict[str, Any]:
|
|
844
844
|
from team_agent.messaging.delivery import _deliver_pending_message as impl
|
|
845
845
|
|
|
846
|
-
return impl(workspace, state, message_id, wait_visible, timeout)
|
|
846
|
+
return impl(workspace, state, message_id, wait_visible, timeout, _trust_retry_attempt=_trust_retry_attempt)
|
|
847
847
|
|
|
848
848
|
def _enable_codex_fast_mode(session_name: str, window_name: str) -> dict[str, Any]:
|
|
849
849
|
from team_agent.messaging.tmux_prompt import _enable_codex_fast_mode as impl
|
|
@@ -1,14 +1,25 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import time
|
|
3
4
|
from datetime import datetime, timezone
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any
|
|
6
7
|
|
|
8
|
+
from team_agent.errors import RuntimeError as TeamAgentRuntimeError
|
|
7
9
|
from team_agent.events import EventLog
|
|
8
10
|
from team_agent.providers import get_adapter
|
|
9
11
|
from team_agent.state import SESSION_CAPTURE_FIELDS, SESSION_STATE_FIELDS
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
# Stage 7 S6 (2026-05-27): capture_agent_session used to do a single adapter
|
|
15
|
+
# call and silently return None on miss, leaving status='running' workers with
|
|
16
|
+
# session_id=null. Slow worker startups (Codex writing the rollout file a few
|
|
17
|
+
# tenths of a second after window creation) raced this check. We now poll on a
|
|
18
|
+
# small interval inside the caller's timeout_s budget so the adapter's own
|
|
19
|
+
# fast-path call doesn't have to absorb all the latency on its own.
|
|
20
|
+
_CAPTURE_POLL_INTERVAL_SECONDS = 0.05
|
|
21
|
+
|
|
22
|
+
|
|
12
23
|
def capture_missing_sessions(
|
|
13
24
|
workspace: Path,
|
|
14
25
|
state: dict[str, Any],
|
|
@@ -25,6 +36,10 @@ def capture_missing_sessions(
|
|
|
25
36
|
for aid, item in state.get("agents", {}).items()
|
|
26
37
|
if aid != agent_id and item.get("session_id")
|
|
27
38
|
}
|
|
39
|
+
# capture_missing_sessions is invoked from coordinator_tick, diagnose,
|
|
40
|
+
# status, etc. with very short timeouts; a transient miss should NOT
|
|
41
|
+
# crash those paths. The loud raise contract belongs to direct callers
|
|
42
|
+
# (e.g. lifecycle start/restart) who own the worker's atomicity.
|
|
28
43
|
result = capture_agent_session(
|
|
29
44
|
workspace,
|
|
30
45
|
agent_id,
|
|
@@ -32,6 +47,7 @@ def capture_missing_sessions(
|
|
|
32
47
|
event_log,
|
|
33
48
|
timeout_s=timeout_s,
|
|
34
49
|
exclude_session_ids=known_session_ids,
|
|
50
|
+
raise_on_missed=False,
|
|
35
51
|
)
|
|
36
52
|
if result:
|
|
37
53
|
captured.append(agent_id)
|
|
@@ -53,6 +69,7 @@ def capture_agent_session(
|
|
|
53
69
|
event_log: EventLog,
|
|
54
70
|
timeout_s: float,
|
|
55
71
|
exclude_session_ids: set[str] | None = None,
|
|
72
|
+
raise_on_missed: bool = True,
|
|
56
73
|
) -> dict[str, Any] | None:
|
|
57
74
|
if agent_state.get("session_id"):
|
|
58
75
|
return None
|
|
@@ -66,21 +83,54 @@ def capture_agent_session(
|
|
|
66
83
|
"exclude_session_ids": sorted(exclude_session_ids or set()),
|
|
67
84
|
"claude_projects_root": agent_state.get("claude_projects_root"),
|
|
68
85
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
86
|
+
deadline = time.monotonic() + max(timeout_s, 0.0)
|
|
87
|
+
while True:
|
|
88
|
+
# Pass timeout_s=0 so the adapter does a single fast-path check; the
|
|
89
|
+
# outer loop owns the polling budget so behaviour stays consistent
|
|
90
|
+
# whether or not the adapter has its own internal sleep.
|
|
91
|
+
result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
|
|
92
|
+
if isinstance(result, dict) and result.get("session_id"):
|
|
93
|
+
copy_session_metadata(agent_state, result)
|
|
94
|
+
agent_state.pop("_pending_session_id", None)
|
|
95
|
+
event_log.write(
|
|
96
|
+
"session.captured",
|
|
97
|
+
agent_id=agent_id,
|
|
98
|
+
provider=agent_state.get("provider"),
|
|
99
|
+
session_id=agent_state.get("session_id"),
|
|
100
|
+
rollout_path=agent_state.get("rollout_path"),
|
|
101
|
+
captured_via=agent_state.get("captured_via"),
|
|
102
|
+
attribution_confidence=agent_state.get("attribution_confidence"),
|
|
103
|
+
)
|
|
104
|
+
return result
|
|
105
|
+
if time.monotonic() >= deadline:
|
|
106
|
+
break
|
|
107
|
+
time.sleep(_CAPTURE_POLL_INTERVAL_SECONDS)
|
|
108
|
+
# Timeout. Slice 1 atomicity contract: a worker whose status is 'running'
|
|
109
|
+
# must NEVER be left with session_id=null — that half-state is what made
|
|
110
|
+
# Mac mini Stage 7 S5/S6 unreproducible and breaks resume on next restart.
|
|
111
|
+
# Emit a structured attention event so the coordinator/operator sees the
|
|
112
|
+
# miss, then raise so callers cannot accidentally treat the None as a
|
|
113
|
+
# silent "no-op". Non-running workers (still starting, paused, stopped)
|
|
114
|
+
# legitimately have no session yet, so they still get the silent-None
|
|
115
|
+
# return that existing callers expect.
|
|
116
|
+
if agent_state.get("status") == "running":
|
|
117
|
+
event_log.write(
|
|
118
|
+
"session.capture_required_attention",
|
|
119
|
+
agent_id=agent_id,
|
|
120
|
+
provider=agent_state.get("provider"),
|
|
121
|
+
timeout_s=timeout_s,
|
|
122
|
+
spawn_cwd=agent_state.get("spawn_cwd"),
|
|
123
|
+
session_name=agent_state.get("session_name"),
|
|
124
|
+
window=agent_state.get("window", agent_id),
|
|
125
|
+
)
|
|
126
|
+
if raise_on_missed:
|
|
127
|
+
raise TeamAgentRuntimeError(
|
|
128
|
+
f"Failed to capture session_id for agent {agent_id}: adapter "
|
|
129
|
+
f"did not produce a session within {timeout_s}s. Worker is "
|
|
130
|
+
"running but unidentifiable; this is a Slice 1 atomicity "
|
|
131
|
+
"violation."
|
|
132
|
+
)
|
|
133
|
+
return None
|
|
84
134
|
|
|
85
135
|
|
|
86
136
|
def copy_session_metadata(target: dict[str, Any], source: dict[str, Any]) -> None:
|
package/src/team_agent/spec.py
CHANGED
|
@@ -27,9 +27,60 @@ def load_yaml(path: Path) -> dict[str, Any]:
|
|
|
27
27
|
def load_spec(path: Path) -> dict[str, Any]:
|
|
28
28
|
spec = load_yaml(path)
|
|
29
29
|
validate_spec(spec, base_dir=path.parent)
|
|
30
|
+
_emit_load_time_deprecations(spec, path)
|
|
30
31
|
return spec
|
|
31
32
|
|
|
32
33
|
|
|
34
|
+
def _emit_load_time_deprecations(spec: dict[str, Any], path: Path) -> None:
|
|
35
|
+
"""Stage 7 S7 (2026-05-27): deprecation signals attached to the spec field
|
|
36
|
+
itself must fire when the YAML is read, not lazily inside the trust-prompt
|
|
37
|
+
code path. A user with the deprecated field in team.spec.yaml needs to see
|
|
38
|
+
the warning even when startup never reaches attempt_trust_auto_answer.
|
|
39
|
+
|
|
40
|
+
The leader-panes helper owns the one-shot stderr guard + the structured
|
|
41
|
+
audit event, so we reuse it. EventLog points at the WORKSPACE ROOT (not
|
|
42
|
+
the spec file's directory) so a quick-start layout that stores the spec
|
|
43
|
+
under <workspace>/.team/current/team.spec.yaml still routes the audit
|
|
44
|
+
event into the single canonical <workspace>/.team/logs/events.jsonl
|
|
45
|
+
instead of a doubled <workspace>/.team/current/.team/logs/events.jsonl
|
|
46
|
+
nesting.
|
|
47
|
+
"""
|
|
48
|
+
runtime = spec.get("runtime")
|
|
49
|
+
if not isinstance(runtime, dict):
|
|
50
|
+
return
|
|
51
|
+
if not bool(runtime.get("auto_trust_own_workspace")):
|
|
52
|
+
return
|
|
53
|
+
# Local import keeps the spec module free of messaging-layer coupling at
|
|
54
|
+
# import time; only YAMLs that opt into the deprecated field pay the cost.
|
|
55
|
+
from team_agent.events import EventLog
|
|
56
|
+
from team_agent.messaging.leader_panes import _emit_spec_opt_in_deprecation
|
|
57
|
+
_emit_spec_opt_in_deprecation(EventLog(_resolve_workspace_root(path)))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_workspace_root(spec_path: Path) -> Path:
|
|
61
|
+
"""Find the workspace root that owns this spec.
|
|
62
|
+
|
|
63
|
+
A workspace root is the directory whose `.team/` subdirectory holds the
|
|
64
|
+
runtime state, logs, artifacts, and (for quick-start layouts) the spec
|
|
65
|
+
itself under `.team/current/`. We climb from the spec file's parent
|
|
66
|
+
looking for the first ancestor that has a `.team/` child. If no ancestor
|
|
67
|
+
qualifies (fresh workspace before init, or a spec deliberately placed
|
|
68
|
+
outside any team workspace), we fall back to `spec_path.parent` which is
|
|
69
|
+
the legacy single-layout behaviour.
|
|
70
|
+
|
|
71
|
+
Implementation note: we use real filesystem evidence (`(dir/.team).is_dir()`)
|
|
72
|
+
rather than path-string parsing so the resolver works correctly even when
|
|
73
|
+
workspace paths legitimately contain a `.team` segment.
|
|
74
|
+
"""
|
|
75
|
+
direct_parent = spec_path.parent
|
|
76
|
+
if (direct_parent / ".team").is_dir():
|
|
77
|
+
return direct_parent
|
|
78
|
+
for ancestor in direct_parent.parents:
|
|
79
|
+
if (ancestor / ".team").is_dir():
|
|
80
|
+
return ancestor
|
|
81
|
+
return direct_parent
|
|
82
|
+
|
|
83
|
+
|
|
33
84
|
def validate_spec(spec: dict[str, Any], base_dir: Path | None = None) -> None:
|
|
34
85
|
messages = _basic_schema_errors(spec)
|
|
35
86
|
messages.extend(_semantic_errors(spec, base_dir or Path.cwd()))
|
|
@@ -190,6 +241,12 @@ def _check_runtime(runtime: Any, errors: list[str]) -> None:
|
|
|
190
241
|
"tick_interval_sec",
|
|
191
242
|
"push_min_interval_sec",
|
|
192
243
|
"stuck_timeout_sec",
|
|
244
|
+
# Gap 29 / F3 deprecation (2026-05-26): accept the legacy spec opt-in so
|
|
245
|
+
# YAMLs that still set it validate and the deprecation warning + structured
|
|
246
|
+
# event in messaging/leader_panes.py can fire. The preferred per-session
|
|
247
|
+
# opt-in is the env var TEAM_AGENT_AUTO_TRUST_OWN_WORKSPACE; this spec
|
|
248
|
+
# field will be removed in 0.3.0.
|
|
249
|
+
"auto_trust_own_workspace",
|
|
193
250
|
}
|
|
194
251
|
_check_keys(runtime, "/runtime", required, allowed, errors)
|
|
195
252
|
if not isinstance(runtime, dict):
|
|
@@ -200,6 +257,8 @@ def _check_runtime(runtime: Any, errors: list[str]) -> None:
|
|
|
200
257
|
errors.append("/runtime/display_backend: invalid display backend")
|
|
201
258
|
if "dangerous_auto_approve" in runtime and not isinstance(runtime["dangerous_auto_approve"], bool):
|
|
202
259
|
errors.append("/runtime/dangerous_auto_approve: must be a boolean")
|
|
260
|
+
if "auto_trust_own_workspace" in runtime and not isinstance(runtime["auto_trust_own_workspace"], bool):
|
|
261
|
+
errors.append("/runtime/auto_trust_own_workspace: must be a boolean")
|
|
203
262
|
_check_list(runtime.get("startup_order"), "/runtime/startup_order", errors)
|
|
204
263
|
|
|
205
264
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
from datetime import datetime
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any
|
|
6
7
|
|
|
@@ -11,6 +12,24 @@ from team_agent.status.compact import compact_status
|
|
|
11
12
|
from team_agent.status.constants import PENDING_DELIVERY_STATUSES
|
|
12
13
|
|
|
13
14
|
|
|
15
|
+
def _interacted_marker(first_send_at: Any) -> str:
|
|
16
|
+
"""C3 (cr verdict, 2026-05-27): render the persisted first_send_at as a
|
|
17
|
+
user-visible status field. Valid ISO 8601 UTC strings pass through; any
|
|
18
|
+
other shape (None, empty string, 0, False, corrupt garbage) renders as
|
|
19
|
+
the literal "never" so the operator sees a consistent classification
|
|
20
|
+
instead of leaking raw garbage into status output. Restart enforces
|
|
21
|
+
strict typing separately (corrupt values fail the operation); status is
|
|
22
|
+
a read-only surface and tolerantly degrades to "never" rather than
|
|
23
|
+
failing the status command."""
|
|
24
|
+
if isinstance(first_send_at, str) and first_send_at:
|
|
25
|
+
try:
|
|
26
|
+
datetime.fromisoformat(first_send_at)
|
|
27
|
+
except (ValueError, TypeError):
|
|
28
|
+
return "never"
|
|
29
|
+
return first_send_at
|
|
30
|
+
return "never"
|
|
31
|
+
|
|
32
|
+
|
|
14
33
|
def status(workspace: Path, as_json: bool = False, *, compact: bool = False) -> dict[str, Any]:
|
|
15
34
|
from team_agent.runtime import (
|
|
16
35
|
_capture_missing_sessions,
|
|
@@ -31,12 +50,24 @@ def status(workspace: Path, as_json: bool = False, *, compact: bool = False) ->
|
|
|
31
50
|
save_runtime_state(workspace, state)
|
|
32
51
|
session_name = state.get("session_name")
|
|
33
52
|
tmux_exists = _tmux_session_exists(session_name) if session_name else False
|
|
53
|
+
# C3 (cr verdict): enrich each worker entry with an explicit `interacted`
|
|
54
|
+
# field derived from the persisted first_send_at. The original entry
|
|
55
|
+
# passes through unchanged so any pre-existing field (including raw
|
|
56
|
+
# first_send_at) stays visible.
|
|
57
|
+
enriched_agents: dict[str, Any] = {}
|
|
58
|
+
for aid, raw in (state.get("agents") or {}).items():
|
|
59
|
+
if isinstance(raw, dict):
|
|
60
|
+
entry = dict(raw)
|
|
61
|
+
entry["interacted"] = _interacted_marker(raw.get("first_send_at"))
|
|
62
|
+
else:
|
|
63
|
+
entry = raw
|
|
64
|
+
enriched_agents[aid] = entry
|
|
34
65
|
result = {
|
|
35
66
|
"team": state.get("leader", {}).get("id", "leader"),
|
|
36
67
|
"session_name": session_name,
|
|
37
68
|
"tmux_session_present": tmux_exists,
|
|
38
69
|
"leader_receiver": state.get("leader_receiver", {}),
|
|
39
|
-
"agents":
|
|
70
|
+
"agents": enriched_agents,
|
|
40
71
|
"agent_health": store.agent_health(),
|
|
41
72
|
"tasks": state.get("tasks", []),
|
|
42
73
|
"messages": store.message_counts(),
|