@team-agent/installer 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/team_agent/cli/__init__.py +2 -0
- package/src/team_agent/cli/commands.py +22 -3
- package/src/team_agent/cli/parser.py +40 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +23 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +193 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +86 -9
- package/src/team_agent/lifecycle/paste_buffer_hygiene.py +39 -0
- package/src/team_agent/lifecycle/start.py +3 -0
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +23 -0
- package/src/team_agent/messaging/delivery.py +10 -0
- package/src/team_agent/messaging/idle_alerts.py +227 -21
- package/src/team_agent/messaging/leader.py +166 -6
- package/src/team_agent/messaging/leader_panes.py +193 -23
- package/src/team_agent/messaging/owner_bypass.py +29 -0
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +22 -2
- package/src/team_agent/messaging/send.py +9 -2
- package/src/team_agent/messaging/session_drift.py +94 -0
- package/src/team_agent/runtime.py +22 -14
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/state.py +167 -10
- package/src/team_agent/status/inbox.py +33 -3
|
@@ -13,7 +13,7 @@ from pathlib import Path
|
|
|
13
13
|
from typing import Any
|
|
14
14
|
|
|
15
15
|
from team_agent.events import EventLog
|
|
16
|
-
from team_agent.state import load_runtime_state, save_runtime_state
|
|
16
|
+
from team_agent.state import apply_first_time_leader_binding, derive_leader_session_uuid, leader_env_exports, load_runtime_state, save_runtime_state, save_team_scoped_state, select_runtime_state, team_state_key, validate_leader_uuid_from_targets
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def attach_leader(workspace: Path, pane: str | None = None, provider: str = "codex") -> dict[str, Any]:
|
|
@@ -63,28 +63,16 @@ def start_leader(
|
|
|
63
63
|
confirm_attach: bool = False,
|
|
64
64
|
attach_session: str | None = None,
|
|
65
65
|
) -> None:
|
|
66
|
-
plan = leader_start_plan(
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
workspace,
|
|
70
|
-
attach_existing=attach_existing,
|
|
71
|
-
confirm_attach=confirm_attach,
|
|
72
|
-
attach_session=attach_session,
|
|
73
|
-
)
|
|
66
|
+
plan = leader_start_plan(provider, provider_args, workspace, attach_existing=attach_existing, confirm_attach=confirm_attach, attach_session=attach_session)
|
|
67
|
+
if plan.get("leader_session_uuid_source") == "override":
|
|
68
|
+
EventLog(workspace).write("leader_session_uuid.override", source="explicit-override", uuid_prefix=str(plan.get("leader_session_uuid") or "")[:12], team_id=plan.get("team_id"))
|
|
74
69
|
if plan["mode"] == "new_tmux_session" and not sys.stdin.isatty():
|
|
75
70
|
plan = dict(plan)
|
|
76
71
|
argv = list(plan["argv"])
|
|
77
72
|
argv.insert(2, "-d")
|
|
78
73
|
plan["argv"] = argv
|
|
79
74
|
plan["detached"] = True
|
|
80
|
-
EventLog(workspace).write(
|
|
81
|
-
"leader.start",
|
|
82
|
-
provider=provider,
|
|
83
|
-
workspace=str(workspace),
|
|
84
|
-
mode=plan["mode"],
|
|
85
|
-
session_name=plan.get("session_name"),
|
|
86
|
-
argv=plan["argv"],
|
|
87
|
-
)
|
|
75
|
+
EventLog(workspace).write("leader.start", provider=provider, workspace=str(workspace), mode=plan["mode"], session_name=plan.get("session_name"), argv=_leader_plan_log_argv(plan), leader_session_uuid_source=plan.get("leader_session_uuid_source"), uuid_prefix=str(plan.get("leader_session_uuid") or "")[:12] or None)
|
|
88
76
|
_run_leader_plan(plan, workspace)
|
|
89
77
|
|
|
90
78
|
|
|
@@ -110,33 +98,23 @@ def leader_start_plan(
|
|
|
110
98
|
if not adapter.is_installed():
|
|
111
99
|
raise RuntimeError(f"Provider {provider} command {adapter.command_name!r} not found")
|
|
112
100
|
argv = [adapter.command_name, *provider_args]
|
|
101
|
+
identity = _leader_identity_context(workspace)
|
|
102
|
+
leader_env = _leader_provider_env(provider, identity)
|
|
113
103
|
if attach_session:
|
|
114
104
|
if not confirm_attach:
|
|
115
105
|
raise RuntimeError("--attach-session requires --confirm")
|
|
116
|
-
return {
|
|
117
|
-
"mode": "attach_existing",
|
|
118
|
-
"provider": provider,
|
|
119
|
-
"workspace": str(workspace),
|
|
120
|
-
"session_name": attach_session,
|
|
121
|
-
"argv": ["tmux", "attach-session", "-t", attach_session],
|
|
122
|
-
}
|
|
106
|
+
return {"mode": "attach_existing", "provider": provider, "workspace": str(workspace), "session_name": attach_session, "argv": ["tmux", "attach-session", "-t", attach_session]}
|
|
123
107
|
if os.environ.get("TMUX"):
|
|
124
|
-
return {"mode": "exec_provider", "provider": provider, "workspace": str(workspace), "argv": argv}
|
|
108
|
+
return {"mode": "exec_provider", "provider": provider, "workspace": str(workspace), "argv": argv, "env": {**os.environ, **leader_env}, **identity}
|
|
125
109
|
if not shutil_which("tmux"):
|
|
126
110
|
raise RuntimeError("tmux is not installed; install tmux 3.3+ or start the leader from an existing tmux pane")
|
|
127
111
|
session_name = leader_session_name(provider, workspace)
|
|
128
112
|
if _tmux_session_exists(session_name):
|
|
129
|
-
return {
|
|
130
|
-
|
|
131
|
-
"provider": provider,
|
|
132
|
-
"workspace": str(workspace),
|
|
133
|
-
"session_name": session_name,
|
|
134
|
-
"argv": ["tmux", "attach-session", "-t", session_name],
|
|
135
|
-
}
|
|
136
|
-
exports = ""
|
|
113
|
+
return {"mode": "attach_existing", "provider": provider, "workspace": str(workspace), "session_name": session_name, "argv": ["tmux", "attach-session", "-t", session_name]}
|
|
114
|
+
exports = " ".join(f"{key}={shlex.quote(value)}" for key, value in leader_env.items())
|
|
137
115
|
if os.environ.get("PATH"):
|
|
138
|
-
exports = f"PATH={shlex.quote(os.environ['PATH'])}
|
|
139
|
-
shell = f"cd {shlex.quote(str(workspace))} && {exports}exec {shlex.join(argv)}"
|
|
116
|
+
exports = f"{exports} PATH={shlex.quote(os.environ['PATH'])}"
|
|
117
|
+
shell = f"cd {shlex.quote(str(workspace))} && export {exports} && exec {shlex.join(argv)}"
|
|
140
118
|
tmux_args = ["tmux", "new-session", "-s", session_name, "-n", provider, "-c", str(workspace)]
|
|
141
119
|
return {
|
|
142
120
|
"mode": "new_tmux_session",
|
|
@@ -144,6 +122,8 @@ def leader_start_plan(
|
|
|
144
122
|
"workspace": str(workspace),
|
|
145
123
|
"session_name": session_name,
|
|
146
124
|
"argv": [*tmux_args, "sh", "-lc", shell],
|
|
125
|
+
"leader_env": leader_env,
|
|
126
|
+
**identity,
|
|
147
127
|
"detached": False,
|
|
148
128
|
}
|
|
149
129
|
|
|
@@ -174,7 +154,7 @@ def _run_leader_plan(plan: dict[str, Any], workspace: Path) -> None:
|
|
|
174
154
|
try:
|
|
175
155
|
if plan["mode"] == "exec_provider":
|
|
176
156
|
os.chdir(workspace)
|
|
177
|
-
proc = subprocess.Popen(plan["argv"])
|
|
157
|
+
proc = subprocess.Popen(plan["argv"], env=plan.get("env"))
|
|
178
158
|
if plan.get("detached") and session_name:
|
|
179
159
|
proc.wait()
|
|
180
160
|
while _tmux_session_exists_local(str(session_name)):
|
|
@@ -206,6 +186,70 @@ def leader_session_name(provider: str, workspace: Path) -> str:
|
|
|
206
186
|
return f"team-agent-leader-{provider}-{folder}-{digest}"
|
|
207
187
|
|
|
208
188
|
|
|
189
|
+
def _leader_identity_context(workspace: Path, team: str | None = None, state: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
190
|
+
state = state or _load_identity_state(workspace, team)
|
|
191
|
+
team_id = team_state_key(state)
|
|
192
|
+
machine = _identity_machine_fingerprint(state)
|
|
193
|
+
user = _identity_os_user()
|
|
194
|
+
override = os.environ.get("TEAM_AGENT_LEADER_SESSION_UUID_OVERRIDE") or ""
|
|
195
|
+
leader_uuid = override or _state_leader_session_uuid(state) or derive_leader_session_uuid(
|
|
196
|
+
machine,
|
|
197
|
+
str(workspace.resolve()),
|
|
198
|
+
user,
|
|
199
|
+
team_id,
|
|
200
|
+
)
|
|
201
|
+
return {
|
|
202
|
+
"leader_session_uuid": leader_uuid,
|
|
203
|
+
"leader_session_uuid_source": "override" if override else "derived",
|
|
204
|
+
"machine_fingerprint": machine,
|
|
205
|
+
"workspace_abspath": str(workspace.resolve()),
|
|
206
|
+
"os_user": user,
|
|
207
|
+
"team_id": team_id,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _load_identity_state(workspace: Path, team: str | None) -> dict[str, Any]:
|
|
212
|
+
try:
|
|
213
|
+
return select_runtime_state(workspace, team)
|
|
214
|
+
except Exception:
|
|
215
|
+
return load_runtime_state(workspace)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _identity_machine_fingerprint(state: dict[str, Any]) -> str:
|
|
219
|
+
for record in (state.get("team_owner"), state.get("leader_receiver")):
|
|
220
|
+
if isinstance(record, dict) and record.get("machine_fingerprint"):
|
|
221
|
+
return str(record["machine_fingerprint"])
|
|
222
|
+
return os.environ.get("TEAM_AGENT_MACHINE_FINGERPRINT") or os.uname().nodename
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _identity_os_user() -> str:
|
|
226
|
+
return os.environ.get("USER") or os.environ.get("USERNAME") or ""
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _state_leader_session_uuid(state: dict[str, Any]) -> str:
|
|
230
|
+
for record in (state.get("team_owner"), state.get("leader_receiver")):
|
|
231
|
+
if isinstance(record, dict) and record.get("leader_session_uuid"):
|
|
232
|
+
return str(record["leader_session_uuid"])
|
|
233
|
+
return ""
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _leader_provider_env(provider: str, identity: dict[str, Any]) -> dict[str, str]:
|
|
237
|
+
return {
|
|
238
|
+
"TEAM_AGENT_LEADER_PROVIDER": provider,
|
|
239
|
+
"TEAM_AGENT_LEADER_SESSION_UUID": str(identity["leader_session_uuid"]),
|
|
240
|
+
"TEAM_AGENT_MACHINE_FINGERPRINT": str(identity["machine_fingerprint"]),
|
|
241
|
+
"TEAM_AGENT_WORKSPACE": str(identity["workspace_abspath"]),
|
|
242
|
+
"TEAM_AGENT_TEAM_ID": str(identity["team_id"]),
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _leader_plan_log_argv(plan: dict[str, Any]) -> list[str]:
|
|
247
|
+
uuid_value = str(plan.get("leader_session_uuid") or "")
|
|
248
|
+
if not uuid_value:
|
|
249
|
+
return plan["argv"]
|
|
250
|
+
return [str(part).replace(uuid_value, f"{uuid_value[:12]}...") for part in plan["argv"]]
|
|
251
|
+
|
|
252
|
+
|
|
209
253
|
def attach_leader_to_state(
|
|
210
254
|
workspace: Path,
|
|
211
255
|
state: dict[str, Any],
|
|
@@ -221,12 +265,17 @@ def attach_leader_to_state(
|
|
|
221
265
|
_resolve_leader_pane,
|
|
222
266
|
_target_fingerprint,
|
|
223
267
|
_validate_leader_receiver,
|
|
268
|
+
core_list_targets,
|
|
224
269
|
get_adapter,
|
|
270
|
+
run_cmd,
|
|
225
271
|
)
|
|
226
272
|
get_adapter(provider)
|
|
227
273
|
pane_info, discovery = _resolve_leader_pane(pane, provider, workspace=workspace, require_current=require_current)
|
|
228
274
|
inferred_provider = _leader_command_provider(pane_info.get("pane_current_command", ""))
|
|
229
275
|
receiver_provider = inferred_provider or provider
|
|
276
|
+
identity = _leader_identity_context(workspace, state=state)
|
|
277
|
+
if identity.get("leader_session_uuid_source") == "override":
|
|
278
|
+
event_log.write("leader_session_uuid.override", source="explicit-override", uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12], team_id=identity.get("team_id"))
|
|
230
279
|
receiver = {
|
|
231
280
|
"mode": "direct_tmux",
|
|
232
281
|
"status": "attached",
|
|
@@ -242,38 +291,200 @@ def attach_leader_to_state(
|
|
|
242
291
|
"attached_at": datetime.now(timezone.utc).isoformat(),
|
|
243
292
|
"discovery": discovery,
|
|
244
293
|
}
|
|
294
|
+
if not state.get("team_owner") and source in {"launch", "quick_start"}:
|
|
295
|
+
validation = apply_first_time_leader_binding(workspace, state, receiver, pane_info, identity, source)
|
|
296
|
+
if not validation["ok"]:
|
|
297
|
+
event_log.write("leader_receiver.attach_failed", target=pane or pane_info.get("pane_id"), discovery=discovery, provider=provider, reason=validation["reason"], error=validation.get("error"), source=source, first_time=True, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12])
|
|
298
|
+
raise RuntimeError(f"leader pane validation failed: {validation['reason']}")
|
|
299
|
+
_set_tmux_leader_environment(receiver, identity, event_log, run_cmd)
|
|
300
|
+
event_log.write("leader_receiver.attached", target=receiver["pane_id"], session_name=receiver["session_name"], window_index=receiver["window_index"], window_name=receiver["window_name"], pane_index=receiver["pane_index"], pane_tty=receiver["pane_tty"], pane_current_command=receiver["pane_current_command"], provider=receiver_provider, requested_provider=provider if receiver_provider != provider else None, discovery=discovery, source=source, first_time=True, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12], leader_session_uuid_source=identity.get("leader_session_uuid_source"))
|
|
301
|
+
return receiver, validation
|
|
302
|
+
if receiver_provider != "fake":
|
|
303
|
+
receiver["leader_session_uuid"] = identity["leader_session_uuid"]
|
|
245
304
|
if receiver_provider != provider:
|
|
246
305
|
receiver["requested_provider"] = provider
|
|
247
|
-
validation =
|
|
306
|
+
validation = validate_leader_uuid_from_targets(receiver, core_list_targets())
|
|
307
|
+
if validation["ok"]:
|
|
308
|
+
validation = _validate_leader_receiver(receiver)
|
|
248
309
|
if not validation["ok"]:
|
|
249
|
-
event_log.write(
|
|
250
|
-
|
|
251
|
-
target=pane or pane_info.get("pane_id"),
|
|
252
|
-
discovery=discovery,
|
|
253
|
-
provider=provider,
|
|
254
|
-
reason=validation["reason"],
|
|
255
|
-
error=validation.get("error"),
|
|
256
|
-
source=source,
|
|
257
|
-
)
|
|
258
|
-
raise RuntimeError(f"leader pane validation failed: {validation['reason']}")
|
|
310
|
+
event_log.write("leader_receiver.attach_failed", target=pane or pane_info.get("pane_id"), discovery=discovery, provider=provider, reason=validation["reason"], error=validation.get("error"), source=source, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12])
|
|
311
|
+
raise RuntimeError(_strict_leader_validation_error(validation))
|
|
259
312
|
if validation.get("warning"):
|
|
260
313
|
receiver["warning"] = validation["warning"]
|
|
261
314
|
state["leader_receiver"] = receiver
|
|
315
|
+
event_log.write("leader_receiver.attached", target=receiver["pane_id"], session_name=receiver["session_name"], window_index=receiver["window_index"], window_name=receiver["window_name"], pane_index=receiver["pane_index"], pane_tty=receiver["pane_tty"], pane_current_command=receiver["pane_current_command"], provider=receiver_provider, requested_provider=provider if receiver_provider != provider else None, discovery=discovery, source=source, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12], leader_session_uuid_source=identity.get("leader_session_uuid_source"))
|
|
316
|
+
return receiver, validation
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _set_tmux_leader_environment(receiver: dict[str, Any], identity: dict[str, Any], event_log: EventLog, run_cmd: Any) -> None:
|
|
320
|
+
session_name = receiver.get("session_name")
|
|
321
|
+
if not session_name:
|
|
322
|
+
return
|
|
323
|
+
failures: dict[str, str] = {}
|
|
324
|
+
for key, value in leader_env_exports(receiver, identity).items():
|
|
325
|
+
proc = run_cmd(["tmux", "set-environment", "-t", str(session_name), key, value], timeout=5)
|
|
326
|
+
if proc.returncode != 0:
|
|
327
|
+
failures[key] = proc.stderr.strip() or "tmux set-environment failed"
|
|
262
328
|
event_log.write(
|
|
263
|
-
"leader_receiver.
|
|
264
|
-
|
|
265
|
-
session_name=
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
pane_index=receiver["pane_index"],
|
|
269
|
-
pane_tty=receiver["pane_tty"],
|
|
270
|
-
pane_current_command=receiver["pane_current_command"],
|
|
271
|
-
provider=receiver_provider,
|
|
272
|
-
requested_provider=provider if receiver_provider != provider else None,
|
|
273
|
-
discovery=discovery,
|
|
274
|
-
source=source,
|
|
329
|
+
"leader_receiver.first_time_env_seeded",
|
|
330
|
+
pane_id=receiver.get("pane_id"),
|
|
331
|
+
session_name=session_name,
|
|
332
|
+
ok=not failures,
|
|
333
|
+
failed_keys=sorted(failures),
|
|
275
334
|
)
|
|
276
|
-
|
|
335
|
+
|
|
336
|
+
def _strict_leader_validation_error(validation: dict[str, Any]) -> str:
|
|
337
|
+
return (
|
|
338
|
+
f"leader pane validation failed: {validation['reason']}. "
|
|
339
|
+
"first quick-start uses cwd+command match only; this team already has team_owner "
|
|
340
|
+
"so strict UUID gate applies; use team-agent takeover --confirm if you intend to take over"
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def leader_identity(workspace: Path, team: str | None = None) -> dict[str, Any]:
|
|
345
|
+
state = _load_identity_state(workspace, team)
|
|
346
|
+
identity = _leader_identity_context(workspace, team=team, state=state)
|
|
347
|
+
receiver = state.get("leader_receiver") if isinstance(state.get("leader_receiver"), dict) else {}
|
|
348
|
+
return {
|
|
349
|
+
"ok": True,
|
|
350
|
+
"uuid_prefix": str(identity["leader_session_uuid"])[:12],
|
|
351
|
+
"machine_fingerprint": identity["machine_fingerprint"],
|
|
352
|
+
"workspace_abspath": identity["workspace_abspath"],
|
|
353
|
+
"os_user": identity["os_user"],
|
|
354
|
+
"team_id": identity["team_id"],
|
|
355
|
+
"current_pane_id": os.environ.get("TEAM_AGENT_LEADER_PANE_ID") or os.environ.get("TMUX_PANE") or None,
|
|
356
|
+
"last_seen_at": receiver.get("attached_at") or receiver.get("last_seen_at"),
|
|
357
|
+
"source": identity["leader_session_uuid_source"],
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def claim_leader(workspace: Path, team: str | None = None, confirm: bool = False) -> dict[str, Any]:
|
|
362
|
+
from team_agent.runtime import RuntimeError, _runtime_lock, core_list_targets
|
|
363
|
+
current_pane = os.environ.get("TEAM_AGENT_LEADER_PANE_ID") or os.environ.get("TMUX_PANE")
|
|
364
|
+
if not current_pane:
|
|
365
|
+
return {"ok": False, "status": "refused", "reason": "no_caller_pane", "action": "run from a tmux leader pane"}
|
|
366
|
+
with _runtime_lock(workspace, "leader_receiver"):
|
|
367
|
+
state = select_runtime_state(workspace, team)
|
|
368
|
+
event_log = EventLog(workspace)
|
|
369
|
+
incident = _latest_ambiguous_incident(event_log, team_state_key(state))
|
|
370
|
+
if not incident:
|
|
371
|
+
return {"ok": False, "status": "refused", "reason": "no_ambiguous_candidates"}
|
|
372
|
+
candidates = [str(item) for item in incident.get("candidates", [])]
|
|
373
|
+
if current_pane not in candidates:
|
|
374
|
+
return {"ok": False, "status": "refused", "reason": "caller_not_candidate", "candidates": candidates}
|
|
375
|
+
receiver = state.get("leader_receiver") or {}
|
|
376
|
+
if receiver.get("pane_id") == current_pane:
|
|
377
|
+
return {"ok": True, "status": "already_bound", "leader_receiver": receiver}
|
|
378
|
+
if _incident_already_claimed(event_log, str(incident.get("incident_id"))):
|
|
379
|
+
return _claim_lost_race(receiver)
|
|
380
|
+
if receiver.get("pane_id") in candidates and receiver.get("pane_id") != incident.get("old_pane_id"):
|
|
381
|
+
return _claim_lost_race(receiver)
|
|
382
|
+
if not confirm:
|
|
383
|
+
return {"ok": True, "status": "dry_run", "would_bind_pane_id": current_pane, "candidates": candidates}
|
|
384
|
+
targets = core_list_targets()
|
|
385
|
+
if not targets.get("ok"):
|
|
386
|
+
raise RuntimeError(str(targets.get("error") or "tmux target scan failed"))
|
|
387
|
+
target = next((item for item in targets.get("targets", []) if item.get("pane_id") == current_pane), None)
|
|
388
|
+
if not target:
|
|
389
|
+
return {"ok": False, "status": "refused", "reason": "candidate_pane_missing", "pane_id": current_pane}
|
|
390
|
+
owner = state.setdefault("team_owner", {})
|
|
391
|
+
expected_uuid = str(owner.get("leader_session_uuid") or _leader_identity_context(workspace, team=team, state=state)["leader_session_uuid"])
|
|
392
|
+
target_uuid = _target_leader_session_uuid(target)
|
|
393
|
+
if target_uuid != expected_uuid:
|
|
394
|
+
return {"ok": False, "status": "refused", "reason": "leader_session_uuid_mismatch", "uuid_prefix": expected_uuid[:12]}
|
|
395
|
+
epoch = int(owner.get("owner_epoch") or receiver.get("owner_epoch") or 0) + 1
|
|
396
|
+
owner.update({"pane_id": current_pane, "owner_epoch": epoch, "claimed_at": datetime.now(timezone.utc).isoformat(), "claimed_via": "claim-leader"})
|
|
397
|
+
state["leader_receiver"] = _receiver_from_claim_target(target, receiver, expected_uuid, epoch)
|
|
398
|
+
save_team_scoped_state(workspace, state)
|
|
399
|
+
losers = [pane for pane in candidates if pane != current_pane]
|
|
400
|
+
event_log.write(
|
|
401
|
+
"leader_receiver.claim_applied",
|
|
402
|
+
incident_id=incident.get("incident_id"),
|
|
403
|
+
winner_pane_id=current_pane,
|
|
404
|
+
losers=losers,
|
|
405
|
+
owner_epoch=epoch,
|
|
406
|
+
uuid_prefix=expected_uuid[:12],
|
|
407
|
+
)
|
|
408
|
+
# Stage 11.9 (Gap 26 Mac mini Scenario 3): result watchers that stalled while the
|
|
409
|
+
# broadcast was waiting for a human claim need fresh budget against the newly bound
|
|
410
|
+
# pane. Per-watcher leader_receiver.claim_requeue events + immediate retry.
|
|
411
|
+
from team_agent.message_store import MessageStore
|
|
412
|
+
from team_agent.messaging.result_delivery import requeue_after_claim_leader
|
|
413
|
+
requeued = requeue_after_claim_leader(
|
|
414
|
+
workspace,
|
|
415
|
+
MessageStore(workspace),
|
|
416
|
+
event_log,
|
|
417
|
+
team_state_key(state),
|
|
418
|
+
current_pane,
|
|
419
|
+
incident_ts=incident.get("ts"),
|
|
420
|
+
)
|
|
421
|
+
response: dict[str, Any] = {
|
|
422
|
+
"ok": True,
|
|
423
|
+
"status": "claimed",
|
|
424
|
+
"leader_receiver": state["leader_receiver"],
|
|
425
|
+
"owner_epoch": epoch,
|
|
426
|
+
"losers": losers,
|
|
427
|
+
"requeued_watchers": [item["watcher_id"] for item in requeued],
|
|
428
|
+
}
|
|
429
|
+
# Stage 13 (silent-loss arm mailbox-hint route, 2026-05-26 second roundtable):
|
|
430
|
+
# the framework cannot guarantee every worker message reached the leader pane during
|
|
431
|
+
# the ambiguous-state window (retry budgets may have exhausted before the human
|
|
432
|
+
# claimed). Pointing the leader agent at the inbox lets it self-recover by reading
|
|
433
|
+
# the messages that landed in storage but never injected to a pane.
|
|
434
|
+
incident_ts = incident.get("ts")
|
|
435
|
+
if incident_ts:
|
|
436
|
+
response["inbox_hint"] = {
|
|
437
|
+
"message": (
|
|
438
|
+
"During the previous ambiguous-leader state, some worker messages may "
|
|
439
|
+
"not have been auto-delivered to this pane. Run the command below to "
|
|
440
|
+
"retrieve them."
|
|
441
|
+
),
|
|
442
|
+
"command": f"team-agent inbox leader --since {incident_ts}",
|
|
443
|
+
"since": incident_ts,
|
|
444
|
+
"incident_id": incident.get("incident_id"),
|
|
445
|
+
}
|
|
446
|
+
return response
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _latest_ambiguous_incident(event_log: EventLog, team_id: str) -> dict[str, Any] | None:
|
|
450
|
+
for event in reversed(event_log.tail(200)):
|
|
451
|
+
if event.get("event") != "leader_receiver.ambiguous_candidates":
|
|
452
|
+
continue
|
|
453
|
+
if event.get("team_id") in {None, team_id}:
|
|
454
|
+
return event
|
|
455
|
+
return None
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _incident_already_claimed(event_log: EventLog, incident_id: str) -> bool:
|
|
459
|
+
return any(event.get("event") == "leader_receiver.claim_applied" and event.get("incident_id") == incident_id for event in event_log.tail(200))
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _claim_lost_race(receiver: dict[str, Any]) -> dict[str, Any]:
|
|
463
|
+
return {"ok": False, "status": "refused", "reason": "owner_epoch_advanced", "error": f"team already bound to pane {receiver.get('pane_id')}; you lost the race", "bound_pane_id": receiver.get("pane_id"), "owner_epoch": receiver.get("owner_epoch")}
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _target_leader_session_uuid(target: dict[str, Any]) -> str:
|
|
467
|
+
env = target.get("leader_env") if isinstance(target.get("leader_env"), dict) else {}
|
|
468
|
+
return str(target.get("leader_session_uuid") or env.get("TEAM_AGENT_LEADER_SESSION_UUID") or "")
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _receiver_from_claim_target(target: dict[str, Any], previous: dict[str, Any], leader_uuid: str, owner_epoch: int) -> dict[str, Any]:
|
|
472
|
+
return {
|
|
473
|
+
"mode": "direct_tmux",
|
|
474
|
+
"status": "attached",
|
|
475
|
+
"provider": previous.get("provider") or "codex",
|
|
476
|
+
"pane_id": target["pane_id"],
|
|
477
|
+
"session_name": target.get("session_name"),
|
|
478
|
+
"window_index": str(target.get("window_index")),
|
|
479
|
+
"window_name": target.get("window_name"),
|
|
480
|
+
"pane_index": str(target.get("pane_index")),
|
|
481
|
+
"pane_tty": target.get("pane_tty"),
|
|
482
|
+
"pane_current_command": target.get("pane_current_command"),
|
|
483
|
+
"leader_session_uuid": leader_uuid,
|
|
484
|
+
"owner_epoch": owner_epoch,
|
|
485
|
+
"attached_at": datetime.now(timezone.utc).isoformat(),
|
|
486
|
+
"discovery": "claim_leader",
|
|
487
|
+
}
|
|
277
488
|
|
|
278
489
|
|
|
279
490
|
def autobind_leader_receiver_from_env(
|
|
@@ -314,6 +525,8 @@ __all__ = [
|
|
|
314
525
|
"attach_leader",
|
|
315
526
|
"attach_leader_to_state",
|
|
316
527
|
"autobind_leader_receiver_from_env",
|
|
528
|
+
"claim_leader",
|
|
529
|
+
"leader_identity",
|
|
317
530
|
"leader_session_name",
|
|
318
531
|
"leader_start_plan",
|
|
319
532
|
"start_leader",
|
|
@@ -57,30 +57,80 @@ def remove_agent(
|
|
|
57
57
|
|
|
58
58
|
rollback = _RemoveRollback(workspace, spec_path, spec, state, dynamic_role_file, store, agent_id, False)
|
|
59
59
|
stopped: dict[str, Any] | None = None
|
|
60
|
+
cleared_locations: list[str] = []
|
|
61
|
+
current_step = "init"
|
|
62
|
+
current_resource: str | None = None
|
|
63
|
+
|
|
64
|
+
def _step_done(name: str, resource: str | None = None, **extra: Any) -> None:
|
|
65
|
+
cleared_locations.append(name)
|
|
66
|
+
event_log.write(
|
|
67
|
+
"lifecycle.remove_step_completed",
|
|
68
|
+
agent_id=agent_id,
|
|
69
|
+
step=name,
|
|
70
|
+
resource=resource,
|
|
71
|
+
**extra,
|
|
72
|
+
)
|
|
73
|
+
|
|
60
74
|
try:
|
|
61
75
|
if running and force:
|
|
76
|
+
current_step, current_resource = "stop_agent", agent_id
|
|
62
77
|
stopped = runtime.stop_agent(workspace, agent_id, team=team)
|
|
63
78
|
rollback.restore_running = True
|
|
64
79
|
state, _refusal_after = resolve_team_scoped_state(workspace, team)
|
|
80
|
+
_step_done("stop_agent", resource=agent_id, stopped=stopped)
|
|
81
|
+
|
|
82
|
+
current_step, current_resource = "workspace_state", "state.json:agents"
|
|
65
83
|
removed_state = copy.deepcopy(state)
|
|
66
84
|
removed_state.get("agents", {}).pop(agent_id, None)
|
|
67
85
|
save_team_scoped_state(workspace, removed_state)
|
|
86
|
+
_step_done("workspace_state", resource=current_resource)
|
|
68
87
|
|
|
88
|
+
current_step, current_resource = "spec_yaml", str(spec_path)
|
|
69
89
|
removed_spec = copy.deepcopy(spec)
|
|
70
90
|
removed_spec["agents"] = [item for item in removed_spec.get("agents", []) if item.get("id") != agent_id]
|
|
71
91
|
startup_order = removed_spec.get("runtime", {}).get("startup_order")
|
|
72
92
|
if isinstance(startup_order, list):
|
|
73
93
|
removed_spec["runtime"]["startup_order"] = [item for item in startup_order if item != agent_id]
|
|
74
94
|
validate_spec(removed_spec, base_dir=spec_path.parent)
|
|
95
|
+
current_step, current_resource = "team_state_md", "team_state.md"
|
|
75
96
|
team_state_path = write_team_state(workspace, removed_spec, removed_state)
|
|
97
|
+
_step_done("team_state_md", resource=str(team_state_path))
|
|
98
|
+
current_step, current_resource = "spec_yaml", str(spec_path)
|
|
76
99
|
write_spec(spec_path, removed_spec)
|
|
100
|
+
_step_done("spec_yaml", resource=str(spec_path))
|
|
77
101
|
|
|
102
|
+
current_step, current_resource = "role_file", str(dynamic_role_file)
|
|
78
103
|
role_file_removed = _remove_dynamic_role_file(dynamic_role_file, bool(agent_state.get("dynamic_role_file")))
|
|
104
|
+
if role_file_removed:
|
|
105
|
+
_step_done("role_file", resource=str(dynamic_role_file))
|
|
106
|
+
|
|
107
|
+
current_step, current_resource = "agent_health", agent_id
|
|
79
108
|
_delete_agent_health(store, agent_id)
|
|
109
|
+
_step_done("agent_health", resource=agent_id)
|
|
80
110
|
except Exception as exc:
|
|
81
111
|
rollback_result = rollback.restore(runtime, event_log)
|
|
82
|
-
event_log.write(
|
|
83
|
-
|
|
112
|
+
event_log.write(
|
|
113
|
+
"remove_agent.rollback",
|
|
114
|
+
agent_id=agent_id,
|
|
115
|
+
ok=rollback_result["ok"],
|
|
116
|
+
error=str(exc),
|
|
117
|
+
failed_step=current_step,
|
|
118
|
+
resource=current_resource,
|
|
119
|
+
cleared_before_failure=cleared_locations,
|
|
120
|
+
rollback=rollback_result,
|
|
121
|
+
)
|
|
122
|
+
event_log.write(
|
|
123
|
+
"lifecycle.remove_rolled_back",
|
|
124
|
+
agent_id=agent_id,
|
|
125
|
+
ok=rollback_result["ok"],
|
|
126
|
+
failed_step=current_step,
|
|
127
|
+
resource=current_resource,
|
|
128
|
+
rollback_errors=rollback_result.get("errors", []),
|
|
129
|
+
)
|
|
130
|
+
raise RuntimeError(
|
|
131
|
+
f"remove-agent failed for {agent_id} at step={current_step} "
|
|
132
|
+
f"resource={current_resource}: {exc}; rollback_ok={rollback_result['ok']}"
|
|
133
|
+
) from exc
|
|
84
134
|
|
|
85
135
|
runtime._save_team_runtime_snapshot(workspace, removed_state)
|
|
86
136
|
warning = None
|
|
@@ -93,6 +143,7 @@ def remove_agent(
|
|
|
93
143
|
force=force,
|
|
94
144
|
stopped=stopped,
|
|
95
145
|
role_file_removed=role_file_removed,
|
|
146
|
+
cleared_locations=cleared_locations,
|
|
96
147
|
)
|
|
97
148
|
except Exception as exc:
|
|
98
149
|
warning = f"remove-agent completed but success event logging failed: {exc}"
|
|
@@ -105,6 +156,7 @@ def remove_agent(
|
|
|
105
156
|
"stopped": stopped,
|
|
106
157
|
"state_file": str(team_state_path),
|
|
107
158
|
"role_file_removed": role_file_removed,
|
|
159
|
+
"cleared_locations": cleared_locations,
|
|
108
160
|
**({"warning": warning} if warning else {}),
|
|
109
161
|
}
|
|
110
162
|
|
|
@@ -154,32 +154,109 @@ def add_agent(workspace: Path, agent_id: str, *, role_file_path: str, open_displ
|
|
|
154
154
|
old_spec_text = spec_path.read_text(encoding="utf-8")
|
|
155
155
|
old_state = copy.deepcopy(state)
|
|
156
156
|
old_dynamic = dynamic_path.read_bytes() if dynamic_path.exists() else None
|
|
157
|
+
# Stage 11.11 (Gap 15 follow-up): snapshot team_state.md BEFORE any write so the rollback
|
|
158
|
+
# path can restore it byte-equal. Pre-Stage-11.11 the rollback handler omitted this file
|
|
159
|
+
# and Mac mini Scenario 6 left orphan entries after induced add-agent failures.
|
|
160
|
+
team_state_path = workspace / spec.get("context", {}).get("state_file", "team_state.md")
|
|
161
|
+
old_team_state = team_state_path.read_bytes() if team_state_path.exists() else None
|
|
157
162
|
event_log = EventLog(workspace)
|
|
163
|
+
cleared_locations: list[str] = []
|
|
164
|
+
current_step = "init"
|
|
165
|
+
current_resource: str | None = None
|
|
166
|
+
|
|
167
|
+
def _step_done(name: str, resource: str | None = None, **extra: Any) -> None:
|
|
168
|
+
cleared_locations.append(name)
|
|
169
|
+
event_log.write(
|
|
170
|
+
"lifecycle.add_step_completed",
|
|
171
|
+
agent_id=agent_id,
|
|
172
|
+
step=name,
|
|
173
|
+
resource=resource,
|
|
174
|
+
**extra,
|
|
175
|
+
)
|
|
176
|
+
|
|
158
177
|
try:
|
|
178
|
+
current_step, current_resource = "role_file", str(dynamic_path)
|
|
159
179
|
dynamic_dir.mkdir(parents=True, exist_ok=True)
|
|
160
180
|
dynamic_path.write_bytes(role_bytes)
|
|
181
|
+
_step_done("role_file", resource=str(dynamic_path))
|
|
182
|
+
|
|
183
|
+
current_step, current_resource = "compile_role_doc", str(dynamic_path)
|
|
161
184
|
agent = compile_role_doc_agent(dynamic_path, team_dir, agent_id)
|
|
185
|
+
_step_done("compile_role_doc", resource=str(dynamic_path))
|
|
186
|
+
|
|
187
|
+
current_step, current_resource = "spec_yaml", str(spec_path)
|
|
162
188
|
spec.setdefault("agents", []).append(agent)
|
|
163
189
|
spec.setdefault("runtime", {}).setdefault("startup_order", []).append(agent_id)
|
|
164
190
|
validate_spec(spec, base_dir=spec_path.parent)
|
|
165
191
|
write_spec(spec_path, spec)
|
|
192
|
+
_step_done("spec_yaml", resource=str(spec_path))
|
|
193
|
+
|
|
194
|
+
current_step, current_resource = "team_state_md", "team_state.md"
|
|
166
195
|
write_team_state(workspace, spec, state)
|
|
196
|
+
_step_done("team_state_md", resource="team_state.md")
|
|
197
|
+
|
|
198
|
+
current_step, current_resource = "start_agent", agent_id
|
|
167
199
|
started = start_agent(workspace, agent_id, open_display=open_display, allow_fresh=True, team=team)
|
|
200
|
+
_step_done("start_agent", resource=agent_id, started=started)
|
|
201
|
+
|
|
202
|
+
current_step, current_resource = "workspace_state", "state.json:agents"
|
|
168
203
|
state, _refusal_after = resolve_team_scoped_state(workspace, team)
|
|
169
204
|
state["agents"][agent_id]["dynamic_role_file"] = str(dynamic_path.relative_to(workspace))
|
|
170
205
|
state["agents"][agent_id]["role_file_sha"] = role_sha
|
|
171
206
|
save_team_scoped_state(workspace, state)
|
|
172
207
|
state_path = write_team_state(workspace, spec, state)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
208
|
+
_step_done("workspace_state", resource="state.json:agents")
|
|
209
|
+
except Exception as exc:
|
|
210
|
+
rolled_back: list[str] = []
|
|
211
|
+
rollback_errors: list[str] = []
|
|
212
|
+
try:
|
|
213
|
+
spec_path.write_text(old_spec_text, encoding="utf-8")
|
|
214
|
+
rolled_back.append("spec_yaml")
|
|
215
|
+
event_log.write("lifecycle.add_step_rolled_back", agent_id=agent_id,
|
|
216
|
+
step="spec_yaml", resource=str(spec_path))
|
|
217
|
+
except Exception as restore_exc:
|
|
218
|
+
rollback_errors.append(f"spec_yaml:{restore_exc}")
|
|
219
|
+
try:
|
|
220
|
+
save_team_scoped_state(workspace, old_state)
|
|
221
|
+
rolled_back.append("workspace_state")
|
|
222
|
+
event_log.write("lifecycle.add_step_rolled_back", agent_id=agent_id,
|
|
223
|
+
step="workspace_state", resource="state.json:agents")
|
|
224
|
+
except Exception as restore_exc:
|
|
225
|
+
rollback_errors.append(f"workspace_state:{restore_exc}")
|
|
226
|
+
try:
|
|
227
|
+
if old_team_state is None:
|
|
228
|
+
team_state_path.unlink(missing_ok=True)
|
|
229
|
+
else:
|
|
230
|
+
team_state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
231
|
+
team_state_path.write_bytes(old_team_state)
|
|
232
|
+
rolled_back.append("team_state_md")
|
|
233
|
+
event_log.write("lifecycle.add_step_rolled_back", agent_id=agent_id,
|
|
234
|
+
step="team_state_md", resource=str(team_state_path))
|
|
235
|
+
except Exception as restore_exc:
|
|
236
|
+
rollback_errors.append(f"team_state_md:{restore_exc}")
|
|
237
|
+
try:
|
|
238
|
+
if old_dynamic is None:
|
|
239
|
+
dynamic_path.unlink(missing_ok=True)
|
|
240
|
+
else:
|
|
241
|
+
dynamic_path.parent.mkdir(parents=True, exist_ok=True)
|
|
242
|
+
dynamic_path.write_bytes(old_dynamic)
|
|
243
|
+
rolled_back.append("role_file")
|
|
244
|
+
event_log.write("lifecycle.add_step_rolled_back", agent_id=agent_id,
|
|
245
|
+
step="role_file", resource=str(dynamic_path))
|
|
246
|
+
except Exception as restore_exc:
|
|
247
|
+
rollback_errors.append(f"role_file:{restore_exc}")
|
|
248
|
+
event_log.write(
|
|
249
|
+
"lifecycle.add_failed",
|
|
250
|
+
agent_id=agent_id,
|
|
251
|
+
failed_step=current_step,
|
|
252
|
+
failed_resource=current_resource,
|
|
253
|
+
cleared_locations=cleared_locations,
|
|
254
|
+
rolled_back=rolled_back,
|
|
255
|
+
rollback_errors=rollback_errors,
|
|
256
|
+
reason=str(exc),
|
|
257
|
+
)
|
|
181
258
|
raise
|
|
182
|
-
event_log.write("add_agent.complete", agent_id=agent_id, role_file=str(dynamic_path), role_file_sha=role_sha, started=started)
|
|
259
|
+
event_log.write("add_agent.complete", agent_id=agent_id, role_file=str(dynamic_path), role_file_sha=role_sha, started=started, cleared_locations=cleared_locations)
|
|
183
260
|
return {
|
|
184
261
|
"ok": True,
|
|
185
262
|
"agent_id": agent_id,
|