@team-agent/installer 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@ from pathlib import Path
13
13
  from typing import Any
14
14
 
15
15
  from team_agent.events import EventLog
16
- from team_agent.state import load_runtime_state, save_runtime_state
16
+ from team_agent.state import apply_first_time_leader_binding, derive_leader_session_uuid, leader_env_exports, load_runtime_state, save_runtime_state, save_team_scoped_state, select_runtime_state, team_state_key, validate_leader_uuid_from_targets
17
17
 
18
18
 
19
19
  def attach_leader(workspace: Path, pane: str | None = None, provider: str = "codex") -> dict[str, Any]:
@@ -63,28 +63,16 @@ def start_leader(
63
63
  confirm_attach: bool = False,
64
64
  attach_session: str | None = None,
65
65
  ) -> None:
66
- plan = leader_start_plan(
67
- provider,
68
- provider_args,
69
- workspace,
70
- attach_existing=attach_existing,
71
- confirm_attach=confirm_attach,
72
- attach_session=attach_session,
73
- )
66
+ plan = leader_start_plan(provider, provider_args, workspace, attach_existing=attach_existing, confirm_attach=confirm_attach, attach_session=attach_session)
67
+ if plan.get("leader_session_uuid_source") == "override":
68
+ EventLog(workspace).write("leader_session_uuid.override", source="explicit-override", uuid_prefix=str(plan.get("leader_session_uuid") or "")[:12], team_id=plan.get("team_id"))
74
69
  if plan["mode"] == "new_tmux_session" and not sys.stdin.isatty():
75
70
  plan = dict(plan)
76
71
  argv = list(plan["argv"])
77
72
  argv.insert(2, "-d")
78
73
  plan["argv"] = argv
79
74
  plan["detached"] = True
80
- EventLog(workspace).write(
81
- "leader.start",
82
- provider=provider,
83
- workspace=str(workspace),
84
- mode=plan["mode"],
85
- session_name=plan.get("session_name"),
86
- argv=plan["argv"],
87
- )
75
+ EventLog(workspace).write("leader.start", provider=provider, workspace=str(workspace), mode=plan["mode"], session_name=plan.get("session_name"), argv=_leader_plan_log_argv(plan), leader_session_uuid_source=plan.get("leader_session_uuid_source"), uuid_prefix=str(plan.get("leader_session_uuid") or "")[:12] or None)
88
76
  _run_leader_plan(plan, workspace)
89
77
 
90
78
 
@@ -110,33 +98,23 @@ def leader_start_plan(
110
98
  if not adapter.is_installed():
111
99
  raise RuntimeError(f"Provider {provider} command {adapter.command_name!r} not found")
112
100
  argv = [adapter.command_name, *provider_args]
101
+ identity = _leader_identity_context(workspace)
102
+ leader_env = _leader_provider_env(provider, identity)
113
103
  if attach_session:
114
104
  if not confirm_attach:
115
105
  raise RuntimeError("--attach-session requires --confirm")
116
- return {
117
- "mode": "attach_existing",
118
- "provider": provider,
119
- "workspace": str(workspace),
120
- "session_name": attach_session,
121
- "argv": ["tmux", "attach-session", "-t", attach_session],
122
- }
106
+ return {"mode": "attach_existing", "provider": provider, "workspace": str(workspace), "session_name": attach_session, "argv": ["tmux", "attach-session", "-t", attach_session]}
123
107
  if os.environ.get("TMUX"):
124
- return {"mode": "exec_provider", "provider": provider, "workspace": str(workspace), "argv": argv}
108
+ return {"mode": "exec_provider", "provider": provider, "workspace": str(workspace), "argv": argv, "env": {**os.environ, **leader_env}, **identity}
125
109
  if not shutil_which("tmux"):
126
110
  raise RuntimeError("tmux is not installed; install tmux 3.3+ or start the leader from an existing tmux pane")
127
111
  session_name = leader_session_name(provider, workspace)
128
112
  if _tmux_session_exists(session_name):
129
- return {
130
- "mode": "attach_existing",
131
- "provider": provider,
132
- "workspace": str(workspace),
133
- "session_name": session_name,
134
- "argv": ["tmux", "attach-session", "-t", session_name],
135
- }
136
- exports = ""
113
+ return {"mode": "attach_existing", "provider": provider, "workspace": str(workspace), "session_name": session_name, "argv": ["tmux", "attach-session", "-t", session_name]}
114
+ exports = " ".join(f"{key}={shlex.quote(value)}" for key, value in leader_env.items())
137
115
  if os.environ.get("PATH"):
138
- exports = f"PATH={shlex.quote(os.environ['PATH'])} "
139
- shell = f"cd {shlex.quote(str(workspace))} && {exports}exec {shlex.join(argv)}"
116
+ exports = f"{exports} PATH={shlex.quote(os.environ['PATH'])}"
117
+ shell = f"cd {shlex.quote(str(workspace))} && export {exports} && exec {shlex.join(argv)}"
140
118
  tmux_args = ["tmux", "new-session", "-s", session_name, "-n", provider, "-c", str(workspace)]
141
119
  return {
142
120
  "mode": "new_tmux_session",
@@ -144,6 +122,8 @@ def leader_start_plan(
144
122
  "workspace": str(workspace),
145
123
  "session_name": session_name,
146
124
  "argv": [*tmux_args, "sh", "-lc", shell],
125
+ "leader_env": leader_env,
126
+ **identity,
147
127
  "detached": False,
148
128
  }
149
129
 
@@ -174,7 +154,7 @@ def _run_leader_plan(plan: dict[str, Any], workspace: Path) -> None:
174
154
  try:
175
155
  if plan["mode"] == "exec_provider":
176
156
  os.chdir(workspace)
177
- proc = subprocess.Popen(plan["argv"])
157
+ proc = subprocess.Popen(plan["argv"], env=plan.get("env"))
178
158
  if plan.get("detached") and session_name:
179
159
  proc.wait()
180
160
  while _tmux_session_exists_local(str(session_name)):
@@ -206,6 +186,70 @@ def leader_session_name(provider: str, workspace: Path) -> str:
206
186
  return f"team-agent-leader-{provider}-{folder}-{digest}"
207
187
 
208
188
 
189
+ def _leader_identity_context(workspace: Path, team: str | None = None, state: dict[str, Any] | None = None) -> dict[str, Any]:
190
+ state = state or _load_identity_state(workspace, team)
191
+ team_id = team_state_key(state)
192
+ machine = _identity_machine_fingerprint(state)
193
+ user = _identity_os_user()
194
+ override = os.environ.get("TEAM_AGENT_LEADER_SESSION_UUID_OVERRIDE") or ""
195
+ leader_uuid = override or _state_leader_session_uuid(state) or derive_leader_session_uuid(
196
+ machine,
197
+ str(workspace.resolve()),
198
+ user,
199
+ team_id,
200
+ )
201
+ return {
202
+ "leader_session_uuid": leader_uuid,
203
+ "leader_session_uuid_source": "override" if override else "derived",
204
+ "machine_fingerprint": machine,
205
+ "workspace_abspath": str(workspace.resolve()),
206
+ "os_user": user,
207
+ "team_id": team_id,
208
+ }
209
+
210
+
211
+ def _load_identity_state(workspace: Path, team: str | None) -> dict[str, Any]:
212
+ try:
213
+ return select_runtime_state(workspace, team)
214
+ except Exception:
215
+ return load_runtime_state(workspace)
216
+
217
+
218
+ def _identity_machine_fingerprint(state: dict[str, Any]) -> str:
219
+ for record in (state.get("team_owner"), state.get("leader_receiver")):
220
+ if isinstance(record, dict) and record.get("machine_fingerprint"):
221
+ return str(record["machine_fingerprint"])
222
+ return os.environ.get("TEAM_AGENT_MACHINE_FINGERPRINT") or os.uname().nodename
223
+
224
+
225
+ def _identity_os_user() -> str:
226
+ return os.environ.get("USER") or os.environ.get("USERNAME") or ""
227
+
228
+
229
+ def _state_leader_session_uuid(state: dict[str, Any]) -> str:
230
+ for record in (state.get("team_owner"), state.get("leader_receiver")):
231
+ if isinstance(record, dict) and record.get("leader_session_uuid"):
232
+ return str(record["leader_session_uuid"])
233
+ return ""
234
+
235
+
236
+ def _leader_provider_env(provider: str, identity: dict[str, Any]) -> dict[str, str]:
237
+ return {
238
+ "TEAM_AGENT_LEADER_PROVIDER": provider,
239
+ "TEAM_AGENT_LEADER_SESSION_UUID": str(identity["leader_session_uuid"]),
240
+ "TEAM_AGENT_MACHINE_FINGERPRINT": str(identity["machine_fingerprint"]),
241
+ "TEAM_AGENT_WORKSPACE": str(identity["workspace_abspath"]),
242
+ "TEAM_AGENT_TEAM_ID": str(identity["team_id"]),
243
+ }
244
+
245
+
246
+ def _leader_plan_log_argv(plan: dict[str, Any]) -> list[str]:
247
+ uuid_value = str(plan.get("leader_session_uuid") or "")
248
+ if not uuid_value:
249
+ return plan["argv"]
250
+ return [str(part).replace(uuid_value, f"{uuid_value[:12]}...") for part in plan["argv"]]
251
+
252
+
209
253
  def attach_leader_to_state(
210
254
  workspace: Path,
211
255
  state: dict[str, Any],
@@ -221,12 +265,17 @@ def attach_leader_to_state(
221
265
  _resolve_leader_pane,
222
266
  _target_fingerprint,
223
267
  _validate_leader_receiver,
268
+ core_list_targets,
224
269
  get_adapter,
270
+ run_cmd,
225
271
  )
226
272
  get_adapter(provider)
227
273
  pane_info, discovery = _resolve_leader_pane(pane, provider, workspace=workspace, require_current=require_current)
228
274
  inferred_provider = _leader_command_provider(pane_info.get("pane_current_command", ""))
229
275
  receiver_provider = inferred_provider or provider
276
+ identity = _leader_identity_context(workspace, state=state)
277
+ if identity.get("leader_session_uuid_source") == "override":
278
+ event_log.write("leader_session_uuid.override", source="explicit-override", uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12], team_id=identity.get("team_id"))
230
279
  receiver = {
231
280
  "mode": "direct_tmux",
232
281
  "status": "attached",
@@ -242,38 +291,200 @@ def attach_leader_to_state(
242
291
  "attached_at": datetime.now(timezone.utc).isoformat(),
243
292
  "discovery": discovery,
244
293
  }
294
+ if not state.get("team_owner") and source in {"launch", "quick_start"}:
295
+ validation = apply_first_time_leader_binding(workspace, state, receiver, pane_info, identity, source)
296
+ if not validation["ok"]:
297
+ event_log.write("leader_receiver.attach_failed", target=pane or pane_info.get("pane_id"), discovery=discovery, provider=provider, reason=validation["reason"], error=validation.get("error"), source=source, first_time=True, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12])
298
+ raise RuntimeError(f"leader pane validation failed: {validation['reason']}")
299
+ _set_tmux_leader_environment(receiver, identity, event_log, run_cmd)
300
+ event_log.write("leader_receiver.attached", target=receiver["pane_id"], session_name=receiver["session_name"], window_index=receiver["window_index"], window_name=receiver["window_name"], pane_index=receiver["pane_index"], pane_tty=receiver["pane_tty"], pane_current_command=receiver["pane_current_command"], provider=receiver_provider, requested_provider=provider if receiver_provider != provider else None, discovery=discovery, source=source, first_time=True, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12], leader_session_uuid_source=identity.get("leader_session_uuid_source"))
301
+ return receiver, validation
302
+ if receiver_provider != "fake":
303
+ receiver["leader_session_uuid"] = identity["leader_session_uuid"]
245
304
  if receiver_provider != provider:
246
305
  receiver["requested_provider"] = provider
247
- validation = _validate_leader_receiver(receiver)
306
+ validation = validate_leader_uuid_from_targets(receiver, core_list_targets())
307
+ if validation["ok"]:
308
+ validation = _validate_leader_receiver(receiver)
248
309
  if not validation["ok"]:
249
- event_log.write(
250
- "leader_receiver.attach_failed",
251
- target=pane or pane_info.get("pane_id"),
252
- discovery=discovery,
253
- provider=provider,
254
- reason=validation["reason"],
255
- error=validation.get("error"),
256
- source=source,
257
- )
258
- raise RuntimeError(f"leader pane validation failed: {validation['reason']}")
310
+ event_log.write("leader_receiver.attach_failed", target=pane or pane_info.get("pane_id"), discovery=discovery, provider=provider, reason=validation["reason"], error=validation.get("error"), source=source, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12])
311
+ raise RuntimeError(_strict_leader_validation_error(validation))
259
312
  if validation.get("warning"):
260
313
  receiver["warning"] = validation["warning"]
261
314
  state["leader_receiver"] = receiver
315
+ event_log.write("leader_receiver.attached", target=receiver["pane_id"], session_name=receiver["session_name"], window_index=receiver["window_index"], window_name=receiver["window_name"], pane_index=receiver["pane_index"], pane_tty=receiver["pane_tty"], pane_current_command=receiver["pane_current_command"], provider=receiver_provider, requested_provider=provider if receiver_provider != provider else None, discovery=discovery, source=source, uuid_prefix=str(identity.get("leader_session_uuid") or "")[:12], leader_session_uuid_source=identity.get("leader_session_uuid_source"))
316
+ return receiver, validation
317
+
318
+
319
+ def _set_tmux_leader_environment(receiver: dict[str, Any], identity: dict[str, Any], event_log: EventLog, run_cmd: Any) -> None:
320
+ session_name = receiver.get("session_name")
321
+ if not session_name:
322
+ return
323
+ failures: dict[str, str] = {}
324
+ for key, value in leader_env_exports(receiver, identity).items():
325
+ proc = run_cmd(["tmux", "set-environment", "-t", str(session_name), key, value], timeout=5)
326
+ if proc.returncode != 0:
327
+ failures[key] = proc.stderr.strip() or "tmux set-environment failed"
262
328
  event_log.write(
263
- "leader_receiver.attached",
264
- target=receiver["pane_id"],
265
- session_name=receiver["session_name"],
266
- window_index=receiver["window_index"],
267
- window_name=receiver["window_name"],
268
- pane_index=receiver["pane_index"],
269
- pane_tty=receiver["pane_tty"],
270
- pane_current_command=receiver["pane_current_command"],
271
- provider=receiver_provider,
272
- requested_provider=provider if receiver_provider != provider else None,
273
- discovery=discovery,
274
- source=source,
329
+ "leader_receiver.first_time_env_seeded",
330
+ pane_id=receiver.get("pane_id"),
331
+ session_name=session_name,
332
+ ok=not failures,
333
+ failed_keys=sorted(failures),
275
334
  )
276
- return receiver, validation
335
+
336
+ def _strict_leader_validation_error(validation: dict[str, Any]) -> str:
337
+ return (
338
+ f"leader pane validation failed: {validation['reason']}. "
339
+ "first quick-start uses cwd+command match only; this team already has team_owner "
340
+ "so strict UUID gate applies; use team-agent takeover --confirm if you intend to take over"
341
+ )
342
+
343
+
344
+ def leader_identity(workspace: Path, team: str | None = None) -> dict[str, Any]:
345
+ state = _load_identity_state(workspace, team)
346
+ identity = _leader_identity_context(workspace, team=team, state=state)
347
+ receiver = state.get("leader_receiver") if isinstance(state.get("leader_receiver"), dict) else {}
348
+ return {
349
+ "ok": True,
350
+ "uuid_prefix": str(identity["leader_session_uuid"])[:12],
351
+ "machine_fingerprint": identity["machine_fingerprint"],
352
+ "workspace_abspath": identity["workspace_abspath"],
353
+ "os_user": identity["os_user"],
354
+ "team_id": identity["team_id"],
355
+ "current_pane_id": os.environ.get("TEAM_AGENT_LEADER_PANE_ID") or os.environ.get("TMUX_PANE") or None,
356
+ "last_seen_at": receiver.get("attached_at") or receiver.get("last_seen_at"),
357
+ "source": identity["leader_session_uuid_source"],
358
+ }
359
+
360
+
361
+ def claim_leader(workspace: Path, team: str | None = None, confirm: bool = False) -> dict[str, Any]:
362
+ from team_agent.runtime import RuntimeError, _runtime_lock, core_list_targets
363
+ current_pane = os.environ.get("TEAM_AGENT_LEADER_PANE_ID") or os.environ.get("TMUX_PANE")
364
+ if not current_pane:
365
+ return {"ok": False, "status": "refused", "reason": "no_caller_pane", "action": "run from a tmux leader pane"}
366
+ with _runtime_lock(workspace, "leader_receiver"):
367
+ state = select_runtime_state(workspace, team)
368
+ event_log = EventLog(workspace)
369
+ incident = _latest_ambiguous_incident(event_log, team_state_key(state))
370
+ if not incident:
371
+ return {"ok": False, "status": "refused", "reason": "no_ambiguous_candidates"}
372
+ candidates = [str(item) for item in incident.get("candidates", [])]
373
+ if current_pane not in candidates:
374
+ return {"ok": False, "status": "refused", "reason": "caller_not_candidate", "candidates": candidates}
375
+ receiver = state.get("leader_receiver") or {}
376
+ if receiver.get("pane_id") == current_pane:
377
+ return {"ok": True, "status": "already_bound", "leader_receiver": receiver}
378
+ if _incident_already_claimed(event_log, str(incident.get("incident_id"))):
379
+ return _claim_lost_race(receiver)
380
+ if receiver.get("pane_id") in candidates and receiver.get("pane_id") != incident.get("old_pane_id"):
381
+ return _claim_lost_race(receiver)
382
+ if not confirm:
383
+ return {"ok": True, "status": "dry_run", "would_bind_pane_id": current_pane, "candidates": candidates}
384
+ targets = core_list_targets()
385
+ if not targets.get("ok"):
386
+ raise RuntimeError(str(targets.get("error") or "tmux target scan failed"))
387
+ target = next((item for item in targets.get("targets", []) if item.get("pane_id") == current_pane), None)
388
+ if not target:
389
+ return {"ok": False, "status": "refused", "reason": "candidate_pane_missing", "pane_id": current_pane}
390
+ owner = state.setdefault("team_owner", {})
391
+ expected_uuid = str(owner.get("leader_session_uuid") or _leader_identity_context(workspace, team=team, state=state)["leader_session_uuid"])
392
+ target_uuid = _target_leader_session_uuid(target)
393
+ if target_uuid != expected_uuid:
394
+ return {"ok": False, "status": "refused", "reason": "leader_session_uuid_mismatch", "uuid_prefix": expected_uuid[:12]}
395
+ epoch = int(owner.get("owner_epoch") or receiver.get("owner_epoch") or 0) + 1
396
+ owner.update({"pane_id": current_pane, "owner_epoch": epoch, "claimed_at": datetime.now(timezone.utc).isoformat(), "claimed_via": "claim-leader"})
397
+ state["leader_receiver"] = _receiver_from_claim_target(target, receiver, expected_uuid, epoch)
398
+ save_team_scoped_state(workspace, state)
399
+ losers = [pane for pane in candidates if pane != current_pane]
400
+ event_log.write(
401
+ "leader_receiver.claim_applied",
402
+ incident_id=incident.get("incident_id"),
403
+ winner_pane_id=current_pane,
404
+ losers=losers,
405
+ owner_epoch=epoch,
406
+ uuid_prefix=expected_uuid[:12],
407
+ )
408
+ # Stage 11.9 (Gap 26 Mac mini Scenario 3): result watchers that stalled while the
409
+ # broadcast was waiting for a human claim need fresh budget against the newly bound
410
+ # pane. Per-watcher leader_receiver.claim_requeue events + immediate retry.
411
+ from team_agent.message_store import MessageStore
412
+ from team_agent.messaging.result_delivery import requeue_after_claim_leader
413
+ requeued = requeue_after_claim_leader(
414
+ workspace,
415
+ MessageStore(workspace),
416
+ event_log,
417
+ team_state_key(state),
418
+ current_pane,
419
+ incident_ts=incident.get("ts"),
420
+ )
421
+ response: dict[str, Any] = {
422
+ "ok": True,
423
+ "status": "claimed",
424
+ "leader_receiver": state["leader_receiver"],
425
+ "owner_epoch": epoch,
426
+ "losers": losers,
427
+ "requeued_watchers": [item["watcher_id"] for item in requeued],
428
+ }
429
+ # Stage 13 (silent-loss arm mailbox-hint route, 2026-05-26 second roundtable):
430
+ # the framework cannot guarantee every worker message reached the leader pane during
431
+ # the ambiguous-state window (retry budgets may have exhausted before the human
432
+ # claimed). Pointing the leader agent at the inbox lets it self-recover by reading
433
+ # the messages that landed in storage but never injected to a pane.
434
+ incident_ts = incident.get("ts")
435
+ if incident_ts:
436
+ response["inbox_hint"] = {
437
+ "message": (
438
+ "During the previous ambiguous-leader state, some worker messages may "
439
+ "not have been auto-delivered to this pane. Run the command below to "
440
+ "retrieve them."
441
+ ),
442
+ "command": f"team-agent inbox leader --since {incident_ts}",
443
+ "since": incident_ts,
444
+ "incident_id": incident.get("incident_id"),
445
+ }
446
+ return response
447
+
448
+
449
+ def _latest_ambiguous_incident(event_log: EventLog, team_id: str) -> dict[str, Any] | None:
450
+ for event in reversed(event_log.tail(200)):
451
+ if event.get("event") != "leader_receiver.ambiguous_candidates":
452
+ continue
453
+ if event.get("team_id") in {None, team_id}:
454
+ return event
455
+ return None
456
+
457
+
458
+ def _incident_already_claimed(event_log: EventLog, incident_id: str) -> bool:
459
+ return any(event.get("event") == "leader_receiver.claim_applied" and event.get("incident_id") == incident_id for event in event_log.tail(200))
460
+
461
+
462
+ def _claim_lost_race(receiver: dict[str, Any]) -> dict[str, Any]:
463
+ return {"ok": False, "status": "refused", "reason": "owner_epoch_advanced", "error": f"team already bound to pane {receiver.get('pane_id')}; you lost the race", "bound_pane_id": receiver.get("pane_id"), "owner_epoch": receiver.get("owner_epoch")}
464
+
465
+
466
+ def _target_leader_session_uuid(target: dict[str, Any]) -> str:
467
+ env = target.get("leader_env") if isinstance(target.get("leader_env"), dict) else {}
468
+ return str(target.get("leader_session_uuid") or env.get("TEAM_AGENT_LEADER_SESSION_UUID") or "")
469
+
470
+
471
+ def _receiver_from_claim_target(target: dict[str, Any], previous: dict[str, Any], leader_uuid: str, owner_epoch: int) -> dict[str, Any]:
472
+ return {
473
+ "mode": "direct_tmux",
474
+ "status": "attached",
475
+ "provider": previous.get("provider") or "codex",
476
+ "pane_id": target["pane_id"],
477
+ "session_name": target.get("session_name"),
478
+ "window_index": str(target.get("window_index")),
479
+ "window_name": target.get("window_name"),
480
+ "pane_index": str(target.get("pane_index")),
481
+ "pane_tty": target.get("pane_tty"),
482
+ "pane_current_command": target.get("pane_current_command"),
483
+ "leader_session_uuid": leader_uuid,
484
+ "owner_epoch": owner_epoch,
485
+ "attached_at": datetime.now(timezone.utc).isoformat(),
486
+ "discovery": "claim_leader",
487
+ }
277
488
 
278
489
 
279
490
  def autobind_leader_receiver_from_env(
@@ -314,6 +525,8 @@ __all__ = [
314
525
  "attach_leader",
315
526
  "attach_leader_to_state",
316
527
  "autobind_leader_receiver_from_env",
528
+ "claim_leader",
529
+ "leader_identity",
317
530
  "leader_session_name",
318
531
  "leader_start_plan",
319
532
  "start_leader",
@@ -57,30 +57,80 @@ def remove_agent(
57
57
 
58
58
  rollback = _RemoveRollback(workspace, spec_path, spec, state, dynamic_role_file, store, agent_id, False)
59
59
  stopped: dict[str, Any] | None = None
60
+ cleared_locations: list[str] = []
61
+ current_step = "init"
62
+ current_resource: str | None = None
63
+
64
+ def _step_done(name: str, resource: str | None = None, **extra: Any) -> None:
65
+ cleared_locations.append(name)
66
+ event_log.write(
67
+ "lifecycle.remove_step_completed",
68
+ agent_id=agent_id,
69
+ step=name,
70
+ resource=resource,
71
+ **extra,
72
+ )
73
+
60
74
  try:
61
75
  if running and force:
76
+ current_step, current_resource = "stop_agent", agent_id
62
77
  stopped = runtime.stop_agent(workspace, agent_id, team=team)
63
78
  rollback.restore_running = True
64
79
  state, _refusal_after = resolve_team_scoped_state(workspace, team)
80
+ _step_done("stop_agent", resource=agent_id, stopped=stopped)
81
+
82
+ current_step, current_resource = "workspace_state", "state.json:agents"
65
83
  removed_state = copy.deepcopy(state)
66
84
  removed_state.get("agents", {}).pop(agent_id, None)
67
85
  save_team_scoped_state(workspace, removed_state)
86
+ _step_done("workspace_state", resource=current_resource)
68
87
 
88
+ current_step, current_resource = "spec_yaml", str(spec_path)
69
89
  removed_spec = copy.deepcopy(spec)
70
90
  removed_spec["agents"] = [item for item in removed_spec.get("agents", []) if item.get("id") != agent_id]
71
91
  startup_order = removed_spec.get("runtime", {}).get("startup_order")
72
92
  if isinstance(startup_order, list):
73
93
  removed_spec["runtime"]["startup_order"] = [item for item in startup_order if item != agent_id]
74
94
  validate_spec(removed_spec, base_dir=spec_path.parent)
95
+ current_step, current_resource = "team_state_md", "team_state.md"
75
96
  team_state_path = write_team_state(workspace, removed_spec, removed_state)
97
+ _step_done("team_state_md", resource=str(team_state_path))
98
+ current_step, current_resource = "spec_yaml", str(spec_path)
76
99
  write_spec(spec_path, removed_spec)
100
+ _step_done("spec_yaml", resource=str(spec_path))
77
101
 
102
+ current_step, current_resource = "role_file", str(dynamic_role_file)
78
103
  role_file_removed = _remove_dynamic_role_file(dynamic_role_file, bool(agent_state.get("dynamic_role_file")))
104
+ if role_file_removed:
105
+ _step_done("role_file", resource=str(dynamic_role_file))
106
+
107
+ current_step, current_resource = "agent_health", agent_id
79
108
  _delete_agent_health(store, agent_id)
109
+ _step_done("agent_health", resource=agent_id)
80
110
  except Exception as exc:
81
111
  rollback_result = rollback.restore(runtime, event_log)
82
- event_log.write("remove_agent.rollback", agent_id=agent_id, ok=rollback_result["ok"], error=str(exc), rollback=rollback_result)
83
- raise RuntimeError(f"remove-agent failed for {agent_id}: {exc}; rollback_ok={rollback_result['ok']}") from exc
112
+ event_log.write(
113
+ "remove_agent.rollback",
114
+ agent_id=agent_id,
115
+ ok=rollback_result["ok"],
116
+ error=str(exc),
117
+ failed_step=current_step,
118
+ resource=current_resource,
119
+ cleared_before_failure=cleared_locations,
120
+ rollback=rollback_result,
121
+ )
122
+ event_log.write(
123
+ "lifecycle.remove_rolled_back",
124
+ agent_id=agent_id,
125
+ ok=rollback_result["ok"],
126
+ failed_step=current_step,
127
+ resource=current_resource,
128
+ rollback_errors=rollback_result.get("errors", []),
129
+ )
130
+ raise RuntimeError(
131
+ f"remove-agent failed for {agent_id} at step={current_step} "
132
+ f"resource={current_resource}: {exc}; rollback_ok={rollback_result['ok']}"
133
+ ) from exc
84
134
 
85
135
  runtime._save_team_runtime_snapshot(workspace, removed_state)
86
136
  warning = None
@@ -93,6 +143,7 @@ def remove_agent(
93
143
  force=force,
94
144
  stopped=stopped,
95
145
  role_file_removed=role_file_removed,
146
+ cleared_locations=cleared_locations,
96
147
  )
97
148
  except Exception as exc:
98
149
  warning = f"remove-agent completed but success event logging failed: {exc}"
@@ -105,6 +156,7 @@ def remove_agent(
105
156
  "stopped": stopped,
106
157
  "state_file": str(team_state_path),
107
158
  "role_file_removed": role_file_removed,
159
+ "cleared_locations": cleared_locations,
108
160
  **({"warning": warning} if warning else {}),
109
161
  }
110
162
 
@@ -154,32 +154,109 @@ def add_agent(workspace: Path, agent_id: str, *, role_file_path: str, open_displ
154
154
  old_spec_text = spec_path.read_text(encoding="utf-8")
155
155
  old_state = copy.deepcopy(state)
156
156
  old_dynamic = dynamic_path.read_bytes() if dynamic_path.exists() else None
157
+ # Stage 11.11 (Gap 15 follow-up): snapshot team_state.md BEFORE any write so the rollback
158
+ # path can restore it byte-equal. Pre-Stage-11.11 the rollback handler omitted this file
159
+ # and Mac mini Scenario 6 left orphan entries after induced add-agent failures.
160
+ team_state_path = workspace / spec.get("context", {}).get("state_file", "team_state.md")
161
+ old_team_state = team_state_path.read_bytes() if team_state_path.exists() else None
157
162
  event_log = EventLog(workspace)
163
+ cleared_locations: list[str] = []
164
+ current_step = "init"
165
+ current_resource: str | None = None
166
+
167
+ def _step_done(name: str, resource: str | None = None, **extra: Any) -> None:
168
+ cleared_locations.append(name)
169
+ event_log.write(
170
+ "lifecycle.add_step_completed",
171
+ agent_id=agent_id,
172
+ step=name,
173
+ resource=resource,
174
+ **extra,
175
+ )
176
+
158
177
  try:
178
+ current_step, current_resource = "role_file", str(dynamic_path)
159
179
  dynamic_dir.mkdir(parents=True, exist_ok=True)
160
180
  dynamic_path.write_bytes(role_bytes)
181
+ _step_done("role_file", resource=str(dynamic_path))
182
+
183
+ current_step, current_resource = "compile_role_doc", str(dynamic_path)
161
184
  agent = compile_role_doc_agent(dynamic_path, team_dir, agent_id)
185
+ _step_done("compile_role_doc", resource=str(dynamic_path))
186
+
187
+ current_step, current_resource = "spec_yaml", str(spec_path)
162
188
  spec.setdefault("agents", []).append(agent)
163
189
  spec.setdefault("runtime", {}).setdefault("startup_order", []).append(agent_id)
164
190
  validate_spec(spec, base_dir=spec_path.parent)
165
191
  write_spec(spec_path, spec)
192
+ _step_done("spec_yaml", resource=str(spec_path))
193
+
194
+ current_step, current_resource = "team_state_md", "team_state.md"
166
195
  write_team_state(workspace, spec, state)
196
+ _step_done("team_state_md", resource="team_state.md")
197
+
198
+ current_step, current_resource = "start_agent", agent_id
167
199
  started = start_agent(workspace, agent_id, open_display=open_display, allow_fresh=True, team=team)
200
+ _step_done("start_agent", resource=agent_id, started=started)
201
+
202
+ current_step, current_resource = "workspace_state", "state.json:agents"
168
203
  state, _refusal_after = resolve_team_scoped_state(workspace, team)
169
204
  state["agents"][agent_id]["dynamic_role_file"] = str(dynamic_path.relative_to(workspace))
170
205
  state["agents"][agent_id]["role_file_sha"] = role_sha
171
206
  save_team_scoped_state(workspace, state)
172
207
  state_path = write_team_state(workspace, spec, state)
173
- except Exception:
174
- spec_path.write_text(old_spec_text, encoding="utf-8")
175
- save_team_scoped_state(workspace, old_state)
176
- if old_dynamic is None:
177
- dynamic_path.unlink(missing_ok=True)
178
- else:
179
- dynamic_path.parent.mkdir(parents=True, exist_ok=True)
180
- dynamic_path.write_bytes(old_dynamic)
208
+ _step_done("workspace_state", resource="state.json:agents")
209
+ except Exception as exc:
210
+ rolled_back: list[str] = []
211
+ rollback_errors: list[str] = []
212
+ try:
213
+ spec_path.write_text(old_spec_text, encoding="utf-8")
214
+ rolled_back.append("spec_yaml")
215
+ event_log.write("lifecycle.add_step_rolled_back", agent_id=agent_id,
216
+ step="spec_yaml", resource=str(spec_path))
217
+ except Exception as restore_exc:
218
+ rollback_errors.append(f"spec_yaml:{restore_exc}")
219
+ try:
220
+ save_team_scoped_state(workspace, old_state)
221
+ rolled_back.append("workspace_state")
222
+ event_log.write("lifecycle.add_step_rolled_back", agent_id=agent_id,
223
+ step="workspace_state", resource="state.json:agents")
224
+ except Exception as restore_exc:
225
+ rollback_errors.append(f"workspace_state:{restore_exc}")
226
+ try:
227
+ if old_team_state is None:
228
+ team_state_path.unlink(missing_ok=True)
229
+ else:
230
+ team_state_path.parent.mkdir(parents=True, exist_ok=True)
231
+ team_state_path.write_bytes(old_team_state)
232
+ rolled_back.append("team_state_md")
233
+ event_log.write("lifecycle.add_step_rolled_back", agent_id=agent_id,
234
+ step="team_state_md", resource=str(team_state_path))
235
+ except Exception as restore_exc:
236
+ rollback_errors.append(f"team_state_md:{restore_exc}")
237
+ try:
238
+ if old_dynamic is None:
239
+ dynamic_path.unlink(missing_ok=True)
240
+ else:
241
+ dynamic_path.parent.mkdir(parents=True, exist_ok=True)
242
+ dynamic_path.write_bytes(old_dynamic)
243
+ rolled_back.append("role_file")
244
+ event_log.write("lifecycle.add_step_rolled_back", agent_id=agent_id,
245
+ step="role_file", resource=str(dynamic_path))
246
+ except Exception as restore_exc:
247
+ rollback_errors.append(f"role_file:{restore_exc}")
248
+ event_log.write(
249
+ "lifecycle.add_failed",
250
+ agent_id=agent_id,
251
+ failed_step=current_step,
252
+ failed_resource=current_resource,
253
+ cleared_locations=cleared_locations,
254
+ rolled_back=rolled_back,
255
+ rollback_errors=rollback_errors,
256
+ reason=str(exc),
257
+ )
181
258
  raise
182
- event_log.write("add_agent.complete", agent_id=agent_id, role_file=str(dynamic_path), role_file_sha=role_sha, started=started)
259
+ event_log.write("add_agent.complete", agent_id=agent_id, role_file=str(dynamic_path), role_file_sha=role_sha, started=started, cleared_locations=cleared_locations)
183
260
  return {
184
261
  "ok": True,
185
262
  "agent_id": agent_id,