@team-agent/installer 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/package.json +1 -1
  2. package/src/team_agent/cli/__init__.py +2 -0
  3. package/src/team_agent/cli/commands.py +22 -3
  4. package/src/team_agent/cli/parser.py +40 -1
  5. package/src/team_agent/coordinator/__main__.py +21 -2
  6. package/src/team_agent/coordinator/lifecycle.py +23 -0
  7. package/src/team_agent/diagnose/orphan_cleanup.py +193 -0
  8. package/src/team_agent/events.py +47 -0
  9. package/src/team_agent/leader/__init__.py +273 -60
  10. package/src/team_agent/lifecycle/agents.py +54 -2
  11. package/src/team_agent/lifecycle/operations.py +86 -9
  12. package/src/team_agent/lifecycle/paste_buffer_hygiene.py +39 -0
  13. package/src/team_agent/lifecycle/start.py +3 -0
  14. package/src/team_agent/message_store/leader_notification_log.py +132 -0
  15. package/src/team_agent/message_store/result_watchers.py +144 -1
  16. package/src/team_agent/message_store/schema.py +23 -0
  17. package/src/team_agent/messaging/delivery.py +10 -0
  18. package/src/team_agent/messaging/idle_alerts.py +227 -21
  19. package/src/team_agent/messaging/leader.py +166 -6
  20. package/src/team_agent/messaging/leader_panes.py +193 -23
  21. package/src/team_agent/messaging/owner_bypass.py +29 -0
  22. package/src/team_agent/messaging/result_delivery.py +219 -4
  23. package/src/team_agent/messaging/results.py +12 -21
  24. package/src/team_agent/messaging/scheduler.py +22 -2
  25. package/src/team_agent/messaging/send.py +9 -2
  26. package/src/team_agent/messaging/session_drift.py +94 -0
  27. package/src/team_agent/runtime.py +22 -14
  28. package/src/team_agent/rust_core.py +157 -3
  29. package/src/team_agent/state.py +167 -10
  30. package/src/team_agent/status/inbox.py +33 -3
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hashlib
4
+
3
5
  from team_agent.messaging.deps import (
4
6
  EventLog,
5
7
  RuntimeError,
@@ -9,6 +11,7 @@ from team_agent.messaging.deps import (
9
11
  _tmux_current_client_pane_info as _runtime_tmux_current_client_pane_info,
10
12
  _tmux_list_panes as _runtime_tmux_list_panes,
11
13
  _tmux_pane_info as _runtime_tmux_pane_info,
14
+ _tmux_inject_text,
12
15
  core_list_targets,
13
16
  datetime,
14
17
  os,
@@ -20,6 +23,8 @@ from team_agent.messaging.deps import (
20
23
  from pathlib import Path
21
24
  from typing import Any
22
25
 
26
+ _AMBIGUOUS_DEBOUNCE_SECONDS = 60
27
+
23
28
  def _resolve_leader_pane(
24
29
  pane: str | None,
25
30
  provider: str,
@@ -208,17 +213,40 @@ def _target_fingerprint(pane_info: dict[str, Any]) -> str:
208
213
  )
209
214
 
210
215
 
216
+ def is_bound_pane_still_valid(state: dict[str, Any], store: Any | None = None) -> dict[str, Any]:
217
+ receiver = dict(state.get("leader_receiver") or {})
218
+ owner = state.get("team_owner") if isinstance(state.get("team_owner"), dict) else {}
219
+ if owner and owner.get("leader_session_uuid") and not receiver.get("leader_session_uuid"):
220
+ receiver["leader_session_uuid"] = owner["leader_session_uuid"]
221
+ return _validate_leader_receiver(receiver)
222
+
223
+
211
224
  def _rediscover_leader_receiver(
212
225
  receiver: dict[str, Any],
213
226
  event_log: EventLog,
214
227
  owner_identity: dict[str, Any] | None = None,
228
+ invalidation_reason: str | None = None,
229
+ team_id: str | None = None,
215
230
  ) -> dict[str, Any]:
216
231
  provider = str(receiver.get("provider") or "codex")
217
- if provider != "codex":
218
- return {"status": "missing", "reason": "rediscovery_only_for_codex"}
232
+ if provider == "fake":
233
+ return {"status": "missing", "reason": "rediscovery_not_supported_for_fake"}
219
234
  targets = core_list_targets()
220
235
  if not targets.get("ok"):
221
236
  event_log.write("leader_receiver.rediscover_failed", provider=provider, error=targets.get("error"))
237
+ # Stage 15 CI fix: when the tmux target scan itself fails (no server, no daemon,
238
+ # CI env without tmux), the caller has no way to recover unless we also emit
239
+ # rebind_required. Without this, _refresh_leader_receiver_or_flag_rebind silently
240
+ # returns and report_result queues against the stale pane with zero audit signal.
241
+ event_log.write(
242
+ "leader_receiver.rebind_required",
243
+ old_pane_id=receiver.get("pane_id"),
244
+ reason=invalidation_reason,
245
+ provider=provider,
246
+ team_id=team_id,
247
+ rediscovery_status="failed",
248
+ error=targets.get("error"),
249
+ )
222
250
  return {"status": "failed", "error": targets.get("error")}
223
251
  candidates = [
224
252
  target
@@ -228,16 +256,26 @@ def _rediscover_leader_receiver(
228
256
  if owner_identity:
229
257
  owner_candidates = [target for target in candidates if _target_matches_owner_identity(target, owner_identity)]
230
258
  if len(owner_candidates) == 1:
231
- return _rediscovered_receiver(receiver, provider, owner_candidates[0], event_log, owner_identity)
259
+ return _rediscovered_receiver(receiver, provider, owner_candidates[0], event_log, owner_identity, invalidation_reason)
232
260
  if len(owner_candidates) > 1:
261
+ incident = _broadcast_ambiguous_candidates(
262
+ receiver,
263
+ provider,
264
+ owner_candidates,
265
+ event_log,
266
+ owner_identity,
267
+ team_id,
268
+ )
233
269
  event_log.write(
234
270
  "leader_receiver.rediscover_ambiguous",
235
271
  provider=provider,
236
272
  old_target=receiver.get("pane_id"),
237
273
  candidates=[target.get("pane_id") for target in owner_candidates],
238
274
  owner_identity=owner_identity,
275
+ incident_id=incident.get("incident_id"),
276
+ deduped=incident.get("deduped"),
239
277
  )
240
- return {"status": "ambiguous", "candidates": owner_candidates, "owner_identity": owner_identity}
278
+ return {"status": "ambiguous", "candidates": owner_candidates, "owner_identity": owner_identity, **incident}
241
279
  event_log.write(
242
280
  "leader_receiver.rediscover_missing",
243
281
  provider=provider,
@@ -245,9 +283,19 @@ def _rediscover_leader_receiver(
245
283
  owner_identity=owner_identity,
246
284
  candidate_count=len(candidates),
247
285
  )
286
+ event_log.write(
287
+ "leader_receiver.rebind_required",
288
+ old_pane_id=receiver.get("pane_id"),
289
+ reason=invalidation_reason,
290
+ provider=provider,
291
+ team_id=team_id,
292
+ uuid_prefix=_uuid_prefix(owner_identity),
293
+ owner_identity=owner_identity,
294
+ recovery_action="open the owning leader pane or run team-agent claim-leader --confirm from a matching pane",
295
+ )
248
296
  return {"status": "missing", "owner_identity": owner_identity}
249
297
  if len(candidates) == 1:
250
- return _rediscovered_receiver(receiver, provider, candidates[0], event_log, None)
298
+ return _rediscovered_receiver(receiver, provider, candidates[0], event_log, None, invalidation_reason)
251
299
  if len(candidates) > 1:
252
300
  event_log.write(
253
301
  "leader_receiver.rediscover_ambiguous",
@@ -255,12 +303,19 @@ def _rediscover_leader_receiver(
255
303
  old_target=receiver.get("pane_id"),
256
304
  candidates=[target.get("pane_id") for target in candidates],
257
305
  )
306
+ event_log.write("leader_receiver.rebind_required", old_pane_id=receiver.get("pane_id"), reason=invalidation_reason, provider=provider, team_id=team_id, rediscovery_status="ambiguous")
258
307
  return {"status": "ambiguous", "candidates": candidates}
259
308
  event_log.write("leader_receiver.rediscover_missing", provider=provider, old_target=receiver.get("pane_id"))
309
+ event_log.write("leader_receiver.rebind_required", old_pane_id=receiver.get("pane_id"), reason=invalidation_reason, provider=provider, team_id=team_id, rediscovery_status="missing")
260
310
  return {"status": "missing"}
261
311
 
262
312
 
263
313
  def _target_matches_owner_identity(target: dict[str, Any], owner_identity: dict[str, Any]) -> bool:
314
+ expected_uuid = owner_identity.get("leader_session_uuid")
315
+ if expected_uuid:
316
+ actual_uuid = _target_leader_session_uuid(target)
317
+ if actual_uuid:
318
+ return actual_uuid == expected_uuid
264
319
  env = target.get("leader_env") if isinstance(target.get("leader_env"), dict) else {}
265
320
  return (
266
321
  env.get("TEAM_AGENT_LEADER_PANE_ID") == (owner_identity.get("pane_id") or "")
@@ -269,14 +324,31 @@ def _target_matches_owner_identity(target: dict[str, Any], owner_identity: dict[
269
324
  )
270
325
 
271
326
 
272
- def _rediscovered_receiver(
273
- receiver: dict[str, Any],
274
- provider: str,
275
- target: dict[str, Any],
276
- event_log: EventLog,
277
- owner_identity: dict[str, Any] | None,
278
- ) -> dict[str, Any]:
279
- updated = {
327
+ def _target_leader_session_uuid(target: dict[str, Any]) -> str:
328
+ env = target.get("leader_env") if isinstance(target.get("leader_env"), dict) else {}
329
+ return str(target.get("leader_session_uuid") or env.get("TEAM_AGENT_LEADER_SESSION_UUID") or "")
330
+
331
+
332
+ def _leader_uuid_for_bound_pane(receiver: dict[str, Any], pane_info: dict[str, Any]) -> str:
333
+ direct = _target_leader_session_uuid(pane_info) or _target_leader_session_uuid(receiver)
334
+ if direct:
335
+ return direct
336
+ targets = core_list_targets()
337
+ if not targets.get("ok"):
338
+ return ""
339
+ pane_id = pane_info.get("pane_id")
340
+ for target in targets.get("targets", []):
341
+ if target.get("pane_id") == pane_id:
342
+ return _target_leader_session_uuid(target)
343
+ return ""
344
+
345
+
346
+ def _uuid_prefix(owner_identity: dict[str, Any] | None) -> str:
347
+ return str((owner_identity or {}).get("leader_session_uuid") or "")[:8]
348
+
349
+
350
+ def _receiver_from_target(target: dict[str, Any], provider: str, leader_uuid: str | None, owner_epoch: int | None = None) -> dict[str, Any]:
351
+ receiver = {
280
352
  "mode": "direct_tmux",
281
353
  "status": "attached",
282
354
  "provider": provider,
@@ -289,8 +361,83 @@ def _rediscovered_receiver(
289
361
  "pane_current_command": target["pane_current_command"],
290
362
  "fingerprint": target.get("fingerprint") or _target_fingerprint(target),
291
363
  "attached_at": datetime.now(timezone.utc).isoformat(),
292
- "discovery": "stale_rediscovery_owner_identity" if owner_identity else "stale_rediscovery_unique_candidate",
293
364
  }
365
+ if leader_uuid:
366
+ receiver["leader_session_uuid"] = leader_uuid
367
+ if owner_epoch is not None:
368
+ receiver["owner_epoch"] = owner_epoch
369
+ return receiver
370
+
371
+
372
+ def _broadcast_ambiguous_candidates(
373
+ receiver: dict[str, Any],
374
+ provider: str,
375
+ candidates: list[dict[str, Any]],
376
+ event_log: EventLog,
377
+ owner_identity: dict[str, Any],
378
+ team_id: str | None,
379
+ ) -> dict[str, Any]:
380
+ candidate_ids = sorted(str(candidate.get("pane_id")) for candidate in candidates)
381
+ bucket = _ambiguous_debounce_bucket()
382
+ incident_id = hashlib.sha256("\0".join([str(team_id or ""), *candidate_ids, bucket]).encode("utf-8")).hexdigest()[:16]
383
+ if any(event.get("event") == "leader_receiver.ambiguous_candidates" and event.get("incident_id") == incident_id for event in event_log.tail(200)):
384
+ return {"incident_id": incident_id, "deduped": True}
385
+ prompt = _ambiguous_candidate_prompt(team_id, len(candidates))
386
+ event_log.write(
387
+ "leader_receiver.ambiguous_candidates",
388
+ incident_id=incident_id,
389
+ old_pane_id=receiver.get("pane_id"),
390
+ candidates=candidate_ids,
391
+ provider=provider,
392
+ team_id=team_id,
393
+ uuid_prefix=_uuid_prefix(owner_identity),
394
+ debounce_bucket=bucket,
395
+ )
396
+ for candidate in candidates:
397
+ pane_id = str(candidate.get("pane_id") or "")
398
+ injected = _tmux_inject_text(
399
+ pane_id,
400
+ prompt,
401
+ "Enter",
402
+ f"team-agent-leader-ambiguous-{incident_id}-{pane_id.strip('%')}",
403
+ provider=provider,
404
+ )
405
+ event_log.write(
406
+ "leader_receiver.ambiguous_candidate_queued",
407
+ incident_id=incident_id,
408
+ pane_id=pane_id,
409
+ ok=bool(injected.get("ok")),
410
+ error=injected.get("error"),
411
+ )
412
+ return {"incident_id": incident_id, "deduped": False}
413
+
414
+
415
+ def _ambiguous_debounce_bucket() -> str:
416
+ now = datetime.now(timezone.utc)
417
+ epoch = int(now.timestamp() // _AMBIGUOUS_DEBOUNCE_SECONDS) * _AMBIGUOUS_DEBOUNCE_SECONDS
418
+ return datetime.fromtimestamp(epoch, timezone.utc).isoformat()
419
+
420
+
421
+ def _ambiguous_candidate_prompt(team_id: str | None, candidate_count: int) -> str:
422
+ others = max(candidate_count - 1, 0)
423
+ return (
424
+ f"Team `{team_id or 'current'}` has no bound leader. This window and {others} other window(s) all qualify. "
425
+ "To claim this window as the team leader, run: `team-agent claim-leader --confirm`. "
426
+ "Only the first such call wins; subsequent calls from other windows will be refused."
427
+ )
428
+
429
+
430
+ def _rediscovered_receiver(
431
+ receiver: dict[str, Any],
432
+ provider: str,
433
+ target: dict[str, Any],
434
+ event_log: EventLog,
435
+ owner_identity: dict[str, Any] | None,
436
+ invalidation_reason: str | None = None,
437
+ ) -> dict[str, Any]:
438
+ leader_uuid = _target_leader_session_uuid(target) or (owner_identity or {}).get("leader_session_uuid") or receiver.get("leader_session_uuid")
439
+ updated = _receiver_from_target(target, provider, leader_uuid)
440
+ updated["discovery"] = "stale_rediscovery_owner_identity" if owner_identity else "stale_rediscovery_unique_candidate"
294
441
  event_log.write(
295
442
  "leader_receiver.rediscovered",
296
443
  provider=provider,
@@ -299,6 +446,14 @@ def _rediscovered_receiver(
299
446
  candidate_count=1,
300
447
  owner_identity=owner_identity,
301
448
  )
449
+ event_log.write(
450
+ "leader_receiver.rebind_applied",
451
+ old_pane_id=receiver.get("pane_id"),
452
+ new_pane_id=updated["pane_id"],
453
+ reason=invalidation_reason,
454
+ owner_identity=owner_identity,
455
+ uuid_prefix=_uuid_prefix(owner_identity),
456
+ )
302
457
  return {"status": "updated", "receiver": updated, "owner_identity": owner_identity}
303
458
 
304
459
 
@@ -306,6 +461,26 @@ def _validate_leader_receiver(receiver: dict[str, Any]) -> dict[str, Any]:
306
461
  pane_info = _runtime_tmux_pane_info(receiver.get("pane_id"))
307
462
  if not pane_info:
308
463
  return {"ok": False, "reason": "leader_pane_missing", "error": "tmux pane does not exist"}
464
+ provider = str(receiver.get("provider") or "codex")
465
+ if not _leader_command_looks_usable(pane_info.get("pane_current_command", ""), provider):
466
+ return {
467
+ "ok": False,
468
+ "reason": "leader_pane_wrong_command",
469
+ "error": f"pane command {pane_info.get('pane_current_command')!r} is not a leader host",
470
+ "pane": pane_info,
471
+ }
472
+ expected_uuid = receiver.get("leader_session_uuid")
473
+ if expected_uuid:
474
+ actual_uuid = _leader_uuid_for_bound_pane(receiver, pane_info)
475
+ if not actual_uuid:
476
+ return {"ok": False, "reason": "leader_uuid_missing", "error": "bound pane has no TEAM_AGENT_LEADER_SESSION_UUID", "pane": pane_info}
477
+ if actual_uuid != expected_uuid:
478
+ return {
479
+ "ok": False,
480
+ "reason": "leader_uuid_mismatch",
481
+ "error": "bound pane TEAM_AGENT_LEADER_SESSION_UUID does not match stored team owner",
482
+ "pane": pane_info,
483
+ }
309
484
  capture = run_cmd(["tmux", "capture-pane", "-p", "-S", "-40", "-t", pane_info["pane_id"]], timeout=5)
310
485
  if capture.returncode != 0:
311
486
  return {
@@ -314,14 +489,7 @@ def _validate_leader_receiver(receiver: dict[str, Any]) -> dict[str, Any]:
314
489
  "error": capture.stderr.strip() or "tmux capture-pane failed",
315
490
  "pane": pane_info,
316
491
  }
317
- warning = None
318
- provider = str(receiver.get("provider") or "codex")
319
- if not _leader_command_looks_usable(pane_info.get("pane_current_command", ""), provider):
320
- warning = (
321
- f"pane command {pane_info.get('pane_current_command')!r} is not a typical {provider} host; "
322
- "continuing because tmux capture works"
323
- )
324
- return {"ok": True, "pane": pane_info, "capture": capture.stdout, "warning": warning}
492
+ return {"ok": True, "pane": pane_info, "capture": capture.stdout, "warning": None}
325
493
 
326
494
 
327
495
  def _leader_command_looks_usable(command: str, provider: str) -> bool:
@@ -330,7 +498,9 @@ def _leader_command_looks_usable(command: str, provider: str) -> bool:
330
498
  command_name = Path(command).name
331
499
  if provider == "codex":
332
500
  return command_name in {"codex", "node", "nodejs"}
333
- return bool(command_name)
501
+ if provider in {"claude", "claude_code"}:
502
+ return command_name in {"claude", "claude.exe"}
503
+ return command_name in {"codex", "node", "nodejs", "claude", "claude.exe"}
334
504
 
335
505
 
336
506
  def _choose_leader_submit_key(provider: str, capture_text: str) -> tuple[str, str]:
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from team_agent.events import EventLog
6
+ from team_agent.state import worker_sender_bypasses_owner_gate
7
+
8
+
9
+ def apply_worker_sender_bypass(
10
+ state: dict[str, Any],
11
+ sender: str | None,
12
+ target: Any,
13
+ task_id: str | None,
14
+ event_log: EventLog,
15
+ ) -> bool:
16
+ via = worker_sender_bypasses_owner_gate(state, sender)
17
+ if not via:
18
+ return False
19
+ event_log.write(
20
+ "send.bypassed_owner_gate_worker_sender",
21
+ sender=sender,
22
+ env_team_agent_id=via,
23
+ target=target if isinstance(target, str) else None,
24
+ task_id=task_id,
25
+ )
26
+ return True
27
+
28
+
29
+ __all__ = ["apply_worker_sender_bypass"]
@@ -1,11 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ from datetime import datetime, timezone
4
5
  from pathlib import Path
5
6
  from typing import Any
6
7
 
7
8
  from team_agent.events import EventLog
8
9
  from team_agent.message_store import MessageStore
10
+ from team_agent.message_store.leader_notification_log import peek_leader_notification
11
+ from team_agent.message_store.result_watchers import leader_notified_message_id_for_result
9
12
  from team_agent.messaging.deps import send_message
10
13
  from team_agent.messaging.internal_delivery import deliver_stored_message
11
14
 
@@ -22,7 +25,13 @@ def retry_result_deliveries(workspace: Path, event_log: EventLog) -> list[dict[s
22
25
  row = store.result_by_id(str(watcher["result_id"]))
23
26
  if not row:
24
27
  continue
25
- notified.extend(notify_result_watchers(workspace, _result_entry_from_row(row), event_log, watchers=[watcher]))
28
+ notified.extend(notify_result_watchers(
29
+ workspace,
30
+ _result_entry_from_row(row),
31
+ event_log,
32
+ watchers=[watcher],
33
+ dedupe_reason="rebind_retry",
34
+ ))
26
35
  return notified
27
36
 
28
37
 
@@ -31,6 +40,7 @@ def notify_result_watchers(
31
40
  result: dict[str, Any],
32
41
  event_log: EventLog,
33
42
  watchers: list[dict[str, Any]] | None = None,
43
+ dedupe_reason: str | None = None,
34
44
  ) -> list[dict[str, Any]]:
35
45
  store = MessageStore(workspace)
36
46
  candidates = [
@@ -67,9 +77,44 @@ def notify_result_watchers(
67
77
  }
68
78
  )
69
79
  attempts = result_delivery_attempts(event_log, primary["watcher_id"], str(result.get("result_id") or ""))
80
+ # Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): exactly-once dedupe
81
+ # lives in leader_notification_log keyed by (result_id, leader_session_uuid) and is
82
+ # consulted atomically at the injection boundary inside _send_to_leader_receiver. Here
83
+ # we add a read-only fast-path peek so concurrent notify_result_watchers calls for the
84
+ # same result short-circuit without spinning up a deliver_stored_message round-trip.
85
+ # The peek is NOT the dedupe primitive — the atomic INSERT OR IGNORE at injection is.
86
+ result_id_str = str(result.get("result_id") or "") or None
87
+ if result_id_str:
88
+ leader_uuid = _resolve_leader_session_uuid(workspace, primary.get("owner_team_id"))
89
+ if leader_uuid:
90
+ prior = peek_leader_notification(
91
+ store, result_id=result_id_str, leader_session_uuid=leader_uuid,
92
+ )
93
+ if prior:
94
+ notified.append(_mark_watcher_dedupe_skip(
95
+ store, event_log, primary, result, attempts,
96
+ prior["notified_message_id"],
97
+ dedupe_reason or "injection_log_already_notified",
98
+ notified_at=prior.get("notified_at"),
99
+ leader_session_uuid=leader_uuid,
100
+ ))
101
+ return notified
102
+ # Legacy compat: watcher.notified_message_id set by a prior path (Gap 32 reversal of
103
+ # 78055bc, or any pre-Stage-12 code) also blocks redelivery. This preserves the
104
+ # Stage 11.9-11.12 era contract while the new gate (leader_notification_log) is the
105
+ # authoritative dedupe primitive going forward.
106
+ legacy_canonical = leader_notified_message_id_for_result(
107
+ store, primary.get("owner_team_id"), result_id_str,
108
+ )
109
+ if legacy_canonical:
110
+ notified.append(_mark_watcher_dedupe_skip(
111
+ store, event_log, primary, result, attempts,
112
+ legacy_canonical,
113
+ dedupe_reason or "rebind_retry",
114
+ ))
115
+ return notified
70
116
  existing = delivered_result_message(
71
- store,
72
- str(result.get("result_id") or ""),
117
+ store, str(result.get("result_id") or ""),
73
118
  task_id=result.get("task_id"),
74
119
  owner_team_id=primary.get("owner_team_id"),
75
120
  )
@@ -83,6 +128,75 @@ def notify_result_watchers(
83
128
  return notified
84
129
 
85
130
 
131
+ def _resolve_leader_session_uuid(workspace: Path, owner_team_id: str | None) -> str | None:
132
+ """Helper: read the team's leader_session_uuid from runtime state for gate lookups."""
133
+ try:
134
+ from team_agent.messaging.deps import load_runtime_state, team_state_key
135
+ state = load_runtime_state(workspace)
136
+ if owner_team_id and isinstance(state.get("teams"), dict):
137
+ scoped = state["teams"].get(owner_team_id)
138
+ if isinstance(scoped, dict):
139
+ state = scoped
140
+ elif owner_team_id and team_state_key(state) != owner_team_id:
141
+ return None
142
+ owner = state.get("team_owner") or {}
143
+ return str(owner.get("leader_session_uuid") or "") or None
144
+ except Exception:
145
+ return None
146
+
147
+
148
+ def _infer_dedupe_reason(primary: dict[str, Any], store: MessageStore) -> str:
149
+ if primary.get("notified_message_id"):
150
+ return "rebind_retry"
151
+ return "watcher_duplicate"
152
+
153
+
154
+ def _mark_watcher_dedupe_skip(
155
+ store: MessageStore,
156
+ event_log: EventLog,
157
+ watcher: dict[str, Any],
158
+ result: dict[str, Any],
159
+ attempts: int,
160
+ canonical_message_id: str,
161
+ reason: str,
162
+ *,
163
+ notified_at: str | None = None,
164
+ leader_session_uuid: str | None = None,
165
+ ) -> dict[str, Any]:
166
+ original_message_id = watcher.get("notified_message_id")
167
+ # Stage 12: the canonical message_id (or sentinel from the gate) is auditing metadata
168
+ # here. The authoritative dedupe gate is leader_notification_log; this mark just keeps
169
+ # the watcher row from being re-picked by retry scans.
170
+ store.mark_result_watcher(
171
+ watcher["watcher_id"],
172
+ "notified",
173
+ result_id=result.get("result_id"),
174
+ notified_message_id=canonical_message_id,
175
+ )
176
+ event_log.write(
177
+ "leader_receiver.notification_dedupe_skip",
178
+ result_id=result.get("result_id"),
179
+ original_message_id=original_message_id,
180
+ suppressed_message_id=canonical_message_id,
181
+ reason=reason,
182
+ team_id=watcher.get("owner_team_id"),
183
+ watcher_id=watcher["watcher_id"],
184
+ task_id=result.get("task_id"),
185
+ agent_id=result.get("agent_id"),
186
+ attempt=attempts + 1,
187
+ leader_session_uuid=leader_session_uuid,
188
+ prior_notified_at=notified_at,
189
+ )
190
+ return {
191
+ "watcher_id": watcher["watcher_id"],
192
+ "result_id": result.get("result_id"),
193
+ "ok": True,
194
+ "message_id": canonical_message_id,
195
+ "deduped": True,
196
+ "dedupe_reason": reason,
197
+ }
198
+
199
+
86
200
  def _dedupe_watchers_for_result(
87
201
  watchers: list[dict[str, Any]],
88
202
  ) -> tuple[dict[str, Any], list[dict[str, Any]]]:
@@ -114,11 +228,19 @@ def _deliver_result_to_watcher(
114
228
  return _mark_delivery_failed(store, event_log, watcher, result, attempts, str(exc))
115
229
  status = "notified" if delivery.get("ok") else "notify_failed"
116
230
  error = delivery.get("reason") or delivery.get("error")
231
+ # Stage 12: notified_message_id is now auditing metadata. The exactly-once contract
232
+ # lives in the leader_notification_log table consulted by _send_to_leader_receiver;
233
+ # whatever the gate suppresses comes back as ok=true deduped=true, and the watcher row
234
+ # records this as a successful notification with the canonical message_id.
235
+ persisted_message_id = (
236
+ delivery.get("canonical_message_id") if delivery.get("deduped")
237
+ else (delivery.get("message_id") if delivery.get("ok") else None)
238
+ )
117
239
  store.mark_result_watcher(
118
240
  watcher["watcher_id"],
119
241
  status,
120
242
  result_id=result.get("result_id"),
121
- notified_message_id=delivery.get("message_id"),
243
+ notified_message_id=persisted_message_id,
122
244
  error=error,
123
245
  )
124
246
  event_log.write(
@@ -279,6 +401,99 @@ def watcher_matches_result(watcher: dict[str, Any], result: dict[str, Any]) -> b
279
401
  return (not task_id or task_id == result.get("task_id")) and (not agent_id or agent_id == result.get("agent_id"))
280
402
 
281
403
 
404
+ def requeue_after_claim_leader(
405
+ workspace: Path,
406
+ store: MessageStore,
407
+ event_log: EventLog,
408
+ owner_team_id: str,
409
+ claimed_pane_id: str,
410
+ *,
411
+ incident_ts: str | None = None,
412
+ ) -> list[dict[str, Any]]:
413
+ """Post-claim hook (Gap 26 / Mac mini Stage 11 Scenarios 3, 11.10): re-route every
414
+ not-yet-delivered leader-bound notification to the newly claimed pane. Returns the
415
+ list of requeued watcher records (may be empty).
416
+
417
+ Stage 11.10 semantic reframe: claim-leader means "all not-yet-delivered leader-bound
418
+ notifications for this team_id reroute to the claimed pane". Watcher status is
419
+ irrelevant — `notified_message_id` is the only dedupe gate. Gap 32 exactly-once
420
+ contract still holds: notified_message_id non-null blocks redelivery.
421
+
422
+ Selection rules:
423
+ - watcher is scoped to this team (owner_team_id match)
424
+ - watcher has no notified_message_id (Gap 32 once-only)
425
+ - watcher's latest activity timestamp (completed_at fallback created_at) is
426
+ at-or-after incident_ts when provided; without an incident_ts every
427
+ un-notified watcher is requeued.
428
+ - watcher status is otherwise ignored (pending / delivery_blocked /
429
+ delivery_exhausted / notify_failed all become candidates).
430
+
431
+ Atomicity vs coordinator's own scheduled retry: just before flipping a watcher's
432
+ status, re-fetch the row from the store. If notified_message_id became non-null
433
+ in the gap (the scheduled retry beat us), emit a benign
434
+ leader_receiver.claim_requeue_already_in_flight event and skip. If the race
435
+ leaks past this check, Gap 32 dedupe inside notify_result_watchers still
436
+ guarantees exactly-once injection.
437
+ """
438
+ # Stage 11.12: CAS re-fetch + claim_requeue_already_in_flight event retired. The atomic
439
+ # UPSERT in notify_result_watchers (claim_leader_notification) is now the single race
440
+ # gate. We mark eligible watchers to notify_failed and let retry_result_deliveries route
441
+ # through the UPSERT — concurrent claim/scheduled-retry paths both pass through the
442
+ # same atomic claim and only one fires deliver_attempt.
443
+ incident_dt = _parse_iso(incident_ts)
444
+ requeued: list[dict[str, Any]] = []
445
+ for watcher in store.result_watchers(owner_team_id=owner_team_id):
446
+ if watcher.get("notified_message_id"):
447
+ continue
448
+ latest_ts = _parse_iso(watcher.get("completed_at")) or _parse_iso(watcher.get("created_at"))
449
+ if incident_dt and latest_ts and latest_ts < incident_dt:
450
+ continue
451
+ watcher_id = watcher["watcher_id"]
452
+ prior_state = str(watcher.get("status") or "")
453
+ store.mark_result_watcher(
454
+ watcher_id, "notify_failed",
455
+ result_id=watcher.get("result_id"),
456
+ )
457
+ event_log.write(
458
+ "leader_receiver.claim_requeue",
459
+ result_id=watcher.get("result_id"),
460
+ watcher_id=watcher_id,
461
+ prior_state=prior_state,
462
+ requeued_at=datetime.now(timezone.utc).isoformat(),
463
+ claimed_pane_id=claimed_pane_id,
464
+ team_id=owner_team_id,
465
+ )
466
+ requeued.append({
467
+ "watcher_id": watcher_id,
468
+ "result_id": watcher.get("result_id"),
469
+ "prior_state": prior_state,
470
+ })
471
+ if requeued:
472
+ try:
473
+ retry_result_deliveries(workspace, event_log)
474
+ except Exception as exc:
475
+ event_log.write(
476
+ "leader_receiver.claim_requeue_delivery_failed",
477
+ error=str(exc),
478
+ watcher_ids=[r["watcher_id"] for r in requeued],
479
+ team_id=owner_team_id,
480
+ claimed_pane_id=claimed_pane_id,
481
+ )
482
+ return requeued
483
+
484
+
485
+ def _parse_iso(text: Any) -> datetime | None:
486
+ if not isinstance(text, str) or not text:
487
+ return None
488
+ try:
489
+ dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
490
+ except ValueError:
491
+ return None
492
+ if dt.tzinfo is None:
493
+ dt = dt.replace(tzinfo=timezone.utc)
494
+ return dt
495
+
496
+
282
497
  def format_result_watcher_notification(result: dict[str, Any]) -> str:
283
498
  task_id = result.get("task_id") or "unknown task"
284
499
  agent_id = result.get("agent_id") or "unknown agent"