@team-agent/installer 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/package.json +1 -1
  2. package/schemas/team.schema.json +6 -0
  3. package/src/team_agent/approvals/runtime_prompts.py +1 -1
  4. package/src/team_agent/cli/commands.py +122 -6
  5. package/src/team_agent/cli/parser.py +42 -1
  6. package/src/team_agent/coordinator/__main__.py +21 -2
  7. package/src/team_agent/coordinator/lifecycle.py +11 -0
  8. package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
  9. package/src/team_agent/events.py +47 -0
  10. package/src/team_agent/launch/core.py +2 -1
  11. package/src/team_agent/leader/__init__.py +273 -60
  12. package/src/team_agent/lifecycle/agents.py +54 -2
  13. package/src/team_agent/lifecycle/operations.py +87 -9
  14. package/src/team_agent/lifecycle/start.py +1 -1
  15. package/src/team_agent/message_store/core.py +8 -7
  16. package/src/team_agent/message_store/leader_notification_log.py +132 -0
  17. package/src/team_agent/message_store/result_watchers.py +144 -1
  18. package/src/team_agent/message_store/schema.py +31 -2
  19. package/src/team_agent/messaging/delivery.py +293 -1
  20. package/src/team_agent/messaging/idle_alerts.py +109 -9
  21. package/src/team_agent/messaging/leader.py +179 -10
  22. package/src/team_agent/messaging/leader_api_errors.py +216 -0
  23. package/src/team_agent/messaging/leader_panes.py +393 -23
  24. package/src/team_agent/messaging/result_delivery.py +219 -4
  25. package/src/team_agent/messaging/results.py +12 -21
  26. package/src/team_agent/messaging/scheduler.py +24 -2
  27. package/src/team_agent/messaging/send.py +21 -26
  28. package/src/team_agent/messaging/tmux_io.py +153 -23
  29. package/src/team_agent/messaging/tmux_prompt.py +87 -0
  30. package/src/team_agent/messaging/trust_auto_answer.py +44 -0
  31. package/src/team_agent/restart/orchestration.py +207 -4
  32. package/src/team_agent/runtime.py +7 -7
  33. package/src/team_agent/rust_core.py +157 -3
  34. package/src/team_agent/sessions/capture.py +65 -15
  35. package/src/team_agent/spec.py +59 -0
  36. package/src/team_agent/state.py +153 -10
  37. package/src/team_agent/status/inbox.py +33 -3
  38. package/src/team_agent/status/queries.py +32 -1
  39. package/src/team_agent/watch/__init__.py +145 -0
@@ -1,11 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ from datetime import datetime, timezone
4
5
  from pathlib import Path
5
6
  from typing import Any
6
7
 
7
8
  from team_agent.events import EventLog
8
9
  from team_agent.message_store import MessageStore
10
+ from team_agent.message_store.leader_notification_log import peek_leader_notification
11
+ from team_agent.message_store.result_watchers import leader_notified_message_id_for_result
9
12
  from team_agent.messaging.deps import send_message
10
13
  from team_agent.messaging.internal_delivery import deliver_stored_message
11
14
 
@@ -22,7 +25,13 @@ def retry_result_deliveries(workspace: Path, event_log: EventLog) -> list[dict[s
22
25
  row = store.result_by_id(str(watcher["result_id"]))
23
26
  if not row:
24
27
  continue
25
- notified.extend(notify_result_watchers(workspace, _result_entry_from_row(row), event_log, watchers=[watcher]))
28
+ notified.extend(notify_result_watchers(
29
+ workspace,
30
+ _result_entry_from_row(row),
31
+ event_log,
32
+ watchers=[watcher],
33
+ dedupe_reason="rebind_retry",
34
+ ))
26
35
  return notified
27
36
 
28
37
 
@@ -31,6 +40,7 @@ def notify_result_watchers(
31
40
  result: dict[str, Any],
32
41
  event_log: EventLog,
33
42
  watchers: list[dict[str, Any]] | None = None,
43
+ dedupe_reason: str | None = None,
34
44
  ) -> list[dict[str, Any]]:
35
45
  store = MessageStore(workspace)
36
46
  candidates = [
@@ -67,9 +77,44 @@ def notify_result_watchers(
67
77
  }
68
78
  )
69
79
  attempts = result_delivery_attempts(event_log, primary["watcher_id"], str(result.get("result_id") or ""))
80
+ # Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): exactly-once dedupe
81
+ # lives in leader_notification_log keyed by (result_id, leader_session_uuid) and is
82
+ # consulted atomically at the injection boundary inside _send_to_leader_receiver. Here
83
+ # we add a read-only fast-path peek so concurrent notify_result_watchers calls for the
84
+ # same result short-circuit without spinning up a deliver_stored_message round-trip.
85
+ # The peek is NOT the dedupe primitive — the atomic INSERT OR IGNORE at injection is.
86
+ result_id_str = str(result.get("result_id") or "") or None
87
+ if result_id_str:
88
+ leader_uuid = _resolve_leader_session_uuid(workspace, primary.get("owner_team_id"))
89
+ if leader_uuid:
90
+ prior = peek_leader_notification(
91
+ store, result_id=result_id_str, leader_session_uuid=leader_uuid,
92
+ )
93
+ if prior:
94
+ notified.append(_mark_watcher_dedupe_skip(
95
+ store, event_log, primary, result, attempts,
96
+ prior["notified_message_id"],
97
+ dedupe_reason or "injection_log_already_notified",
98
+ notified_at=prior.get("notified_at"),
99
+ leader_session_uuid=leader_uuid,
100
+ ))
101
+ return notified
102
+ # Legacy compat: watcher.notified_message_id set by a prior path (Gap 32 reversal of
103
+ # 78055bc, or any pre-Stage-12 code) also blocks redelivery. This preserves the
104
+ # Stage 11.9-11.12 era contract while the new gate (leader_notification_log) is the
105
+ # authoritative dedupe primitive going forward.
106
+ legacy_canonical = leader_notified_message_id_for_result(
107
+ store, primary.get("owner_team_id"), result_id_str,
108
+ )
109
+ if legacy_canonical:
110
+ notified.append(_mark_watcher_dedupe_skip(
111
+ store, event_log, primary, result, attempts,
112
+ legacy_canonical,
113
+ dedupe_reason or "rebind_retry",
114
+ ))
115
+ return notified
70
116
  existing = delivered_result_message(
71
- store,
72
- str(result.get("result_id") or ""),
117
+ store, str(result.get("result_id") or ""),
73
118
  task_id=result.get("task_id"),
74
119
  owner_team_id=primary.get("owner_team_id"),
75
120
  )
@@ -83,6 +128,75 @@ def notify_result_watchers(
83
128
  return notified
84
129
 
85
130
 
131
+ def _resolve_leader_session_uuid(workspace: Path, owner_team_id: str | None) -> str | None:
132
+ """Helper: read the team's leader_session_uuid from runtime state for gate lookups."""
133
+ try:
134
+ from team_agent.messaging.deps import load_runtime_state, team_state_key
135
+ state = load_runtime_state(workspace)
136
+ if owner_team_id and isinstance(state.get("teams"), dict):
137
+ scoped = state["teams"].get(owner_team_id)
138
+ if isinstance(scoped, dict):
139
+ state = scoped
140
+ elif owner_team_id and team_state_key(state) != owner_team_id:
141
+ return None
142
+ owner = state.get("team_owner") or {}
143
+ return str(owner.get("leader_session_uuid") or "") or None
144
+ except Exception:
145
+ return None
146
+
147
+
148
+ def _infer_dedupe_reason(primary: dict[str, Any], store: MessageStore) -> str:
149
+ if primary.get("notified_message_id"):
150
+ return "rebind_retry"
151
+ return "watcher_duplicate"
152
+
153
+
154
+ def _mark_watcher_dedupe_skip(
155
+ store: MessageStore,
156
+ event_log: EventLog,
157
+ watcher: dict[str, Any],
158
+ result: dict[str, Any],
159
+ attempts: int,
160
+ canonical_message_id: str,
161
+ reason: str,
162
+ *,
163
+ notified_at: str | None = None,
164
+ leader_session_uuid: str | None = None,
165
+ ) -> dict[str, Any]:
166
+ original_message_id = watcher.get("notified_message_id")
167
+ # Stage 12: the canonical message_id (or sentinel from the gate) is auditing metadata
168
+ # here. The authoritative dedupe gate is leader_notification_log; this mark just keeps
169
+ # the watcher row from being re-picked by retry scans.
170
+ store.mark_result_watcher(
171
+ watcher["watcher_id"],
172
+ "notified",
173
+ result_id=result.get("result_id"),
174
+ notified_message_id=canonical_message_id,
175
+ )
176
+ event_log.write(
177
+ "leader_receiver.notification_dedupe_skip",
178
+ result_id=result.get("result_id"),
179
+ original_message_id=original_message_id,
180
+ suppressed_message_id=canonical_message_id,
181
+ reason=reason,
182
+ team_id=watcher.get("owner_team_id"),
183
+ watcher_id=watcher["watcher_id"],
184
+ task_id=result.get("task_id"),
185
+ agent_id=result.get("agent_id"),
186
+ attempt=attempts + 1,
187
+ leader_session_uuid=leader_session_uuid,
188
+ prior_notified_at=notified_at,
189
+ )
190
+ return {
191
+ "watcher_id": watcher["watcher_id"],
192
+ "result_id": result.get("result_id"),
193
+ "ok": True,
194
+ "message_id": canonical_message_id,
195
+ "deduped": True,
196
+ "dedupe_reason": reason,
197
+ }
198
+
199
+
86
200
  def _dedupe_watchers_for_result(
87
201
  watchers: list[dict[str, Any]],
88
202
  ) -> tuple[dict[str, Any], list[dict[str, Any]]]:
@@ -114,11 +228,19 @@ def _deliver_result_to_watcher(
114
228
  return _mark_delivery_failed(store, event_log, watcher, result, attempts, str(exc))
115
229
  status = "notified" if delivery.get("ok") else "notify_failed"
116
230
  error = delivery.get("reason") or delivery.get("error")
231
+ # Stage 12: notified_message_id is now auditing metadata. The exactly-once contract
232
+ # lives in the leader_notification_log table consulted by _send_to_leader_receiver;
233
+ # whatever the gate suppresses comes back as ok=true deduped=true, and the watcher row
234
+ # records this as a successful notification with the canonical message_id.
235
+ persisted_message_id = (
236
+ delivery.get("canonical_message_id") if delivery.get("deduped")
237
+ else (delivery.get("message_id") if delivery.get("ok") else None)
238
+ )
117
239
  store.mark_result_watcher(
118
240
  watcher["watcher_id"],
119
241
  status,
120
242
  result_id=result.get("result_id"),
121
- notified_message_id=delivery.get("message_id"),
243
+ notified_message_id=persisted_message_id,
122
244
  error=error,
123
245
  )
124
246
  event_log.write(
@@ -279,6 +401,99 @@ def watcher_matches_result(watcher: dict[str, Any], result: dict[str, Any]) -> b
279
401
  return (not task_id or task_id == result.get("task_id")) and (not agent_id or agent_id == result.get("agent_id"))
280
402
 
281
403
 
404
+ def requeue_after_claim_leader(
405
+ workspace: Path,
406
+ store: MessageStore,
407
+ event_log: EventLog,
408
+ owner_team_id: str,
409
+ claimed_pane_id: str,
410
+ *,
411
+ incident_ts: str | None = None,
412
+ ) -> list[dict[str, Any]]:
413
+ """Post-claim hook (Gap 26 / Mac mini Stage 11 Scenarios 3, 11.10): re-route every
414
+ not-yet-delivered leader-bound notification to the newly claimed pane. Returns the
415
+ list of requeued watcher records (may be empty).
416
+
417
+ Stage 11.10 semantic reframe: claim-leader means "all not-yet-delivered leader-bound
418
+ notifications for this team_id reroute to the claimed pane". Watcher status is
419
+ irrelevant — `notified_message_id` is the only dedupe gate. Gap 32 exactly-once
420
+ contract still holds: notified_message_id non-null blocks redelivery.
421
+
422
+ Selection rules:
423
+ - watcher is scoped to this team (owner_team_id match)
424
+ - watcher has no notified_message_id (Gap 32 once-only)
425
+ - watcher's latest activity timestamp (completed_at fallback created_at) is
426
+ at-or-after incident_ts when provided; without an incident_ts every
427
+ un-notified watcher is requeued.
428
+ - watcher status is otherwise ignored (pending / delivery_blocked /
429
+ delivery_exhausted / notify_failed all become candidates).
430
+
431
+ Atomicity vs coordinator's own scheduled retry: just before flipping a watcher's
432
+ status, re-fetch the row from the store. If notified_message_id became non-null
433
+ in the gap (the scheduled retry beat us), emit a benign
434
+ leader_receiver.claim_requeue_already_in_flight event and skip. If the race
435
+ leaks past this check, Gap 32 dedupe inside notify_result_watchers still
436
+ guarantees exactly-once injection.
437
+ """
438
+ # Stage 11.12: CAS re-fetch + claim_requeue_already_in_flight event retired. The atomic
439
+ # UPSERT in notify_result_watchers (claim_leader_notification) is now the single race
440
+ # gate. We mark eligible watchers to notify_failed and let retry_result_deliveries route
441
+ # through the UPSERT — concurrent claim/scheduled-retry paths both pass through the
442
+ # same atomic claim and only one fires deliver_attempt.
443
+ incident_dt = _parse_iso(incident_ts)
444
+ requeued: list[dict[str, Any]] = []
445
+ for watcher in store.result_watchers(owner_team_id=owner_team_id):
446
+ if watcher.get("notified_message_id"):
447
+ continue
448
+ latest_ts = _parse_iso(watcher.get("completed_at")) or _parse_iso(watcher.get("created_at"))
449
+ if incident_dt and latest_ts and latest_ts < incident_dt:
450
+ continue
451
+ watcher_id = watcher["watcher_id"]
452
+ prior_state = str(watcher.get("status") or "")
453
+ store.mark_result_watcher(
454
+ watcher_id, "notify_failed",
455
+ result_id=watcher.get("result_id"),
456
+ )
457
+ event_log.write(
458
+ "leader_receiver.claim_requeue",
459
+ result_id=watcher.get("result_id"),
460
+ watcher_id=watcher_id,
461
+ prior_state=prior_state,
462
+ requeued_at=datetime.now(timezone.utc).isoformat(),
463
+ claimed_pane_id=claimed_pane_id,
464
+ team_id=owner_team_id,
465
+ )
466
+ requeued.append({
467
+ "watcher_id": watcher_id,
468
+ "result_id": watcher.get("result_id"),
469
+ "prior_state": prior_state,
470
+ })
471
+ if requeued:
472
+ try:
473
+ retry_result_deliveries(workspace, event_log)
474
+ except Exception as exc:
475
+ event_log.write(
476
+ "leader_receiver.claim_requeue_delivery_failed",
477
+ error=str(exc),
478
+ watcher_ids=[r["watcher_id"] for r in requeued],
479
+ team_id=owner_team_id,
480
+ claimed_pane_id=claimed_pane_id,
481
+ )
482
+ return requeued
483
+
484
+
485
+ def _parse_iso(text: Any) -> datetime | None:
486
+ if not isinstance(text, str) or not text:
487
+ return None
488
+ try:
489
+ dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
490
+ except ValueError:
491
+ return None
492
+ if dt.tzinfo is None:
493
+ dt = dt.replace(tzinfo=timezone.utc)
494
+ return dt
495
+
496
+
282
497
  def format_result_watcher_notification(result: dict[str, Any]) -> str:
283
498
  task_id = result.get("task_id") or "unknown task"
284
499
  agent_id = result.get("agent_id") or "unknown agent"
@@ -359,36 +359,27 @@ def _refresh_leader_receiver_or_flag_rebind(
359
359
  receiver = state.get("leader_receiver") or {}
360
360
  if receiver.get("mode") != "direct_tmux":
361
361
  return state
362
- validation = _validate_leader_receiver(receiver)
362
+ owner_identity = state.get("team_owner") or None
363
+ receiver_for_validation = dict(receiver)
364
+ if owner_identity and owner_identity.get("leader_session_uuid") and not receiver_for_validation.get("leader_session_uuid"):
365
+ receiver_for_validation["leader_session_uuid"] = owner_identity["leader_session_uuid"]
366
+ validation = _validate_leader_receiver(receiver_for_validation)
363
367
  if validation.get("ok"):
364
368
  return state
365
- owner_identity = state.get("team_owner") or None
366
- rediscovered = _rediscover_leader_receiver(receiver, event_log, owner_identity)
369
+ rediscovered = _rediscover_leader_receiver(
370
+ receiver_for_validation,
371
+ event_log,
372
+ owner_identity,
373
+ invalidation_reason=validation.get("reason"),
374
+ team_id=team_state_key(state),
375
+ )
367
376
  if rediscovered.get("status") == "updated":
368
377
  state["leader_receiver"] = rediscovered["receiver"]
369
378
  if persist:
370
379
  save_runtime_state(workspace, state)
371
380
  else:
372
381
  save_team_scoped_state(workspace, state)
373
- event_log.write(
374
- "leader_receiver.rebind_applied",
375
- old_pane_id=receiver.get("pane_id"),
376
- new_pane_id=rediscovered["receiver"].get("pane_id"),
377
- reason=validation.get("reason"),
378
- source="report_result_notify",
379
- owner_identity=owner_identity,
380
- )
381
382
  return state
382
- event_log.write(
383
- "leader_receiver.rebind_required",
384
- old_pane_id=receiver.get("pane_id"),
385
- reason=validation.get("reason"),
386
- validation_error=validation.get("error"),
387
- rediscovery_status=rediscovered.get("status"),
388
- provider=receiver.get("provider"),
389
- source="report_result_notify",
390
- owner_identity=owner_identity,
391
- )
392
383
  return state
393
384
 
394
385
 
@@ -84,6 +84,18 @@ def _fire_due_scheduled_events(workspace: Path, store: MessageStore, event_log:
84
84
  elif row["kind"] == "health_ping":
85
85
  result = {"ok": True, "status": "logged"}
86
86
  event_log.write("coordinator.health_ping", target=row["target"], payload=payload)
87
+ elif row["kind"] == "trust_retry":
88
+ # Spark MEDIUM sweep #3 (2026-05-26) — bounded-backoff consumer
89
+ # for delivery.py:_handle_trust_retry_needed. payload carries the
90
+ # message_id and current attempt; _execute_trust_retry resets the
91
+ # row to 'accepted', re-runs _deliver_pending_message with the
92
+ # attempt threaded through, and either delivers, reschedules, or
93
+ # hits the terminal trust_auto_answer_exhausted branch.
94
+ from team_agent.messaging.delivery import _execute_trust_retry
95
+ result = _execute_trust_retry(
96
+ workspace, store, event_log, payload,
97
+ owner_team_id=row.get("owner_team_id"),
98
+ )
87
99
  else:
88
100
  result = {"ok": False, "error": f"unknown scheduled event kind: {row['kind']}"}
89
101
  if not result.get("ok") and row["kind"] == "send":
@@ -409,8 +421,18 @@ def _recent_restart_or_reset_event(event_log: EventLog, agent_id: str, since: da
409
421
  for event in reversed(event_log.tail(200)):
410
422
  if event.get("event") not in _RESTART_RESET_EVENTS:
411
423
  continue
412
- if event.get("agent_id") != agent_id and agent_id not in set(event.get("agents") or []):
413
- continue
424
+ if event.get("agent_id") != agent_id:
425
+ agents_field = event.get("agents") or []
426
+ agent_ids: set[str] = set()
427
+ for entry in agents_field:
428
+ if isinstance(entry, str):
429
+ agent_ids.add(entry)
430
+ elif isinstance(entry, dict):
431
+ aid = entry.get("agent_id")
432
+ if isinstance(aid, str):
433
+ agent_ids.add(aid)
434
+ if agent_id not in agent_ids:
435
+ continue
414
436
  try:
415
437
  ts = datetime.fromisoformat(str(event.get("ts")))
416
438
  except ValueError:
@@ -34,19 +34,10 @@ from pathlib import Path
34
34
  from typing import Any
35
35
 
36
36
  def send_message(
37
- workspace: Path,
38
- target: str | list[str] | None,
39
- content: str,
40
- task_id: str | None = None,
41
- sender: str = "leader",
42
- requires_ack: bool = True,
43
- confirm_human: bool = False,
44
- wait_visible: bool = True,
45
- timeout: float = 30.0,
46
- lock_timeout: float = 5.0,
47
- watch_result: bool = False,
48
- block_until_delivered: bool = True,
49
- team: str | None = None,
37
+ workspace: Path, target: str | list[str] | None, content: str, task_id: str | None = None,
38
+ sender: str = "leader", requires_ack: bool = True, confirm_human: bool = False,
39
+ wait_visible: bool = True, timeout: float = 30.0, lock_timeout: float = 5.0,
40
+ watch_result: bool = False, block_until_delivered: bool = True, team: str | None = None,
50
41
  ) -> dict[str, Any]:
51
42
  with _runtime_lock(workspace, "send", timeout=lock_timeout):
52
43
  return _send_message_unlocked(
@@ -66,18 +57,10 @@ def send_message(
66
57
 
67
58
 
68
59
  def _send_message_unlocked(
69
- workspace: Path,
70
- target: str | list[str] | None,
71
- content: str,
72
- task_id: str | None = None,
73
- sender: str = "leader",
74
- requires_ack: bool = True,
75
- confirm_human: bool = False,
76
- wait_visible: bool = True,
77
- timeout: float = 30.0,
78
- watch_result: bool = False,
79
- block_until_delivered: bool = True,
80
- team: str | None = None,
60
+ workspace: Path, target: str | list[str] | None, content: str, task_id: str | None = None,
61
+ sender: str = "leader", requires_ack: bool = True, confirm_human: bool = False,
62
+ wait_visible: bool = True, timeout: float = 30.0, watch_result: bool = False,
63
+ block_until_delivered: bool = True, team: str | None = None,
81
64
  ) -> dict[str, Any]:
82
65
  if team is None:
83
66
  ambiguous = ambiguous_team_target_result(load_runtime_state(workspace))
@@ -336,6 +319,8 @@ def _send_single_message_unlocked(
336
319
  "submit_verification": delivered_result.get("submit_verification"),
337
320
  "turn_verification": delivered_result.get("turn_verification"),
338
321
  }
322
+ result.update({key: delivered_result[key] for key in ("reason", "stage") if delivered_result.get(key)})
323
+ result.update(_structured_delivery_refusal(delivered_result))
339
324
  if delivered_result.get("queued"):
340
325
  result["queued"] = True
341
326
  result["reason"] = delivered_result.get("reason")
@@ -490,7 +475,7 @@ def _broadcast_targets(state: dict[str, Any], spec: dict[str, Any], sender: str)
490
475
 
491
476
 
492
477
  def _compact_broadcast_delivery(result: dict[str, Any]) -> dict[str, Any]:
493
- keys = ["ok", "status", "message_id", "to", "reason", "channel"]
478
+ keys = ["ok", "status", "message_id", "to", "reason", "channel", "detected", "pane_id", "pane_mode", "pane_capture_tail", "stage", "verification"]
494
479
  return {key: result[key] for key in keys if key in result}
495
480
 
496
481
 
@@ -498,3 +483,13 @@ def _compact_fanout_delivery(result: dict[str, Any]) -> dict[str, Any]:
498
483
  compact = _compact_broadcast_delivery(result)
499
484
  compact["delivered"] = bool(result.get("submitted") or result.get("visible") or result.get("status") in {"submitted", "visible", "delivered", "acknowledged"})
500
485
  return compact
486
+
487
+
488
+ def _structured_delivery_refusal(delivered_result: dict[str, Any]) -> dict[str, Any]:
489
+ attempts = delivered_result.get("paste_attempts")
490
+ if not isinstance(attempts, list):
491
+ return {}
492
+ for attempt in attempts:
493
+ if isinstance(attempt, dict) and attempt.get("reason") == "recipient_pane_in_non_input_mode":
494
+ return {key: attempt[key] for key in ("detected", "pane_id", "pane_mode", "pane_capture_tail") if key in attempt}
495
+ return {}