@team-agent/installer 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/package.json +1 -1
  2. package/src/team_agent/cli/__init__.py +2 -0
  3. package/src/team_agent/cli/commands.py +22 -3
  4. package/src/team_agent/cli/parser.py +40 -1
  5. package/src/team_agent/coordinator/__main__.py +21 -2
  6. package/src/team_agent/coordinator/lifecycle.py +23 -0
  7. package/src/team_agent/diagnose/orphan_cleanup.py +193 -0
  8. package/src/team_agent/events.py +47 -0
  9. package/src/team_agent/leader/__init__.py +273 -60
  10. package/src/team_agent/lifecycle/agents.py +54 -2
  11. package/src/team_agent/lifecycle/operations.py +86 -9
  12. package/src/team_agent/lifecycle/paste_buffer_hygiene.py +39 -0
  13. package/src/team_agent/lifecycle/start.py +3 -0
  14. package/src/team_agent/message_store/leader_notification_log.py +132 -0
  15. package/src/team_agent/message_store/result_watchers.py +144 -1
  16. package/src/team_agent/message_store/schema.py +23 -0
  17. package/src/team_agent/messaging/delivery.py +10 -0
  18. package/src/team_agent/messaging/idle_alerts.py +227 -21
  19. package/src/team_agent/messaging/leader.py +166 -6
  20. package/src/team_agent/messaging/leader_panes.py +193 -23
  21. package/src/team_agent/messaging/owner_bypass.py +29 -0
  22. package/src/team_agent/messaging/result_delivery.py +219 -4
  23. package/src/team_agent/messaging/results.py +12 -21
  24. package/src/team_agent/messaging/scheduler.py +22 -2
  25. package/src/team_agent/messaging/send.py +9 -2
  26. package/src/team_agent/messaging/session_drift.py +94 -0
  27. package/src/team_agent/runtime.py +22 -14
  28. package/src/team_agent/rust_core.py +157 -3
  29. package/src/team_agent/state.py +167 -10
  30. package/src/team_agent/status/inbox.py +33 -3
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from team_agent.events import EventLog
7
+
8
+ _TEAM_AGENT_BUFFER_PREFIXES = ("team-agent-send-", "team-agent-leader-receiver-", "team-agent-")
9
+
10
+
11
+ def _is_team_agent_buffer(name: str) -> bool:
12
+ return any(name.startswith(prefix) for prefix in _TEAM_AGENT_BUFFER_PREFIXES)
13
+
14
+
15
+ def cleanup_stale_team_agent_buffers(workspace: Path, event_log: EventLog, *, context: str) -> dict[str, Any]:
16
+ from team_agent.runtime import run_cmd
17
+ proc = run_cmd(["tmux", "list-buffers", "-F", "#{buffer_name}"], timeout=5)
18
+ if proc.returncode != 0:
19
+ event_log.write("paste_buffer_hygiene.list_failed", context=context, stderr=proc.stderr.strip()[:200])
20
+ return {"ok": False, "deleted": [], "reason": "list_buffers_failed"}
21
+ names = [line.strip() for line in proc.stdout.splitlines() if line.strip()]
22
+ targets = [name for name in names if _is_team_agent_buffer(name)]
23
+ deleted: list[str] = []
24
+ for name in targets:
25
+ delete_proc = run_cmd(["tmux", "delete-buffer", "-b", name], timeout=5)
26
+ if delete_proc.returncode == 0:
27
+ deleted.append(name)
28
+ if deleted:
29
+ event_log.write(
30
+ "paste_buffer_hygiene.prevented_resume_injection",
31
+ context=context,
32
+ deleted_buffers=deleted,
33
+ scanned_count=len(names),
34
+ matched_count=len(targets),
35
+ )
36
+ return {"ok": True, "deleted": deleted, "scanned": len(names), "matched": len(targets)}
37
+
38
+
39
+ __all__ = ["cleanup_stale_team_agent_buffers"]
@@ -219,6 +219,8 @@ def _start_agent_unlocked(workspace: Path, agent_id: str, force: bool, open_disp
219
219
  reason="rollout_missing" if start_mode == "fresh_after_missing_rollout" else "session_id_missing",
220
220
  )
221
221
 
222
+ from team_agent.lifecycle.paste_buffer_hygiene import cleanup_stale_team_agent_buffers
223
+ cleanup_stale_team_agent_buffers(workspace, event_log, context=f"start_agent:{agent_id}")
222
224
  tmux_cmd, tmux_start_mode = _tmux_start_command_for_agent_window(session_name, agent_id, command)
223
225
  event_log.write(
224
226
  "start_agent.agent_start",
@@ -273,6 +275,7 @@ def _start_agent_unlocked(workspace: Path, agent_id: str, force: bool, open_disp
273
275
  )
274
276
  command = shell_command_for_agent(command_agent, workspace, mcp_config)
275
277
  start_mode = "fresh_after_missing_rollout" if missing_resume_rollout else "fresh"
278
+ cleanup_stale_team_agent_buffers(workspace, event_log, context=f"start_agent_fallback:{agent_id}")
276
279
  tmux_cmd, tmux_start_mode = _tmux_start_command_for_agent_window(session_name, agent_id, command)
277
280
  event_log.write(
278
281
  "start_agent.agent_start",
@@ -0,0 +1,132 @@
1
+ """Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): atomic exactly-once
2
+ dedupe at the leader-pane injection boundary, keyed by (result_id, leader_session_uuid).
3
+
4
+ Replaces the bad6484 watcher-table UPSERT approach. UNIQUE primary key + SQLite
5
+ INSERT OR IGNORE gives an atomic claim that works across processes (CLI subprocess
6
+ vs coordinator daemon) and across threads without an advisory lock. Distinct
7
+ leader_session_uuid values (e.g. after takeover) each get their own row so a
8
+ re-takeover legitimately allows another delivery for the same result_id.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from contextlib import closing
13
+ from datetime import datetime, timedelta, timezone
14
+ from typing import Any
15
+
16
+
17
+ def claim_leader_notification_delivery(
18
+ store: Any,
19
+ *,
20
+ result_id: str,
21
+ leader_session_uuid: str,
22
+ proposed_message_id: str,
23
+ envelope_hash: str,
24
+ owner_team_id: str | None,
25
+ pane_id: str | None,
26
+ ) -> dict[str, Any]:
27
+ """Atomic claim. INSERT OR IGNORE → rowcount=1 means we won, fire the inject.
28
+ rowcount=0 means a prior row exists for (result_id, leader_session_uuid); SELECT
29
+ it and return so the caller can decide to suppress (same envelope_hash) or surface
30
+ legitimate-duplicate (different envelope_hash)."""
31
+ now = datetime.now(timezone.utc).isoformat()
32
+ with closing(store.connect()) as conn:
33
+ with conn:
34
+ cur = conn.execute(
35
+ "insert or ignore into leader_notification_log("
36
+ " result_id, leader_session_uuid, notified_message_id, notified_at,"
37
+ " leader_pane_id_at_notify, envelope_content_hash, owner_team_id"
38
+ ") values (?, ?, ?, ?, ?, ?, ?)",
39
+ (
40
+ result_id, leader_session_uuid, proposed_message_id, now,
41
+ pane_id, envelope_hash, owner_team_id,
42
+ ),
43
+ )
44
+ if cur.rowcount == 1:
45
+ return {
46
+ "status": "claimed_by_you",
47
+ "notified_message_id": proposed_message_id,
48
+ "notified_at": now,
49
+ "envelope_content_hash": envelope_hash,
50
+ }
51
+ row = conn.execute(
52
+ "select notified_message_id, notified_at, envelope_content_hash, "
53
+ "leader_pane_id_at_notify from leader_notification_log "
54
+ "where result_id = ? and leader_session_uuid = ?",
55
+ (result_id, leader_session_uuid),
56
+ ).fetchone()
57
+ if row is None:
58
+ # Should not happen (INSERT OR IGNORE returned 0 → row must exist), but be defensive.
59
+ return {"status": "claimed_by_you", "notified_message_id": proposed_message_id,
60
+ "notified_at": now, "envelope_content_hash": envelope_hash}
61
+ prev_message_id, prev_ts, prev_hash, prev_pane = row[0], row[1], row[2], row[3]
62
+ return {
63
+ "status": "already_notified_by",
64
+ "notified_message_id": prev_message_id,
65
+ "notified_at": prev_ts,
66
+ "envelope_content_hash": prev_hash,
67
+ "leader_pane_id_at_notify": prev_pane,
68
+ }
69
+
70
+
71
+ def peek_leader_notification(
72
+ store: Any,
73
+ *,
74
+ result_id: str,
75
+ leader_session_uuid: str,
76
+ ) -> dict[str, Any] | None:
77
+ """Read-only fast-path peek (Stage 12). Returns the existing log row for
78
+ (result_id, leader_session_uuid) or None. Used by notify_result_watchers to short-
79
+ circuit before calling deliver_stored_message; the authoritative atomic claim still
80
+ happens at the _send_to_leader_receiver injection boundary."""
81
+ with closing(store.connect()) as conn:
82
+ row = conn.execute(
83
+ "select notified_message_id, notified_at, envelope_content_hash, "
84
+ "leader_pane_id_at_notify, owner_team_id from leader_notification_log "
85
+ "where result_id = ? and leader_session_uuid = ?",
86
+ (result_id, leader_session_uuid),
87
+ ).fetchone()
88
+ if row is None:
89
+ return None
90
+ return {
91
+ "notified_message_id": row[0],
92
+ "notified_at": row[1],
93
+ "envelope_content_hash": row[2],
94
+ "leader_pane_id_at_notify": row[3],
95
+ "owner_team_id": row[4],
96
+ }
97
+
98
+
99
+ def prune_leader_notification_log(store: Any, *, max_age_hours: int = 24) -> int:
100
+ """Coordinator-tick maintenance: drop rows older than max_age_hours. Cheap, bounded."""
101
+ cutoff = (datetime.now(timezone.utc) - timedelta(hours=max_age_hours)).isoformat()
102
+ with closing(store.connect()) as conn:
103
+ with conn:
104
+ cur = conn.execute(
105
+ "delete from leader_notification_log where notified_at < ?",
106
+ (cutoff,),
107
+ )
108
+ return cur.rowcount or 0
109
+
110
+
111
+ def leader_notification_log_rows(store: Any, *, owner_team_id: str | None = None) -> list[dict[str, Any]]:
112
+ """Test/diagnostic accessor. Returns all rows (optionally team-scoped)."""
113
+ with closing(store.connect()) as conn:
114
+ if owner_team_id is None:
115
+ rows = conn.execute(
116
+ "select * from leader_notification_log order by notified_at"
117
+ ).fetchall()
118
+ else:
119
+ rows = conn.execute(
120
+ "select * from leader_notification_log where owner_team_id = ? "
121
+ "or owner_team_id is null order by notified_at",
122
+ (owner_team_id,),
123
+ ).fetchall()
124
+ return [dict(row) for row in rows]
125
+
126
+
127
+ __all__ = [
128
+ "claim_leader_notification_delivery",
129
+ "peek_leader_notification",
130
+ "prune_leader_notification_log",
131
+ "leader_notification_log_rows",
132
+ ]
@@ -94,9 +94,152 @@ def requeue_delivery_exhausted_watchers(self) -> list[str]:
94
94
  ).fetchall()
95
95
  watcher_ids = [row[0] for row in rows]
96
96
  if watcher_ids:
97
+ # Phase D hotfix-3 (78055bc) cleared notified_message_id here; Gap 32 dedupe
98
+ # reverses that — preserve notified_message_id so the retry path can re-confirm
99
+ # (or skip if the same result_id was already injected on a different pane_id).
97
100
  conn.execute(
98
101
  "update result_watchers "
99
- "set status = 'notify_failed', error = null, notified_message_id = null, completed_at = null "
102
+ "set status = 'notify_failed', error = null, completed_at = null "
100
103
  "where status = 'delivery_exhausted'"
101
104
  )
102
105
  return watcher_ids
106
+
107
+
108
+ def claim_leader_notification(
109
+ store: Any,
110
+ owner_team_id: str | None,
111
+ result_id: str | None,
112
+ watcher_id: str,
113
+ proposed_token: str,
114
+ ) -> dict[str, Any]:
115
+ """DEPRECATED (Stage 12 roundtable retirement). The watcher-table UPSERT did not
116
+ actually prevent duplicate leader-pane injections in Mac mini real flow because two
117
+ independent code paths (scheduled_event branch + result_watchers branch) emit
118
+ deliver_attempt without coordinating at the watcher level. Replaced by
119
+ leader_notification_log.claim_leader_notification_delivery consulted inside
120
+ _send_to_leader_receiver. Kept here as a no-op shim so legacy callers / tests that
121
+ still import this symbol don't crash on import — but it does NOT perform a claim and
122
+ should NOT be used in new code."""
123
+ if not result_id:
124
+ return {"status": "deprecated_noop", "canonical_message_id": None}
125
+ return {"status": "deprecated_noop", "canonical_message_id": None}
126
+
127
+
128
+ def _claim_leader_notification_disabled_impl( # legacy reference for archaeology
129
+ store: Any,
130
+ owner_team_id: str | None,
131
+ result_id: str | None,
132
+ watcher_id: str,
133
+ proposed_token: str,
134
+ ) -> dict[str, Any]:
135
+ if not result_id:
136
+ return {"status": "no_result_id", "canonical_message_id": None}
137
+ with closing(store.connect()) as conn:
138
+ conn.isolation_level = None
139
+ try:
140
+ conn.execute("BEGIN IMMEDIATE")
141
+ if owner_team_id is None:
142
+ sibling = conn.execute(
143
+ "select notified_message_id from result_watchers "
144
+ "where result_id = ? and notified_message_id is not null "
145
+ "order by coalesce(completed_at, created_at) limit 1",
146
+ (result_id,),
147
+ ).fetchone()
148
+ else:
149
+ sibling = conn.execute(
150
+ "select notified_message_id from result_watchers "
151
+ "where result_id = ? and notified_message_id is not null "
152
+ "and (owner_team_id = ? or owner_team_id is null) "
153
+ "order by coalesce(completed_at, created_at) limit 1",
154
+ (result_id, owner_team_id),
155
+ ).fetchone()
156
+ if sibling and sibling[0]:
157
+ conn.execute("COMMIT")
158
+ return {"status": "already_notified_by", "canonical_message_id": sibling[0]}
159
+ cur = conn.execute(
160
+ "update result_watchers "
161
+ "set notified_message_id = ?, result_id = coalesce(result_id, ?) "
162
+ "where watcher_id = ? and notified_message_id is null",
163
+ (proposed_token, result_id, watcher_id),
164
+ )
165
+ if cur.rowcount == 1:
166
+ conn.execute("COMMIT")
167
+ return {"status": "claimed_by_you", "canonical_message_id": proposed_token}
168
+ row = conn.execute(
169
+ "select notified_message_id from result_watchers where watcher_id = ?",
170
+ (watcher_id,),
171
+ ).fetchone()
172
+ conn.execute("COMMIT")
173
+ return {
174
+ "status": "already_notified_by",
175
+ "canonical_message_id": (row[0] if row else None) or None,
176
+ }
177
+ except Exception:
178
+ try:
179
+ conn.execute("ROLLBACK")
180
+ except Exception:
181
+ pass
182
+ raise
183
+ finally:
184
+ conn.isolation_level = "" # restore default
185
+
186
+
187
+ def release_leader_notification_claim(
188
+ store: Any,
189
+ watcher_id: str,
190
+ expected_token: str,
191
+ ) -> bool:
192
+ """Release a sentinel claim after delivery failure so the next retry can re-claim.
193
+ Returns True iff we released the claim we owned (rowcount == 1)."""
194
+ with closing(store.connect()) as conn:
195
+ with conn:
196
+ cur = conn.execute(
197
+ "update result_watchers set notified_message_id = null "
198
+ "where watcher_id = ? and notified_message_id = ?",
199
+ (watcher_id, expected_token),
200
+ )
201
+ return cur.rowcount == 1
202
+
203
+
204
+ def promote_leader_notification_id(
205
+ store: Any,
206
+ watcher_id: str,
207
+ sentinel_token: str,
208
+ real_message_id: str,
209
+ ) -> bool:
210
+ """After successful delivery, replace the sentinel claim with the real message_id.
211
+ Returns True iff the promotion succeeded (rowcount == 1)."""
212
+ with closing(store.connect()) as conn:
213
+ with conn:
214
+ cur = conn.execute(
215
+ "update result_watchers set notified_message_id = ? "
216
+ "where watcher_id = ? and notified_message_id = ?",
217
+ (real_message_id, watcher_id, sentinel_token),
218
+ )
219
+ return cur.rowcount == 1
220
+
221
+
222
+ def leader_notified_message_id_for_result(
223
+ store: Any,
224
+ owner_team_id: str | None,
225
+ result_id: str | None,
226
+ ) -> str | None:
227
+ if not result_id:
228
+ return None
229
+ with closing(store.connect()) as conn:
230
+ if owner_team_id is None:
231
+ row = conn.execute(
232
+ "select notified_message_id from result_watchers "
233
+ "where result_id = ? and notified_message_id is not null "
234
+ "order by coalesce(completed_at, created_at) limit 1",
235
+ (result_id,),
236
+ ).fetchone()
237
+ else:
238
+ row = conn.execute(
239
+ "select notified_message_id from result_watchers "
240
+ "where result_id = ? and notified_message_id is not null "
241
+ "and (owner_team_id = ? or owner_team_id is null) "
242
+ "order by coalesce(completed_at, created_at) limit 1",
243
+ (result_id, owner_team_id),
244
+ ).fetchone()
245
+ return row[0] if row else None
@@ -231,6 +231,29 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
231
231
  RESULT_WATCHER_COLUMNS,
232
232
  {"owner_team_id": "alter table result_watchers add column owner_team_id text"},
233
233
  )
234
+ # Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): dedupe leader
235
+ # notifications at the injection boundary, keyed by (result_id, leader_session_uuid).
236
+ # UNIQUE primary key + INSERT OR IGNORE in claim_leader_notification_delivery gives
237
+ # atomic exactly-once without an advisory lock. Retires the bad6484 watcher-table
238
+ # UPSERT approach.
239
+ conn.execute(
240
+ """
241
+ create table if not exists leader_notification_log (
242
+ result_id text not null,
243
+ leader_session_uuid text not null,
244
+ notified_message_id text not null,
245
+ notified_at text not null,
246
+ leader_pane_id_at_notify text,
247
+ envelope_content_hash text,
248
+ owner_team_id text,
249
+ primary key (result_id, leader_session_uuid)
250
+ )
251
+ """
252
+ )
253
+ conn.execute(
254
+ "create index if not exists idx_leader_notification_log_uuid "
255
+ "on leader_notification_log(leader_session_uuid, notified_at)"
256
+ )
234
257
  conn.execute("create index if not exists idx_messages_owner_team_id on messages(owner_team_id)")
235
258
  conn.execute("create index if not exists idx_scheduled_events_owner_team_id on scheduled_events(owner_team_id)")
236
259
  conn.execute("create index if not exists idx_agent_health_owner_team_id on agent_health(owner_team_id)")
@@ -121,6 +121,16 @@ def _deliver_pending_messages(workspace: Path, state: dict[str, Any], event_log:
121
121
  for row in store.messages():
122
122
  if row["status"] not in {"pending", "accepted"}:
123
123
  continue
124
+ agent_state = state.get("agents", {}).get(row["recipient"]) or {}
125
+ if str(agent_state.get("status") or "").lower() == "busy":
126
+ event_log.write(
127
+ "send.deferred_busy",
128
+ message_id=row["message_id"],
129
+ sender=row.get("sender"),
130
+ recipient=row["recipient"],
131
+ reason="recipient_busy",
132
+ )
133
+ continue
124
134
  result = _deliver_pending_message(workspace, state, row["message_id"], wait_visible=True, timeout=30.0)
125
135
  if result.get("ok"):
126
136
  delivered.append(row["message_id"])