@team-agent/installer 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,132 @@
1
+ """Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): atomic exactly-once
2
+ dedupe at the leader-pane injection boundary, keyed by (result_id, leader_session_uuid).
3
+
4
+ Replaces the bad6484 watcher-table UPSERT approach. UNIQUE primary key + SQLite
5
+ INSERT OR IGNORE gives an atomic claim that works across processes (CLI subprocess
6
+ vs coordinator daemon) and across threads without an advisory lock. Distinct
7
+ leader_session_uuid values (e.g. after takeover) each get their own row so a
8
+ re-takeover legitimately allows another delivery for the same result_id.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from contextlib import closing
13
+ from datetime import datetime, timedelta, timezone
14
+ from typing import Any
15
+
16
+
17
+ def claim_leader_notification_delivery(
18
+ store: Any,
19
+ *,
20
+ result_id: str,
21
+ leader_session_uuid: str,
22
+ proposed_message_id: str,
23
+ envelope_hash: str,
24
+ owner_team_id: str | None,
25
+ pane_id: str | None,
26
+ ) -> dict[str, Any]:
27
+ """Atomic claim. INSERT OR IGNORE → rowcount=1 means we won, fire the inject.
28
+ rowcount=0 means a prior row exists for (result_id, leader_session_uuid); SELECT
29
+ it and return so the caller can decide to suppress (same envelope_hash) or surface
30
+ legitimate-duplicate (different envelope_hash)."""
31
+ now = datetime.now(timezone.utc).isoformat()
32
+ with closing(store.connect()) as conn:
33
+ with conn:
34
+ cur = conn.execute(
35
+ "insert or ignore into leader_notification_log("
36
+ " result_id, leader_session_uuid, notified_message_id, notified_at,"
37
+ " leader_pane_id_at_notify, envelope_content_hash, owner_team_id"
38
+ ") values (?, ?, ?, ?, ?, ?, ?)",
39
+ (
40
+ result_id, leader_session_uuid, proposed_message_id, now,
41
+ pane_id, envelope_hash, owner_team_id,
42
+ ),
43
+ )
44
+ if cur.rowcount == 1:
45
+ return {
46
+ "status": "claimed_by_you",
47
+ "notified_message_id": proposed_message_id,
48
+ "notified_at": now,
49
+ "envelope_content_hash": envelope_hash,
50
+ }
51
+ row = conn.execute(
52
+ "select notified_message_id, notified_at, envelope_content_hash, "
53
+ "leader_pane_id_at_notify from leader_notification_log "
54
+ "where result_id = ? and leader_session_uuid = ?",
55
+ (result_id, leader_session_uuid),
56
+ ).fetchone()
57
+ if row is None:
58
+ # Should not happen (INSERT OR IGNORE returned 0 → row must exist), but be defensive.
59
+ return {"status": "claimed_by_you", "notified_message_id": proposed_message_id,
60
+ "notified_at": now, "envelope_content_hash": envelope_hash}
61
+ prev_message_id, prev_ts, prev_hash, prev_pane = row[0], row[1], row[2], row[3]
62
+ return {
63
+ "status": "already_notified_by",
64
+ "notified_message_id": prev_message_id,
65
+ "notified_at": prev_ts,
66
+ "envelope_content_hash": prev_hash,
67
+ "leader_pane_id_at_notify": prev_pane,
68
+ }
69
+
70
+
71
+ def peek_leader_notification(
72
+ store: Any,
73
+ *,
74
+ result_id: str,
75
+ leader_session_uuid: str,
76
+ ) -> dict[str, Any] | None:
77
+ """Read-only fast-path peek (Stage 12). Returns the existing log row for
78
+ (result_id, leader_session_uuid) or None. Used by notify_result_watchers to short-
79
+ circuit before calling deliver_stored_message; the authoritative atomic claim still
80
+ happens at the _send_to_leader_receiver injection boundary."""
81
+ with closing(store.connect()) as conn:
82
+ row = conn.execute(
83
+ "select notified_message_id, notified_at, envelope_content_hash, "
84
+ "leader_pane_id_at_notify, owner_team_id from leader_notification_log "
85
+ "where result_id = ? and leader_session_uuid = ?",
86
+ (result_id, leader_session_uuid),
87
+ ).fetchone()
88
+ if row is None:
89
+ return None
90
+ return {
91
+ "notified_message_id": row[0],
92
+ "notified_at": row[1],
93
+ "envelope_content_hash": row[2],
94
+ "leader_pane_id_at_notify": row[3],
95
+ "owner_team_id": row[4],
96
+ }
97
+
98
+
99
+ def prune_leader_notification_log(store: Any, *, max_age_hours: int = 24) -> int:
100
+ """Coordinator-tick maintenance: drop rows older than max_age_hours. Cheap, bounded."""
101
+ cutoff = (datetime.now(timezone.utc) - timedelta(hours=max_age_hours)).isoformat()
102
+ with closing(store.connect()) as conn:
103
+ with conn:
104
+ cur = conn.execute(
105
+ "delete from leader_notification_log where notified_at < ?",
106
+ (cutoff,),
107
+ )
108
+ return cur.rowcount or 0
109
+
110
+
111
+ def leader_notification_log_rows(store: Any, *, owner_team_id: str | None = None) -> list[dict[str, Any]]:
112
+ """Test/diagnostic accessor. Returns all rows (optionally team-scoped)."""
113
+ with closing(store.connect()) as conn:
114
+ if owner_team_id is None:
115
+ rows = conn.execute(
116
+ "select * from leader_notification_log order by notified_at"
117
+ ).fetchall()
118
+ else:
119
+ rows = conn.execute(
120
+ "select * from leader_notification_log where owner_team_id = ? "
121
+ "or owner_team_id is null order by notified_at",
122
+ (owner_team_id,),
123
+ ).fetchall()
124
+ return [dict(row) for row in rows]
125
+
126
+
127
+ __all__ = [
128
+ "claim_leader_notification_delivery",
129
+ "peek_leader_notification",
130
+ "prune_leader_notification_log",
131
+ "leader_notification_log_rows",
132
+ ]
@@ -94,9 +94,152 @@ def requeue_delivery_exhausted_watchers(self) -> list[str]:
94
94
  ).fetchall()
95
95
  watcher_ids = [row[0] for row in rows]
96
96
  if watcher_ids:
97
+ # Phase D hotfix-3 (78055bc) cleared notified_message_id here; Gap 32 dedupe
98
+ # reverses that — preserve notified_message_id so the retry path can re-confirm
99
+ # (or skip if the same result_id was already injected on a different pane_id).
97
100
  conn.execute(
98
101
  "update result_watchers "
99
- "set status = 'notify_failed', error = null, notified_message_id = null, completed_at = null "
102
+ "set status = 'notify_failed', error = null, completed_at = null "
100
103
  "where status = 'delivery_exhausted'"
101
104
  )
102
105
  return watcher_ids
106
+
107
+
108
+ def claim_leader_notification(
109
+ store: Any,
110
+ owner_team_id: str | None,
111
+ result_id: str | None,
112
+ watcher_id: str,
113
+ proposed_token: str,
114
+ ) -> dict[str, Any]:
115
+ """DEPRECATED (Stage 12 roundtable retirement). The watcher-table UPSERT did not
116
+ actually prevent duplicate leader-pane injections in Mac mini real flow because two
117
+ independent code paths (scheduled_event branch + result_watchers branch) emit
118
+ deliver_attempt without coordinating at the watcher level. Replaced by
119
+ leader_notification_log.claim_leader_notification_delivery consulted inside
120
+ _send_to_leader_receiver. Kept here as a no-op shim so legacy callers / tests that
121
+ still import this symbol don't crash on import — but it does NOT perform a claim and
122
+ should NOT be used in new code."""
123
+ if not result_id:
124
+ return {"status": "deprecated_noop", "canonical_message_id": None}
125
+ return {"status": "deprecated_noop", "canonical_message_id": None}
126
+
127
+
128
+ def _claim_leader_notification_disabled_impl( # legacy reference for archaeology
129
+ store: Any,
130
+ owner_team_id: str | None,
131
+ result_id: str | None,
132
+ watcher_id: str,
133
+ proposed_token: str,
134
+ ) -> dict[str, Any]:
135
+ if not result_id:
136
+ return {"status": "no_result_id", "canonical_message_id": None}
137
+ with closing(store.connect()) as conn:
138
+ conn.isolation_level = None
139
+ try:
140
+ conn.execute("BEGIN IMMEDIATE")
141
+ if owner_team_id is None:
142
+ sibling = conn.execute(
143
+ "select notified_message_id from result_watchers "
144
+ "where result_id = ? and notified_message_id is not null "
145
+ "order by coalesce(completed_at, created_at) limit 1",
146
+ (result_id,),
147
+ ).fetchone()
148
+ else:
149
+ sibling = conn.execute(
150
+ "select notified_message_id from result_watchers "
151
+ "where result_id = ? and notified_message_id is not null "
152
+ "and (owner_team_id = ? or owner_team_id is null) "
153
+ "order by coalesce(completed_at, created_at) limit 1",
154
+ (result_id, owner_team_id),
155
+ ).fetchone()
156
+ if sibling and sibling[0]:
157
+ conn.execute("COMMIT")
158
+ return {"status": "already_notified_by", "canonical_message_id": sibling[0]}
159
+ cur = conn.execute(
160
+ "update result_watchers "
161
+ "set notified_message_id = ?, result_id = coalesce(result_id, ?) "
162
+ "where watcher_id = ? and notified_message_id is null",
163
+ (proposed_token, result_id, watcher_id),
164
+ )
165
+ if cur.rowcount == 1:
166
+ conn.execute("COMMIT")
167
+ return {"status": "claimed_by_you", "canonical_message_id": proposed_token}
168
+ row = conn.execute(
169
+ "select notified_message_id from result_watchers where watcher_id = ?",
170
+ (watcher_id,),
171
+ ).fetchone()
172
+ conn.execute("COMMIT")
173
+ return {
174
+ "status": "already_notified_by",
175
+ "canonical_message_id": (row[0] if row else None) or None,
176
+ }
177
+ except Exception:
178
+ try:
179
+ conn.execute("ROLLBACK")
180
+ except Exception:
181
+ pass
182
+ raise
183
+ finally:
184
+ conn.isolation_level = "" # restore default
185
+
186
+
187
+ def release_leader_notification_claim(
188
+ store: Any,
189
+ watcher_id: str,
190
+ expected_token: str,
191
+ ) -> bool:
192
+ """Release a sentinel claim after delivery failure so the next retry can re-claim.
193
+ Returns True iff we released the claim we owned (rowcount == 1)."""
194
+ with closing(store.connect()) as conn:
195
+ with conn:
196
+ cur = conn.execute(
197
+ "update result_watchers set notified_message_id = null "
198
+ "where watcher_id = ? and notified_message_id = ?",
199
+ (watcher_id, expected_token),
200
+ )
201
+ return cur.rowcount == 1
202
+
203
+
204
+ def promote_leader_notification_id(
205
+ store: Any,
206
+ watcher_id: str,
207
+ sentinel_token: str,
208
+ real_message_id: str,
209
+ ) -> bool:
210
+ """After successful delivery, replace the sentinel claim with the real message_id.
211
+ Returns True iff the promotion succeeded (rowcount == 1)."""
212
+ with closing(store.connect()) as conn:
213
+ with conn:
214
+ cur = conn.execute(
215
+ "update result_watchers set notified_message_id = ? "
216
+ "where watcher_id = ? and notified_message_id = ?",
217
+ (real_message_id, watcher_id, sentinel_token),
218
+ )
219
+ return cur.rowcount == 1
220
+
221
+
222
+ def leader_notified_message_id_for_result(
223
+ store: Any,
224
+ owner_team_id: str | None,
225
+ result_id: str | None,
226
+ ) -> str | None:
227
+ if not result_id:
228
+ return None
229
+ with closing(store.connect()) as conn:
230
+ if owner_team_id is None:
231
+ row = conn.execute(
232
+ "select notified_message_id from result_watchers "
233
+ "where result_id = ? and notified_message_id is not null "
234
+ "order by coalesce(completed_at, created_at) limit 1",
235
+ (result_id,),
236
+ ).fetchone()
237
+ else:
238
+ row = conn.execute(
239
+ "select notified_message_id from result_watchers "
240
+ "where result_id = ? and notified_message_id is not null "
241
+ "and (owner_team_id = ? or owner_team_id is null) "
242
+ "order by coalesce(completed_at, created_at) limit 1",
243
+ (result_id, owner_team_id),
244
+ ).fetchone()
245
+ return row[0] if row else None
@@ -231,6 +231,29 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
231
231
  RESULT_WATCHER_COLUMNS,
232
232
  {"owner_team_id": "alter table result_watchers add column owner_team_id text"},
233
233
  )
234
+ # Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): dedupe leader
235
+ # notifications at the injection boundary, keyed by (result_id, leader_session_uuid).
236
+ # UNIQUE primary key + INSERT OR IGNORE in claim_leader_notification_delivery gives
237
+ # atomic exactly-once without an advisory lock. Retires the bad6484 watcher-table
238
+ # UPSERT approach.
239
+ conn.execute(
240
+ """
241
+ create table if not exists leader_notification_log (
242
+ result_id text not null,
243
+ leader_session_uuid text not null,
244
+ notified_message_id text not null,
245
+ notified_at text not null,
246
+ leader_pane_id_at_notify text,
247
+ envelope_content_hash text,
248
+ owner_team_id text,
249
+ primary key (result_id, leader_session_uuid)
250
+ )
251
+ """
252
+ )
253
+ conn.execute(
254
+ "create index if not exists idx_leader_notification_log_uuid "
255
+ "on leader_notification_log(leader_session_uuid, notified_at)"
256
+ )
234
257
  conn.execute("create index if not exists idx_messages_owner_team_id on messages(owner_team_id)")
235
258
  conn.execute("create index if not exists idx_scheduled_events_owner_team_id on scheduled_events(owner_team_id)")
236
259
  conn.execute("create index if not exists idx_agent_health_owner_team_id on agent_health(owner_team_id)")
@@ -6,7 +6,7 @@ from typing import Any
6
6
 
7
7
  from team_agent.events import EventLog
8
8
  from team_agent.message_store import MessageStore
9
- from team_agent.messaging.deps import load_spec, save_runtime_state, team_state_key
9
+ from team_agent.messaging.deps import load_runtime_state, load_spec, save_runtime_state, team_state_key
10
10
  from team_agent.messaging.internal_delivery import deliver_stored_message
11
11
 
12
12
 
@@ -27,6 +27,21 @@ STABLE_IDLE_SECONDS = 120
27
27
  FIRE_DEBOUNCE_SECONDS = 300
28
28
  OBLIGATION_PENDING_MIN_AGE_SECONDS = 60
29
29
 
30
+ # Event-log progress signal (Gap 32 §"Idle-Detector False Positive Continues Post Phase G hotfix-3"):
31
+ # the team_last_progress_at calculation must also count leader-side sends and worker MCP calls
32
+ # as recent team activity, not only agent_health.last_output_at. Without this, a worker that has
33
+ # called MCP but not yet emitted a visible turn shows up as idle and the idle reminder fires
34
+ # spuriously inside the stable-idle window.
35
+ _PROGRESS_EVENT_TYPES = frozenset({
36
+ "send.deliver_attempt",
37
+ "leader_receiver.deliver_attempt",
38
+ "mcp.report_result",
39
+ "mcp.send_message",
40
+ })
41
+ _PROGRESS_EVENT_PREFIXES = ("mcp.read_",)
42
+ _PROGRESS_EVENT_WINDOW_SECONDS = 300
43
+ _PROGRESS_EVENT_TAIL_LIMIT = 1000
44
+
30
45
 
31
46
  def _parse_iso(text: Any) -> datetime | None:
32
47
  if not isinstance(text, str) or not text:
@@ -62,24 +77,105 @@ def _team_last_progress_at(
62
77
  state: dict[str, Any],
63
78
  store: MessageStore,
64
79
  owner_team_id: str,
65
- ) -> datetime | None:
66
- candidates: list[datetime] = []
80
+ event_log: EventLog | None = None,
81
+ now: datetime | None = None,
82
+ workspace: Path | None = None,
83
+ ) -> tuple[datetime | None, str | None]:
84
+ sources: list[tuple[datetime, str]] = []
67
85
  coordinator = state.get("coordinator") or {}
68
86
  explicit = (coordinator.get("team_last_progress_at") or {}).get(owner_team_id)
69
87
  if isinstance(explicit, dict):
70
88
  ts = _parse_iso(explicit.get("at"))
71
89
  if ts:
72
- candidates.append(ts)
90
+ sources.append((ts, "explicit_marker"))
73
91
  elif isinstance(explicit, str):
74
92
  ts = _parse_iso(explicit)
75
93
  if ts:
76
- candidates.append(ts)
94
+ sources.append((ts, "explicit_marker"))
77
95
  health = store.agent_health(owner_team_id=owner_team_id)
78
96
  for row in health.values():
79
97
  ts = _parse_iso(row.get("last_output_at"))
80
98
  if ts:
81
- candidates.append(ts)
82
- return max(candidates) if candidates else None
99
+ sources.append((ts, "agent_health.last_output_at"))
100
+ if event_log is not None:
101
+ # Spark MEDIUM #3 (d9f740d): in multi-team workspaces an unscoped progress event in
102
+ # team A's activity must NOT suppress team B's idle_fallback. require_team_scope=True
103
+ # when the workspace has more than one team so unscoped events are ignored. The
104
+ # team-scoped state passed in here does not carry the workspace-level `teams` dict, so
105
+ # we re-read the workspace state from disk to detect multi-team shape.
106
+ require_team_scope = False
107
+ teams = state.get("teams")
108
+ if isinstance(teams, dict) and len(teams) > 1:
109
+ require_team_scope = True
110
+ elif workspace is not None:
111
+ try:
112
+ ws_teams = (load_runtime_state(workspace).get("teams") or {})
113
+ except Exception:
114
+ ws_teams = {}
115
+ if isinstance(ws_teams, dict) and len(ws_teams) > 1:
116
+ require_team_scope = True
117
+ event_ts = _scan_event_progress_signals(
118
+ event_log, owner_team_id, now or datetime.now(timezone.utc),
119
+ require_team_scope=require_team_scope,
120
+ )
121
+ if event_ts:
122
+ sources.append((event_ts, "event_log"))
123
+ if not sources:
124
+ return None, None
125
+ sources.sort(key=lambda item: item[0], reverse=True)
126
+ return sources[0]
127
+
128
+
129
+ # Stage 14 (Gap 36b) — mtime cache per (workspace_path, owner_team_id, require_team_scope).
130
+ # Mac mini 2026-05-26 evidence: _scan_event_progress_signals was a 22% CPU hot path because
131
+ # every 2-second coordinator tick parsed up to 1000 events from a 28 MB events.jsonl. With
132
+ # the cache, the parse only re-runs when the file changes; quiet workspaces pay zero file
133
+ # I/O between writes.
134
+ _PROGRESS_SCAN_CACHE: dict[tuple[str, str, bool], tuple[float, datetime | None]] = {}
135
+
136
+
137
+ def _scan_event_progress_signals(
138
+ event_log: EventLog,
139
+ owner_team_id: str,
140
+ now: datetime,
141
+ *,
142
+ require_team_scope: bool = False,
143
+ ) -> datetime | None:
144
+ cache_key = (str(event_log.path), owner_team_id, require_team_scope)
145
+ try:
146
+ current_mtime = event_log.path.stat().st_mtime
147
+ except FileNotFoundError:
148
+ _PROGRESS_SCAN_CACHE.pop(cache_key, None)
149
+ return None
150
+ cached = _PROGRESS_SCAN_CACHE.get(cache_key)
151
+ if cached is not None and cached[0] == current_mtime:
152
+ return cached[1]
153
+ window_start = now - timedelta(seconds=_PROGRESS_EVENT_WINDOW_SECONDS)
154
+ latest: datetime | None = None
155
+ for event in event_log.tail(_PROGRESS_EVENT_TAIL_LIMIT):
156
+ event_type = str(event.get("event") or "")
157
+ if event_type not in _PROGRESS_EVENT_TYPES and not any(
158
+ event_type.startswith(prefix) for prefix in _PROGRESS_EVENT_PREFIXES
159
+ ):
160
+ continue
161
+ event_team = event.get("team") or event.get("owner_team_id")
162
+ if event_team is None:
163
+ if require_team_scope:
164
+ continue
165
+ elif event_team != owner_team_id:
166
+ continue
167
+ ts = _parse_iso(event.get("ts"))
168
+ if not ts or ts < window_start:
169
+ continue
170
+ if latest is None or ts > latest:
171
+ latest = ts
172
+ _PROGRESS_SCAN_CACHE[cache_key] = (current_mtime, latest)
173
+ return latest
174
+
175
+
176
+ def _reset_progress_scan_cache() -> None:
177
+ """Test-only hook to force re-scan."""
178
+ _PROGRESS_SCAN_CACHE.clear()
83
179
 
84
180
 
85
181
  def _team_last_idle_fallback_fire_at(state: dict[str, Any], owner_team_id: str) -> datetime | None:
@@ -209,14 +305,18 @@ def detect_idle_fallbacks(
209
305
  record_team_progress(state, now, source="all_workers_idle:false", owner_team_id=owner_team_id)
210
306
  save_runtime_state(workspace, state)
211
307
  return []
212
- last_progress = _team_last_progress_at(state, store, owner_team_id)
308
+ last_progress, progress_source = _team_last_progress_at(
309
+ state, store, owner_team_id, event_log=event_log, now=now, workspace=workspace,
310
+ )
213
311
  if last_progress and (now - last_progress) < timedelta(seconds=STABLE_IDLE_SECONDS):
312
+ reason = "recent_team_progress" if progress_source == "event_log" else "stable_idle_window"
214
313
  event_log.write(
215
314
  "coordinator.idle_fallback_skipped",
216
- reason="stable_idle_window",
315
+ reason=reason,
217
316
  team=owner_team_id,
218
317
  stable_idle_seconds=STABLE_IDLE_SECONDS,
219
318
  elapsed_seconds=int((now - last_progress).total_seconds()),
319
+ progress_source=progress_source,
220
320
  )
221
321
  return []
222
322
  last_fire = _team_last_idle_fallback_fire_at(state, owner_team_id)