@team-agent/installer 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/team_agent/cli/commands.py +18 -3
- package/src/team_agent/cli/parser.py +33 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +8 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +193 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +86 -9
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +23 -0
- package/src/team_agent/messaging/idle_alerts.py +109 -9
- package/src/team_agent/messaging/leader.py +166 -6
- package/src/team_agent/messaging/leader_panes.py +193 -23
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +12 -2
- package/src/team_agent/runtime.py +4 -4
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/state.py +153 -10
- package/src/team_agent/status/inbox.py +33 -3
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): atomic exactly-once
|
|
2
|
+
dedupe at the leader-pane injection boundary, keyed by (result_id, leader_session_uuid).
|
|
3
|
+
|
|
4
|
+
Replaces the bad6484 watcher-table UPSERT approach. UNIQUE primary key + SQLite
|
|
5
|
+
INSERT OR IGNORE gives an atomic claim that works across processes (CLI subprocess
|
|
6
|
+
vs coordinator daemon) and across threads without an advisory lock. Distinct
|
|
7
|
+
leader_session_uuid values (e.g. after takeover) each get their own row so a
|
|
8
|
+
re-takeover legitimately allows another delivery for the same result_id.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from contextlib import closing
|
|
13
|
+
from datetime import datetime, timedelta, timezone
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def claim_leader_notification_delivery(
|
|
18
|
+
store: Any,
|
|
19
|
+
*,
|
|
20
|
+
result_id: str,
|
|
21
|
+
leader_session_uuid: str,
|
|
22
|
+
proposed_message_id: str,
|
|
23
|
+
envelope_hash: str,
|
|
24
|
+
owner_team_id: str | None,
|
|
25
|
+
pane_id: str | None,
|
|
26
|
+
) -> dict[str, Any]:
|
|
27
|
+
"""Atomic claim. INSERT OR IGNORE → rowcount=1 means we won, fire the inject.
|
|
28
|
+
rowcount=0 means a prior row exists for (result_id, leader_session_uuid); SELECT
|
|
29
|
+
it and return so the caller can decide to suppress (same envelope_hash) or surface
|
|
30
|
+
legitimate-duplicate (different envelope_hash)."""
|
|
31
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
32
|
+
with closing(store.connect()) as conn:
|
|
33
|
+
with conn:
|
|
34
|
+
cur = conn.execute(
|
|
35
|
+
"insert or ignore into leader_notification_log("
|
|
36
|
+
" result_id, leader_session_uuid, notified_message_id, notified_at,"
|
|
37
|
+
" leader_pane_id_at_notify, envelope_content_hash, owner_team_id"
|
|
38
|
+
") values (?, ?, ?, ?, ?, ?, ?)",
|
|
39
|
+
(
|
|
40
|
+
result_id, leader_session_uuid, proposed_message_id, now,
|
|
41
|
+
pane_id, envelope_hash, owner_team_id,
|
|
42
|
+
),
|
|
43
|
+
)
|
|
44
|
+
if cur.rowcount == 1:
|
|
45
|
+
return {
|
|
46
|
+
"status": "claimed_by_you",
|
|
47
|
+
"notified_message_id": proposed_message_id,
|
|
48
|
+
"notified_at": now,
|
|
49
|
+
"envelope_content_hash": envelope_hash,
|
|
50
|
+
}
|
|
51
|
+
row = conn.execute(
|
|
52
|
+
"select notified_message_id, notified_at, envelope_content_hash, "
|
|
53
|
+
"leader_pane_id_at_notify from leader_notification_log "
|
|
54
|
+
"where result_id = ? and leader_session_uuid = ?",
|
|
55
|
+
(result_id, leader_session_uuid),
|
|
56
|
+
).fetchone()
|
|
57
|
+
if row is None:
|
|
58
|
+
# Should not happen (INSERT OR IGNORE returned 0 → row must exist), but be defensive.
|
|
59
|
+
return {"status": "claimed_by_you", "notified_message_id": proposed_message_id,
|
|
60
|
+
"notified_at": now, "envelope_content_hash": envelope_hash}
|
|
61
|
+
prev_message_id, prev_ts, prev_hash, prev_pane = row[0], row[1], row[2], row[3]
|
|
62
|
+
return {
|
|
63
|
+
"status": "already_notified_by",
|
|
64
|
+
"notified_message_id": prev_message_id,
|
|
65
|
+
"notified_at": prev_ts,
|
|
66
|
+
"envelope_content_hash": prev_hash,
|
|
67
|
+
"leader_pane_id_at_notify": prev_pane,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def peek_leader_notification(
|
|
72
|
+
store: Any,
|
|
73
|
+
*,
|
|
74
|
+
result_id: str,
|
|
75
|
+
leader_session_uuid: str,
|
|
76
|
+
) -> dict[str, Any] | None:
|
|
77
|
+
"""Read-only fast-path peek (Stage 12). Returns the existing log row for
|
|
78
|
+
(result_id, leader_session_uuid) or None. Used by notify_result_watchers to short-
|
|
79
|
+
circuit before calling deliver_stored_message; the authoritative atomic claim still
|
|
80
|
+
happens at the _send_to_leader_receiver injection boundary."""
|
|
81
|
+
with closing(store.connect()) as conn:
|
|
82
|
+
row = conn.execute(
|
|
83
|
+
"select notified_message_id, notified_at, envelope_content_hash, "
|
|
84
|
+
"leader_pane_id_at_notify, owner_team_id from leader_notification_log "
|
|
85
|
+
"where result_id = ? and leader_session_uuid = ?",
|
|
86
|
+
(result_id, leader_session_uuid),
|
|
87
|
+
).fetchone()
|
|
88
|
+
if row is None:
|
|
89
|
+
return None
|
|
90
|
+
return {
|
|
91
|
+
"notified_message_id": row[0],
|
|
92
|
+
"notified_at": row[1],
|
|
93
|
+
"envelope_content_hash": row[2],
|
|
94
|
+
"leader_pane_id_at_notify": row[3],
|
|
95
|
+
"owner_team_id": row[4],
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def prune_leader_notification_log(store: Any, *, max_age_hours: int = 24) -> int:
|
|
100
|
+
"""Coordinator-tick maintenance: drop rows older than max_age_hours. Cheap, bounded."""
|
|
101
|
+
cutoff = (datetime.now(timezone.utc) - timedelta(hours=max_age_hours)).isoformat()
|
|
102
|
+
with closing(store.connect()) as conn:
|
|
103
|
+
with conn:
|
|
104
|
+
cur = conn.execute(
|
|
105
|
+
"delete from leader_notification_log where notified_at < ?",
|
|
106
|
+
(cutoff,),
|
|
107
|
+
)
|
|
108
|
+
return cur.rowcount or 0
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def leader_notification_log_rows(store: Any, *, owner_team_id: str | None = None) -> list[dict[str, Any]]:
|
|
112
|
+
"""Test/diagnostic accessor. Returns all rows (optionally team-scoped)."""
|
|
113
|
+
with closing(store.connect()) as conn:
|
|
114
|
+
if owner_team_id is None:
|
|
115
|
+
rows = conn.execute(
|
|
116
|
+
"select * from leader_notification_log order by notified_at"
|
|
117
|
+
).fetchall()
|
|
118
|
+
else:
|
|
119
|
+
rows = conn.execute(
|
|
120
|
+
"select * from leader_notification_log where owner_team_id = ? "
|
|
121
|
+
"or owner_team_id is null order by notified_at",
|
|
122
|
+
(owner_team_id,),
|
|
123
|
+
).fetchall()
|
|
124
|
+
return [dict(row) for row in rows]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
__all__ = [
|
|
128
|
+
"claim_leader_notification_delivery",
|
|
129
|
+
"peek_leader_notification",
|
|
130
|
+
"prune_leader_notification_log",
|
|
131
|
+
"leader_notification_log_rows",
|
|
132
|
+
]
|
|
@@ -94,9 +94,152 @@ def requeue_delivery_exhausted_watchers(self) -> list[str]:
|
|
|
94
94
|
).fetchall()
|
|
95
95
|
watcher_ids = [row[0] for row in rows]
|
|
96
96
|
if watcher_ids:
|
|
97
|
+
# Phase D hotfix-3 (78055bc) cleared notified_message_id here; Gap 32 dedupe
|
|
98
|
+
# reverses that — preserve notified_message_id so the retry path can re-confirm
|
|
99
|
+
# (or skip if the same result_id was already injected on a different pane_id).
|
|
97
100
|
conn.execute(
|
|
98
101
|
"update result_watchers "
|
|
99
|
-
"set status = 'notify_failed', error = null,
|
|
102
|
+
"set status = 'notify_failed', error = null, completed_at = null "
|
|
100
103
|
"where status = 'delivery_exhausted'"
|
|
101
104
|
)
|
|
102
105
|
return watcher_ids
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def claim_leader_notification(
|
|
109
|
+
store: Any,
|
|
110
|
+
owner_team_id: str | None,
|
|
111
|
+
result_id: str | None,
|
|
112
|
+
watcher_id: str,
|
|
113
|
+
proposed_token: str,
|
|
114
|
+
) -> dict[str, Any]:
|
|
115
|
+
"""DEPRECATED (Stage 12 roundtable retirement). The watcher-table UPSERT did not
|
|
116
|
+
actually prevent duplicate leader-pane injections in Mac mini real flow because two
|
|
117
|
+
independent code paths (scheduled_event branch + result_watchers branch) emit
|
|
118
|
+
deliver_attempt without coordinating at the watcher level. Replaced by
|
|
119
|
+
leader_notification_log.claim_leader_notification_delivery consulted inside
|
|
120
|
+
_send_to_leader_receiver. Kept here as a no-op shim so legacy callers / tests that
|
|
121
|
+
still import this symbol don't crash on import — but it does NOT perform a claim and
|
|
122
|
+
should NOT be used in new code."""
|
|
123
|
+
if not result_id:
|
|
124
|
+
return {"status": "deprecated_noop", "canonical_message_id": None}
|
|
125
|
+
return {"status": "deprecated_noop", "canonical_message_id": None}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _claim_leader_notification_disabled_impl( # legacy reference for archaeology
|
|
129
|
+
store: Any,
|
|
130
|
+
owner_team_id: str | None,
|
|
131
|
+
result_id: str | None,
|
|
132
|
+
watcher_id: str,
|
|
133
|
+
proposed_token: str,
|
|
134
|
+
) -> dict[str, Any]:
|
|
135
|
+
if not result_id:
|
|
136
|
+
return {"status": "no_result_id", "canonical_message_id": None}
|
|
137
|
+
with closing(store.connect()) as conn:
|
|
138
|
+
conn.isolation_level = None
|
|
139
|
+
try:
|
|
140
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
141
|
+
if owner_team_id is None:
|
|
142
|
+
sibling = conn.execute(
|
|
143
|
+
"select notified_message_id from result_watchers "
|
|
144
|
+
"where result_id = ? and notified_message_id is not null "
|
|
145
|
+
"order by coalesce(completed_at, created_at) limit 1",
|
|
146
|
+
(result_id,),
|
|
147
|
+
).fetchone()
|
|
148
|
+
else:
|
|
149
|
+
sibling = conn.execute(
|
|
150
|
+
"select notified_message_id from result_watchers "
|
|
151
|
+
"where result_id = ? and notified_message_id is not null "
|
|
152
|
+
"and (owner_team_id = ? or owner_team_id is null) "
|
|
153
|
+
"order by coalesce(completed_at, created_at) limit 1",
|
|
154
|
+
(result_id, owner_team_id),
|
|
155
|
+
).fetchone()
|
|
156
|
+
if sibling and sibling[0]:
|
|
157
|
+
conn.execute("COMMIT")
|
|
158
|
+
return {"status": "already_notified_by", "canonical_message_id": sibling[0]}
|
|
159
|
+
cur = conn.execute(
|
|
160
|
+
"update result_watchers "
|
|
161
|
+
"set notified_message_id = ?, result_id = coalesce(result_id, ?) "
|
|
162
|
+
"where watcher_id = ? and notified_message_id is null",
|
|
163
|
+
(proposed_token, result_id, watcher_id),
|
|
164
|
+
)
|
|
165
|
+
if cur.rowcount == 1:
|
|
166
|
+
conn.execute("COMMIT")
|
|
167
|
+
return {"status": "claimed_by_you", "canonical_message_id": proposed_token}
|
|
168
|
+
row = conn.execute(
|
|
169
|
+
"select notified_message_id from result_watchers where watcher_id = ?",
|
|
170
|
+
(watcher_id,),
|
|
171
|
+
).fetchone()
|
|
172
|
+
conn.execute("COMMIT")
|
|
173
|
+
return {
|
|
174
|
+
"status": "already_notified_by",
|
|
175
|
+
"canonical_message_id": (row[0] if row else None) or None,
|
|
176
|
+
}
|
|
177
|
+
except Exception:
|
|
178
|
+
try:
|
|
179
|
+
conn.execute("ROLLBACK")
|
|
180
|
+
except Exception:
|
|
181
|
+
pass
|
|
182
|
+
raise
|
|
183
|
+
finally:
|
|
184
|
+
conn.isolation_level = "" # restore default
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def release_leader_notification_claim(
|
|
188
|
+
store: Any,
|
|
189
|
+
watcher_id: str,
|
|
190
|
+
expected_token: str,
|
|
191
|
+
) -> bool:
|
|
192
|
+
"""Release a sentinel claim after delivery failure so the next retry can re-claim.
|
|
193
|
+
Returns True iff we released the claim we owned (rowcount == 1)."""
|
|
194
|
+
with closing(store.connect()) as conn:
|
|
195
|
+
with conn:
|
|
196
|
+
cur = conn.execute(
|
|
197
|
+
"update result_watchers set notified_message_id = null "
|
|
198
|
+
"where watcher_id = ? and notified_message_id = ?",
|
|
199
|
+
(watcher_id, expected_token),
|
|
200
|
+
)
|
|
201
|
+
return cur.rowcount == 1
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def promote_leader_notification_id(
|
|
205
|
+
store: Any,
|
|
206
|
+
watcher_id: str,
|
|
207
|
+
sentinel_token: str,
|
|
208
|
+
real_message_id: str,
|
|
209
|
+
) -> bool:
|
|
210
|
+
"""After successful delivery, replace the sentinel claim with the real message_id.
|
|
211
|
+
Returns True iff the promotion succeeded (rowcount == 1)."""
|
|
212
|
+
with closing(store.connect()) as conn:
|
|
213
|
+
with conn:
|
|
214
|
+
cur = conn.execute(
|
|
215
|
+
"update result_watchers set notified_message_id = ? "
|
|
216
|
+
"where watcher_id = ? and notified_message_id = ?",
|
|
217
|
+
(real_message_id, watcher_id, sentinel_token),
|
|
218
|
+
)
|
|
219
|
+
return cur.rowcount == 1
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def leader_notified_message_id_for_result(
|
|
223
|
+
store: Any,
|
|
224
|
+
owner_team_id: str | None,
|
|
225
|
+
result_id: str | None,
|
|
226
|
+
) -> str | None:
|
|
227
|
+
if not result_id:
|
|
228
|
+
return None
|
|
229
|
+
with closing(store.connect()) as conn:
|
|
230
|
+
if owner_team_id is None:
|
|
231
|
+
row = conn.execute(
|
|
232
|
+
"select notified_message_id from result_watchers "
|
|
233
|
+
"where result_id = ? and notified_message_id is not null "
|
|
234
|
+
"order by coalesce(completed_at, created_at) limit 1",
|
|
235
|
+
(result_id,),
|
|
236
|
+
).fetchone()
|
|
237
|
+
else:
|
|
238
|
+
row = conn.execute(
|
|
239
|
+
"select notified_message_id from result_watchers "
|
|
240
|
+
"where result_id = ? and notified_message_id is not null "
|
|
241
|
+
"and (owner_team_id = ? or owner_team_id is null) "
|
|
242
|
+
"order by coalesce(completed_at, created_at) limit 1",
|
|
243
|
+
(result_id, owner_team_id),
|
|
244
|
+
).fetchone()
|
|
245
|
+
return row[0] if row else None
|
|
@@ -231,6 +231,29 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
|
|
|
231
231
|
RESULT_WATCHER_COLUMNS,
|
|
232
232
|
{"owner_team_id": "alter table result_watchers add column owner_team_id text"},
|
|
233
233
|
)
|
|
234
|
+
# Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): dedupe leader
|
|
235
|
+
# notifications at the injection boundary, keyed by (result_id, leader_session_uuid).
|
|
236
|
+
# UNIQUE primary key + INSERT OR IGNORE in claim_leader_notification_delivery gives
|
|
237
|
+
# atomic exactly-once without an advisory lock. Retires the bad6484 watcher-table
|
|
238
|
+
# UPSERT approach.
|
|
239
|
+
conn.execute(
|
|
240
|
+
"""
|
|
241
|
+
create table if not exists leader_notification_log (
|
|
242
|
+
result_id text not null,
|
|
243
|
+
leader_session_uuid text not null,
|
|
244
|
+
notified_message_id text not null,
|
|
245
|
+
notified_at text not null,
|
|
246
|
+
leader_pane_id_at_notify text,
|
|
247
|
+
envelope_content_hash text,
|
|
248
|
+
owner_team_id text,
|
|
249
|
+
primary key (result_id, leader_session_uuid)
|
|
250
|
+
)
|
|
251
|
+
"""
|
|
252
|
+
)
|
|
253
|
+
conn.execute(
|
|
254
|
+
"create index if not exists idx_leader_notification_log_uuid "
|
|
255
|
+
"on leader_notification_log(leader_session_uuid, notified_at)"
|
|
256
|
+
)
|
|
234
257
|
conn.execute("create index if not exists idx_messages_owner_team_id on messages(owner_team_id)")
|
|
235
258
|
conn.execute("create index if not exists idx_scheduled_events_owner_team_id on scheduled_events(owner_team_id)")
|
|
236
259
|
conn.execute("create index if not exists idx_agent_health_owner_team_id on agent_health(owner_team_id)")
|
|
@@ -6,7 +6,7 @@ from typing import Any
|
|
|
6
6
|
|
|
7
7
|
from team_agent.events import EventLog
|
|
8
8
|
from team_agent.message_store import MessageStore
|
|
9
|
-
from team_agent.messaging.deps import load_spec, save_runtime_state, team_state_key
|
|
9
|
+
from team_agent.messaging.deps import load_runtime_state, load_spec, save_runtime_state, team_state_key
|
|
10
10
|
from team_agent.messaging.internal_delivery import deliver_stored_message
|
|
11
11
|
|
|
12
12
|
|
|
@@ -27,6 +27,21 @@ STABLE_IDLE_SECONDS = 120
|
|
|
27
27
|
FIRE_DEBOUNCE_SECONDS = 300
|
|
28
28
|
OBLIGATION_PENDING_MIN_AGE_SECONDS = 60
|
|
29
29
|
|
|
30
|
+
# Event-log progress signal (Gap 32 §"Idle-Detector False Positive Continues Post Phase G hotfix-3"):
|
|
31
|
+
# the team_last_progress_at calculation must also count leader-side sends and worker MCP calls
|
|
32
|
+
# as recent team activity, not only agent_health.last_output_at. Without this, a worker that has
|
|
33
|
+
# called MCP but not yet emitted a visible turn shows up as idle and the idle reminder fires
|
|
34
|
+
# spuriously inside the stable-idle window.
|
|
35
|
+
_PROGRESS_EVENT_TYPES = frozenset({
|
|
36
|
+
"send.deliver_attempt",
|
|
37
|
+
"leader_receiver.deliver_attempt",
|
|
38
|
+
"mcp.report_result",
|
|
39
|
+
"mcp.send_message",
|
|
40
|
+
})
|
|
41
|
+
_PROGRESS_EVENT_PREFIXES = ("mcp.read_",)
|
|
42
|
+
_PROGRESS_EVENT_WINDOW_SECONDS = 300
|
|
43
|
+
_PROGRESS_EVENT_TAIL_LIMIT = 1000
|
|
44
|
+
|
|
30
45
|
|
|
31
46
|
def _parse_iso(text: Any) -> datetime | None:
|
|
32
47
|
if not isinstance(text, str) or not text:
|
|
@@ -62,24 +77,105 @@ def _team_last_progress_at(
|
|
|
62
77
|
state: dict[str, Any],
|
|
63
78
|
store: MessageStore,
|
|
64
79
|
owner_team_id: str,
|
|
65
|
-
|
|
66
|
-
|
|
80
|
+
event_log: EventLog | None = None,
|
|
81
|
+
now: datetime | None = None,
|
|
82
|
+
workspace: Path | None = None,
|
|
83
|
+
) -> tuple[datetime | None, str | None]:
|
|
84
|
+
sources: list[tuple[datetime, str]] = []
|
|
67
85
|
coordinator = state.get("coordinator") or {}
|
|
68
86
|
explicit = (coordinator.get("team_last_progress_at") or {}).get(owner_team_id)
|
|
69
87
|
if isinstance(explicit, dict):
|
|
70
88
|
ts = _parse_iso(explicit.get("at"))
|
|
71
89
|
if ts:
|
|
72
|
-
|
|
90
|
+
sources.append((ts, "explicit_marker"))
|
|
73
91
|
elif isinstance(explicit, str):
|
|
74
92
|
ts = _parse_iso(explicit)
|
|
75
93
|
if ts:
|
|
76
|
-
|
|
94
|
+
sources.append((ts, "explicit_marker"))
|
|
77
95
|
health = store.agent_health(owner_team_id=owner_team_id)
|
|
78
96
|
for row in health.values():
|
|
79
97
|
ts = _parse_iso(row.get("last_output_at"))
|
|
80
98
|
if ts:
|
|
81
|
-
|
|
82
|
-
|
|
99
|
+
sources.append((ts, "agent_health.last_output_at"))
|
|
100
|
+
if event_log is not None:
|
|
101
|
+
# Spark MEDIUM #3 (d9f740d): in multi-team workspaces an unscoped progress event in
|
|
102
|
+
# team A's activity must NOT suppress team B's idle_fallback. require_team_scope=True
|
|
103
|
+
# when the workspace has more than one team so unscoped events are ignored. The
|
|
104
|
+
# team-scoped state passed in here does not carry the workspace-level `teams` dict, so
|
|
105
|
+
# we re-read the workspace state from disk to detect multi-team shape.
|
|
106
|
+
require_team_scope = False
|
|
107
|
+
teams = state.get("teams")
|
|
108
|
+
if isinstance(teams, dict) and len(teams) > 1:
|
|
109
|
+
require_team_scope = True
|
|
110
|
+
elif workspace is not None:
|
|
111
|
+
try:
|
|
112
|
+
ws_teams = (load_runtime_state(workspace).get("teams") or {})
|
|
113
|
+
except Exception:
|
|
114
|
+
ws_teams = {}
|
|
115
|
+
if isinstance(ws_teams, dict) and len(ws_teams) > 1:
|
|
116
|
+
require_team_scope = True
|
|
117
|
+
event_ts = _scan_event_progress_signals(
|
|
118
|
+
event_log, owner_team_id, now or datetime.now(timezone.utc),
|
|
119
|
+
require_team_scope=require_team_scope,
|
|
120
|
+
)
|
|
121
|
+
if event_ts:
|
|
122
|
+
sources.append((event_ts, "event_log"))
|
|
123
|
+
if not sources:
|
|
124
|
+
return None, None
|
|
125
|
+
sources.sort(key=lambda item: item[0], reverse=True)
|
|
126
|
+
return sources[0]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# Stage 14 (Gap 36b) — mtime cache per (workspace_path, owner_team_id, require_team_scope).
|
|
130
|
+
# Mac mini 2026-05-26 evidence: _scan_event_progress_signals was a 22% CPU hot path because
|
|
131
|
+
# every 2-second coordinator tick parsed up to 1000 events from a 28 MB events.jsonl. With
|
|
132
|
+
# the cache, the parse only re-runs when the file changes; quiet workspaces pay zero file
|
|
133
|
+
# I/O between writes.
|
|
134
|
+
_PROGRESS_SCAN_CACHE: dict[tuple[str, str, bool], tuple[float, datetime | None]] = {}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _scan_event_progress_signals(
|
|
138
|
+
event_log: EventLog,
|
|
139
|
+
owner_team_id: str,
|
|
140
|
+
now: datetime,
|
|
141
|
+
*,
|
|
142
|
+
require_team_scope: bool = False,
|
|
143
|
+
) -> datetime | None:
|
|
144
|
+
cache_key = (str(event_log.path), owner_team_id, require_team_scope)
|
|
145
|
+
try:
|
|
146
|
+
current_mtime = event_log.path.stat().st_mtime
|
|
147
|
+
except FileNotFoundError:
|
|
148
|
+
_PROGRESS_SCAN_CACHE.pop(cache_key, None)
|
|
149
|
+
return None
|
|
150
|
+
cached = _PROGRESS_SCAN_CACHE.get(cache_key)
|
|
151
|
+
if cached is not None and cached[0] == current_mtime:
|
|
152
|
+
return cached[1]
|
|
153
|
+
window_start = now - timedelta(seconds=_PROGRESS_EVENT_WINDOW_SECONDS)
|
|
154
|
+
latest: datetime | None = None
|
|
155
|
+
for event in event_log.tail(_PROGRESS_EVENT_TAIL_LIMIT):
|
|
156
|
+
event_type = str(event.get("event") or "")
|
|
157
|
+
if event_type not in _PROGRESS_EVENT_TYPES and not any(
|
|
158
|
+
event_type.startswith(prefix) for prefix in _PROGRESS_EVENT_PREFIXES
|
|
159
|
+
):
|
|
160
|
+
continue
|
|
161
|
+
event_team = event.get("team") or event.get("owner_team_id")
|
|
162
|
+
if event_team is None:
|
|
163
|
+
if require_team_scope:
|
|
164
|
+
continue
|
|
165
|
+
elif event_team != owner_team_id:
|
|
166
|
+
continue
|
|
167
|
+
ts = _parse_iso(event.get("ts"))
|
|
168
|
+
if not ts or ts < window_start:
|
|
169
|
+
continue
|
|
170
|
+
if latest is None or ts > latest:
|
|
171
|
+
latest = ts
|
|
172
|
+
_PROGRESS_SCAN_CACHE[cache_key] = (current_mtime, latest)
|
|
173
|
+
return latest
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _reset_progress_scan_cache() -> None:
|
|
177
|
+
"""Test-only hook to force re-scan."""
|
|
178
|
+
_PROGRESS_SCAN_CACHE.clear()
|
|
83
179
|
|
|
84
180
|
|
|
85
181
|
def _team_last_idle_fallback_fire_at(state: dict[str, Any], owner_team_id: str) -> datetime | None:
|
|
@@ -209,14 +305,18 @@ def detect_idle_fallbacks(
|
|
|
209
305
|
record_team_progress(state, now, source="all_workers_idle:false", owner_team_id=owner_team_id)
|
|
210
306
|
save_runtime_state(workspace, state)
|
|
211
307
|
return []
|
|
212
|
-
last_progress = _team_last_progress_at(
|
|
308
|
+
last_progress, progress_source = _team_last_progress_at(
|
|
309
|
+
state, store, owner_team_id, event_log=event_log, now=now, workspace=workspace,
|
|
310
|
+
)
|
|
213
311
|
if last_progress and (now - last_progress) < timedelta(seconds=STABLE_IDLE_SECONDS):
|
|
312
|
+
reason = "recent_team_progress" if progress_source == "event_log" else "stable_idle_window"
|
|
214
313
|
event_log.write(
|
|
215
314
|
"coordinator.idle_fallback_skipped",
|
|
216
|
-
reason=
|
|
315
|
+
reason=reason,
|
|
217
316
|
team=owner_team_id,
|
|
218
317
|
stable_idle_seconds=STABLE_IDLE_SECONDS,
|
|
219
318
|
elapsed_seconds=int((now - last_progress).total_seconds()),
|
|
319
|
+
progress_source=progress_source,
|
|
220
320
|
)
|
|
221
321
|
return []
|
|
222
322
|
last_fire = _team_last_idle_fallback_fire_at(state, owner_team_id)
|