@team-agent/installer 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/package.json +1 -1
  2. package/schemas/team.schema.json +6 -0
  3. package/src/team_agent/approvals/runtime_prompts.py +1 -1
  4. package/src/team_agent/cli/commands.py +122 -6
  5. package/src/team_agent/cli/parser.py +42 -1
  6. package/src/team_agent/coordinator/__main__.py +21 -2
  7. package/src/team_agent/coordinator/lifecycle.py +11 -0
  8. package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
  9. package/src/team_agent/events.py +47 -0
  10. package/src/team_agent/launch/core.py +2 -1
  11. package/src/team_agent/leader/__init__.py +273 -60
  12. package/src/team_agent/lifecycle/agents.py +54 -2
  13. package/src/team_agent/lifecycle/operations.py +87 -9
  14. package/src/team_agent/lifecycle/start.py +1 -1
  15. package/src/team_agent/message_store/core.py +8 -7
  16. package/src/team_agent/message_store/leader_notification_log.py +132 -0
  17. package/src/team_agent/message_store/result_watchers.py +144 -1
  18. package/src/team_agent/message_store/schema.py +31 -2
  19. package/src/team_agent/messaging/delivery.py +293 -1
  20. package/src/team_agent/messaging/idle_alerts.py +109 -9
  21. package/src/team_agent/messaging/leader.py +179 -10
  22. package/src/team_agent/messaging/leader_api_errors.py +216 -0
  23. package/src/team_agent/messaging/leader_panes.py +393 -23
  24. package/src/team_agent/messaging/result_delivery.py +219 -4
  25. package/src/team_agent/messaging/results.py +12 -21
  26. package/src/team_agent/messaging/scheduler.py +24 -2
  27. package/src/team_agent/messaging/send.py +21 -26
  28. package/src/team_agent/messaging/tmux_io.py +153 -23
  29. package/src/team_agent/messaging/tmux_prompt.py +87 -0
  30. package/src/team_agent/messaging/trust_auto_answer.py +44 -0
  31. package/src/team_agent/restart/orchestration.py +207 -4
  32. package/src/team_agent/runtime.py +7 -7
  33. package/src/team_agent/rust_core.py +157 -3
  34. package/src/team_agent/sessions/capture.py +65 -15
  35. package/src/team_agent/spec.py +59 -0
  36. package/src/team_agent/state.py +153 -10
  37. package/src/team_agent/status/inbox.py +33 -3
  38. package/src/team_agent/status/queries.py +32 -1
  39. package/src/team_agent/watch/__init__.py +145 -0
@@ -10,15 +10,28 @@ from team_agent.messaging.deps import (
10
10
  core_render_message,
11
11
  )
12
12
 
13
+ from datetime import datetime, timedelta, timezone
13
14
  from pathlib import Path
14
15
  from typing import Any
15
16
 
17
+
18
+ # Spark MEDIUM sweep #3 (2026-05-26): retry_needed bounded backoff. Each entry is
19
+ # the delay (seconds) BEFORE the attempt with that number runs; attempt 1 was the
20
+ # original delivery, attempt 2 fires 5s after retry_needed, attempt 3 fires 15s
21
+ # after the previous, attempt 4 fires 30s after the previous. _TRUST_RETRY_MAX_ATTEMPTS
22
+ # bounds the total — the 4th retry_needed is terminal and emits
23
+ # leader_panes.trust_auto_answer_exhausted.
24
+ _TRUST_RETRY_BACKOFF_SECONDS = {2: 5, 3: 15, 4: 30}
25
+ _TRUST_RETRY_MAX_ATTEMPTS = 4
26
+
16
27
  def _deliver_pending_message(
17
28
  workspace: Path,
18
29
  state: dict[str, Any],
19
30
  message_id: str,
20
31
  wait_visible: bool = True,
21
32
  timeout: float = 30.0,
33
+ *,
34
+ _trust_retry_attempt: int = 1,
22
35
  ) -> dict[str, Any]:
23
36
  store = MessageStore(workspace)
24
37
  row = next((m for m in store.messages() if m["message_id"] == message_id), None)
@@ -65,9 +78,49 @@ def _deliver_pending_message(
65
78
  attempts=3 if wait_visible else 1,
66
79
  provider=agent_state.get("provider", "fake"),
67
80
  )
81
+ if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
82
+ # Gap 29 (Stage 2): opt-in trust auto-answer. The helper enforces both the
83
+ # opt-in flag and a workspace-dir match before sending '1'+Enter, then we
84
+ # retry the original paste once the prompt has actually been dismissed.
85
+ # Bypassed entirely when opt-out (default) — the existing failed envelope
86
+ # is preserved.
87
+ from team_agent.messaging.leader_panes import attempt_trust_auto_answer
88
+ answer = attempt_trust_auto_answer(
89
+ workspace,
90
+ injection.get("pane_id") or target,
91
+ injection.get("pane_capture_tail") or "",
92
+ EventLog(workspace),
93
+ state=state,
94
+ )
95
+ if answer.get("answered"):
96
+ # Spark MEDIUM #4 (2026-05-26): replace the fixed 0.3s sleep with a
97
+ # bounded poll. Slow terminals can take well over a second to clear
98
+ # the trust prompt; sleeping a fixed amount races dismissal and
99
+ # leaves the retry hitting the same codex_trust_prompt state. We
100
+ # poll for prompt dismissal up to 3s; if still present, return a
101
+ # retry_needed envelope and let the upstream scheduler decide
102
+ # whether to back off and try again later.
103
+ dismissed = _wait_for_trust_prompt_dismissal(
104
+ injection.get("pane_id") or target, timeout=3.0,
105
+ )
106
+ if not dismissed:
107
+ return _handle_trust_retry_needed(
108
+ workspace, state, store, message_id, target, injection,
109
+ attempt=_trust_retry_attempt,
110
+ )
111
+ injection = _tmux_inject_text(
112
+ target,
113
+ text,
114
+ "Enter",
115
+ f"team-agent-send-{message_id}-trust-retry",
116
+ attempts=3 if wait_visible else 1,
117
+ provider=agent_state.get("provider", "fake"),
118
+ )
68
119
  if injection["ok"]:
69
120
  store.mark(message_id, "submitted")
70
- EventLog(workspace).write(
121
+ send_event_log = EventLog(workspace)
122
+ _stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
123
+ send_event_log.write(
71
124
  "send.submitted",
72
125
  message_id=message_id,
73
126
  target=target,
@@ -112,9 +165,248 @@ def _deliver_pending_message(
112
165
  "turn_verification": injection.get("turn_verification"),
113
166
  "paste_attempts": injection.get("attempts"),
114
167
  "submit_attempts": injection.get("submit_attempts"),
168
+ "detected": injection.get("detected"),
169
+ "pane_id": injection.get("pane_id"),
170
+ "pane_mode": injection.get("pane_mode"),
171
+ "pane_capture_tail": injection.get("pane_capture_tail"),
115
172
  }
116
173
 
117
174
 
175
+ def _handle_trust_retry_needed(
176
+ workspace: Path,
177
+ state: dict[str, Any],
178
+ store: MessageStore,
179
+ message_id: str,
180
+ target: str,
181
+ injection: dict[str, Any],
182
+ *,
183
+ attempt: int,
184
+ ) -> dict[str, Any]:
185
+ """Spark MEDIUM sweep #3: replace the dead-end failed mark with a real
186
+ bounded-backoff consumer. attempt is the number of the delivery that JUST
187
+ failed (1 = the original delivery; 2..4 = the scheduler-fired retries).
188
+
189
+ Behaviour:
190
+ * attempt < _TRUST_RETRY_MAX_ATTEMPTS: schedule a trust_retry
191
+ scheduled_event for the message, holding the message in 'failed' status
192
+ so _deliver_pending_messages does not race the scheduler. Emit
193
+ leader_panes.trust_auto_answer_retry_scheduled. Return status='retry_scheduled'.
194
+ * attempt >= _TRUST_RETRY_MAX_ATTEMPTS: terminal. Mark the message failed
195
+ and emit leader_panes.trust_auto_answer_exhausted. Return
196
+ status='trust_auto_answer_exhausted'.
197
+ """
198
+ event_log = EventLog(workspace)
199
+ next_attempt = attempt + 1
200
+ if next_attempt > _TRUST_RETRY_MAX_ATTEMPTS:
201
+ store.mark(message_id, "failed", "trust_auto_answer_exhausted")
202
+ event_log.write(
203
+ "leader_panes.trust_auto_answer_exhausted",
204
+ message_id=message_id,
205
+ workspace=str(workspace),
206
+ attempts=attempt,
207
+ target=target,
208
+ pane_id=injection.get("pane_id"),
209
+ reason="trust_auto_answer_exhausted",
210
+ )
211
+ return {
212
+ "ok": False,
213
+ "status": "trust_auto_answer_exhausted",
214
+ "reason": "trust_auto_answer_exhausted",
215
+ "attempts": attempt,
216
+ "detected": injection.get("detected"),
217
+ "pane_id": injection.get("pane_id"),
218
+ "pane_mode": injection.get("pane_mode"),
219
+ "pane_capture_tail": injection.get("pane_capture_tail"),
220
+ }
221
+ backoff = _TRUST_RETRY_BACKOFF_SECONDS.get(next_attempt, _TRUST_RETRY_BACKOFF_SECONDS[_TRUST_RETRY_MAX_ATTEMPTS])
222
+ due_at = (datetime.now(timezone.utc) + timedelta(seconds=backoff)).isoformat()
223
+ owner_team_id = _message_owner_team_id(store, message_id)
224
+ event_id = store.add_scheduled_event(
225
+ due_at,
226
+ message_id,
227
+ "trust_retry",
228
+ {
229
+ "message_id": message_id,
230
+ "attempt": next_attempt,
231
+ "max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
232
+ "first_target": target,
233
+ },
234
+ owner_team_id=owner_team_id,
235
+ )
236
+ # Hold the message in 'failed' so _deliver_pending_messages does not race
237
+ # the scheduled retry. The scheduler consumer resets it to 'accepted' just
238
+ # before re-delivery.
239
+ store.mark(message_id, "failed", "trust_retry_scheduled")
240
+ event_log.write(
241
+ "leader_panes.trust_auto_answer_retry_needed",
242
+ message_id=message_id,
243
+ workspace=str(workspace),
244
+ pane_id=injection.get("pane_id") or target,
245
+ target=target,
246
+ reason="trust_prompt_not_dismissed_after_answer",
247
+ attempt=attempt,
248
+ )
249
+ event_log.write(
250
+ "leader_panes.trust_auto_answer_retry_scheduled",
251
+ message_id=message_id,
252
+ workspace=str(workspace),
253
+ scheduled_event_id=event_id,
254
+ due_at=due_at,
255
+ next_attempt=next_attempt,
256
+ max_attempts=_TRUST_RETRY_MAX_ATTEMPTS,
257
+ backoff_seconds=backoff,
258
+ )
259
+ return {
260
+ "ok": False,
261
+ "status": "retry_scheduled",
262
+ "reason": "trust_prompt_not_dismissed_after_answer",
263
+ "stage": "trust_auto_answer_dismissal_wait",
264
+ "verification": "trust_prompt_not_dismissed_after_answer",
265
+ "scheduled_event_id": event_id,
266
+ "scheduled_retry_at": due_at,
267
+ "next_attempt": next_attempt,
268
+ "max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
269
+ "detected": injection.get("detected"),
270
+ "pane_id": injection.get("pane_id"),
271
+ "pane_mode": injection.get("pane_mode"),
272
+ "pane_capture_tail": injection.get("pane_capture_tail"),
273
+ }
274
+
275
+
276
+ def _message_owner_team_id(store: MessageStore, message_id: str) -> str | None:
277
+ row = _message_by_id(store, message_id)
278
+ if not row:
279
+ return None
280
+ owner = row.get("owner_team_id")
281
+ return str(owner) if owner else None
282
+
283
+
284
+ def _execute_trust_retry(
285
+ workspace: Path,
286
+ store: MessageStore,
287
+ event_log: EventLog,
288
+ payload: dict[str, Any],
289
+ *,
290
+ owner_team_id: str | None = None,
291
+ ) -> dict[str, Any]:
292
+ """Scheduler-side consumer for kind='trust_retry'. Resets the message back
293
+ to 'accepted' so claim_for_delivery succeeds, re-runs _deliver_pending_message,
294
+ and either succeeds, escalates to a further retry (via _handle_trust_retry_needed),
295
+ or hits the terminal exhausted branch.
296
+ """
297
+ from team_agent.state import load_runtime_state
298
+ message_id = str(payload.get("message_id") or "")
299
+ if not message_id:
300
+ return {"ok": False, "reason": "trust_retry_missing_message_id"}
301
+ attempt = int(payload.get("attempt") or 1)
302
+ row = _message_by_id(store, message_id)
303
+ if not row:
304
+ event_log.write(
305
+ "leader_panes.trust_auto_answer_retry_skipped",
306
+ message_id=message_id,
307
+ reason="message_missing",
308
+ attempt=attempt,
309
+ )
310
+ return {"ok": False, "reason": "message_missing"}
311
+ # Reset to accepted so claim_for_delivery succeeds. The previous attempt
312
+ # left the row in 'failed' status with reason='trust_retry_scheduled'.
313
+ store.mark(message_id, "accepted", "trust_retry_resuming")
314
+ event_log.write(
315
+ "leader_panes.trust_auto_answer_retry_attempted",
316
+ message_id=message_id,
317
+ workspace=str(workspace),
318
+ attempt=attempt,
319
+ max_attempts=int(payload.get("max_attempts") or _TRUST_RETRY_MAX_ATTEMPTS),
320
+ )
321
+ state = load_runtime_state(workspace)
322
+ if owner_team_id and isinstance(state.get("teams"), dict):
323
+ scoped = state["teams"].get(owner_team_id)
324
+ if isinstance(scoped, dict):
325
+ state = scoped
326
+ delivery_result = _deliver_pending_message(
327
+ workspace, state, message_id,
328
+ wait_visible=True, timeout=30.0,
329
+ _trust_retry_attempt=attempt,
330
+ )
331
+ return delivery_result
332
+
333
+
334
+ def _stamp_first_send_at_if_leader_to_worker(
335
+ state: dict[str, Any],
336
+ row: dict[str, Any],
337
+ event_log: EventLog | None = None,
338
+ ) -> None:
339
+ """Route B atomicity (2026-05-27): record the first time the leader
340
+ successfully sends work to each worker. The presence of this stamp drives
341
+ restart's resumability decision — a worker the leader has interacted with
342
+ has accumulated conversation state, so a missing session_id at restart
343
+ time IS an atomicity violation. A worker that has never received work
344
+ legitimately fresh-starts during restart.
345
+
346
+ Only stamped once per worker (idempotent across re-sends). Only fires on
347
+ leader -> worker sends; worker-to-worker peer messages do not count.
348
+ The mutation lives on the state dict the caller already saves
349
+ (`save_team_scoped_state` in send.py, or `save_runtime_state` after
350
+ coordinator_tick), so persistence is automatic.
351
+
352
+ C1 (cr verdict, 2026-05-27): when the stamp transitions null -> ts (the
353
+ one-time write), emit a `worker.first_interaction` audit event with
354
+ worker_id, first_send_at, message_id. Re-sends to the same worker hit the
355
+ idempotency guard above and do NOT re-emit. Worker-to-worker peer sends
356
+ short-circuit at the sender check and do NOT emit.
357
+ """
358
+ sender = str(row.get("sender") or "")
359
+ recipient = str(row.get("recipient") or "")
360
+ if not recipient:
361
+ return
362
+ leader_id = str((state.get("leader") or {}).get("id") or "leader")
363
+ if sender not in {"leader", "Leader", leader_id}:
364
+ return
365
+ agents = state.get("agents")
366
+ if not isinstance(agents, dict):
367
+ return
368
+ agent_state = agents.get(recipient)
369
+ if not isinstance(agent_state, dict):
370
+ return
371
+ if agent_state.get("first_send_at"):
372
+ return
373
+ stamp = datetime.now(timezone.utc).isoformat()
374
+ agent_state["first_send_at"] = stamp
375
+ if event_log is not None:
376
+ event_log.write(
377
+ "worker.first_interaction",
378
+ worker_id=recipient,
379
+ first_send_at=stamp,
380
+ message_id=str(row.get("message_id") or ""),
381
+ )
382
+
383
+
384
+ def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
385
+ """Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
386
+ the pane no longer matches detect_non_input_scrollback, False if the prompt
387
+ is still present after `timeout` seconds. Uses the same detector the inject
388
+ path uses so behaviour stays consistent."""
389
+ import time as _time
390
+ from team_agent.messaging.tmux_prompt import detect_non_input_scrollback
391
+ deadline = _time.monotonic() + max(timeout, 0.0)
392
+ while True:
393
+ capture = _capture_pane_tail(target)
394
+ detected = detect_non_input_scrollback(capture)
395
+ if detected != "codex_trust_prompt":
396
+ return True
397
+ if _time.monotonic() >= deadline:
398
+ return False
399
+ _time.sleep(poll_interval)
400
+
401
+
402
+ def _capture_pane_tail(target: str) -> str:
403
+ from team_agent.messaging.deps import _capture_tmux_pane_text
404
+ capture = _capture_tmux_pane_text(target)
405
+ if not capture.get("ok"):
406
+ return ""
407
+ return str(capture.get("capture") or "")
408
+
409
+
118
410
  def _deliver_pending_messages(workspace: Path, state: dict[str, Any], event_log: EventLog) -> list[str]:
119
411
  store = MessageStore(workspace)
120
412
  delivered: list[str] = []
@@ -6,7 +6,7 @@ from typing import Any
6
6
 
7
7
  from team_agent.events import EventLog
8
8
  from team_agent.message_store import MessageStore
9
- from team_agent.messaging.deps import load_spec, save_runtime_state, team_state_key
9
+ from team_agent.messaging.deps import load_runtime_state, load_spec, save_runtime_state, team_state_key
10
10
  from team_agent.messaging.internal_delivery import deliver_stored_message
11
11
 
12
12
 
@@ -27,6 +27,21 @@ STABLE_IDLE_SECONDS = 120
27
27
  FIRE_DEBOUNCE_SECONDS = 300
28
28
  OBLIGATION_PENDING_MIN_AGE_SECONDS = 60
29
29
 
30
+ # Event-log progress signal (Gap 32 §"Idle-Detector False Positive Continues Post Phase G hotfix-3"):
31
+ # the team_last_progress_at calculation must also count leader-side sends and worker MCP calls
32
+ # as recent team activity, not only agent_health.last_output_at. Without this, a worker that has
33
+ # called MCP but not yet emitted a visible turn shows up as idle and the idle reminder fires
34
+ # spuriously inside the stable-idle window.
35
+ _PROGRESS_EVENT_TYPES = frozenset({
36
+ "send.deliver_attempt",
37
+ "leader_receiver.deliver_attempt",
38
+ "mcp.report_result",
39
+ "mcp.send_message",
40
+ })
41
+ _PROGRESS_EVENT_PREFIXES = ("mcp.read_",)
42
+ _PROGRESS_EVENT_WINDOW_SECONDS = 300
43
+ _PROGRESS_EVENT_TAIL_LIMIT = 1000
44
+
30
45
 
31
46
  def _parse_iso(text: Any) -> datetime | None:
32
47
  if not isinstance(text, str) or not text:
@@ -62,24 +77,105 @@ def _team_last_progress_at(
62
77
  state: dict[str, Any],
63
78
  store: MessageStore,
64
79
  owner_team_id: str,
65
- ) -> datetime | None:
66
- candidates: list[datetime] = []
80
+ event_log: EventLog | None = None,
81
+ now: datetime | None = None,
82
+ workspace: Path | None = None,
83
+ ) -> tuple[datetime | None, str | None]:
84
+ sources: list[tuple[datetime, str]] = []
67
85
  coordinator = state.get("coordinator") or {}
68
86
  explicit = (coordinator.get("team_last_progress_at") or {}).get(owner_team_id)
69
87
  if isinstance(explicit, dict):
70
88
  ts = _parse_iso(explicit.get("at"))
71
89
  if ts:
72
- candidates.append(ts)
90
+ sources.append((ts, "explicit_marker"))
73
91
  elif isinstance(explicit, str):
74
92
  ts = _parse_iso(explicit)
75
93
  if ts:
76
- candidates.append(ts)
94
+ sources.append((ts, "explicit_marker"))
77
95
  health = store.agent_health(owner_team_id=owner_team_id)
78
96
  for row in health.values():
79
97
  ts = _parse_iso(row.get("last_output_at"))
80
98
  if ts:
81
- candidates.append(ts)
82
- return max(candidates) if candidates else None
99
+ sources.append((ts, "agent_health.last_output_at"))
100
+ if event_log is not None:
101
+ # Spark MEDIUM #3 (d9f740d): in multi-team workspaces an unscoped progress event in
102
+ # team A's activity must NOT suppress team B's idle_fallback. require_team_scope=True
103
+ # when the workspace has more than one team so unscoped events are ignored. The
104
+ # team-scoped state passed in here does not carry the workspace-level `teams` dict, so
105
+ # we re-read the workspace state from disk to detect multi-team shape.
106
+ require_team_scope = False
107
+ teams = state.get("teams")
108
+ if isinstance(teams, dict) and len(teams) > 1:
109
+ require_team_scope = True
110
+ elif workspace is not None:
111
+ try:
112
+ ws_teams = (load_runtime_state(workspace).get("teams") or {})
113
+ except Exception:
114
+ ws_teams = {}
115
+ if isinstance(ws_teams, dict) and len(ws_teams) > 1:
116
+ require_team_scope = True
117
+ event_ts = _scan_event_progress_signals(
118
+ event_log, owner_team_id, now or datetime.now(timezone.utc),
119
+ require_team_scope=require_team_scope,
120
+ )
121
+ if event_ts:
122
+ sources.append((event_ts, "event_log"))
123
+ if not sources:
124
+ return None, None
125
+ sources.sort(key=lambda item: item[0], reverse=True)
126
+ return sources[0]
127
+
128
+
129
+ # Stage 14 (Gap 36b) — mtime cache per (workspace_path, owner_team_id, require_team_scope).
130
+ # Mac mini 2026-05-26 evidence: _scan_event_progress_signals was a 22% CPU hot path because
131
+ # every 2-second coordinator tick parsed up to 1000 events from a 28 MB events.jsonl. With
132
+ # the cache, the parse only re-runs when the file changes; quiet workspaces pay zero file
133
+ # I/O between writes.
134
+ _PROGRESS_SCAN_CACHE: dict[tuple[str, str, bool], tuple[float, datetime | None]] = {}
135
+
136
+
137
+ def _scan_event_progress_signals(
138
+ event_log: EventLog,
139
+ owner_team_id: str,
140
+ now: datetime,
141
+ *,
142
+ require_team_scope: bool = False,
143
+ ) -> datetime | None:
144
+ cache_key = (str(event_log.path), owner_team_id, require_team_scope)
145
+ try:
146
+ current_mtime = event_log.path.stat().st_mtime
147
+ except FileNotFoundError:
148
+ _PROGRESS_SCAN_CACHE.pop(cache_key, None)
149
+ return None
150
+ cached = _PROGRESS_SCAN_CACHE.get(cache_key)
151
+ if cached is not None and cached[0] == current_mtime:
152
+ return cached[1]
153
+ window_start = now - timedelta(seconds=_PROGRESS_EVENT_WINDOW_SECONDS)
154
+ latest: datetime | None = None
155
+ for event in event_log.tail(_PROGRESS_EVENT_TAIL_LIMIT):
156
+ event_type = str(event.get("event") or "")
157
+ if event_type not in _PROGRESS_EVENT_TYPES and not any(
158
+ event_type.startswith(prefix) for prefix in _PROGRESS_EVENT_PREFIXES
159
+ ):
160
+ continue
161
+ event_team = event.get("team") or event.get("owner_team_id")
162
+ if event_team is None:
163
+ if require_team_scope:
164
+ continue
165
+ elif event_team != owner_team_id:
166
+ continue
167
+ ts = _parse_iso(event.get("ts"))
168
+ if not ts or ts < window_start:
169
+ continue
170
+ if latest is None or ts > latest:
171
+ latest = ts
172
+ _PROGRESS_SCAN_CACHE[cache_key] = (current_mtime, latest)
173
+ return latest
174
+
175
+
176
+ def _reset_progress_scan_cache() -> None:
177
+ """Test-only hook to force re-scan."""
178
+ _PROGRESS_SCAN_CACHE.clear()
83
179
 
84
180
 
85
181
  def _team_last_idle_fallback_fire_at(state: dict[str, Any], owner_team_id: str) -> datetime | None:
@@ -209,14 +305,18 @@ def detect_idle_fallbacks(
209
305
  record_team_progress(state, now, source="all_workers_idle:false", owner_team_id=owner_team_id)
210
306
  save_runtime_state(workspace, state)
211
307
  return []
212
- last_progress = _team_last_progress_at(state, store, owner_team_id)
308
+ last_progress, progress_source = _team_last_progress_at(
309
+ state, store, owner_team_id, event_log=event_log, now=now, workspace=workspace,
310
+ )
213
311
  if last_progress and (now - last_progress) < timedelta(seconds=STABLE_IDLE_SECONDS):
312
+ reason = "recent_team_progress" if progress_source == "event_log" else "stable_idle_window"
214
313
  event_log.write(
215
314
  "coordinator.idle_fallback_skipped",
216
- reason="stable_idle_window",
315
+ reason=reason,
217
316
  team=owner_team_id,
218
317
  stable_idle_seconds=STABLE_IDLE_SECONDS,
219
318
  elapsed_seconds=int((now - last_progress).total_seconds()),
319
+ progress_source=progress_source,
220
320
  )
221
321
  return []
222
322
  last_fire = _team_last_idle_fallback_fire_at(state, owner_team_id)