@team-agent/installer 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +122 -6
- package/src/team_agent/cli/parser.py +42 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +11 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/launch/core.py +2 -1
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +87 -9
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +8 -7
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +31 -2
- package/src/team_agent/messaging/delivery.py +293 -1
- package/src/team_agent/messaging/idle_alerts.py +109 -9
- package/src/team_agent/messaging/leader.py +179 -10
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +393 -23
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +24 -2
- package/src/team_agent/messaging/send.py +21 -26
- package/src/team_agent/messaging/tmux_io.py +153 -23
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +44 -0
- package/src/team_agent/restart/orchestration.py +207 -4
- package/src/team_agent/runtime.py +7 -7
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +59 -0
- package/src/team_agent/state.py +153 -10
- package/src/team_agent/status/inbox.py +33 -3
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/watch/__init__.py +145 -0
|
@@ -10,15 +10,28 @@ from team_agent.messaging.deps import (
|
|
|
10
10
|
core_render_message,
|
|
11
11
|
)
|
|
12
12
|
|
|
13
|
+
from datetime import datetime, timedelta, timezone
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from typing import Any
|
|
15
16
|
|
|
17
|
+
|
|
18
|
+
# Spark MEDIUM sweep #3 (2026-05-26): retry_needed bounded backoff. Each entry is
|
|
19
|
+
# the delay (seconds) BEFORE the attempt with that number runs; attempt 1 was the
|
|
20
|
+
# original delivery, attempt 2 fires 5s after retry_needed, attempt 3 fires 15s
|
|
21
|
+
# after the previous, attempt 4 fires 30s after the previous. _TRUST_RETRY_MAX_ATTEMPTS
|
|
22
|
+
# bounds the total — the 4th retry_needed is terminal and emits
|
|
23
|
+
# leader_panes.trust_auto_answer_exhausted.
|
|
24
|
+
_TRUST_RETRY_BACKOFF_SECONDS = {2: 5, 3: 15, 4: 30}
|
|
25
|
+
_TRUST_RETRY_MAX_ATTEMPTS = 4
|
|
26
|
+
|
|
16
27
|
def _deliver_pending_message(
|
|
17
28
|
workspace: Path,
|
|
18
29
|
state: dict[str, Any],
|
|
19
30
|
message_id: str,
|
|
20
31
|
wait_visible: bool = True,
|
|
21
32
|
timeout: float = 30.0,
|
|
33
|
+
*,
|
|
34
|
+
_trust_retry_attempt: int = 1,
|
|
22
35
|
) -> dict[str, Any]:
|
|
23
36
|
store = MessageStore(workspace)
|
|
24
37
|
row = next((m for m in store.messages() if m["message_id"] == message_id), None)
|
|
@@ -65,9 +78,49 @@ def _deliver_pending_message(
|
|
|
65
78
|
attempts=3 if wait_visible else 1,
|
|
66
79
|
provider=agent_state.get("provider", "fake"),
|
|
67
80
|
)
|
|
81
|
+
if not injection.get("ok") and injection.get("detected") == "codex_trust_prompt":
|
|
82
|
+
# Gap 29 (Stage 2): opt-in trust auto-answer. The helper enforces both the
|
|
83
|
+
# opt-in flag and a workspace-dir match before sending '1'+Enter, then we
|
|
84
|
+
# retry the original paste once the prompt has actually been dismissed.
|
|
85
|
+
# Bypassed entirely when opt-out (default) — the existing failed envelope
|
|
86
|
+
# is preserved.
|
|
87
|
+
from team_agent.messaging.leader_panes import attempt_trust_auto_answer
|
|
88
|
+
answer = attempt_trust_auto_answer(
|
|
89
|
+
workspace,
|
|
90
|
+
injection.get("pane_id") or target,
|
|
91
|
+
injection.get("pane_capture_tail") or "",
|
|
92
|
+
EventLog(workspace),
|
|
93
|
+
state=state,
|
|
94
|
+
)
|
|
95
|
+
if answer.get("answered"):
|
|
96
|
+
# Spark MEDIUM #4 (2026-05-26): replace the fixed 0.3s sleep with a
|
|
97
|
+
# bounded poll. Slow terminals can take well over a second to clear
|
|
98
|
+
# the trust prompt; sleeping a fixed amount races dismissal and
|
|
99
|
+
# leaves the retry hitting the same codex_trust_prompt state. We
|
|
100
|
+
# poll for prompt dismissal up to 3s; if still present, return a
|
|
101
|
+
# retry_needed envelope and let the upstream scheduler decide
|
|
102
|
+
# whether to back off and try again later.
|
|
103
|
+
dismissed = _wait_for_trust_prompt_dismissal(
|
|
104
|
+
injection.get("pane_id") or target, timeout=3.0,
|
|
105
|
+
)
|
|
106
|
+
if not dismissed:
|
|
107
|
+
return _handle_trust_retry_needed(
|
|
108
|
+
workspace, state, store, message_id, target, injection,
|
|
109
|
+
attempt=_trust_retry_attempt,
|
|
110
|
+
)
|
|
111
|
+
injection = _tmux_inject_text(
|
|
112
|
+
target,
|
|
113
|
+
text,
|
|
114
|
+
"Enter",
|
|
115
|
+
f"team-agent-send-{message_id}-trust-retry",
|
|
116
|
+
attempts=3 if wait_visible else 1,
|
|
117
|
+
provider=agent_state.get("provider", "fake"),
|
|
118
|
+
)
|
|
68
119
|
if injection["ok"]:
|
|
69
120
|
store.mark(message_id, "submitted")
|
|
70
|
-
EventLog(workspace)
|
|
121
|
+
send_event_log = EventLog(workspace)
|
|
122
|
+
_stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
|
|
123
|
+
send_event_log.write(
|
|
71
124
|
"send.submitted",
|
|
72
125
|
message_id=message_id,
|
|
73
126
|
target=target,
|
|
@@ -112,9 +165,248 @@ def _deliver_pending_message(
|
|
|
112
165
|
"turn_verification": injection.get("turn_verification"),
|
|
113
166
|
"paste_attempts": injection.get("attempts"),
|
|
114
167
|
"submit_attempts": injection.get("submit_attempts"),
|
|
168
|
+
"detected": injection.get("detected"),
|
|
169
|
+
"pane_id": injection.get("pane_id"),
|
|
170
|
+
"pane_mode": injection.get("pane_mode"),
|
|
171
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
115
172
|
}
|
|
116
173
|
|
|
117
174
|
|
|
175
|
+
def _handle_trust_retry_needed(
|
|
176
|
+
workspace: Path,
|
|
177
|
+
state: dict[str, Any],
|
|
178
|
+
store: MessageStore,
|
|
179
|
+
message_id: str,
|
|
180
|
+
target: str,
|
|
181
|
+
injection: dict[str, Any],
|
|
182
|
+
*,
|
|
183
|
+
attempt: int,
|
|
184
|
+
) -> dict[str, Any]:
|
|
185
|
+
"""Spark MEDIUM sweep #3: replace the dead-end failed mark with a real
|
|
186
|
+
bounded-backoff consumer. attempt is the number of the delivery that JUST
|
|
187
|
+
failed (1 = the original delivery; 2..4 = the scheduler-fired retries).
|
|
188
|
+
|
|
189
|
+
Behaviour:
|
|
190
|
+
* attempt < _TRUST_RETRY_MAX_ATTEMPTS: schedule a trust_retry
|
|
191
|
+
scheduled_event for the message, holding the message in 'failed' status
|
|
192
|
+
so _deliver_pending_messages does not race the scheduler. Emit
|
|
193
|
+
leader_panes.trust_auto_answer_retry_scheduled. Return status='retry_scheduled'.
|
|
194
|
+
* attempt >= _TRUST_RETRY_MAX_ATTEMPTS: terminal. Mark the message failed
|
|
195
|
+
and emit leader_panes.trust_auto_answer_exhausted. Return
|
|
196
|
+
status='trust_auto_answer_exhausted'.
|
|
197
|
+
"""
|
|
198
|
+
event_log = EventLog(workspace)
|
|
199
|
+
next_attempt = attempt + 1
|
|
200
|
+
if next_attempt > _TRUST_RETRY_MAX_ATTEMPTS:
|
|
201
|
+
store.mark(message_id, "failed", "trust_auto_answer_exhausted")
|
|
202
|
+
event_log.write(
|
|
203
|
+
"leader_panes.trust_auto_answer_exhausted",
|
|
204
|
+
message_id=message_id,
|
|
205
|
+
workspace=str(workspace),
|
|
206
|
+
attempts=attempt,
|
|
207
|
+
target=target,
|
|
208
|
+
pane_id=injection.get("pane_id"),
|
|
209
|
+
reason="trust_auto_answer_exhausted",
|
|
210
|
+
)
|
|
211
|
+
return {
|
|
212
|
+
"ok": False,
|
|
213
|
+
"status": "trust_auto_answer_exhausted",
|
|
214
|
+
"reason": "trust_auto_answer_exhausted",
|
|
215
|
+
"attempts": attempt,
|
|
216
|
+
"detected": injection.get("detected"),
|
|
217
|
+
"pane_id": injection.get("pane_id"),
|
|
218
|
+
"pane_mode": injection.get("pane_mode"),
|
|
219
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
220
|
+
}
|
|
221
|
+
backoff = _TRUST_RETRY_BACKOFF_SECONDS.get(next_attempt, _TRUST_RETRY_BACKOFF_SECONDS[_TRUST_RETRY_MAX_ATTEMPTS])
|
|
222
|
+
due_at = (datetime.now(timezone.utc) + timedelta(seconds=backoff)).isoformat()
|
|
223
|
+
owner_team_id = _message_owner_team_id(store, message_id)
|
|
224
|
+
event_id = store.add_scheduled_event(
|
|
225
|
+
due_at,
|
|
226
|
+
message_id,
|
|
227
|
+
"trust_retry",
|
|
228
|
+
{
|
|
229
|
+
"message_id": message_id,
|
|
230
|
+
"attempt": next_attempt,
|
|
231
|
+
"max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
|
|
232
|
+
"first_target": target,
|
|
233
|
+
},
|
|
234
|
+
owner_team_id=owner_team_id,
|
|
235
|
+
)
|
|
236
|
+
# Hold the message in 'failed' so _deliver_pending_messages does not race
|
|
237
|
+
# the scheduled retry. The scheduler consumer resets it to 'accepted' just
|
|
238
|
+
# before re-delivery.
|
|
239
|
+
store.mark(message_id, "failed", "trust_retry_scheduled")
|
|
240
|
+
event_log.write(
|
|
241
|
+
"leader_panes.trust_auto_answer_retry_needed",
|
|
242
|
+
message_id=message_id,
|
|
243
|
+
workspace=str(workspace),
|
|
244
|
+
pane_id=injection.get("pane_id") or target,
|
|
245
|
+
target=target,
|
|
246
|
+
reason="trust_prompt_not_dismissed_after_answer",
|
|
247
|
+
attempt=attempt,
|
|
248
|
+
)
|
|
249
|
+
event_log.write(
|
|
250
|
+
"leader_panes.trust_auto_answer_retry_scheduled",
|
|
251
|
+
message_id=message_id,
|
|
252
|
+
workspace=str(workspace),
|
|
253
|
+
scheduled_event_id=event_id,
|
|
254
|
+
due_at=due_at,
|
|
255
|
+
next_attempt=next_attempt,
|
|
256
|
+
max_attempts=_TRUST_RETRY_MAX_ATTEMPTS,
|
|
257
|
+
backoff_seconds=backoff,
|
|
258
|
+
)
|
|
259
|
+
return {
|
|
260
|
+
"ok": False,
|
|
261
|
+
"status": "retry_scheduled",
|
|
262
|
+
"reason": "trust_prompt_not_dismissed_after_answer",
|
|
263
|
+
"stage": "trust_auto_answer_dismissal_wait",
|
|
264
|
+
"verification": "trust_prompt_not_dismissed_after_answer",
|
|
265
|
+
"scheduled_event_id": event_id,
|
|
266
|
+
"scheduled_retry_at": due_at,
|
|
267
|
+
"next_attempt": next_attempt,
|
|
268
|
+
"max_attempts": _TRUST_RETRY_MAX_ATTEMPTS,
|
|
269
|
+
"detected": injection.get("detected"),
|
|
270
|
+
"pane_id": injection.get("pane_id"),
|
|
271
|
+
"pane_mode": injection.get("pane_mode"),
|
|
272
|
+
"pane_capture_tail": injection.get("pane_capture_tail"),
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _message_owner_team_id(store: MessageStore, message_id: str) -> str | None:
|
|
277
|
+
row = _message_by_id(store, message_id)
|
|
278
|
+
if not row:
|
|
279
|
+
return None
|
|
280
|
+
owner = row.get("owner_team_id")
|
|
281
|
+
return str(owner) if owner else None
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _execute_trust_retry(
|
|
285
|
+
workspace: Path,
|
|
286
|
+
store: MessageStore,
|
|
287
|
+
event_log: EventLog,
|
|
288
|
+
payload: dict[str, Any],
|
|
289
|
+
*,
|
|
290
|
+
owner_team_id: str | None = None,
|
|
291
|
+
) -> dict[str, Any]:
|
|
292
|
+
"""Scheduler-side consumer for kind='trust_retry'. Resets the message back
|
|
293
|
+
to 'accepted' so claim_for_delivery succeeds, re-runs _deliver_pending_message,
|
|
294
|
+
and either succeeds, escalates to a further retry (via _handle_trust_retry_needed),
|
|
295
|
+
or hits the terminal exhausted branch.
|
|
296
|
+
"""
|
|
297
|
+
from team_agent.state import load_runtime_state
|
|
298
|
+
message_id = str(payload.get("message_id") or "")
|
|
299
|
+
if not message_id:
|
|
300
|
+
return {"ok": False, "reason": "trust_retry_missing_message_id"}
|
|
301
|
+
attempt = int(payload.get("attempt") or 1)
|
|
302
|
+
row = _message_by_id(store, message_id)
|
|
303
|
+
if not row:
|
|
304
|
+
event_log.write(
|
|
305
|
+
"leader_panes.trust_auto_answer_retry_skipped",
|
|
306
|
+
message_id=message_id,
|
|
307
|
+
reason="message_missing",
|
|
308
|
+
attempt=attempt,
|
|
309
|
+
)
|
|
310
|
+
return {"ok": False, "reason": "message_missing"}
|
|
311
|
+
# Reset to accepted so claim_for_delivery succeeds. The previous attempt
|
|
312
|
+
# left the row in 'failed' status with reason='trust_retry_scheduled'.
|
|
313
|
+
store.mark(message_id, "accepted", "trust_retry_resuming")
|
|
314
|
+
event_log.write(
|
|
315
|
+
"leader_panes.trust_auto_answer_retry_attempted",
|
|
316
|
+
message_id=message_id,
|
|
317
|
+
workspace=str(workspace),
|
|
318
|
+
attempt=attempt,
|
|
319
|
+
max_attempts=int(payload.get("max_attempts") or _TRUST_RETRY_MAX_ATTEMPTS),
|
|
320
|
+
)
|
|
321
|
+
state = load_runtime_state(workspace)
|
|
322
|
+
if owner_team_id and isinstance(state.get("teams"), dict):
|
|
323
|
+
scoped = state["teams"].get(owner_team_id)
|
|
324
|
+
if isinstance(scoped, dict):
|
|
325
|
+
state = scoped
|
|
326
|
+
delivery_result = _deliver_pending_message(
|
|
327
|
+
workspace, state, message_id,
|
|
328
|
+
wait_visible=True, timeout=30.0,
|
|
329
|
+
_trust_retry_attempt=attempt,
|
|
330
|
+
)
|
|
331
|
+
return delivery_result
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _stamp_first_send_at_if_leader_to_worker(
|
|
335
|
+
state: dict[str, Any],
|
|
336
|
+
row: dict[str, Any],
|
|
337
|
+
event_log: EventLog | None = None,
|
|
338
|
+
) -> None:
|
|
339
|
+
"""Route B atomicity (2026-05-27): record the first time the leader
|
|
340
|
+
successfully sends work to each worker. The presence of this stamp drives
|
|
341
|
+
restart's resumability decision — a worker the leader has interacted with
|
|
342
|
+
has accumulated conversation state, so a missing session_id at restart
|
|
343
|
+
time IS an atomicity violation. A worker that has never received work
|
|
344
|
+
legitimately fresh-starts during restart.
|
|
345
|
+
|
|
346
|
+
Only stamped once per worker (idempotent across re-sends). Only fires on
|
|
347
|
+
leader -> worker sends; worker-to-worker peer messages do not count.
|
|
348
|
+
The mutation lives on the state dict the caller already saves
|
|
349
|
+
(`save_team_scoped_state` in send.py, or `save_runtime_state` after
|
|
350
|
+
coordinator_tick), so persistence is automatic.
|
|
351
|
+
|
|
352
|
+
C1 (cr verdict, 2026-05-27): when the stamp transitions null -> ts (the
|
|
353
|
+
one-time write), emit a `worker.first_interaction` audit event with
|
|
354
|
+
worker_id, first_send_at, message_id. Re-sends to the same worker hit the
|
|
355
|
+
idempotency guard above and do NOT re-emit. Worker-to-worker peer sends
|
|
356
|
+
short-circuit at the sender check and do NOT emit.
|
|
357
|
+
"""
|
|
358
|
+
sender = str(row.get("sender") or "")
|
|
359
|
+
recipient = str(row.get("recipient") or "")
|
|
360
|
+
if not recipient:
|
|
361
|
+
return
|
|
362
|
+
leader_id = str((state.get("leader") or {}).get("id") or "leader")
|
|
363
|
+
if sender not in {"leader", "Leader", leader_id}:
|
|
364
|
+
return
|
|
365
|
+
agents = state.get("agents")
|
|
366
|
+
if not isinstance(agents, dict):
|
|
367
|
+
return
|
|
368
|
+
agent_state = agents.get(recipient)
|
|
369
|
+
if not isinstance(agent_state, dict):
|
|
370
|
+
return
|
|
371
|
+
if agent_state.get("first_send_at"):
|
|
372
|
+
return
|
|
373
|
+
stamp = datetime.now(timezone.utc).isoformat()
|
|
374
|
+
agent_state["first_send_at"] = stamp
|
|
375
|
+
if event_log is not None:
|
|
376
|
+
event_log.write(
|
|
377
|
+
"worker.first_interaction",
|
|
378
|
+
worker_id=recipient,
|
|
379
|
+
first_send_at=stamp,
|
|
380
|
+
message_id=str(row.get("message_id") or ""),
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
|
|
385
|
+
"""Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
|
|
386
|
+
the pane no longer matches detect_non_input_scrollback, False if the prompt
|
|
387
|
+
is still present after `timeout` seconds. Uses the same detector the inject
|
|
388
|
+
path uses so behaviour stays consistent."""
|
|
389
|
+
import time as _time
|
|
390
|
+
from team_agent.messaging.tmux_prompt import detect_non_input_scrollback
|
|
391
|
+
deadline = _time.monotonic() + max(timeout, 0.0)
|
|
392
|
+
while True:
|
|
393
|
+
capture = _capture_pane_tail(target)
|
|
394
|
+
detected = detect_non_input_scrollback(capture)
|
|
395
|
+
if detected != "codex_trust_prompt":
|
|
396
|
+
return True
|
|
397
|
+
if _time.monotonic() >= deadline:
|
|
398
|
+
return False
|
|
399
|
+
_time.sleep(poll_interval)
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _capture_pane_tail(target: str) -> str:
|
|
403
|
+
from team_agent.messaging.deps import _capture_tmux_pane_text
|
|
404
|
+
capture = _capture_tmux_pane_text(target)
|
|
405
|
+
if not capture.get("ok"):
|
|
406
|
+
return ""
|
|
407
|
+
return str(capture.get("capture") or "")
|
|
408
|
+
|
|
409
|
+
|
|
118
410
|
def _deliver_pending_messages(workspace: Path, state: dict[str, Any], event_log: EventLog) -> list[str]:
|
|
119
411
|
store = MessageStore(workspace)
|
|
120
412
|
delivered: list[str] = []
|
|
@@ -6,7 +6,7 @@ from typing import Any
|
|
|
6
6
|
|
|
7
7
|
from team_agent.events import EventLog
|
|
8
8
|
from team_agent.message_store import MessageStore
|
|
9
|
-
from team_agent.messaging.deps import load_spec, save_runtime_state, team_state_key
|
|
9
|
+
from team_agent.messaging.deps import load_runtime_state, load_spec, save_runtime_state, team_state_key
|
|
10
10
|
from team_agent.messaging.internal_delivery import deliver_stored_message
|
|
11
11
|
|
|
12
12
|
|
|
@@ -27,6 +27,21 @@ STABLE_IDLE_SECONDS = 120
|
|
|
27
27
|
FIRE_DEBOUNCE_SECONDS = 300
|
|
28
28
|
OBLIGATION_PENDING_MIN_AGE_SECONDS = 60
|
|
29
29
|
|
|
30
|
+
# Event-log progress signal (Gap 32 §"Idle-Detector False Positive Continues Post Phase G hotfix-3"):
|
|
31
|
+
# the team_last_progress_at calculation must also count leader-side sends and worker MCP calls
|
|
32
|
+
# as recent team activity, not only agent_health.last_output_at. Without this, a worker that has
|
|
33
|
+
# called MCP but not yet emitted a visible turn shows up as idle and the idle reminder fires
|
|
34
|
+
# spuriously inside the stable-idle window.
|
|
35
|
+
_PROGRESS_EVENT_TYPES = frozenset({
|
|
36
|
+
"send.deliver_attempt",
|
|
37
|
+
"leader_receiver.deliver_attempt",
|
|
38
|
+
"mcp.report_result",
|
|
39
|
+
"mcp.send_message",
|
|
40
|
+
})
|
|
41
|
+
_PROGRESS_EVENT_PREFIXES = ("mcp.read_",)
|
|
42
|
+
_PROGRESS_EVENT_WINDOW_SECONDS = 300
|
|
43
|
+
_PROGRESS_EVENT_TAIL_LIMIT = 1000
|
|
44
|
+
|
|
30
45
|
|
|
31
46
|
def _parse_iso(text: Any) -> datetime | None:
|
|
32
47
|
if not isinstance(text, str) or not text:
|
|
@@ -62,24 +77,105 @@ def _team_last_progress_at(
|
|
|
62
77
|
state: dict[str, Any],
|
|
63
78
|
store: MessageStore,
|
|
64
79
|
owner_team_id: str,
|
|
65
|
-
|
|
66
|
-
|
|
80
|
+
event_log: EventLog | None = None,
|
|
81
|
+
now: datetime | None = None,
|
|
82
|
+
workspace: Path | None = None,
|
|
83
|
+
) -> tuple[datetime | None, str | None]:
|
|
84
|
+
sources: list[tuple[datetime, str]] = []
|
|
67
85
|
coordinator = state.get("coordinator") or {}
|
|
68
86
|
explicit = (coordinator.get("team_last_progress_at") or {}).get(owner_team_id)
|
|
69
87
|
if isinstance(explicit, dict):
|
|
70
88
|
ts = _parse_iso(explicit.get("at"))
|
|
71
89
|
if ts:
|
|
72
|
-
|
|
90
|
+
sources.append((ts, "explicit_marker"))
|
|
73
91
|
elif isinstance(explicit, str):
|
|
74
92
|
ts = _parse_iso(explicit)
|
|
75
93
|
if ts:
|
|
76
|
-
|
|
94
|
+
sources.append((ts, "explicit_marker"))
|
|
77
95
|
health = store.agent_health(owner_team_id=owner_team_id)
|
|
78
96
|
for row in health.values():
|
|
79
97
|
ts = _parse_iso(row.get("last_output_at"))
|
|
80
98
|
if ts:
|
|
81
|
-
|
|
82
|
-
|
|
99
|
+
sources.append((ts, "agent_health.last_output_at"))
|
|
100
|
+
if event_log is not None:
|
|
101
|
+
# Spark MEDIUM #3 (d9f740d): in multi-team workspaces an unscoped progress event in
|
|
102
|
+
# team A's activity must NOT suppress team B's idle_fallback. require_team_scope=True
|
|
103
|
+
# when the workspace has more than one team so unscoped events are ignored. The
|
|
104
|
+
# team-scoped state passed in here does not carry the workspace-level `teams` dict, so
|
|
105
|
+
# we re-read the workspace state from disk to detect multi-team shape.
|
|
106
|
+
require_team_scope = False
|
|
107
|
+
teams = state.get("teams")
|
|
108
|
+
if isinstance(teams, dict) and len(teams) > 1:
|
|
109
|
+
require_team_scope = True
|
|
110
|
+
elif workspace is not None:
|
|
111
|
+
try:
|
|
112
|
+
ws_teams = (load_runtime_state(workspace).get("teams") or {})
|
|
113
|
+
except Exception:
|
|
114
|
+
ws_teams = {}
|
|
115
|
+
if isinstance(ws_teams, dict) and len(ws_teams) > 1:
|
|
116
|
+
require_team_scope = True
|
|
117
|
+
event_ts = _scan_event_progress_signals(
|
|
118
|
+
event_log, owner_team_id, now or datetime.now(timezone.utc),
|
|
119
|
+
require_team_scope=require_team_scope,
|
|
120
|
+
)
|
|
121
|
+
if event_ts:
|
|
122
|
+
sources.append((event_ts, "event_log"))
|
|
123
|
+
if not sources:
|
|
124
|
+
return None, None
|
|
125
|
+
sources.sort(key=lambda item: item[0], reverse=True)
|
|
126
|
+
return sources[0]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# Stage 14 (Gap 36b) — mtime cache per (workspace_path, owner_team_id, require_team_scope).
|
|
130
|
+
# Mac mini 2026-05-26 evidence: _scan_event_progress_signals was a 22% CPU hot path because
|
|
131
|
+
# every 2-second coordinator tick parsed up to 1000 events from a 28 MB events.jsonl. With
|
|
132
|
+
# the cache, the parse only re-runs when the file changes; quiet workspaces pay zero file
|
|
133
|
+
# I/O between writes.
|
|
134
|
+
_PROGRESS_SCAN_CACHE: dict[tuple[str, str, bool], tuple[float, datetime | None]] = {}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _scan_event_progress_signals(
|
|
138
|
+
event_log: EventLog,
|
|
139
|
+
owner_team_id: str,
|
|
140
|
+
now: datetime,
|
|
141
|
+
*,
|
|
142
|
+
require_team_scope: bool = False,
|
|
143
|
+
) -> datetime | None:
|
|
144
|
+
cache_key = (str(event_log.path), owner_team_id, require_team_scope)
|
|
145
|
+
try:
|
|
146
|
+
current_mtime = event_log.path.stat().st_mtime
|
|
147
|
+
except FileNotFoundError:
|
|
148
|
+
_PROGRESS_SCAN_CACHE.pop(cache_key, None)
|
|
149
|
+
return None
|
|
150
|
+
cached = _PROGRESS_SCAN_CACHE.get(cache_key)
|
|
151
|
+
if cached is not None and cached[0] == current_mtime:
|
|
152
|
+
return cached[1]
|
|
153
|
+
window_start = now - timedelta(seconds=_PROGRESS_EVENT_WINDOW_SECONDS)
|
|
154
|
+
latest: datetime | None = None
|
|
155
|
+
for event in event_log.tail(_PROGRESS_EVENT_TAIL_LIMIT):
|
|
156
|
+
event_type = str(event.get("event") or "")
|
|
157
|
+
if event_type not in _PROGRESS_EVENT_TYPES and not any(
|
|
158
|
+
event_type.startswith(prefix) for prefix in _PROGRESS_EVENT_PREFIXES
|
|
159
|
+
):
|
|
160
|
+
continue
|
|
161
|
+
event_team = event.get("team") or event.get("owner_team_id")
|
|
162
|
+
if event_team is None:
|
|
163
|
+
if require_team_scope:
|
|
164
|
+
continue
|
|
165
|
+
elif event_team != owner_team_id:
|
|
166
|
+
continue
|
|
167
|
+
ts = _parse_iso(event.get("ts"))
|
|
168
|
+
if not ts or ts < window_start:
|
|
169
|
+
continue
|
|
170
|
+
if latest is None or ts > latest:
|
|
171
|
+
latest = ts
|
|
172
|
+
_PROGRESS_SCAN_CACHE[cache_key] = (current_mtime, latest)
|
|
173
|
+
return latest
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _reset_progress_scan_cache() -> None:
|
|
177
|
+
"""Test-only hook to force re-scan."""
|
|
178
|
+
_PROGRESS_SCAN_CACHE.clear()
|
|
83
179
|
|
|
84
180
|
|
|
85
181
|
def _team_last_idle_fallback_fire_at(state: dict[str, Any], owner_team_id: str) -> datetime | None:
|
|
@@ -209,14 +305,18 @@ def detect_idle_fallbacks(
|
|
|
209
305
|
record_team_progress(state, now, source="all_workers_idle:false", owner_team_id=owner_team_id)
|
|
210
306
|
save_runtime_state(workspace, state)
|
|
211
307
|
return []
|
|
212
|
-
last_progress = _team_last_progress_at(
|
|
308
|
+
last_progress, progress_source = _team_last_progress_at(
|
|
309
|
+
state, store, owner_team_id, event_log=event_log, now=now, workspace=workspace,
|
|
310
|
+
)
|
|
213
311
|
if last_progress and (now - last_progress) < timedelta(seconds=STABLE_IDLE_SECONDS):
|
|
312
|
+
reason = "recent_team_progress" if progress_source == "event_log" else "stable_idle_window"
|
|
214
313
|
event_log.write(
|
|
215
314
|
"coordinator.idle_fallback_skipped",
|
|
216
|
-
reason=
|
|
315
|
+
reason=reason,
|
|
217
316
|
team=owner_team_id,
|
|
218
317
|
stable_idle_seconds=STABLE_IDLE_SECONDS,
|
|
219
318
|
elapsed_seconds=int((now - last_progress).total_seconds()),
|
|
319
|
+
progress_source=progress_source,
|
|
220
320
|
)
|
|
221
321
|
return []
|
|
222
322
|
last_fire = _team_last_idle_fallback_fire_at(state, owner_team_id)
|