@team-agent/installer 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/team_agent/cli/__init__.py +2 -0
- package/src/team_agent/cli/commands.py +22 -3
- package/src/team_agent/cli/parser.py +40 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +23 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +193 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +86 -9
- package/src/team_agent/lifecycle/paste_buffer_hygiene.py +39 -0
- package/src/team_agent/lifecycle/start.py +3 -0
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +23 -0
- package/src/team_agent/messaging/delivery.py +10 -0
- package/src/team_agent/messaging/idle_alerts.py +227 -21
- package/src/team_agent/messaging/leader.py +166 -6
- package/src/team_agent/messaging/leader_panes.py +193 -23
- package/src/team_agent/messaging/owner_bypass.py +29 -0
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +22 -2
- package/src/team_agent/messaging/send.py +9 -2
- package/src/team_agent/messaging/session_drift.py +94 -0
- package/src/team_agent/runtime.py +22 -14
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/state.py +167 -10
- package/src/team_agent/status/inbox.py +33 -3
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
4
|
+
|
|
3
5
|
from team_agent.messaging.deps import (
|
|
4
6
|
EventLog,
|
|
5
7
|
RuntimeError,
|
|
@@ -9,6 +11,7 @@ from team_agent.messaging.deps import (
|
|
|
9
11
|
_tmux_current_client_pane_info as _runtime_tmux_current_client_pane_info,
|
|
10
12
|
_tmux_list_panes as _runtime_tmux_list_panes,
|
|
11
13
|
_tmux_pane_info as _runtime_tmux_pane_info,
|
|
14
|
+
_tmux_inject_text,
|
|
12
15
|
core_list_targets,
|
|
13
16
|
datetime,
|
|
14
17
|
os,
|
|
@@ -20,6 +23,8 @@ from team_agent.messaging.deps import (
|
|
|
20
23
|
from pathlib import Path
|
|
21
24
|
from typing import Any
|
|
22
25
|
|
|
26
|
+
_AMBIGUOUS_DEBOUNCE_SECONDS = 60
|
|
27
|
+
|
|
23
28
|
def _resolve_leader_pane(
|
|
24
29
|
pane: str | None,
|
|
25
30
|
provider: str,
|
|
@@ -208,17 +213,40 @@ def _target_fingerprint(pane_info: dict[str, Any]) -> str:
|
|
|
208
213
|
)
|
|
209
214
|
|
|
210
215
|
|
|
216
|
+
def is_bound_pane_still_valid(state: dict[str, Any], store: Any | None = None) -> dict[str, Any]:
|
|
217
|
+
receiver = dict(state.get("leader_receiver") or {})
|
|
218
|
+
owner = state.get("team_owner") if isinstance(state.get("team_owner"), dict) else {}
|
|
219
|
+
if owner and owner.get("leader_session_uuid") and not receiver.get("leader_session_uuid"):
|
|
220
|
+
receiver["leader_session_uuid"] = owner["leader_session_uuid"]
|
|
221
|
+
return _validate_leader_receiver(receiver)
|
|
222
|
+
|
|
223
|
+
|
|
211
224
|
def _rediscover_leader_receiver(
|
|
212
225
|
receiver: dict[str, Any],
|
|
213
226
|
event_log: EventLog,
|
|
214
227
|
owner_identity: dict[str, Any] | None = None,
|
|
228
|
+
invalidation_reason: str | None = None,
|
|
229
|
+
team_id: str | None = None,
|
|
215
230
|
) -> dict[str, Any]:
|
|
216
231
|
provider = str(receiver.get("provider") or "codex")
|
|
217
|
-
if provider
|
|
218
|
-
return {"status": "missing", "reason": "
|
|
232
|
+
if provider == "fake":
|
|
233
|
+
return {"status": "missing", "reason": "rediscovery_not_supported_for_fake"}
|
|
219
234
|
targets = core_list_targets()
|
|
220
235
|
if not targets.get("ok"):
|
|
221
236
|
event_log.write("leader_receiver.rediscover_failed", provider=provider, error=targets.get("error"))
|
|
237
|
+
# Stage 15 CI fix: when the tmux target scan itself fails (no server, no daemon,
|
|
238
|
+
# CI env without tmux), the caller has no way to recover unless we also emit
|
|
239
|
+
# rebind_required. Without this, _refresh_leader_receiver_or_flag_rebind silently
|
|
240
|
+
# returns and report_result queues against the stale pane with zero audit signal.
|
|
241
|
+
event_log.write(
|
|
242
|
+
"leader_receiver.rebind_required",
|
|
243
|
+
old_pane_id=receiver.get("pane_id"),
|
|
244
|
+
reason=invalidation_reason,
|
|
245
|
+
provider=provider,
|
|
246
|
+
team_id=team_id,
|
|
247
|
+
rediscovery_status="failed",
|
|
248
|
+
error=targets.get("error"),
|
|
249
|
+
)
|
|
222
250
|
return {"status": "failed", "error": targets.get("error")}
|
|
223
251
|
candidates = [
|
|
224
252
|
target
|
|
@@ -228,16 +256,26 @@ def _rediscover_leader_receiver(
|
|
|
228
256
|
if owner_identity:
|
|
229
257
|
owner_candidates = [target for target in candidates if _target_matches_owner_identity(target, owner_identity)]
|
|
230
258
|
if len(owner_candidates) == 1:
|
|
231
|
-
return _rediscovered_receiver(receiver, provider, owner_candidates[0], event_log, owner_identity)
|
|
259
|
+
return _rediscovered_receiver(receiver, provider, owner_candidates[0], event_log, owner_identity, invalidation_reason)
|
|
232
260
|
if len(owner_candidates) > 1:
|
|
261
|
+
incident = _broadcast_ambiguous_candidates(
|
|
262
|
+
receiver,
|
|
263
|
+
provider,
|
|
264
|
+
owner_candidates,
|
|
265
|
+
event_log,
|
|
266
|
+
owner_identity,
|
|
267
|
+
team_id,
|
|
268
|
+
)
|
|
233
269
|
event_log.write(
|
|
234
270
|
"leader_receiver.rediscover_ambiguous",
|
|
235
271
|
provider=provider,
|
|
236
272
|
old_target=receiver.get("pane_id"),
|
|
237
273
|
candidates=[target.get("pane_id") for target in owner_candidates],
|
|
238
274
|
owner_identity=owner_identity,
|
|
275
|
+
incident_id=incident.get("incident_id"),
|
|
276
|
+
deduped=incident.get("deduped"),
|
|
239
277
|
)
|
|
240
|
-
return {"status": "ambiguous", "candidates": owner_candidates, "owner_identity": owner_identity}
|
|
278
|
+
return {"status": "ambiguous", "candidates": owner_candidates, "owner_identity": owner_identity, **incident}
|
|
241
279
|
event_log.write(
|
|
242
280
|
"leader_receiver.rediscover_missing",
|
|
243
281
|
provider=provider,
|
|
@@ -245,9 +283,19 @@ def _rediscover_leader_receiver(
|
|
|
245
283
|
owner_identity=owner_identity,
|
|
246
284
|
candidate_count=len(candidates),
|
|
247
285
|
)
|
|
286
|
+
event_log.write(
|
|
287
|
+
"leader_receiver.rebind_required",
|
|
288
|
+
old_pane_id=receiver.get("pane_id"),
|
|
289
|
+
reason=invalidation_reason,
|
|
290
|
+
provider=provider,
|
|
291
|
+
team_id=team_id,
|
|
292
|
+
uuid_prefix=_uuid_prefix(owner_identity),
|
|
293
|
+
owner_identity=owner_identity,
|
|
294
|
+
recovery_action="open the owning leader pane or run team-agent claim-leader --confirm from a matching pane",
|
|
295
|
+
)
|
|
248
296
|
return {"status": "missing", "owner_identity": owner_identity}
|
|
249
297
|
if len(candidates) == 1:
|
|
250
|
-
return _rediscovered_receiver(receiver, provider, candidates[0], event_log, None)
|
|
298
|
+
return _rediscovered_receiver(receiver, provider, candidates[0], event_log, None, invalidation_reason)
|
|
251
299
|
if len(candidates) > 1:
|
|
252
300
|
event_log.write(
|
|
253
301
|
"leader_receiver.rediscover_ambiguous",
|
|
@@ -255,12 +303,19 @@ def _rediscover_leader_receiver(
|
|
|
255
303
|
old_target=receiver.get("pane_id"),
|
|
256
304
|
candidates=[target.get("pane_id") for target in candidates],
|
|
257
305
|
)
|
|
306
|
+
event_log.write("leader_receiver.rebind_required", old_pane_id=receiver.get("pane_id"), reason=invalidation_reason, provider=provider, team_id=team_id, rediscovery_status="ambiguous")
|
|
258
307
|
return {"status": "ambiguous", "candidates": candidates}
|
|
259
308
|
event_log.write("leader_receiver.rediscover_missing", provider=provider, old_target=receiver.get("pane_id"))
|
|
309
|
+
event_log.write("leader_receiver.rebind_required", old_pane_id=receiver.get("pane_id"), reason=invalidation_reason, provider=provider, team_id=team_id, rediscovery_status="missing")
|
|
260
310
|
return {"status": "missing"}
|
|
261
311
|
|
|
262
312
|
|
|
263
313
|
def _target_matches_owner_identity(target: dict[str, Any], owner_identity: dict[str, Any]) -> bool:
|
|
314
|
+
expected_uuid = owner_identity.get("leader_session_uuid")
|
|
315
|
+
if expected_uuid:
|
|
316
|
+
actual_uuid = _target_leader_session_uuid(target)
|
|
317
|
+
if actual_uuid:
|
|
318
|
+
return actual_uuid == expected_uuid
|
|
264
319
|
env = target.get("leader_env") if isinstance(target.get("leader_env"), dict) else {}
|
|
265
320
|
return (
|
|
266
321
|
env.get("TEAM_AGENT_LEADER_PANE_ID") == (owner_identity.get("pane_id") or "")
|
|
@@ -269,14 +324,31 @@ def _target_matches_owner_identity(target: dict[str, Any], owner_identity: dict[
|
|
|
269
324
|
)
|
|
270
325
|
|
|
271
326
|
|
|
272
|
-
def
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
)
|
|
279
|
-
|
|
327
|
+
def _target_leader_session_uuid(target: dict[str, Any]) -> str:
|
|
328
|
+
env = target.get("leader_env") if isinstance(target.get("leader_env"), dict) else {}
|
|
329
|
+
return str(target.get("leader_session_uuid") or env.get("TEAM_AGENT_LEADER_SESSION_UUID") or "")
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _leader_uuid_for_bound_pane(receiver: dict[str, Any], pane_info: dict[str, Any]) -> str:
|
|
333
|
+
direct = _target_leader_session_uuid(pane_info) or _target_leader_session_uuid(receiver)
|
|
334
|
+
if direct:
|
|
335
|
+
return direct
|
|
336
|
+
targets = core_list_targets()
|
|
337
|
+
if not targets.get("ok"):
|
|
338
|
+
return ""
|
|
339
|
+
pane_id = pane_info.get("pane_id")
|
|
340
|
+
for target in targets.get("targets", []):
|
|
341
|
+
if target.get("pane_id") == pane_id:
|
|
342
|
+
return _target_leader_session_uuid(target)
|
|
343
|
+
return ""
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _uuid_prefix(owner_identity: dict[str, Any] | None) -> str:
|
|
347
|
+
return str((owner_identity or {}).get("leader_session_uuid") or "")[:8]
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _receiver_from_target(target: dict[str, Any], provider: str, leader_uuid: str | None, owner_epoch: int | None = None) -> dict[str, Any]:
|
|
351
|
+
receiver = {
|
|
280
352
|
"mode": "direct_tmux",
|
|
281
353
|
"status": "attached",
|
|
282
354
|
"provider": provider,
|
|
@@ -289,8 +361,83 @@ def _rediscovered_receiver(
|
|
|
289
361
|
"pane_current_command": target["pane_current_command"],
|
|
290
362
|
"fingerprint": target.get("fingerprint") or _target_fingerprint(target),
|
|
291
363
|
"attached_at": datetime.now(timezone.utc).isoformat(),
|
|
292
|
-
"discovery": "stale_rediscovery_owner_identity" if owner_identity else "stale_rediscovery_unique_candidate",
|
|
293
364
|
}
|
|
365
|
+
if leader_uuid:
|
|
366
|
+
receiver["leader_session_uuid"] = leader_uuid
|
|
367
|
+
if owner_epoch is not None:
|
|
368
|
+
receiver["owner_epoch"] = owner_epoch
|
|
369
|
+
return receiver
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _broadcast_ambiguous_candidates(
|
|
373
|
+
receiver: dict[str, Any],
|
|
374
|
+
provider: str,
|
|
375
|
+
candidates: list[dict[str, Any]],
|
|
376
|
+
event_log: EventLog,
|
|
377
|
+
owner_identity: dict[str, Any],
|
|
378
|
+
team_id: str | None,
|
|
379
|
+
) -> dict[str, Any]:
|
|
380
|
+
candidate_ids = sorted(str(candidate.get("pane_id")) for candidate in candidates)
|
|
381
|
+
bucket = _ambiguous_debounce_bucket()
|
|
382
|
+
incident_id = hashlib.sha256("\0".join([str(team_id or ""), *candidate_ids, bucket]).encode("utf-8")).hexdigest()[:16]
|
|
383
|
+
if any(event.get("event") == "leader_receiver.ambiguous_candidates" and event.get("incident_id") == incident_id for event in event_log.tail(200)):
|
|
384
|
+
return {"incident_id": incident_id, "deduped": True}
|
|
385
|
+
prompt = _ambiguous_candidate_prompt(team_id, len(candidates))
|
|
386
|
+
event_log.write(
|
|
387
|
+
"leader_receiver.ambiguous_candidates",
|
|
388
|
+
incident_id=incident_id,
|
|
389
|
+
old_pane_id=receiver.get("pane_id"),
|
|
390
|
+
candidates=candidate_ids,
|
|
391
|
+
provider=provider,
|
|
392
|
+
team_id=team_id,
|
|
393
|
+
uuid_prefix=_uuid_prefix(owner_identity),
|
|
394
|
+
debounce_bucket=bucket,
|
|
395
|
+
)
|
|
396
|
+
for candidate in candidates:
|
|
397
|
+
pane_id = str(candidate.get("pane_id") or "")
|
|
398
|
+
injected = _tmux_inject_text(
|
|
399
|
+
pane_id,
|
|
400
|
+
prompt,
|
|
401
|
+
"Enter",
|
|
402
|
+
f"team-agent-leader-ambiguous-{incident_id}-{pane_id.strip('%')}",
|
|
403
|
+
provider=provider,
|
|
404
|
+
)
|
|
405
|
+
event_log.write(
|
|
406
|
+
"leader_receiver.ambiguous_candidate_queued",
|
|
407
|
+
incident_id=incident_id,
|
|
408
|
+
pane_id=pane_id,
|
|
409
|
+
ok=bool(injected.get("ok")),
|
|
410
|
+
error=injected.get("error"),
|
|
411
|
+
)
|
|
412
|
+
return {"incident_id": incident_id, "deduped": False}
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _ambiguous_debounce_bucket() -> str:
|
|
416
|
+
now = datetime.now(timezone.utc)
|
|
417
|
+
epoch = int(now.timestamp() // _AMBIGUOUS_DEBOUNCE_SECONDS) * _AMBIGUOUS_DEBOUNCE_SECONDS
|
|
418
|
+
return datetime.fromtimestamp(epoch, timezone.utc).isoformat()
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _ambiguous_candidate_prompt(team_id: str | None, candidate_count: int) -> str:
|
|
422
|
+
others = max(candidate_count - 1, 0)
|
|
423
|
+
return (
|
|
424
|
+
f"Team `{team_id or 'current'}` has no bound leader. This window and {others} other window(s) all qualify. "
|
|
425
|
+
"To claim this window as the team leader, run: `team-agent claim-leader --confirm`. "
|
|
426
|
+
"Only the first such call wins; subsequent calls from other windows will be refused."
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _rediscovered_receiver(
|
|
431
|
+
receiver: dict[str, Any],
|
|
432
|
+
provider: str,
|
|
433
|
+
target: dict[str, Any],
|
|
434
|
+
event_log: EventLog,
|
|
435
|
+
owner_identity: dict[str, Any] | None,
|
|
436
|
+
invalidation_reason: str | None = None,
|
|
437
|
+
) -> dict[str, Any]:
|
|
438
|
+
leader_uuid = _target_leader_session_uuid(target) or (owner_identity or {}).get("leader_session_uuid") or receiver.get("leader_session_uuid")
|
|
439
|
+
updated = _receiver_from_target(target, provider, leader_uuid)
|
|
440
|
+
updated["discovery"] = "stale_rediscovery_owner_identity" if owner_identity else "stale_rediscovery_unique_candidate"
|
|
294
441
|
event_log.write(
|
|
295
442
|
"leader_receiver.rediscovered",
|
|
296
443
|
provider=provider,
|
|
@@ -299,6 +446,14 @@ def _rediscovered_receiver(
|
|
|
299
446
|
candidate_count=1,
|
|
300
447
|
owner_identity=owner_identity,
|
|
301
448
|
)
|
|
449
|
+
event_log.write(
|
|
450
|
+
"leader_receiver.rebind_applied",
|
|
451
|
+
old_pane_id=receiver.get("pane_id"),
|
|
452
|
+
new_pane_id=updated["pane_id"],
|
|
453
|
+
reason=invalidation_reason,
|
|
454
|
+
owner_identity=owner_identity,
|
|
455
|
+
uuid_prefix=_uuid_prefix(owner_identity),
|
|
456
|
+
)
|
|
302
457
|
return {"status": "updated", "receiver": updated, "owner_identity": owner_identity}
|
|
303
458
|
|
|
304
459
|
|
|
@@ -306,6 +461,26 @@ def _validate_leader_receiver(receiver: dict[str, Any]) -> dict[str, Any]:
|
|
|
306
461
|
pane_info = _runtime_tmux_pane_info(receiver.get("pane_id"))
|
|
307
462
|
if not pane_info:
|
|
308
463
|
return {"ok": False, "reason": "leader_pane_missing", "error": "tmux pane does not exist"}
|
|
464
|
+
provider = str(receiver.get("provider") or "codex")
|
|
465
|
+
if not _leader_command_looks_usable(pane_info.get("pane_current_command", ""), provider):
|
|
466
|
+
return {
|
|
467
|
+
"ok": False,
|
|
468
|
+
"reason": "leader_pane_wrong_command",
|
|
469
|
+
"error": f"pane command {pane_info.get('pane_current_command')!r} is not a leader host",
|
|
470
|
+
"pane": pane_info,
|
|
471
|
+
}
|
|
472
|
+
expected_uuid = receiver.get("leader_session_uuid")
|
|
473
|
+
if expected_uuid:
|
|
474
|
+
actual_uuid = _leader_uuid_for_bound_pane(receiver, pane_info)
|
|
475
|
+
if not actual_uuid:
|
|
476
|
+
return {"ok": False, "reason": "leader_uuid_missing", "error": "bound pane has no TEAM_AGENT_LEADER_SESSION_UUID", "pane": pane_info}
|
|
477
|
+
if actual_uuid != expected_uuid:
|
|
478
|
+
return {
|
|
479
|
+
"ok": False,
|
|
480
|
+
"reason": "leader_uuid_mismatch",
|
|
481
|
+
"error": "bound pane TEAM_AGENT_LEADER_SESSION_UUID does not match stored team owner",
|
|
482
|
+
"pane": pane_info,
|
|
483
|
+
}
|
|
309
484
|
capture = run_cmd(["tmux", "capture-pane", "-p", "-S", "-40", "-t", pane_info["pane_id"]], timeout=5)
|
|
310
485
|
if capture.returncode != 0:
|
|
311
486
|
return {
|
|
@@ -314,14 +489,7 @@ def _validate_leader_receiver(receiver: dict[str, Any]) -> dict[str, Any]:
|
|
|
314
489
|
"error": capture.stderr.strip() or "tmux capture-pane failed",
|
|
315
490
|
"pane": pane_info,
|
|
316
491
|
}
|
|
317
|
-
warning
|
|
318
|
-
provider = str(receiver.get("provider") or "codex")
|
|
319
|
-
if not _leader_command_looks_usable(pane_info.get("pane_current_command", ""), provider):
|
|
320
|
-
warning = (
|
|
321
|
-
f"pane command {pane_info.get('pane_current_command')!r} is not a typical {provider} host; "
|
|
322
|
-
"continuing because tmux capture works"
|
|
323
|
-
)
|
|
324
|
-
return {"ok": True, "pane": pane_info, "capture": capture.stdout, "warning": warning}
|
|
492
|
+
return {"ok": True, "pane": pane_info, "capture": capture.stdout, "warning": None}
|
|
325
493
|
|
|
326
494
|
|
|
327
495
|
def _leader_command_looks_usable(command: str, provider: str) -> bool:
|
|
@@ -330,7 +498,9 @@ def _leader_command_looks_usable(command: str, provider: str) -> bool:
|
|
|
330
498
|
command_name = Path(command).name
|
|
331
499
|
if provider == "codex":
|
|
332
500
|
return command_name in {"codex", "node", "nodejs"}
|
|
333
|
-
|
|
501
|
+
if provider in {"claude", "claude_code"}:
|
|
502
|
+
return command_name in {"claude", "claude.exe"}
|
|
503
|
+
return command_name in {"codex", "node", "nodejs", "claude", "claude.exe"}
|
|
334
504
|
|
|
335
505
|
|
|
336
506
|
def _choose_leader_submit_key(provider: str, capture_text: str) -> tuple[str, str]:
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from team_agent.events import EventLog
|
|
6
|
+
from team_agent.state import worker_sender_bypasses_owner_gate
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def apply_worker_sender_bypass(
|
|
10
|
+
state: dict[str, Any],
|
|
11
|
+
sender: str | None,
|
|
12
|
+
target: Any,
|
|
13
|
+
task_id: str | None,
|
|
14
|
+
event_log: EventLog,
|
|
15
|
+
) -> bool:
|
|
16
|
+
via = worker_sender_bypasses_owner_gate(state, sender)
|
|
17
|
+
if not via:
|
|
18
|
+
return False
|
|
19
|
+
event_log.write(
|
|
20
|
+
"send.bypassed_owner_gate_worker_sender",
|
|
21
|
+
sender=sender,
|
|
22
|
+
env_team_agent_id=via,
|
|
23
|
+
target=target if isinstance(target, str) else None,
|
|
24
|
+
task_id=task_id,
|
|
25
|
+
)
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__all__ = ["apply_worker_sender_bypass"]
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any
|
|
6
7
|
|
|
7
8
|
from team_agent.events import EventLog
|
|
8
9
|
from team_agent.message_store import MessageStore
|
|
10
|
+
from team_agent.message_store.leader_notification_log import peek_leader_notification
|
|
11
|
+
from team_agent.message_store.result_watchers import leader_notified_message_id_for_result
|
|
9
12
|
from team_agent.messaging.deps import send_message
|
|
10
13
|
from team_agent.messaging.internal_delivery import deliver_stored_message
|
|
11
14
|
|
|
@@ -22,7 +25,13 @@ def retry_result_deliveries(workspace: Path, event_log: EventLog) -> list[dict[s
|
|
|
22
25
|
row = store.result_by_id(str(watcher["result_id"]))
|
|
23
26
|
if not row:
|
|
24
27
|
continue
|
|
25
|
-
notified.extend(notify_result_watchers(
|
|
28
|
+
notified.extend(notify_result_watchers(
|
|
29
|
+
workspace,
|
|
30
|
+
_result_entry_from_row(row),
|
|
31
|
+
event_log,
|
|
32
|
+
watchers=[watcher],
|
|
33
|
+
dedupe_reason="rebind_retry",
|
|
34
|
+
))
|
|
26
35
|
return notified
|
|
27
36
|
|
|
28
37
|
|
|
@@ -31,6 +40,7 @@ def notify_result_watchers(
|
|
|
31
40
|
result: dict[str, Any],
|
|
32
41
|
event_log: EventLog,
|
|
33
42
|
watchers: list[dict[str, Any]] | None = None,
|
|
43
|
+
dedupe_reason: str | None = None,
|
|
34
44
|
) -> list[dict[str, Any]]:
|
|
35
45
|
store = MessageStore(workspace)
|
|
36
46
|
candidates = [
|
|
@@ -67,9 +77,44 @@ def notify_result_watchers(
|
|
|
67
77
|
}
|
|
68
78
|
)
|
|
69
79
|
attempts = result_delivery_attempts(event_log, primary["watcher_id"], str(result.get("result_id") or ""))
|
|
80
|
+
# Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): exactly-once dedupe
|
|
81
|
+
# lives in leader_notification_log keyed by (result_id, leader_session_uuid) and is
|
|
82
|
+
# consulted atomically at the injection boundary inside _send_to_leader_receiver. Here
|
|
83
|
+
# we add a read-only fast-path peek so concurrent notify_result_watchers calls for the
|
|
84
|
+
# same result short-circuit without spinning up a deliver_stored_message round-trip.
|
|
85
|
+
# The peek is NOT the dedupe primitive — the atomic INSERT OR IGNORE at injection is.
|
|
86
|
+
result_id_str = str(result.get("result_id") or "") or None
|
|
87
|
+
if result_id_str:
|
|
88
|
+
leader_uuid = _resolve_leader_session_uuid(workspace, primary.get("owner_team_id"))
|
|
89
|
+
if leader_uuid:
|
|
90
|
+
prior = peek_leader_notification(
|
|
91
|
+
store, result_id=result_id_str, leader_session_uuid=leader_uuid,
|
|
92
|
+
)
|
|
93
|
+
if prior:
|
|
94
|
+
notified.append(_mark_watcher_dedupe_skip(
|
|
95
|
+
store, event_log, primary, result, attempts,
|
|
96
|
+
prior["notified_message_id"],
|
|
97
|
+
dedupe_reason or "injection_log_already_notified",
|
|
98
|
+
notified_at=prior.get("notified_at"),
|
|
99
|
+
leader_session_uuid=leader_uuid,
|
|
100
|
+
))
|
|
101
|
+
return notified
|
|
102
|
+
# Legacy compat: watcher.notified_message_id set by a prior path (Gap 32 reversal of
|
|
103
|
+
# 78055bc, or any pre-Stage-12 code) also blocks redelivery. This preserves the
|
|
104
|
+
# Stage 11.9-11.12 era contract while the new gate (leader_notification_log) is the
|
|
105
|
+
# authoritative dedupe primitive going forward.
|
|
106
|
+
legacy_canonical = leader_notified_message_id_for_result(
|
|
107
|
+
store, primary.get("owner_team_id"), result_id_str,
|
|
108
|
+
)
|
|
109
|
+
if legacy_canonical:
|
|
110
|
+
notified.append(_mark_watcher_dedupe_skip(
|
|
111
|
+
store, event_log, primary, result, attempts,
|
|
112
|
+
legacy_canonical,
|
|
113
|
+
dedupe_reason or "rebind_retry",
|
|
114
|
+
))
|
|
115
|
+
return notified
|
|
70
116
|
existing = delivered_result_message(
|
|
71
|
-
store,
|
|
72
|
-
str(result.get("result_id") or ""),
|
|
117
|
+
store, str(result.get("result_id") or ""),
|
|
73
118
|
task_id=result.get("task_id"),
|
|
74
119
|
owner_team_id=primary.get("owner_team_id"),
|
|
75
120
|
)
|
|
@@ -83,6 +128,75 @@ def notify_result_watchers(
|
|
|
83
128
|
return notified
|
|
84
129
|
|
|
85
130
|
|
|
131
|
+
def _resolve_leader_session_uuid(workspace: Path, owner_team_id: str | None) -> str | None:
|
|
132
|
+
"""Helper: read the team's leader_session_uuid from runtime state for gate lookups."""
|
|
133
|
+
try:
|
|
134
|
+
from team_agent.messaging.deps import load_runtime_state, team_state_key
|
|
135
|
+
state = load_runtime_state(workspace)
|
|
136
|
+
if owner_team_id and isinstance(state.get("teams"), dict):
|
|
137
|
+
scoped = state["teams"].get(owner_team_id)
|
|
138
|
+
if isinstance(scoped, dict):
|
|
139
|
+
state = scoped
|
|
140
|
+
elif owner_team_id and team_state_key(state) != owner_team_id:
|
|
141
|
+
return None
|
|
142
|
+
owner = state.get("team_owner") or {}
|
|
143
|
+
return str(owner.get("leader_session_uuid") or "") or None
|
|
144
|
+
except Exception:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _infer_dedupe_reason(primary: dict[str, Any], store: MessageStore) -> str:
|
|
149
|
+
if primary.get("notified_message_id"):
|
|
150
|
+
return "rebind_retry"
|
|
151
|
+
return "watcher_duplicate"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _mark_watcher_dedupe_skip(
|
|
155
|
+
store: MessageStore,
|
|
156
|
+
event_log: EventLog,
|
|
157
|
+
watcher: dict[str, Any],
|
|
158
|
+
result: dict[str, Any],
|
|
159
|
+
attempts: int,
|
|
160
|
+
canonical_message_id: str,
|
|
161
|
+
reason: str,
|
|
162
|
+
*,
|
|
163
|
+
notified_at: str | None = None,
|
|
164
|
+
leader_session_uuid: str | None = None,
|
|
165
|
+
) -> dict[str, Any]:
|
|
166
|
+
original_message_id = watcher.get("notified_message_id")
|
|
167
|
+
# Stage 12: the canonical message_id (or sentinel from the gate) is auditing metadata
|
|
168
|
+
# here. The authoritative dedupe gate is leader_notification_log; this mark just keeps
|
|
169
|
+
# the watcher row from being re-picked by retry scans.
|
|
170
|
+
store.mark_result_watcher(
|
|
171
|
+
watcher["watcher_id"],
|
|
172
|
+
"notified",
|
|
173
|
+
result_id=result.get("result_id"),
|
|
174
|
+
notified_message_id=canonical_message_id,
|
|
175
|
+
)
|
|
176
|
+
event_log.write(
|
|
177
|
+
"leader_receiver.notification_dedupe_skip",
|
|
178
|
+
result_id=result.get("result_id"),
|
|
179
|
+
original_message_id=original_message_id,
|
|
180
|
+
suppressed_message_id=canonical_message_id,
|
|
181
|
+
reason=reason,
|
|
182
|
+
team_id=watcher.get("owner_team_id"),
|
|
183
|
+
watcher_id=watcher["watcher_id"],
|
|
184
|
+
task_id=result.get("task_id"),
|
|
185
|
+
agent_id=result.get("agent_id"),
|
|
186
|
+
attempt=attempts + 1,
|
|
187
|
+
leader_session_uuid=leader_session_uuid,
|
|
188
|
+
prior_notified_at=notified_at,
|
|
189
|
+
)
|
|
190
|
+
return {
|
|
191
|
+
"watcher_id": watcher["watcher_id"],
|
|
192
|
+
"result_id": result.get("result_id"),
|
|
193
|
+
"ok": True,
|
|
194
|
+
"message_id": canonical_message_id,
|
|
195
|
+
"deduped": True,
|
|
196
|
+
"dedupe_reason": reason,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
|
|
86
200
|
def _dedupe_watchers_for_result(
|
|
87
201
|
watchers: list[dict[str, Any]],
|
|
88
202
|
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
|
@@ -114,11 +228,19 @@ def _deliver_result_to_watcher(
|
|
|
114
228
|
return _mark_delivery_failed(store, event_log, watcher, result, attempts, str(exc))
|
|
115
229
|
status = "notified" if delivery.get("ok") else "notify_failed"
|
|
116
230
|
error = delivery.get("reason") or delivery.get("error")
|
|
231
|
+
# Stage 12: notified_message_id is now auditing metadata. The exactly-once contract
|
|
232
|
+
# lives in the leader_notification_log table consulted by _send_to_leader_receiver;
|
|
233
|
+
# whatever the gate suppresses comes back as ok=true deduped=true, and the watcher row
|
|
234
|
+
# records this as a successful notification with the canonical message_id.
|
|
235
|
+
persisted_message_id = (
|
|
236
|
+
delivery.get("canonical_message_id") if delivery.get("deduped")
|
|
237
|
+
else (delivery.get("message_id") if delivery.get("ok") else None)
|
|
238
|
+
)
|
|
117
239
|
store.mark_result_watcher(
|
|
118
240
|
watcher["watcher_id"],
|
|
119
241
|
status,
|
|
120
242
|
result_id=result.get("result_id"),
|
|
121
|
-
notified_message_id=
|
|
243
|
+
notified_message_id=persisted_message_id,
|
|
122
244
|
error=error,
|
|
123
245
|
)
|
|
124
246
|
event_log.write(
|
|
@@ -279,6 +401,99 @@ def watcher_matches_result(watcher: dict[str, Any], result: dict[str, Any]) -> b
|
|
|
279
401
|
return (not task_id or task_id == result.get("task_id")) and (not agent_id or agent_id == result.get("agent_id"))
|
|
280
402
|
|
|
281
403
|
|
|
404
|
+
def requeue_after_claim_leader(
|
|
405
|
+
workspace: Path,
|
|
406
|
+
store: MessageStore,
|
|
407
|
+
event_log: EventLog,
|
|
408
|
+
owner_team_id: str,
|
|
409
|
+
claimed_pane_id: str,
|
|
410
|
+
*,
|
|
411
|
+
incident_ts: str | None = None,
|
|
412
|
+
) -> list[dict[str, Any]]:
|
|
413
|
+
"""Post-claim hook (Gap 26 / Mac mini Stage 11 Scenarios 3, 11.10): re-route every
|
|
414
|
+
not-yet-delivered leader-bound notification to the newly claimed pane. Returns the
|
|
415
|
+
list of requeued watcher records (may be empty).
|
|
416
|
+
|
|
417
|
+
Stage 11.10 semantic reframe: claim-leader means "all not-yet-delivered leader-bound
|
|
418
|
+
notifications for this team_id reroute to the claimed pane". Watcher status is
|
|
419
|
+
irrelevant — `notified_message_id` is the only dedupe gate. Gap 32 exactly-once
|
|
420
|
+
contract still holds: notified_message_id non-null blocks redelivery.
|
|
421
|
+
|
|
422
|
+
Selection rules:
|
|
423
|
+
- watcher is scoped to this team (owner_team_id match)
|
|
424
|
+
- watcher has no notified_message_id (Gap 32 once-only)
|
|
425
|
+
- watcher's latest activity timestamp (completed_at fallback created_at) is
|
|
426
|
+
at-or-after incident_ts when provided; without an incident_ts every
|
|
427
|
+
un-notified watcher is requeued.
|
|
428
|
+
- watcher status is otherwise ignored (pending / delivery_blocked /
|
|
429
|
+
delivery_exhausted / notify_failed all become candidates).
|
|
430
|
+
|
|
431
|
+
Atomicity vs coordinator's own scheduled retry: just before flipping a watcher's
|
|
432
|
+
status, re-fetch the row from the store. If notified_message_id became non-null
|
|
433
|
+
in the gap (the scheduled retry beat us), emit a benign
|
|
434
|
+
leader_receiver.claim_requeue_already_in_flight event and skip. If the race
|
|
435
|
+
leaks past this check, Gap 32 dedupe inside notify_result_watchers still
|
|
436
|
+
guarantees exactly-once injection.
|
|
437
|
+
"""
|
|
438
|
+
# Stage 11.12: CAS re-fetch + claim_requeue_already_in_flight event retired. The atomic
|
|
439
|
+
# UPSERT in notify_result_watchers (claim_leader_notification) is now the single race
|
|
440
|
+
# gate. We mark eligible watchers to notify_failed and let retry_result_deliveries route
|
|
441
|
+
# through the UPSERT — concurrent claim/scheduled-retry paths both pass through the
|
|
442
|
+
# same atomic claim and only one fires deliver_attempt.
|
|
443
|
+
incident_dt = _parse_iso(incident_ts)
|
|
444
|
+
requeued: list[dict[str, Any]] = []
|
|
445
|
+
for watcher in store.result_watchers(owner_team_id=owner_team_id):
|
|
446
|
+
if watcher.get("notified_message_id"):
|
|
447
|
+
continue
|
|
448
|
+
latest_ts = _parse_iso(watcher.get("completed_at")) or _parse_iso(watcher.get("created_at"))
|
|
449
|
+
if incident_dt and latest_ts and latest_ts < incident_dt:
|
|
450
|
+
continue
|
|
451
|
+
watcher_id = watcher["watcher_id"]
|
|
452
|
+
prior_state = str(watcher.get("status") or "")
|
|
453
|
+
store.mark_result_watcher(
|
|
454
|
+
watcher_id, "notify_failed",
|
|
455
|
+
result_id=watcher.get("result_id"),
|
|
456
|
+
)
|
|
457
|
+
event_log.write(
|
|
458
|
+
"leader_receiver.claim_requeue",
|
|
459
|
+
result_id=watcher.get("result_id"),
|
|
460
|
+
watcher_id=watcher_id,
|
|
461
|
+
prior_state=prior_state,
|
|
462
|
+
requeued_at=datetime.now(timezone.utc).isoformat(),
|
|
463
|
+
claimed_pane_id=claimed_pane_id,
|
|
464
|
+
team_id=owner_team_id,
|
|
465
|
+
)
|
|
466
|
+
requeued.append({
|
|
467
|
+
"watcher_id": watcher_id,
|
|
468
|
+
"result_id": watcher.get("result_id"),
|
|
469
|
+
"prior_state": prior_state,
|
|
470
|
+
})
|
|
471
|
+
if requeued:
|
|
472
|
+
try:
|
|
473
|
+
retry_result_deliveries(workspace, event_log)
|
|
474
|
+
except Exception as exc:
|
|
475
|
+
event_log.write(
|
|
476
|
+
"leader_receiver.claim_requeue_delivery_failed",
|
|
477
|
+
error=str(exc),
|
|
478
|
+
watcher_ids=[r["watcher_id"] for r in requeued],
|
|
479
|
+
team_id=owner_team_id,
|
|
480
|
+
claimed_pane_id=claimed_pane_id,
|
|
481
|
+
)
|
|
482
|
+
return requeued
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _parse_iso(text: Any) -> datetime | None:
|
|
486
|
+
if not isinstance(text, str) or not text:
|
|
487
|
+
return None
|
|
488
|
+
try:
|
|
489
|
+
dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
|
|
490
|
+
except ValueError:
|
|
491
|
+
return None
|
|
492
|
+
if dt.tzinfo is None:
|
|
493
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
494
|
+
return dt
|
|
495
|
+
|
|
496
|
+
|
|
282
497
|
def format_result_watcher_notification(result: dict[str, Any]) -> str:
|
|
283
498
|
task_id = result.get("task_id") or "unknown task"
|
|
284
499
|
agent_id = result.get("agent_id") or "unknown agent"
|