@team-agent/installer 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/team_agent/cli/commands.py +18 -3
- package/src/team_agent/cli/parser.py +33 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +8 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +193 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +86 -9
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +23 -0
- package/src/team_agent/messaging/idle_alerts.py +109 -9
- package/src/team_agent/messaging/leader.py +166 -6
- package/src/team_agent/messaging/leader_panes.py +193 -23
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +12 -2
- package/src/team_agent/runtime.py +4 -4
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/state.py +153 -10
- package/src/team_agent/status/inbox.py +33 -3
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any
|
|
6
7
|
|
|
7
8
|
from team_agent.events import EventLog
|
|
8
9
|
from team_agent.message_store import MessageStore
|
|
10
|
+
from team_agent.message_store.leader_notification_log import peek_leader_notification
|
|
11
|
+
from team_agent.message_store.result_watchers import leader_notified_message_id_for_result
|
|
9
12
|
from team_agent.messaging.deps import send_message
|
|
10
13
|
from team_agent.messaging.internal_delivery import deliver_stored_message
|
|
11
14
|
|
|
@@ -22,7 +25,13 @@ def retry_result_deliveries(workspace: Path, event_log: EventLog) -> list[dict[s
|
|
|
22
25
|
row = store.result_by_id(str(watcher["result_id"]))
|
|
23
26
|
if not row:
|
|
24
27
|
continue
|
|
25
|
-
notified.extend(notify_result_watchers(
|
|
28
|
+
notified.extend(notify_result_watchers(
|
|
29
|
+
workspace,
|
|
30
|
+
_result_entry_from_row(row),
|
|
31
|
+
event_log,
|
|
32
|
+
watchers=[watcher],
|
|
33
|
+
dedupe_reason="rebind_retry",
|
|
34
|
+
))
|
|
26
35
|
return notified
|
|
27
36
|
|
|
28
37
|
|
|
@@ -31,6 +40,7 @@ def notify_result_watchers(
|
|
|
31
40
|
result: dict[str, Any],
|
|
32
41
|
event_log: EventLog,
|
|
33
42
|
watchers: list[dict[str, Any]] | None = None,
|
|
43
|
+
dedupe_reason: str | None = None,
|
|
34
44
|
) -> list[dict[str, Any]]:
|
|
35
45
|
store = MessageStore(workspace)
|
|
36
46
|
candidates = [
|
|
@@ -67,9 +77,44 @@ def notify_result_watchers(
|
|
|
67
77
|
}
|
|
68
78
|
)
|
|
69
79
|
attempts = result_delivery_attempts(event_log, primary["watcher_id"], str(result.get("result_id") or ""))
|
|
80
|
+
# Stage 12 (Gap 26 ∩ Gap 32 roundtable consolidation 2026-05-26): exactly-once dedupe
|
|
81
|
+
# lives in leader_notification_log keyed by (result_id, leader_session_uuid) and is
|
|
82
|
+
# consulted atomically at the injection boundary inside _send_to_leader_receiver. Here
|
|
83
|
+
# we add a read-only fast-path peek so concurrent notify_result_watchers calls for the
|
|
84
|
+
# same result short-circuit without spinning up a deliver_stored_message round-trip.
|
|
85
|
+
# The peek is NOT the dedupe primitive — the atomic INSERT OR IGNORE at injection is.
|
|
86
|
+
result_id_str = str(result.get("result_id") or "") or None
|
|
87
|
+
if result_id_str:
|
|
88
|
+
leader_uuid = _resolve_leader_session_uuid(workspace, primary.get("owner_team_id"))
|
|
89
|
+
if leader_uuid:
|
|
90
|
+
prior = peek_leader_notification(
|
|
91
|
+
store, result_id=result_id_str, leader_session_uuid=leader_uuid,
|
|
92
|
+
)
|
|
93
|
+
if prior:
|
|
94
|
+
notified.append(_mark_watcher_dedupe_skip(
|
|
95
|
+
store, event_log, primary, result, attempts,
|
|
96
|
+
prior["notified_message_id"],
|
|
97
|
+
dedupe_reason or "injection_log_already_notified",
|
|
98
|
+
notified_at=prior.get("notified_at"),
|
|
99
|
+
leader_session_uuid=leader_uuid,
|
|
100
|
+
))
|
|
101
|
+
return notified
|
|
102
|
+
# Legacy compat: watcher.notified_message_id set by a prior path (Gap 32 reversal of
|
|
103
|
+
# 78055bc, or any pre-Stage-12 code) also blocks redelivery. This preserves the
|
|
104
|
+
# Stage 11.9-11.12 era contract while the new gate (leader_notification_log) is the
|
|
105
|
+
# authoritative dedupe primitive going forward.
|
|
106
|
+
legacy_canonical = leader_notified_message_id_for_result(
|
|
107
|
+
store, primary.get("owner_team_id"), result_id_str,
|
|
108
|
+
)
|
|
109
|
+
if legacy_canonical:
|
|
110
|
+
notified.append(_mark_watcher_dedupe_skip(
|
|
111
|
+
store, event_log, primary, result, attempts,
|
|
112
|
+
legacy_canonical,
|
|
113
|
+
dedupe_reason or "rebind_retry",
|
|
114
|
+
))
|
|
115
|
+
return notified
|
|
70
116
|
existing = delivered_result_message(
|
|
71
|
-
store,
|
|
72
|
-
str(result.get("result_id") or ""),
|
|
117
|
+
store, str(result.get("result_id") or ""),
|
|
73
118
|
task_id=result.get("task_id"),
|
|
74
119
|
owner_team_id=primary.get("owner_team_id"),
|
|
75
120
|
)
|
|
@@ -83,6 +128,75 @@ def notify_result_watchers(
|
|
|
83
128
|
return notified
|
|
84
129
|
|
|
85
130
|
|
|
131
|
+
def _resolve_leader_session_uuid(workspace: Path, owner_team_id: str | None) -> str | None:
|
|
132
|
+
"""Helper: read the team's leader_session_uuid from runtime state for gate lookups."""
|
|
133
|
+
try:
|
|
134
|
+
from team_agent.messaging.deps import load_runtime_state, team_state_key
|
|
135
|
+
state = load_runtime_state(workspace)
|
|
136
|
+
if owner_team_id and isinstance(state.get("teams"), dict):
|
|
137
|
+
scoped = state["teams"].get(owner_team_id)
|
|
138
|
+
if isinstance(scoped, dict):
|
|
139
|
+
state = scoped
|
|
140
|
+
elif owner_team_id and team_state_key(state) != owner_team_id:
|
|
141
|
+
return None
|
|
142
|
+
owner = state.get("team_owner") or {}
|
|
143
|
+
return str(owner.get("leader_session_uuid") or "") or None
|
|
144
|
+
except Exception:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _infer_dedupe_reason(primary: dict[str, Any], store: MessageStore) -> str:
|
|
149
|
+
if primary.get("notified_message_id"):
|
|
150
|
+
return "rebind_retry"
|
|
151
|
+
return "watcher_duplicate"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _mark_watcher_dedupe_skip(
|
|
155
|
+
store: MessageStore,
|
|
156
|
+
event_log: EventLog,
|
|
157
|
+
watcher: dict[str, Any],
|
|
158
|
+
result: dict[str, Any],
|
|
159
|
+
attempts: int,
|
|
160
|
+
canonical_message_id: str,
|
|
161
|
+
reason: str,
|
|
162
|
+
*,
|
|
163
|
+
notified_at: str | None = None,
|
|
164
|
+
leader_session_uuid: str | None = None,
|
|
165
|
+
) -> dict[str, Any]:
|
|
166
|
+
original_message_id = watcher.get("notified_message_id")
|
|
167
|
+
# Stage 12: the canonical message_id (or sentinel from the gate) is auditing metadata
|
|
168
|
+
# here. The authoritative dedupe gate is leader_notification_log; this mark just keeps
|
|
169
|
+
# the watcher row from being re-picked by retry scans.
|
|
170
|
+
store.mark_result_watcher(
|
|
171
|
+
watcher["watcher_id"],
|
|
172
|
+
"notified",
|
|
173
|
+
result_id=result.get("result_id"),
|
|
174
|
+
notified_message_id=canonical_message_id,
|
|
175
|
+
)
|
|
176
|
+
event_log.write(
|
|
177
|
+
"leader_receiver.notification_dedupe_skip",
|
|
178
|
+
result_id=result.get("result_id"),
|
|
179
|
+
original_message_id=original_message_id,
|
|
180
|
+
suppressed_message_id=canonical_message_id,
|
|
181
|
+
reason=reason,
|
|
182
|
+
team_id=watcher.get("owner_team_id"),
|
|
183
|
+
watcher_id=watcher["watcher_id"],
|
|
184
|
+
task_id=result.get("task_id"),
|
|
185
|
+
agent_id=result.get("agent_id"),
|
|
186
|
+
attempt=attempts + 1,
|
|
187
|
+
leader_session_uuid=leader_session_uuid,
|
|
188
|
+
prior_notified_at=notified_at,
|
|
189
|
+
)
|
|
190
|
+
return {
|
|
191
|
+
"watcher_id": watcher["watcher_id"],
|
|
192
|
+
"result_id": result.get("result_id"),
|
|
193
|
+
"ok": True,
|
|
194
|
+
"message_id": canonical_message_id,
|
|
195
|
+
"deduped": True,
|
|
196
|
+
"dedupe_reason": reason,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
|
|
86
200
|
def _dedupe_watchers_for_result(
|
|
87
201
|
watchers: list[dict[str, Any]],
|
|
88
202
|
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
|
@@ -114,11 +228,19 @@ def _deliver_result_to_watcher(
|
|
|
114
228
|
return _mark_delivery_failed(store, event_log, watcher, result, attempts, str(exc))
|
|
115
229
|
status = "notified" if delivery.get("ok") else "notify_failed"
|
|
116
230
|
error = delivery.get("reason") or delivery.get("error")
|
|
231
|
+
# Stage 12: notified_message_id is now auditing metadata. The exactly-once contract
|
|
232
|
+
# lives in the leader_notification_log table consulted by _send_to_leader_receiver;
|
|
233
|
+
# whatever the gate suppresses comes back as ok=true deduped=true, and the watcher row
|
|
234
|
+
# records this as a successful notification with the canonical message_id.
|
|
235
|
+
persisted_message_id = (
|
|
236
|
+
delivery.get("canonical_message_id") if delivery.get("deduped")
|
|
237
|
+
else (delivery.get("message_id") if delivery.get("ok") else None)
|
|
238
|
+
)
|
|
117
239
|
store.mark_result_watcher(
|
|
118
240
|
watcher["watcher_id"],
|
|
119
241
|
status,
|
|
120
242
|
result_id=result.get("result_id"),
|
|
121
|
-
notified_message_id=
|
|
243
|
+
notified_message_id=persisted_message_id,
|
|
122
244
|
error=error,
|
|
123
245
|
)
|
|
124
246
|
event_log.write(
|
|
@@ -279,6 +401,99 @@ def watcher_matches_result(watcher: dict[str, Any], result: dict[str, Any]) -> b
|
|
|
279
401
|
return (not task_id or task_id == result.get("task_id")) and (not agent_id or agent_id == result.get("agent_id"))
|
|
280
402
|
|
|
281
403
|
|
|
404
|
+
def requeue_after_claim_leader(
|
|
405
|
+
workspace: Path,
|
|
406
|
+
store: MessageStore,
|
|
407
|
+
event_log: EventLog,
|
|
408
|
+
owner_team_id: str,
|
|
409
|
+
claimed_pane_id: str,
|
|
410
|
+
*,
|
|
411
|
+
incident_ts: str | None = None,
|
|
412
|
+
) -> list[dict[str, Any]]:
|
|
413
|
+
"""Post-claim hook (Gap 26 / Mac mini Stage 11 Scenarios 3, 11.10): re-route every
|
|
414
|
+
not-yet-delivered leader-bound notification to the newly claimed pane. Returns the
|
|
415
|
+
list of requeued watcher records (may be empty).
|
|
416
|
+
|
|
417
|
+
Stage 11.10 semantic reframe: claim-leader means "all not-yet-delivered leader-bound
|
|
418
|
+
notifications for this team_id reroute to the claimed pane". Watcher status is
|
|
419
|
+
irrelevant — `notified_message_id` is the only dedupe gate. Gap 32 exactly-once
|
|
420
|
+
contract still holds: notified_message_id non-null blocks redelivery.
|
|
421
|
+
|
|
422
|
+
Selection rules:
|
|
423
|
+
- watcher is scoped to this team (owner_team_id match)
|
|
424
|
+
- watcher has no notified_message_id (Gap 32 once-only)
|
|
425
|
+
- watcher's latest activity timestamp (completed_at fallback created_at) is
|
|
426
|
+
at-or-after incident_ts when provided; without an incident_ts every
|
|
427
|
+
un-notified watcher is requeued.
|
|
428
|
+
- watcher status is otherwise ignored (pending / delivery_blocked /
|
|
429
|
+
delivery_exhausted / notify_failed all become candidates).
|
|
430
|
+
|
|
431
|
+
Atomicity vs coordinator's own scheduled retry: just before flipping a watcher's
|
|
432
|
+
status, re-fetch the row from the store. If notified_message_id became non-null
|
|
433
|
+
in the gap (the scheduled retry beat us), emit a benign
|
|
434
|
+
leader_receiver.claim_requeue_already_in_flight event and skip. If the race
|
|
435
|
+
leaks past this check, Gap 32 dedupe inside notify_result_watchers still
|
|
436
|
+
guarantees exactly-once injection.
|
|
437
|
+
"""
|
|
438
|
+
# Stage 11.12: CAS re-fetch + claim_requeue_already_in_flight event retired. The atomic
|
|
439
|
+
# UPSERT in notify_result_watchers (claim_leader_notification) is now the single race
|
|
440
|
+
# gate. We mark eligible watchers to notify_failed and let retry_result_deliveries route
|
|
441
|
+
# through the UPSERT — concurrent claim/scheduled-retry paths both pass through the
|
|
442
|
+
# same atomic claim and only one fires deliver_attempt.
|
|
443
|
+
incident_dt = _parse_iso(incident_ts)
|
|
444
|
+
requeued: list[dict[str, Any]] = []
|
|
445
|
+
for watcher in store.result_watchers(owner_team_id=owner_team_id):
|
|
446
|
+
if watcher.get("notified_message_id"):
|
|
447
|
+
continue
|
|
448
|
+
latest_ts = _parse_iso(watcher.get("completed_at")) or _parse_iso(watcher.get("created_at"))
|
|
449
|
+
if incident_dt and latest_ts and latest_ts < incident_dt:
|
|
450
|
+
continue
|
|
451
|
+
watcher_id = watcher["watcher_id"]
|
|
452
|
+
prior_state = str(watcher.get("status") or "")
|
|
453
|
+
store.mark_result_watcher(
|
|
454
|
+
watcher_id, "notify_failed",
|
|
455
|
+
result_id=watcher.get("result_id"),
|
|
456
|
+
)
|
|
457
|
+
event_log.write(
|
|
458
|
+
"leader_receiver.claim_requeue",
|
|
459
|
+
result_id=watcher.get("result_id"),
|
|
460
|
+
watcher_id=watcher_id,
|
|
461
|
+
prior_state=prior_state,
|
|
462
|
+
requeued_at=datetime.now(timezone.utc).isoformat(),
|
|
463
|
+
claimed_pane_id=claimed_pane_id,
|
|
464
|
+
team_id=owner_team_id,
|
|
465
|
+
)
|
|
466
|
+
requeued.append({
|
|
467
|
+
"watcher_id": watcher_id,
|
|
468
|
+
"result_id": watcher.get("result_id"),
|
|
469
|
+
"prior_state": prior_state,
|
|
470
|
+
})
|
|
471
|
+
if requeued:
|
|
472
|
+
try:
|
|
473
|
+
retry_result_deliveries(workspace, event_log)
|
|
474
|
+
except Exception as exc:
|
|
475
|
+
event_log.write(
|
|
476
|
+
"leader_receiver.claim_requeue_delivery_failed",
|
|
477
|
+
error=str(exc),
|
|
478
|
+
watcher_ids=[r["watcher_id"] for r in requeued],
|
|
479
|
+
team_id=owner_team_id,
|
|
480
|
+
claimed_pane_id=claimed_pane_id,
|
|
481
|
+
)
|
|
482
|
+
return requeued
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _parse_iso(text: Any) -> datetime | None:
|
|
486
|
+
if not isinstance(text, str) or not text:
|
|
487
|
+
return None
|
|
488
|
+
try:
|
|
489
|
+
dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
|
|
490
|
+
except ValueError:
|
|
491
|
+
return None
|
|
492
|
+
if dt.tzinfo is None:
|
|
493
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
494
|
+
return dt
|
|
495
|
+
|
|
496
|
+
|
|
282
497
|
def format_result_watcher_notification(result: dict[str, Any]) -> str:
|
|
283
498
|
task_id = result.get("task_id") or "unknown task"
|
|
284
499
|
agent_id = result.get("agent_id") or "unknown agent"
|
|
@@ -359,36 +359,27 @@ def _refresh_leader_receiver_or_flag_rebind(
|
|
|
359
359
|
receiver = state.get("leader_receiver") or {}
|
|
360
360
|
if receiver.get("mode") != "direct_tmux":
|
|
361
361
|
return state
|
|
362
|
-
|
|
362
|
+
owner_identity = state.get("team_owner") or None
|
|
363
|
+
receiver_for_validation = dict(receiver)
|
|
364
|
+
if owner_identity and owner_identity.get("leader_session_uuid") and not receiver_for_validation.get("leader_session_uuid"):
|
|
365
|
+
receiver_for_validation["leader_session_uuid"] = owner_identity["leader_session_uuid"]
|
|
366
|
+
validation = _validate_leader_receiver(receiver_for_validation)
|
|
363
367
|
if validation.get("ok"):
|
|
364
368
|
return state
|
|
365
|
-
|
|
366
|
-
|
|
369
|
+
rediscovered = _rediscover_leader_receiver(
|
|
370
|
+
receiver_for_validation,
|
|
371
|
+
event_log,
|
|
372
|
+
owner_identity,
|
|
373
|
+
invalidation_reason=validation.get("reason"),
|
|
374
|
+
team_id=team_state_key(state),
|
|
375
|
+
)
|
|
367
376
|
if rediscovered.get("status") == "updated":
|
|
368
377
|
state["leader_receiver"] = rediscovered["receiver"]
|
|
369
378
|
if persist:
|
|
370
379
|
save_runtime_state(workspace, state)
|
|
371
380
|
else:
|
|
372
381
|
save_team_scoped_state(workspace, state)
|
|
373
|
-
event_log.write(
|
|
374
|
-
"leader_receiver.rebind_applied",
|
|
375
|
-
old_pane_id=receiver.get("pane_id"),
|
|
376
|
-
new_pane_id=rediscovered["receiver"].get("pane_id"),
|
|
377
|
-
reason=validation.get("reason"),
|
|
378
|
-
source="report_result_notify",
|
|
379
|
-
owner_identity=owner_identity,
|
|
380
|
-
)
|
|
381
382
|
return state
|
|
382
|
-
event_log.write(
|
|
383
|
-
"leader_receiver.rebind_required",
|
|
384
|
-
old_pane_id=receiver.get("pane_id"),
|
|
385
|
-
reason=validation.get("reason"),
|
|
386
|
-
validation_error=validation.get("error"),
|
|
387
|
-
rediscovery_status=rediscovered.get("status"),
|
|
388
|
-
provider=receiver.get("provider"),
|
|
389
|
-
source="report_result_notify",
|
|
390
|
-
owner_identity=owner_identity,
|
|
391
|
-
)
|
|
392
383
|
return state
|
|
393
384
|
|
|
394
385
|
|
|
@@ -409,8 +409,18 @@ def _recent_restart_or_reset_event(event_log: EventLog, agent_id: str, since: da
|
|
|
409
409
|
for event in reversed(event_log.tail(200)):
|
|
410
410
|
if event.get("event") not in _RESTART_RESET_EVENTS:
|
|
411
411
|
continue
|
|
412
|
-
if event.get("agent_id") != agent_id
|
|
413
|
-
|
|
412
|
+
if event.get("agent_id") != agent_id:
|
|
413
|
+
agents_field = event.get("agents") or []
|
|
414
|
+
agent_ids: set[str] = set()
|
|
415
|
+
for entry in agents_field:
|
|
416
|
+
if isinstance(entry, str):
|
|
417
|
+
agent_ids.add(entry)
|
|
418
|
+
elif isinstance(entry, dict):
|
|
419
|
+
aid = entry.get("agent_id")
|
|
420
|
+
if isinstance(aid, str):
|
|
421
|
+
agent_ids.add(aid)
|
|
422
|
+
if agent_id not in agent_ids:
|
|
423
|
+
continue
|
|
414
424
|
try:
|
|
415
425
|
ts = datetime.fromisoformat(str(event.get("ts")))
|
|
416
426
|
except ValueError:
|
|
@@ -67,6 +67,8 @@ from team_agent.display import (
|
|
|
67
67
|
from team_agent.leader import (
|
|
68
68
|
attach_leader,
|
|
69
69
|
attach_leader_to_state as _attach_leader_to_state,
|
|
70
|
+
claim_leader,
|
|
71
|
+
leader_identity,
|
|
70
72
|
leader_session_name as _leader_session_name,
|
|
71
73
|
leader_start_plan,
|
|
72
74
|
start_leader,
|
|
@@ -438,12 +440,10 @@ for _name in (
|
|
|
438
440
|
assert hasattr(_launch_pkg, _name), f"team_agent.launch missing {_name}"
|
|
439
441
|
del _launch_pkg, _name
|
|
440
442
|
|
|
441
|
-
# Leader lane re-exports keep runtime
|
|
442
|
-
# runtime.leader_start_plan, runtime._attach_leader_to_state,
|
|
443
|
-
# runtime._leader_session_name resolving for CLI handlers and tests.
|
|
443
|
+
# Leader lane re-exports keep runtime leader helpers resolving for CLI handlers and tests.
|
|
444
444
|
import team_agent.leader as _leader_pkg
|
|
445
445
|
assert attach_leader is _leader_pkg.attach_leader
|
|
446
|
-
for _name in ("attach_leader", "attach_leader_to_state", "leader_session_name", "leader_start_plan", "start_leader"):
|
|
446
|
+
for _name in ("attach_leader", "attach_leader_to_state", "claim_leader", "leader_identity", "leader_session_name", "leader_start_plan", "start_leader"):
|
|
447
447
|
assert hasattr(_leader_pkg, _name), f"team_agent.leader missing {_name}"
|
|
448
448
|
del _leader_pkg, _name
|
|
449
449
|
from team_agent.task_graph import ready_tasks, update_task_status
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import platform
|
|
4
5
|
import re
|
|
5
6
|
import shutil
|
|
6
7
|
import subprocess
|
|
@@ -10,6 +11,18 @@ from typing import Any
|
|
|
10
11
|
from team_agent.paths import repo_root
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
_LEADER_ENV_KEYS = (
|
|
15
|
+
"TEAM_AGENT_LEADER_SESSION_UUID",
|
|
16
|
+
"TEAM_AGENT_LEADER_PANE_ID",
|
|
17
|
+
"TEAM_AGENT_LEADER_PROVIDER",
|
|
18
|
+
"TEAM_AGENT_MACHINE_FINGERPRINT",
|
|
19
|
+
"TEAM_AGENT_LEADER_SESSION_UUID_OVERRIDE",
|
|
20
|
+
)
|
|
21
|
+
_LEADER_SHAPED_COMMANDS = {"codex", "claude", "claude.exe", "node", "nodejs"}
|
|
22
|
+
_PANE_ENV_SCAN_TIMEOUT_SECONDS = 2.0
|
|
23
|
+
_run_subprocess = subprocess.run # test-injectable indirection
|
|
24
|
+
|
|
25
|
+
|
|
13
26
|
def core_binary() -> Path | None:
|
|
14
27
|
configured = shutil.which("team-agent-core")
|
|
15
28
|
if configured:
|
|
@@ -105,13 +118,13 @@ def list_targets() -> dict[str, Any]:
|
|
|
105
118
|
result = call_core("list-targets")
|
|
106
119
|
if result.get("ok"):
|
|
107
120
|
return result
|
|
108
|
-
proc =
|
|
121
|
+
proc = _run_subprocess(
|
|
109
122
|
[
|
|
110
123
|
"tmux",
|
|
111
124
|
"list-panes",
|
|
112
125
|
"-a",
|
|
113
126
|
"-F",
|
|
114
|
-
"#{pane_id}\t#{session_name}\t#{window_index}\t#{window_name}\t#{pane_index}\t#{pane_tty}\t#{pane_current_command}\t#{pane_active}",
|
|
127
|
+
"#{pane_id}\t#{session_name}\t#{window_index}\t#{window_name}\t#{pane_index}\t#{pane_tty}\t#{pane_current_command}\t#{pane_active}\t#{pane_pid}",
|
|
115
128
|
],
|
|
116
129
|
text=True,
|
|
117
130
|
capture_output=True,
|
|
@@ -123,7 +136,7 @@ def list_targets() -> dict[str, Any]:
|
|
|
123
136
|
targets = []
|
|
124
137
|
for line in proc.stdout.splitlines():
|
|
125
138
|
parts = line.split("\t")
|
|
126
|
-
if len(parts)
|
|
139
|
+
if len(parts) not in {8, 9}:
|
|
127
140
|
continue
|
|
128
141
|
target = {
|
|
129
142
|
"pane_id": parts[0],
|
|
@@ -135,11 +148,152 @@ def list_targets() -> dict[str, Any]:
|
|
|
135
148
|
"pane_current_command": parts[6],
|
|
136
149
|
"pane_active": parts[7] == "1",
|
|
137
150
|
}
|
|
151
|
+
pane_pid = parts[8].strip() if len(parts) == 9 else ""
|
|
152
|
+
if pane_pid:
|
|
153
|
+
target["pane_pid"] = pane_pid
|
|
138
154
|
target["fingerprint"] = f"{target['session_name']}|{target['window_index']}|{target['pane_index']}|{target['pane_tty']}"
|
|
155
|
+
_attach_leader_env(target)
|
|
139
156
|
targets.append(target)
|
|
140
157
|
return {"ok": True, "targets": targets, "engine": "python_fallback", "fallback_reason": result.get("error")}
|
|
141
158
|
|
|
142
159
|
|
|
160
|
+
def _attach_leader_env(target: dict[str, Any]) -> None:
|
|
161
|
+
pane_pid = str(target.get("pane_pid") or "").strip()
|
|
162
|
+
if not pane_pid:
|
|
163
|
+
target["leader_env"] = None
|
|
164
|
+
return
|
|
165
|
+
env = _read_process_env(pane_pid)
|
|
166
|
+
if env is None:
|
|
167
|
+
target["leader_env"] = None
|
|
168
|
+
return
|
|
169
|
+
leader_env = {key: env[key] for key in _LEADER_ENV_KEYS if key in env}
|
|
170
|
+
if "TEAM_AGENT_LEADER_SESSION_UUID" not in leader_env:
|
|
171
|
+
for child_pid in _walk_leader_shaped_children(pane_pid):
|
|
172
|
+
child_env = _read_process_env(child_pid)
|
|
173
|
+
if child_env is None:
|
|
174
|
+
continue
|
|
175
|
+
for key in _LEADER_ENV_KEYS:
|
|
176
|
+
if key not in leader_env and key in child_env:
|
|
177
|
+
leader_env[key] = child_env[key]
|
|
178
|
+
if "TEAM_AGENT_LEADER_SESSION_UUID" in leader_env:
|
|
179
|
+
break
|
|
180
|
+
target["leader_env"] = leader_env
|
|
181
|
+
uuid_value = leader_env.get("TEAM_AGENT_LEADER_SESSION_UUID")
|
|
182
|
+
if uuid_value:
|
|
183
|
+
target["leader_session_uuid"] = uuid_value
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _read_process_env(pid: str) -> dict[str, str] | None:
|
|
187
|
+
if platform.system() == "Linux":
|
|
188
|
+
return _read_proc_environ(pid)
|
|
189
|
+
return _read_ps_eww_env(pid)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _read_proc_environ(pid: str) -> dict[str, str] | None:
|
|
193
|
+
path = Path(f"/proc/{pid}/environ")
|
|
194
|
+
try:
|
|
195
|
+
raw = path.read_bytes()
|
|
196
|
+
except (FileNotFoundError, PermissionError, OSError):
|
|
197
|
+
return None
|
|
198
|
+
env: dict[str, str] = {}
|
|
199
|
+
for token in raw.split(b"\x00"):
|
|
200
|
+
if not token or b"=" not in token:
|
|
201
|
+
continue
|
|
202
|
+
try:
|
|
203
|
+
text = token.decode("utf-8", errors="replace")
|
|
204
|
+
except Exception:
|
|
205
|
+
continue
|
|
206
|
+
key, _, value = text.partition("=")
|
|
207
|
+
env[key] = value
|
|
208
|
+
return env
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _read_ps_eww_env(pid: str) -> dict[str, str] | None:
|
|
212
|
+
try:
|
|
213
|
+
proc = _run_subprocess(
|
|
214
|
+
["ps", "-E", "-ww", "-p", str(pid)],
|
|
215
|
+
text=True,
|
|
216
|
+
capture_output=True,
|
|
217
|
+
timeout=_PANE_ENV_SCAN_TIMEOUT_SECONDS,
|
|
218
|
+
check=False,
|
|
219
|
+
)
|
|
220
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
221
|
+
return None
|
|
222
|
+
if proc.returncode != 0 or not proc.stdout:
|
|
223
|
+
return None
|
|
224
|
+
return _parse_ps_eww_output(proc.stdout, pid)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _parse_ps_eww_output(text: str, pid: str) -> dict[str, str]:
|
|
228
|
+
env: dict[str, str] = {}
|
|
229
|
+
lines = text.splitlines()
|
|
230
|
+
if len(lines) < 2:
|
|
231
|
+
return env
|
|
232
|
+
target_row = None
|
|
233
|
+
for line in lines[1:]:
|
|
234
|
+
stripped = line.lstrip()
|
|
235
|
+
if stripped.split(" ", 1)[0] == str(pid):
|
|
236
|
+
target_row = stripped
|
|
237
|
+
break
|
|
238
|
+
if target_row is None:
|
|
239
|
+
# Spark MEDIUM #2 (da436a3): never fall back to lines[1] — that row may belong to
|
|
240
|
+
# an unrelated process and would leak its env (incl. another team's
|
|
241
|
+
# TEAM_AGENT_LEADER_SESSION_UUID) into this pane's leader_env, corrupting rediscovery.
|
|
242
|
+
return env
|
|
243
|
+
for token in target_row.split():
|
|
244
|
+
if "=" not in token:
|
|
245
|
+
continue
|
|
246
|
+
key, _, value = token.partition("=")
|
|
247
|
+
if not key or " " in key:
|
|
248
|
+
continue
|
|
249
|
+
if not (key[0].isalpha() or key[0] == "_"):
|
|
250
|
+
continue
|
|
251
|
+
if not all(ch.isalnum() or ch == "_" for ch in key):
|
|
252
|
+
continue
|
|
253
|
+
env[key] = value
|
|
254
|
+
return env
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _walk_leader_shaped_children(parent_pid: str) -> list[str]:
|
|
258
|
+
try:
|
|
259
|
+
proc = _run_subprocess(
|
|
260
|
+
["ps", "-o", "pid=,ppid=,comm="],
|
|
261
|
+
text=True,
|
|
262
|
+
capture_output=True,
|
|
263
|
+
timeout=_PANE_ENV_SCAN_TIMEOUT_SECONDS,
|
|
264
|
+
check=False,
|
|
265
|
+
)
|
|
266
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
267
|
+
return []
|
|
268
|
+
if proc.returncode != 0 or not proc.stdout:
|
|
269
|
+
return []
|
|
270
|
+
return _select_leader_shaped_descendants(proc.stdout, parent_pid)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _select_leader_shaped_descendants(ps_output: str, parent_pid: str) -> list[str]:
|
|
274
|
+
rows: list[tuple[str, str, str]] = []
|
|
275
|
+
for line in ps_output.splitlines():
|
|
276
|
+
parts = line.split()
|
|
277
|
+
if len(parts) < 3:
|
|
278
|
+
continue
|
|
279
|
+
pid, ppid, command = parts[0], parts[1], " ".join(parts[2:])
|
|
280
|
+
rows.append((pid, ppid, Path(command).name))
|
|
281
|
+
descendants: set[str] = set()
|
|
282
|
+
frontier = {str(parent_pid)}
|
|
283
|
+
while frontier:
|
|
284
|
+
next_frontier: set[str] = set()
|
|
285
|
+
for pid, ppid, _ in rows:
|
|
286
|
+
if ppid in frontier and pid not in descendants:
|
|
287
|
+
descendants.add(pid)
|
|
288
|
+
next_frontier.add(pid)
|
|
289
|
+
frontier = next_frontier
|
|
290
|
+
return [
|
|
291
|
+
pid
|
|
292
|
+
for pid, _, command in rows
|
|
293
|
+
if pid in descendants and command in _LEADER_SHAPED_COMMANDS
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
|
|
143
297
|
def contains_inline_secret(value: str) -> bool:
|
|
144
298
|
return (
|
|
145
299
|
_contains_secret_assignment(value)
|