@team-agent/installer 0.2.10 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/team_agent/coordinator/__main__.py +37 -1
- package/src/team_agent/coordinator/lifecycle.py +51 -3
- package/src/team_agent/idle_predicate.py +26 -8
- package/src/team_agent/idle_takeover_wiring.py +3 -0
- package/src/team_agent/messaging/activity_detector.py +10 -2
- package/src/team_agent/messaging/delivery.py +31 -0
- package/src/team_agent/provider_cli/claude.py +46 -0
- package/src/team_agent/provider_state/__init__.py +5 -0
- package/src/team_agent/runtime.py +7 -3
- package/src/team_agent/sessions/capture.py +2 -1
- package/src/team_agent/state.py +97 -6
package/package.json
CHANGED
|
@@ -39,6 +39,8 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
39
39
|
|
|
40
40
|
interval = args.tick_interval if args.tick_interval is not None else _tick_interval(workspace)
|
|
41
41
|
initial_ppid = os.getppid()
|
|
42
|
+
failure_count = 0
|
|
43
|
+
last_failure_signature: tuple[str, str] | None = None
|
|
42
44
|
while not STOP:
|
|
43
45
|
# Stage 14 (Gap 37b) — orphan self-detection. If our original parent (test harness,
|
|
44
46
|
# shell, or supervisor) died, our ppid is reparented to 1 (or to a launchd shim on
|
|
@@ -55,7 +57,41 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
55
57
|
workspace=str(workspace),
|
|
56
58
|
)
|
|
57
59
|
break
|
|
58
|
-
|
|
60
|
+
try:
|
|
61
|
+
result = runtime.coordinator_tick(workspace)
|
|
62
|
+
except Exception as exc:
|
|
63
|
+
failure_count += 1
|
|
64
|
+
signature = (type(exc).__name__, str(exc)[:200])
|
|
65
|
+
sleep_sec = min(interval * (2 ** min(failure_count - 1, 5)), 60.0)
|
|
66
|
+
if signature != last_failure_signature:
|
|
67
|
+
last_failure_signature = signature
|
|
68
|
+
event_log.write(
|
|
69
|
+
"coordinator.tick_error",
|
|
70
|
+
error=str(exc),
|
|
71
|
+
exc_type=type(exc).__name__,
|
|
72
|
+
consecutive_failures=failure_count,
|
|
73
|
+
next_sleep_sec=sleep_sec,
|
|
74
|
+
)
|
|
75
|
+
elif failure_count == 1 or failure_count % 12 == 0 or sleep_sec in {40.0, 60.0}:
|
|
76
|
+
event_log.write(
|
|
77
|
+
"coordinator.tick_error",
|
|
78
|
+
error=str(exc),
|
|
79
|
+
exc_type=type(exc).__name__,
|
|
80
|
+
consecutive_failures=failure_count,
|
|
81
|
+
next_sleep_sec=sleep_sec,
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
event_log.write(
|
|
85
|
+
"coordinator.tick_error.suppressed",
|
|
86
|
+
consecutive_failures=failure_count,
|
|
87
|
+
next_sleep_sec=sleep_sec,
|
|
88
|
+
)
|
|
89
|
+
time.sleep(sleep_sec)
|
|
90
|
+
continue
|
|
91
|
+
if failure_count:
|
|
92
|
+
event_log.write("coordinator.tick_recovered", consecutive_failures=failure_count)
|
|
93
|
+
failure_count = 0
|
|
94
|
+
last_failure_signature = None
|
|
59
95
|
if result.get("stop") or args.once:
|
|
60
96
|
break
|
|
61
97
|
time.sleep(interval)
|
|
@@ -288,14 +288,18 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
288
288
|
# Gap 32: the take-over reminder is driven by file-fact turn-state via the
|
|
289
289
|
# idle_takeover predicate (the legacy screen-scrape obligation path is retired).
|
|
290
290
|
_coord_meta = state.setdefault("coordinator", {})
|
|
291
|
+
idle_nodes = build_idle_nodes(state)
|
|
292
|
+
_record_unknown_idle_nodes(state, idle_nodes, event_log)
|
|
291
293
|
idle_eval = evaluate_takeover_reminder(
|
|
292
|
-
|
|
294
|
+
idle_nodes,
|
|
293
295
|
monitor_state=_coord_meta.get("idle_takeover_monitor"),
|
|
294
296
|
now_monotonic=_time.monotonic(),
|
|
295
297
|
debounce_seconds=IDLE_DEBOUNCE_SECONDS,
|
|
298
|
+
event_sink=lambda name, fields: event_log.write(name, **fields),
|
|
296
299
|
)
|
|
297
300
|
_coord_meta["idle_takeover_monitor"] = idle_eval.get("monitor_state")
|
|
298
|
-
|
|
301
|
+
if idle_eval.get("should_ping"):
|
|
302
|
+
push_idle_reminder(workspace, state, event_log, idle_eval)
|
|
299
303
|
idle_alerts = (
|
|
300
304
|
[{"alert_type": "idle_takeover", "message": idle_eval.get("message"),
|
|
301
305
|
"reason": idle_eval.get("reason"), "interrupted": idle_eval.get("interrupted_nodes")}]
|
|
@@ -338,7 +342,25 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
338
342
|
if drift:
|
|
339
343
|
drift_results.append(drift)
|
|
340
344
|
api_errors = detect_leader_api_errors(workspace, state, store, event_log)
|
|
341
|
-
|
|
345
|
+
try:
|
|
346
|
+
save_runtime_state(workspace, state)
|
|
347
|
+
except Exception as exc:
|
|
348
|
+
event_log.write("runtime.state.save_failed", phase="tick_end", error=str(exc), exc_type=type(exc).__name__)
|
|
349
|
+
return {
|
|
350
|
+
"ok": False,
|
|
351
|
+
"stop": False,
|
|
352
|
+
"reason": "persistence_degraded",
|
|
353
|
+
"persisted": False,
|
|
354
|
+
"error": str(exc),
|
|
355
|
+
"delivered": delivered,
|
|
356
|
+
"scheduled": fired,
|
|
357
|
+
"stuck": stuck,
|
|
358
|
+
"idle_alerts": idle_alerts,
|
|
359
|
+
"deadlock_alerts": deadlock_alerts,
|
|
360
|
+
"compaction": compaction_results,
|
|
361
|
+
"session_drift": drift_results,
|
|
362
|
+
"api_errors": api_errors,
|
|
363
|
+
}
|
|
342
364
|
results = _collect_results_and_notify_watchers(workspace, event_log)
|
|
343
365
|
# Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
|
|
344
366
|
from team_agent.message_store.leader_notification_log import prune_leader_notification_log
|
|
@@ -361,3 +383,29 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
361
383
|
"api_errors": api_errors,
|
|
362
384
|
"results": results,
|
|
363
385
|
}
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _record_unknown_idle_nodes(state: dict[str, Any], nodes: list[dict[str, Any]], event_log: EventLog) -> None:
|
|
389
|
+
coordinator = state.setdefault("coordinator", {})
|
|
390
|
+
unknown_ticks = coordinator.setdefault("unknown_ticks", {})
|
|
391
|
+
current_unknown: set[str] = set()
|
|
392
|
+
for node in nodes:
|
|
393
|
+
node_id = str(node.get("node_id") or "")
|
|
394
|
+
if not node_id:
|
|
395
|
+
continue
|
|
396
|
+
if node.get("state") == "unknown":
|
|
397
|
+
current_unknown.add(node_id)
|
|
398
|
+
count = int(unknown_ticks.get(node_id) or 0) + 1
|
|
399
|
+
unknown_ticks[node_id] = count
|
|
400
|
+
if count >= 60 and count % 12 == 0:
|
|
401
|
+
event_log.write(
|
|
402
|
+
"idle_takeover.unknown_persistent",
|
|
403
|
+
node_id=node_id,
|
|
404
|
+
provider=node.get("provider"),
|
|
405
|
+
auth_mode=node.get("auth_mode"),
|
|
406
|
+
consecutive_ticks=count,
|
|
407
|
+
rollout_path=node.get("rollout_path"),
|
|
408
|
+
)
|
|
409
|
+
for node_id in list(unknown_ticks):
|
|
410
|
+
if node_id not in current_unknown:
|
|
411
|
+
unknown_ticks.pop(node_id, None)
|
|
@@ -46,10 +46,10 @@ def evaluate_takeover_reminder(
|
|
|
46
46
|
if node_state not in _IDLE_STATES:
|
|
47
47
|
state["all_idle_since"] = None
|
|
48
48
|
state["pinged_for_episode"] = None
|
|
49
|
-
return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state)
|
|
49
|
+
return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state, event_sink=event_sink, node=node)
|
|
50
50
|
|
|
51
51
|
if not nodes:
|
|
52
|
-
return _result(False, None, "no_nodes", [], state)
|
|
52
|
+
return _result(False, None, "no_nodes", [], state, event_sink=event_sink)
|
|
53
53
|
|
|
54
54
|
if state.get("all_idle_since") is None:
|
|
55
55
|
state["all_idle_since"] = now_monotonic
|
|
@@ -58,18 +58,18 @@ def evaluate_takeover_reminder(
|
|
|
58
58
|
interrupted = _interrupted(nodes)
|
|
59
59
|
|
|
60
60
|
if not state.get(_ARM_KEY):
|
|
61
|
-
return _result(False, None, "not_armed_no_worker_turn", interrupted, state)
|
|
61
|
+
return _result(False, None, "not_armed_no_worker_turn", interrupted, state, event_sink=event_sink)
|
|
62
62
|
if state.get(_SUPPRESS_KEY):
|
|
63
|
-
return _result(False, None, "acknowledged", interrupted, state)
|
|
63
|
+
return _result(False, None, "acknowledged", interrupted, state, event_sink=event_sink)
|
|
64
64
|
if elapsed < debounce_seconds:
|
|
65
|
-
return _result(False, None, "debounce_active", interrupted, state)
|
|
65
|
+
return _result(False, None, "debounce_active", interrupted, state, event_sink=event_sink)
|
|
66
66
|
if state.get("pinged_for_episode") == state.get("all_idle_since"):
|
|
67
|
-
return _result(False, None, "already_pinged_this_episode", interrupted, state)
|
|
67
|
+
return _result(False, None, "already_pinged_this_episode", interrupted, state, event_sink=event_sink)
|
|
68
68
|
|
|
69
69
|
state["pinged_for_episode"] = state["all_idle_since"]
|
|
70
70
|
message = _neutral_message(len(nodes), elapsed, interrupted)
|
|
71
71
|
_emit(event_sink, "idle_takeover.ping", nodes=len(nodes), elapsed_seconds=int(elapsed), interrupted=[i["node_id"] for i in interrupted])
|
|
72
|
-
return _result(True, message, "all_idle_debounce_elapsed", interrupted, state)
|
|
72
|
+
return _result(True, message, "all_idle_debounce_elapsed", interrupted, state, event_sink=event_sink)
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
def record_turn_open_after_delivery(
|
|
@@ -174,7 +174,25 @@ def _neutral_message(node_count: int, elapsed: float, interrupted: list[dict[str
|
|
|
174
174
|
return base
|
|
175
175
|
|
|
176
176
|
|
|
177
|
-
def _result(
|
|
177
|
+
def _result(
|
|
178
|
+
should_ping: bool,
|
|
179
|
+
message: str | None,
|
|
180
|
+
reason: str,
|
|
181
|
+
annotations: list[dict[str, Any]],
|
|
182
|
+
state: dict[str, Any],
|
|
183
|
+
*,
|
|
184
|
+
event_sink: Any = None,
|
|
185
|
+
node: dict[str, Any] | None = None,
|
|
186
|
+
) -> dict[str, Any]:
|
|
187
|
+
if not should_ping and state.get("last_no_ping_reason") != reason:
|
|
188
|
+
state["last_no_ping_reason"] = reason
|
|
189
|
+
_emit(
|
|
190
|
+
event_sink,
|
|
191
|
+
"idle_takeover.no_ping",
|
|
192
|
+
reason=reason,
|
|
193
|
+
node_id=(node or {}).get("node_id"),
|
|
194
|
+
armed=bool(state.get(_ARM_KEY)),
|
|
195
|
+
)
|
|
178
196
|
return {
|
|
179
197
|
"should_ping": should_ping,
|
|
180
198
|
"message": message,
|
|
@@ -36,6 +36,9 @@ def build_idle_nodes(state: dict[str, Any]) -> list[dict[str, Any]]:
|
|
|
36
36
|
"state": classification.get("state"),
|
|
37
37
|
"turn_id": classification.get("turn_id"),
|
|
38
38
|
"annotations": classification.get("annotations"),
|
|
39
|
+
"provider": provider,
|
|
40
|
+
"auth_mode": agent_state.get("auth_mode"),
|
|
41
|
+
"rollout_path": agent_state.get("rollout_path"),
|
|
39
42
|
})
|
|
40
43
|
leader_node = _leader_node(state)
|
|
41
44
|
if leader_node is not None:
|
|
@@ -170,7 +170,11 @@ def detect_compaction_degradation(
|
|
|
170
170
|
team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
|
|
171
171
|
current = max(int(team_counts.get(agent_id) or 0), count)
|
|
172
172
|
team_counts[agent_id] = current
|
|
173
|
-
|
|
173
|
+
try:
|
|
174
|
+
save_runtime_state(workspace, state)
|
|
175
|
+
except Exception as exc:
|
|
176
|
+
event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
|
|
177
|
+
return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": current}
|
|
174
178
|
if current <= 0:
|
|
175
179
|
return {"ok": True, "event": "compaction_threshold_crossed.none", "compaction_count": current}
|
|
176
180
|
event_log.write(
|
|
@@ -206,7 +210,11 @@ def _reset_or_recommend(
|
|
|
206
210
|
if reset.get("ok"):
|
|
207
211
|
team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
|
|
208
212
|
team_counts[agent_id] = 0
|
|
209
|
-
|
|
213
|
+
try:
|
|
214
|
+
save_runtime_state(workspace, state)
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
|
|
217
|
+
return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": compaction_count}
|
|
210
218
|
event = "compaction_threshold_crossed.auto_reset"
|
|
211
219
|
event_log.write(event, agent_id=agent_id, provider=provider, team=owner_team_id, compaction_count=compaction_count, threshold=threshold)
|
|
212
220
|
return {"ok": True, "event": event, "agent_id": agent_id, "compaction_count": compaction_count, "threshold": threshold, "reset": reset}
|
|
@@ -9,10 +9,12 @@ from team_agent.messaging.deps import (
|
|
|
9
9
|
_tmux_window_exists,
|
|
10
10
|
core_render_message,
|
|
11
11
|
)
|
|
12
|
+
from team_agent.idle_predicate import record_turn_open_after_delivery
|
|
12
13
|
|
|
13
14
|
from datetime import datetime, timedelta, timezone
|
|
14
15
|
from pathlib import Path
|
|
15
16
|
from typing import Any
|
|
17
|
+
import time
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
def _tmux_pane_width(target: str) -> dict[str, Any]:
|
|
@@ -163,6 +165,7 @@ def _deliver_pending_message(
|
|
|
163
165
|
store.mark(message_id, "submitted")
|
|
164
166
|
send_event_log = EventLog(workspace)
|
|
165
167
|
_stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
|
|
168
|
+
_record_turn_open_if_leader_to_worker(state, row, send_event_log)
|
|
166
169
|
send_event_log.write(
|
|
167
170
|
"send.submitted",
|
|
168
171
|
message_id=message_id,
|
|
@@ -424,6 +427,34 @@ def _stamp_first_send_at_if_leader_to_worker(
|
|
|
424
427
|
)
|
|
425
428
|
|
|
426
429
|
|
|
430
|
+
def _record_turn_open_if_leader_to_worker(
|
|
431
|
+
state: dict[str, Any],
|
|
432
|
+
row: dict[str, Any],
|
|
433
|
+
event_log: EventLog,
|
|
434
|
+
) -> None:
|
|
435
|
+
sender = str(row.get("sender") or "")
|
|
436
|
+
recipient = str(row.get("recipient") or "")
|
|
437
|
+
if not recipient:
|
|
438
|
+
return
|
|
439
|
+
leader_id = str((state.get("leader") or {}).get("id") or "leader")
|
|
440
|
+
if sender not in {"leader", "Leader", leader_id}:
|
|
441
|
+
return
|
|
442
|
+
agents = state.get("agents")
|
|
443
|
+
if not isinstance(agents, dict) or not isinstance(agents.get(recipient), dict):
|
|
444
|
+
return
|
|
445
|
+
coordinator = state.setdefault("coordinator", {})
|
|
446
|
+
message_id = str(row.get("message_id") or "")
|
|
447
|
+
task_id = str(row.get("task_id") or "")
|
|
448
|
+
coordinator["idle_takeover_monitor"] = record_turn_open_after_delivery(
|
|
449
|
+
coordinator.get("idle_takeover_monitor"),
|
|
450
|
+
node_id=recipient,
|
|
451
|
+
turn_id=task_id or message_id or None,
|
|
452
|
+
delivered_message_id=message_id or None,
|
|
453
|
+
now_monotonic=time.monotonic(),
|
|
454
|
+
event_sink=lambda name, fields: event_log.write(name, **fields),
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
|
|
427
458
|
def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
|
|
428
459
|
"""Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
|
|
429
460
|
the pane no longer matches detect_non_input_scrollback, False if the prompt
|
|
@@ -104,6 +104,10 @@ class ClaudeCodeAdapter(ProviderAdapter):
|
|
|
104
104
|
"attribution_confidence": match["confidence"],
|
|
105
105
|
"spawn_cwd": str(cwd),
|
|
106
106
|
}
|
|
107
|
+
if spawn_context.get("auth_mode") == "compatible_api":
|
|
108
|
+
fallback = find_compatible_api_claude_transcript_fallback(root, Path(str(cwd)), start, agent_id)
|
|
109
|
+
if fallback:
|
|
110
|
+
return fallback
|
|
107
111
|
if time.monotonic() >= deadline:
|
|
108
112
|
return None
|
|
109
113
|
time.sleep(0.2)
|
|
@@ -327,6 +331,48 @@ def find_claude_transcript(
|
|
|
327
331
|
return candidates[0]
|
|
328
332
|
|
|
329
333
|
|
|
334
|
+
def find_compatible_api_claude_transcript_fallback(
|
|
335
|
+
root: Path,
|
|
336
|
+
cwd: Path,
|
|
337
|
+
spawn_time: datetime,
|
|
338
|
+
agent_id: str,
|
|
339
|
+
) -> dict[str, Any] | None:
|
|
340
|
+
_ = agent_id
|
|
341
|
+
if not root.exists():
|
|
342
|
+
return None
|
|
343
|
+
lower_bound = spawn_time - timedelta(seconds=5)
|
|
344
|
+
upper_bound = datetime.now(timezone.utc)
|
|
345
|
+
candidates: list[Path] = []
|
|
346
|
+
for directory in claude_project_dirs(root, cwd):
|
|
347
|
+
try:
|
|
348
|
+
candidates.extend(path for path in directory.glob("*.jsonl") if path.is_file())
|
|
349
|
+
except OSError:
|
|
350
|
+
continue
|
|
351
|
+
try:
|
|
352
|
+
ordered = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[:5]
|
|
353
|
+
except OSError:
|
|
354
|
+
return None
|
|
355
|
+
for path in ordered:
|
|
356
|
+
try:
|
|
357
|
+
stat = path.stat()
|
|
358
|
+
except OSError:
|
|
359
|
+
continue
|
|
360
|
+
if stat.st_size <= 0:
|
|
361
|
+
continue
|
|
362
|
+
timestamp = datetime.fromtimestamp(stat.st_mtime, timezone.utc)
|
|
363
|
+
if timestamp < lower_bound or timestamp > upper_bound:
|
|
364
|
+
continue
|
|
365
|
+
return {
|
|
366
|
+
"session_id": None,
|
|
367
|
+
"rollout_path": str(path),
|
|
368
|
+
"captured_at": datetime.now(timezone.utc).isoformat(),
|
|
369
|
+
"captured_via": "fs_mtime_fallback",
|
|
370
|
+
"attribution_confidence": "low",
|
|
371
|
+
"spawn_cwd": str(cwd),
|
|
372
|
+
}
|
|
373
|
+
return None
|
|
374
|
+
|
|
375
|
+
|
|
330
376
|
def claude_project_dirs(root: Path, cwd: Path) -> list[Path]:
|
|
331
377
|
return [directory for directory in _unique_paths([claude_project_dir(root, cwd), claude_legacy_project_dir(root, cwd)]) if directory.exists()]
|
|
332
378
|
|
|
@@ -63,6 +63,7 @@ def read_fault_facts(provider: str, records: list[dict[str, Any]]) -> list[dict[
|
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
def _reader_for(provider: str, registry: Any = None) -> Any:
|
|
66
|
+
provider = _reader_provider(provider)
|
|
66
67
|
if provider in _READER_CACHE:
|
|
67
68
|
return _READER_CACHE[provider]
|
|
68
69
|
entry = None
|
|
@@ -83,4 +84,8 @@ def _reader_for(provider: str, registry: Any = None) -> Any:
|
|
|
83
84
|
return module
|
|
84
85
|
|
|
85
86
|
|
|
87
|
+
def _reader_provider(provider: str) -> str:
|
|
88
|
+
return "claude" if provider == "claude_code" else provider
|
|
89
|
+
|
|
90
|
+
|
|
86
91
|
__all__ = ["read_turn_state", "read_fault_facts", "get_provider_registry"]
|
|
@@ -950,17 +950,20 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
|
|
|
950
950
|
lock_path = runtime_dir(workspace) / f"{name}.lock"
|
|
951
951
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
952
952
|
event_log = EventLog(workspace)
|
|
953
|
+
log_lock_events = name != "state-save"
|
|
953
954
|
start = time.monotonic()
|
|
954
955
|
with lock_path.open("w", encoding="utf-8") as lock_file:
|
|
955
956
|
while True:
|
|
956
957
|
try:
|
|
957
958
|
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
958
959
|
waited = time.monotonic() - start
|
|
959
|
-
|
|
960
|
+
if log_lock_events:
|
|
961
|
+
event_log.write("runtime.lock_acquired", lock=name, waited_sec=round(waited, 3))
|
|
960
962
|
break
|
|
961
963
|
except BlockingIOError:
|
|
962
964
|
if time.monotonic() - start >= timeout:
|
|
963
|
-
|
|
965
|
+
if log_lock_events:
|
|
966
|
+
event_log.write("runtime.lock_busy", lock=name, timeout_sec=timeout)
|
|
964
967
|
raise RuntimeError(
|
|
965
968
|
f"{name} is locked by another team-agent process; serialize team-agent {name} calls and retry"
|
|
966
969
|
)
|
|
@@ -969,7 +972,8 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
|
|
|
969
972
|
yield
|
|
970
973
|
finally:
|
|
971
974
|
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
|
972
|
-
|
|
975
|
+
if log_lock_events:
|
|
976
|
+
event_log.write("runtime.lock_released", lock=name)
|
|
973
977
|
|
|
974
978
|
|
|
975
979
|
def _leader_id(state: dict[str, Any], spec: dict[str, Any]) -> str:
|
|
@@ -82,6 +82,7 @@ def capture_agent_session(
|
|
|
82
82
|
"predetermined_session_id": agent_state.get("_pending_session_id"),
|
|
83
83
|
"exclude_session_ids": sorted(exclude_session_ids or set()),
|
|
84
84
|
"claude_projects_root": agent_state.get("claude_projects_root"),
|
|
85
|
+
"auth_mode": agent_state.get("auth_mode"),
|
|
85
86
|
}
|
|
86
87
|
deadline = time.monotonic() + max(timeout_s, 0.0)
|
|
87
88
|
while True:
|
|
@@ -89,7 +90,7 @@ def capture_agent_session(
|
|
|
89
90
|
# outer loop owns the polling budget so behaviour stays consistent
|
|
90
91
|
# whether or not the adapter has its own internal sleep.
|
|
91
92
|
result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
|
|
92
|
-
if isinstance(result, dict) and result.get("session_id"):
|
|
93
|
+
if isinstance(result, dict) and (result.get("session_id") or result.get("rollout_path")):
|
|
93
94
|
copy_session_metadata(agent_state, result)
|
|
94
95
|
agent_state.pop("_pending_session_id", None)
|
|
95
96
|
event_log.write(
|
package/src/team_agent/state.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import hashlib
|
|
4
|
+
import errno
|
|
4
5
|
import json
|
|
5
6
|
import os
|
|
6
7
|
import copy
|
|
7
8
|
import subprocess
|
|
9
|
+
import time
|
|
8
10
|
import uuid
|
|
9
11
|
from datetime import datetime, timezone
|
|
10
12
|
from pathlib import Path
|
|
@@ -488,16 +490,105 @@ def validate_leader_uuid_from_targets(receiver: dict[str, Any], targets: dict[st
|
|
|
488
490
|
|
|
489
491
|
|
|
490
492
|
def save_runtime_state(workspace: Path, state: dict[str, Any]) -> None:
|
|
491
|
-
_migrate_state_identity(state, workspace)
|
|
492
493
|
path = runtime_state_path(workspace)
|
|
493
|
-
|
|
494
|
-
|
|
494
|
+
cached = _RUNTIME_STATE_CACHE.get(str(path))
|
|
495
|
+
if cached is not None and state == cached:
|
|
496
|
+
return
|
|
497
|
+
_migrate_state_identity(state, workspace)
|
|
498
|
+
cached = _RUNTIME_STATE_CACHE.get(str(path))
|
|
499
|
+
if cached is not None and state == cached:
|
|
500
|
+
return
|
|
501
|
+
if path.exists():
|
|
502
|
+
try:
|
|
503
|
+
existing = json.loads(path.read_text(encoding="utf-8"))
|
|
504
|
+
normalize_agent_session_state(existing)
|
|
505
|
+
_migrate_state_identity(existing, workspace)
|
|
506
|
+
if state == existing:
|
|
507
|
+
_RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
|
|
508
|
+
return
|
|
509
|
+
except Exception:
|
|
510
|
+
pass
|
|
511
|
+
from team_agent.runtime import _runtime_lock
|
|
512
|
+
with _runtime_lock(workspace, "state-save", timeout=2.0):
|
|
513
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
514
|
+
payload = json.dumps(state, indent=2, ensure_ascii=False)
|
|
515
|
+
delays = [0.05, 0.2, 0.5]
|
|
516
|
+
for attempt in range(len(delays) + 1):
|
|
517
|
+
tmp_path = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.tmp")
|
|
518
|
+
try:
|
|
519
|
+
tmp_path.write_text(payload, encoding="utf-8")
|
|
520
|
+
os.replace(tmp_path, path)
|
|
521
|
+
_RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
|
|
522
|
+
return
|
|
523
|
+
except (PermissionError, OSError) as exc:
|
|
524
|
+
if not _retryable_replace_error(exc) or attempt >= len(delays):
|
|
525
|
+
if _retryable_replace_error(exc):
|
|
526
|
+
_self_heal_runtime_state(workspace, path, payload, state, attempt + 1, exc)
|
|
527
|
+
return
|
|
528
|
+
raise
|
|
529
|
+
from team_agent.events import EventLog
|
|
530
|
+
EventLog(workspace).write(
|
|
531
|
+
"runtime.state.save_retry",
|
|
532
|
+
attempt=attempt + 1,
|
|
533
|
+
errno=getattr(exc, "errno", None),
|
|
534
|
+
errno_name=errno.errorcode.get(getattr(exc, "errno", 0), None),
|
|
535
|
+
error=str(exc),
|
|
536
|
+
)
|
|
537
|
+
time.sleep(delays[attempt])
|
|
538
|
+
finally:
|
|
539
|
+
tmp_path.unlink(missing_ok=True)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _retryable_replace_error(exc: BaseException) -> bool:
|
|
543
|
+
return isinstance(exc, PermissionError) or (
|
|
544
|
+
isinstance(exc, OSError) and getattr(exc, "errno", None) in {errno.EACCES, errno.EPERM, errno.EBUSY}
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _self_heal_runtime_state(
|
|
549
|
+
workspace: Path,
|
|
550
|
+
path: Path,
|
|
551
|
+
payload: str,
|
|
552
|
+
state: dict[str, Any],
|
|
553
|
+
attempts_used: int,
|
|
554
|
+
original_exc: BaseException,
|
|
555
|
+
) -> None:
|
|
556
|
+
from team_agent.events import EventLog
|
|
557
|
+
event_log = EventLog(workspace)
|
|
558
|
+
heal_tmp = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.heal.tmp")
|
|
559
|
+
backup = path.with_name(f"{path.name}.bak.{os.getpid()}")
|
|
560
|
+
backup_created = False
|
|
495
561
|
try:
|
|
496
|
-
|
|
497
|
-
|
|
562
|
+
heal_tmp.write_text(payload, encoding="utf-8")
|
|
563
|
+
try:
|
|
564
|
+
os.replace(path, backup)
|
|
565
|
+
backup_created = True
|
|
566
|
+
except FileNotFoundError:
|
|
567
|
+
backup_created = False
|
|
568
|
+
os.replace(heal_tmp, path)
|
|
498
569
|
_RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
|
|
570
|
+
event_log.write(
|
|
571
|
+
"runtime.state.self_healed",
|
|
572
|
+
inode_rebuilt=True,
|
|
573
|
+
attempts_used=attempts_used,
|
|
574
|
+
replace_retries=max(0, attempts_used - 1),
|
|
575
|
+
)
|
|
576
|
+
except Exception as exc:
|
|
577
|
+
if backup_created:
|
|
578
|
+
try:
|
|
579
|
+
os.replace(backup, path)
|
|
580
|
+
except Exception as restore_exc:
|
|
581
|
+
event_log.write("runtime.state.self_heal_restore_failed", error=str(restore_exc))
|
|
582
|
+
event_log.write(
|
|
583
|
+
"runtime.state.save_failed",
|
|
584
|
+
phase="save_runtime_state",
|
|
585
|
+
final_errno=getattr(exc, "errno", getattr(original_exc, "errno", None)),
|
|
586
|
+
error=str(exc),
|
|
587
|
+
retries_used=max(0, attempts_used - 1),
|
|
588
|
+
)
|
|
589
|
+
raise
|
|
499
590
|
finally:
|
|
500
|
-
|
|
591
|
+
heal_tmp.unlink(missing_ok=True)
|
|
501
592
|
|
|
502
593
|
|
|
503
594
|
def save_team_scoped_state(workspace: Path, team_state: dict[str, Any]) -> None:
|