@team-agent/installer 0.2.9 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/team_agent/approvals/status.py +12 -5
- package/src/team_agent/coordinator/__main__.py +37 -1
- package/src/team_agent/coordinator/lifecycle.py +51 -3
- package/src/team_agent/diagnose/quick_start.py +91 -0
- package/src/team_agent/display/worker_window.py +1 -1
- package/src/team_agent/idle_predicate.py +26 -8
- package/src/team_agent/idle_takeover_wiring.py +3 -0
- package/src/team_agent/lifecycle/operations.py +13 -1
- package/src/team_agent/messaging/activity_detector.py +10 -2
- package/src/team_agent/messaging/delivery.py +31 -0
- package/src/team_agent/messaging/leader_panes.py +27 -35
- package/src/team_agent/messaging/tmux_prompt.py +22 -0
- package/src/team_agent/provider_cli/claude.py +46 -0
- package/src/team_agent/provider_state/__init__.py +5 -0
- package/src/team_agent/runtime.py +7 -3
- package/src/team_agent/sessions/capture.py +2 -1
- package/src/team_agent/state.py +97 -6
package/package.json
CHANGED
|
@@ -28,9 +28,12 @@ def refresh_agent_runtime_statuses(workspace: Path, state: dict[str, Any], event
|
|
|
28
28
|
if session_name:
|
|
29
29
|
agent_state["status"] = "missing"
|
|
30
30
|
else:
|
|
31
|
-
|
|
31
|
+
status_capture = detect_provider_status(agent_state["provider"], session_name, window, include_capture=True)
|
|
32
|
+
detected, capture_tail = status_capture if isinstance(status_capture, tuple) else (status_capture, "")
|
|
32
33
|
if detected:
|
|
33
34
|
agent_state["status"] = detected
|
|
35
|
+
if detected == "awaiting_trust_prompt":
|
|
36
|
+
agent_state["pane_capture_tail"] = capture_tail
|
|
34
37
|
else:
|
|
35
38
|
agent_state.setdefault("status", "running")
|
|
36
39
|
if old_status != agent_state.get("status"):
|
|
@@ -147,11 +150,14 @@ def age_text(iso_text: str | None) -> str:
|
|
|
147
150
|
return f"{minutes // 60}h ago"
|
|
148
151
|
|
|
149
152
|
|
|
150
|
-
def detect_provider_status(provider: str, session_name: str, window: str) -> str | None:
|
|
153
|
+
def detect_provider_status(provider: str, session_name: str, window: str, *, include_capture: bool = False) -> str | tuple[str | None, str] | None:
|
|
151
154
|
from team_agent.runtime import get_adapter, run_cmd
|
|
155
|
+
from team_agent.messaging.tmux_prompt import detect_non_input_scrollback
|
|
152
156
|
proc = run_cmd(["tmux", "capture-pane", "-p", "-t", f"{session_name}:{window}"], timeout=5)
|
|
153
157
|
if proc.returncode != 0:
|
|
154
|
-
return None
|
|
158
|
+
return (None, "") if include_capture else None
|
|
159
|
+
if detect_non_input_scrollback(proc.stdout) == "codex_trust_prompt":
|
|
160
|
+
return ("awaiting_trust_prompt", proc.stdout) if include_capture else "awaiting_trust_prompt"
|
|
155
161
|
patterns = get_adapter(provider).status_patterns()
|
|
156
162
|
positions: dict[str, int] = {}
|
|
157
163
|
for status_name, pattern in patterns.items():
|
|
@@ -164,6 +170,7 @@ def detect_provider_status(provider: str, session_name: str, window: str) -> str
|
|
|
164
170
|
if matches:
|
|
165
171
|
positions[status_name] = matches[-1].start()
|
|
166
172
|
if not positions:
|
|
167
|
-
return None
|
|
173
|
+
return (None, proc.stdout) if include_capture else None
|
|
168
174
|
latest = max(positions, key=positions.get)
|
|
169
|
-
|
|
175
|
+
detected = {"idle": "running", "processing": "busy", "error": "error"}.get(latest)
|
|
176
|
+
return (detected, proc.stdout) if include_capture else detected
|
|
@@ -39,6 +39,8 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
39
39
|
|
|
40
40
|
interval = args.tick_interval if args.tick_interval is not None else _tick_interval(workspace)
|
|
41
41
|
initial_ppid = os.getppid()
|
|
42
|
+
failure_count = 0
|
|
43
|
+
last_failure_signature: tuple[str, str] | None = None
|
|
42
44
|
while not STOP:
|
|
43
45
|
# Stage 14 (Gap 37b) — orphan self-detection. If our original parent (test harness,
|
|
44
46
|
# shell, or supervisor) died, our ppid is reparented to 1 (or to a launchd shim on
|
|
@@ -55,7 +57,41 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
55
57
|
workspace=str(workspace),
|
|
56
58
|
)
|
|
57
59
|
break
|
|
58
|
-
|
|
60
|
+
try:
|
|
61
|
+
result = runtime.coordinator_tick(workspace)
|
|
62
|
+
except Exception as exc:
|
|
63
|
+
failure_count += 1
|
|
64
|
+
signature = (type(exc).__name__, str(exc)[:200])
|
|
65
|
+
sleep_sec = min(interval * (2 ** min(failure_count - 1, 5)), 60.0)
|
|
66
|
+
if signature != last_failure_signature:
|
|
67
|
+
last_failure_signature = signature
|
|
68
|
+
event_log.write(
|
|
69
|
+
"coordinator.tick_error",
|
|
70
|
+
error=str(exc),
|
|
71
|
+
exc_type=type(exc).__name__,
|
|
72
|
+
consecutive_failures=failure_count,
|
|
73
|
+
next_sleep_sec=sleep_sec,
|
|
74
|
+
)
|
|
75
|
+
elif failure_count == 1 or failure_count % 12 == 0 or sleep_sec in {40.0, 60.0}:
|
|
76
|
+
event_log.write(
|
|
77
|
+
"coordinator.tick_error",
|
|
78
|
+
error=str(exc),
|
|
79
|
+
exc_type=type(exc).__name__,
|
|
80
|
+
consecutive_failures=failure_count,
|
|
81
|
+
next_sleep_sec=sleep_sec,
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
event_log.write(
|
|
85
|
+
"coordinator.tick_error.suppressed",
|
|
86
|
+
consecutive_failures=failure_count,
|
|
87
|
+
next_sleep_sec=sleep_sec,
|
|
88
|
+
)
|
|
89
|
+
time.sleep(sleep_sec)
|
|
90
|
+
continue
|
|
91
|
+
if failure_count:
|
|
92
|
+
event_log.write("coordinator.tick_recovered", consecutive_failures=failure_count)
|
|
93
|
+
failure_count = 0
|
|
94
|
+
last_failure_signature = None
|
|
59
95
|
if result.get("stop") or args.once:
|
|
60
96
|
break
|
|
61
97
|
time.sleep(interval)
|
|
@@ -288,14 +288,18 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
288
288
|
# Gap 32: the take-over reminder is driven by file-fact turn-state via the
|
|
289
289
|
# idle_takeover predicate (the legacy screen-scrape obligation path is retired).
|
|
290
290
|
_coord_meta = state.setdefault("coordinator", {})
|
|
291
|
+
idle_nodes = build_idle_nodes(state)
|
|
292
|
+
_record_unknown_idle_nodes(state, idle_nodes, event_log)
|
|
291
293
|
idle_eval = evaluate_takeover_reminder(
|
|
292
|
-
|
|
294
|
+
idle_nodes,
|
|
293
295
|
monitor_state=_coord_meta.get("idle_takeover_monitor"),
|
|
294
296
|
now_monotonic=_time.monotonic(),
|
|
295
297
|
debounce_seconds=IDLE_DEBOUNCE_SECONDS,
|
|
298
|
+
event_sink=lambda name, fields: event_log.write(name, **fields),
|
|
296
299
|
)
|
|
297
300
|
_coord_meta["idle_takeover_monitor"] = idle_eval.get("monitor_state")
|
|
298
|
-
|
|
301
|
+
if idle_eval.get("should_ping"):
|
|
302
|
+
push_idle_reminder(workspace, state, event_log, idle_eval)
|
|
299
303
|
idle_alerts = (
|
|
300
304
|
[{"alert_type": "idle_takeover", "message": idle_eval.get("message"),
|
|
301
305
|
"reason": idle_eval.get("reason"), "interrupted": idle_eval.get("interrupted_nodes")}]
|
|
@@ -338,7 +342,25 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
338
342
|
if drift:
|
|
339
343
|
drift_results.append(drift)
|
|
340
344
|
api_errors = detect_leader_api_errors(workspace, state, store, event_log)
|
|
341
|
-
|
|
345
|
+
try:
|
|
346
|
+
save_runtime_state(workspace, state)
|
|
347
|
+
except Exception as exc:
|
|
348
|
+
event_log.write("runtime.state.save_failed", phase="tick_end", error=str(exc), exc_type=type(exc).__name__)
|
|
349
|
+
return {
|
|
350
|
+
"ok": False,
|
|
351
|
+
"stop": False,
|
|
352
|
+
"reason": "persistence_degraded",
|
|
353
|
+
"persisted": False,
|
|
354
|
+
"error": str(exc),
|
|
355
|
+
"delivered": delivered,
|
|
356
|
+
"scheduled": fired,
|
|
357
|
+
"stuck": stuck,
|
|
358
|
+
"idle_alerts": idle_alerts,
|
|
359
|
+
"deadlock_alerts": deadlock_alerts,
|
|
360
|
+
"compaction": compaction_results,
|
|
361
|
+
"session_drift": drift_results,
|
|
362
|
+
"api_errors": api_errors,
|
|
363
|
+
}
|
|
342
364
|
results = _collect_results_and_notify_watchers(workspace, event_log)
|
|
343
365
|
# Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
|
|
344
366
|
from team_agent.message_store.leader_notification_log import prune_leader_notification_log
|
|
@@ -361,3 +383,29 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
|
|
|
361
383
|
"api_errors": api_errors,
|
|
362
384
|
"results": results,
|
|
363
385
|
}
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _record_unknown_idle_nodes(state: dict[str, Any], nodes: list[dict[str, Any]], event_log: EventLog) -> None:
|
|
389
|
+
coordinator = state.setdefault("coordinator", {})
|
|
390
|
+
unknown_ticks = coordinator.setdefault("unknown_ticks", {})
|
|
391
|
+
current_unknown: set[str] = set()
|
|
392
|
+
for node in nodes:
|
|
393
|
+
node_id = str(node.get("node_id") or "")
|
|
394
|
+
if not node_id:
|
|
395
|
+
continue
|
|
396
|
+
if node.get("state") == "unknown":
|
|
397
|
+
current_unknown.add(node_id)
|
|
398
|
+
count = int(unknown_ticks.get(node_id) or 0) + 1
|
|
399
|
+
unknown_ticks[node_id] = count
|
|
400
|
+
if count >= 60 and count % 12 == 0:
|
|
401
|
+
event_log.write(
|
|
402
|
+
"idle_takeover.unknown_persistent",
|
|
403
|
+
node_id=node_id,
|
|
404
|
+
provider=node.get("provider"),
|
|
405
|
+
auth_mode=node.get("auth_mode"),
|
|
406
|
+
consecutive_ticks=count,
|
|
407
|
+
rollout_path=node.get("rollout_path"),
|
|
408
|
+
)
|
|
409
|
+
for node_id in list(unknown_ticks):
|
|
410
|
+
if node_id not in current_unknown:
|
|
411
|
+
unknown_ticks.pop(node_id, None)
|
|
@@ -151,9 +151,20 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
|
|
|
151
151
|
|
|
152
152
|
start_time = time.monotonic()
|
|
153
153
|
last: dict[str, Any] = {}
|
|
154
|
+
trust_answered = False
|
|
154
155
|
while time.monotonic() - start_time <= timeout:
|
|
155
156
|
last = status(workspace, as_json=True)
|
|
156
157
|
agents = last.get("agents", {})
|
|
158
|
+
if agents and any(agent.get("status") == "awaiting_trust_prompt" for agent in agents.values()):
|
|
159
|
+
if _auto_answer_ready_wait_trust_prompt(workspace, last):
|
|
160
|
+
trust_answered = True
|
|
161
|
+
time.sleep(0.5)
|
|
162
|
+
last = status(workspace, as_json=True)
|
|
163
|
+
agents = last.get("agents", {})
|
|
164
|
+
if agents and all(agent.get("tmux_window_present") and agent.get("status") in {"running", "busy"} for agent in agents.values()):
|
|
165
|
+
break
|
|
166
|
+
continue
|
|
167
|
+
break
|
|
157
168
|
if agents and all(agent.get("tmux_window_present") and agent.get("status") in {"running", "busy"} for agent in agents.values()):
|
|
158
169
|
break
|
|
159
170
|
time.sleep(1.0)
|
|
@@ -163,9 +174,28 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
|
|
|
163
174
|
"mcp_ready": all(Path(agent.get("mcp_config", "")).exists() for agent in last.get("agents", {}).values()) if last.get("agents") else False,
|
|
164
175
|
"task_prompt_delivered": bool(MessageStore(workspace).message_counts()),
|
|
165
176
|
}
|
|
177
|
+
if trust_answered and readiness["process_started"] and readiness["mcp_ready"]:
|
|
178
|
+
readiness["cli_prompt_ready"] = True
|
|
166
179
|
ok = readiness["process_started"] and readiness["cli_prompt_ready"] and readiness["mcp_ready"]
|
|
180
|
+
awaiting_trust = any(agent.get("status") == "awaiting_trust_prompt" for agent in last.get("agents", {}).values()) if last.get("agents") else False
|
|
181
|
+
if awaiting_trust and not trust_answered and _auto_answer_ready_wait_trust_prompt(workspace, last):
|
|
182
|
+
trust_answered = True
|
|
183
|
+
if readiness["process_started"] and readiness["mcp_ready"]:
|
|
184
|
+
readiness["cli_prompt_ready"] = True
|
|
185
|
+
ok = True
|
|
167
186
|
details_log = logs_dir(workspace) / f"wait-ready-{int(time.time())}.json"
|
|
168
187
|
details_log.write_text(json.dumps({"readiness": readiness, "status": last}, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
188
|
+
if awaiting_trust and not trust_answered:
|
|
189
|
+
pending = {
|
|
190
|
+
"ok": False,
|
|
191
|
+
"status": "pending",
|
|
192
|
+
"reason": "awaiting_trust_prompt",
|
|
193
|
+
"summary": "workers pending: awaiting_trust_prompt",
|
|
194
|
+
"next_actions": ["Answer the Codex workspace trust prompt in the worker pane."],
|
|
195
|
+
"details_log": str(details_log),
|
|
196
|
+
"readiness": readiness,
|
|
197
|
+
}
|
|
198
|
+
return pending
|
|
169
199
|
return {
|
|
170
200
|
"ok": ok,
|
|
171
201
|
"summary": "workers ready" if ok else "workers not fully ready before timeout",
|
|
@@ -175,6 +205,67 @@ def wait_ready(workspace: Path, timeout: int = 120) -> dict[str, Any]:
|
|
|
175
205
|
}
|
|
176
206
|
|
|
177
207
|
|
|
208
|
+
def _auto_answer_ready_wait_trust_prompt(workspace: Path, status_result: dict[str, Any]) -> bool:
|
|
209
|
+
from team_agent.messaging.leader_panes import attempt_trust_auto_answer
|
|
210
|
+
from team_agent.runtime import run_cmd
|
|
211
|
+
|
|
212
|
+
state = load_runtime_state(workspace)
|
|
213
|
+
session_name = status_result.get("session_name") or state.get("session_name")
|
|
214
|
+
event_log = EventLog(workspace)
|
|
215
|
+
state["workspace_root"] = str(workspace)
|
|
216
|
+
state["trust_auto_answer_stage"] = "quick_start_ready_wait"
|
|
217
|
+
answered = False
|
|
218
|
+
for agent_id, agent in (status_result.get("agents") or {}).items():
|
|
219
|
+
if not isinstance(agent, dict) or agent.get("status") != "awaiting_trust_prompt":
|
|
220
|
+
continue
|
|
221
|
+
state_agent = state.get("agents", {}).get(agent_id, {}) if isinstance(state.get("agents"), dict) else {}
|
|
222
|
+
display = agent.get("display") if isinstance(agent.get("display"), dict) else {}
|
|
223
|
+
state_display = state_agent.get("display") if isinstance(state_agent.get("display"), dict) else {}
|
|
224
|
+
pane_id = (
|
|
225
|
+
agent.get("pane_id")
|
|
226
|
+
or display.get("pane_id")
|
|
227
|
+
or agent.get("target")
|
|
228
|
+
or agent.get("tmux_target")
|
|
229
|
+
or state_agent.get("pane_id")
|
|
230
|
+
or state_display.get("pane_id")
|
|
231
|
+
or state_agent.get("target")
|
|
232
|
+
or state_agent.get("tmux_target")
|
|
233
|
+
or status_result.get("pane_id")
|
|
234
|
+
or status_result.get("target")
|
|
235
|
+
or status_result.get("tmux_target")
|
|
236
|
+
)
|
|
237
|
+
window = agent.get("window") or state_agent.get("window") or agent_id
|
|
238
|
+
agent_session = session_name or agent.get("session_name") or state_agent.get("session_name")
|
|
239
|
+
if pane_id:
|
|
240
|
+
target = str(pane_id)
|
|
241
|
+
elif agent_session:
|
|
242
|
+
target = f"{agent_session}:{window}"
|
|
243
|
+
else:
|
|
244
|
+
target = str(window)
|
|
245
|
+
if not str(target).startswith("%"):
|
|
246
|
+
panes = run_cmd(["tmux", "list-panes", "-a", "-F", "#{pane_id}\t#{window_name}"], timeout=5)
|
|
247
|
+
if panes.returncode == 0:
|
|
248
|
+
for line in panes.stdout.splitlines():
|
|
249
|
+
pane_id_text, _, window_name = line.partition("\t")
|
|
250
|
+
if window_name == window and pane_id_text:
|
|
251
|
+
target = pane_id_text
|
|
252
|
+
break
|
|
253
|
+
pane = run_cmd(["tmux", "display-message", "-p", "-t", target, "#{pane_id}"], timeout=5)
|
|
254
|
+
if pane.returncode == 0 and pane.stdout.strip():
|
|
255
|
+
target = pane.stdout.strip()
|
|
256
|
+
capture_tail = str(agent.get("pane_capture_tail") or agent.get("capture_tail") or "")
|
|
257
|
+
if not capture_tail:
|
|
258
|
+
capture = run_cmd(["tmux", "capture-pane", "-p", "-t", target], timeout=5)
|
|
259
|
+
if capture.returncode != 0:
|
|
260
|
+
event_log.write("quick_start.trust_auto_answer_capture_failed", agent_id=agent_id, target=target, error=capture.stderr.strip())
|
|
261
|
+
continue
|
|
262
|
+
capture_tail = capture.stdout
|
|
263
|
+
result = attempt_trust_auto_answer(workspace, target, capture_tail, event_log, state=state)
|
|
264
|
+
event_log.write("quick_start.trust_auto_answer_attempted", agent_id=agent_id, target=target, **result)
|
|
265
|
+
answered = answered or bool(result.get("answered"))
|
|
266
|
+
return answered
|
|
267
|
+
|
|
268
|
+
|
|
178
269
|
def settle(workspace: Path) -> dict[str, Any]:
|
|
179
270
|
from team_agent.runtime import collect, status
|
|
180
271
|
|
|
@@ -21,7 +21,7 @@ def open_worker_displays(
|
|
|
21
21
|
session_name: str,
|
|
22
22
|
jobs: list[tuple[str, dict[str, Any]]],
|
|
23
23
|
event_log: EventLog,
|
|
24
|
-
display_backend: str = "
|
|
24
|
+
display_backend: str = "adaptive",
|
|
25
25
|
capability_probe: dict[str, Any] | None = None,
|
|
26
26
|
) -> dict[str, dict[str, Any]]:
|
|
27
27
|
if not jobs:
|
|
@@ -46,10 +46,10 @@ def evaluate_takeover_reminder(
|
|
|
46
46
|
if node_state not in _IDLE_STATES:
|
|
47
47
|
state["all_idle_since"] = None
|
|
48
48
|
state["pinged_for_episode"] = None
|
|
49
|
-
return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state)
|
|
49
|
+
return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state, event_sink=event_sink, node=node)
|
|
50
50
|
|
|
51
51
|
if not nodes:
|
|
52
|
-
return _result(False, None, "no_nodes", [], state)
|
|
52
|
+
return _result(False, None, "no_nodes", [], state, event_sink=event_sink)
|
|
53
53
|
|
|
54
54
|
if state.get("all_idle_since") is None:
|
|
55
55
|
state["all_idle_since"] = now_monotonic
|
|
@@ -58,18 +58,18 @@ def evaluate_takeover_reminder(
|
|
|
58
58
|
interrupted = _interrupted(nodes)
|
|
59
59
|
|
|
60
60
|
if not state.get(_ARM_KEY):
|
|
61
|
-
return _result(False, None, "not_armed_no_worker_turn", interrupted, state)
|
|
61
|
+
return _result(False, None, "not_armed_no_worker_turn", interrupted, state, event_sink=event_sink)
|
|
62
62
|
if state.get(_SUPPRESS_KEY):
|
|
63
|
-
return _result(False, None, "acknowledged", interrupted, state)
|
|
63
|
+
return _result(False, None, "acknowledged", interrupted, state, event_sink=event_sink)
|
|
64
64
|
if elapsed < debounce_seconds:
|
|
65
|
-
return _result(False, None, "debounce_active", interrupted, state)
|
|
65
|
+
return _result(False, None, "debounce_active", interrupted, state, event_sink=event_sink)
|
|
66
66
|
if state.get("pinged_for_episode") == state.get("all_idle_since"):
|
|
67
|
-
return _result(False, None, "already_pinged_this_episode", interrupted, state)
|
|
67
|
+
return _result(False, None, "already_pinged_this_episode", interrupted, state, event_sink=event_sink)
|
|
68
68
|
|
|
69
69
|
state["pinged_for_episode"] = state["all_idle_since"]
|
|
70
70
|
message = _neutral_message(len(nodes), elapsed, interrupted)
|
|
71
71
|
_emit(event_sink, "idle_takeover.ping", nodes=len(nodes), elapsed_seconds=int(elapsed), interrupted=[i["node_id"] for i in interrupted])
|
|
72
|
-
return _result(True, message, "all_idle_debounce_elapsed", interrupted, state)
|
|
72
|
+
return _result(True, message, "all_idle_debounce_elapsed", interrupted, state, event_sink=event_sink)
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
def record_turn_open_after_delivery(
|
|
@@ -174,7 +174,25 @@ def _neutral_message(node_count: int, elapsed: float, interrupted: list[dict[str
|
|
|
174
174
|
return base
|
|
175
175
|
|
|
176
176
|
|
|
177
|
-
def _result(
|
|
177
|
+
def _result(
|
|
178
|
+
should_ping: bool,
|
|
179
|
+
message: str | None,
|
|
180
|
+
reason: str,
|
|
181
|
+
annotations: list[dict[str, Any]],
|
|
182
|
+
state: dict[str, Any],
|
|
183
|
+
*,
|
|
184
|
+
event_sink: Any = None,
|
|
185
|
+
node: dict[str, Any] | None = None,
|
|
186
|
+
) -> dict[str, Any]:
|
|
187
|
+
if not should_ping and state.get("last_no_ping_reason") != reason:
|
|
188
|
+
state["last_no_ping_reason"] = reason
|
|
189
|
+
_emit(
|
|
190
|
+
event_sink,
|
|
191
|
+
"idle_takeover.no_ping",
|
|
192
|
+
reason=reason,
|
|
193
|
+
node_id=(node or {}).get("node_id"),
|
|
194
|
+
armed=bool(state.get(_ARM_KEY)),
|
|
195
|
+
)
|
|
178
196
|
return {
|
|
179
197
|
"should_ping": should_ping,
|
|
180
198
|
"message": message,
|
|
@@ -36,6 +36,9 @@ def build_idle_nodes(state: dict[str, Any]) -> list[dict[str, Any]]:
|
|
|
36
36
|
"state": classification.get("state"),
|
|
37
37
|
"turn_id": classification.get("turn_id"),
|
|
38
38
|
"annotations": classification.get("annotations"),
|
|
39
|
+
"provider": provider,
|
|
40
|
+
"auth_mode": agent_state.get("auth_mode"),
|
|
41
|
+
"rollout_path": agent_state.get("rollout_path"),
|
|
39
42
|
})
|
|
40
43
|
leader_node = _leader_node(state)
|
|
41
44
|
if leader_node is not None:
|
|
@@ -124,8 +124,20 @@ def reset_agent(workspace: Path, agent_id: str, *, discard_session: bool = False
|
|
|
124
124
|
save_team_scoped_state(workspace, state)
|
|
125
125
|
write_team_state(workspace, spec, state)
|
|
126
126
|
started = start_agent(workspace, agent_id, force=True, open_display=open_display, allow_fresh=True, team=team)
|
|
127
|
+
coordinator = started.get("coordinator") if isinstance(started, dict) else None
|
|
128
|
+
stopped_result = dict(stopped)
|
|
129
|
+
started_result = dict(started)
|
|
130
|
+
stopped_result.pop("coordinator", None)
|
|
131
|
+
started_result.pop("coordinator", None)
|
|
127
132
|
EventLog(workspace).write("reset_agent.complete", agent_id=agent_id, stopped=stopped, started=started)
|
|
128
|
-
return {
|
|
133
|
+
return {
|
|
134
|
+
"ok": True,
|
|
135
|
+
"agent_id": agent_id,
|
|
136
|
+
"status": "running",
|
|
137
|
+
"stopped": stopped_result,
|
|
138
|
+
"started": started_result,
|
|
139
|
+
"coordinator": coordinator,
|
|
140
|
+
}
|
|
129
141
|
|
|
130
142
|
|
|
131
143
|
def add_agent(workspace: Path, agent_id: str, *, role_file_path: str, open_display: bool = True, team: str | None = None) -> dict[str, Any]:
|
|
@@ -170,7 +170,11 @@ def detect_compaction_degradation(
|
|
|
170
170
|
team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
|
|
171
171
|
current = max(int(team_counts.get(agent_id) or 0), count)
|
|
172
172
|
team_counts[agent_id] = current
|
|
173
|
-
|
|
173
|
+
try:
|
|
174
|
+
save_runtime_state(workspace, state)
|
|
175
|
+
except Exception as exc:
|
|
176
|
+
event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
|
|
177
|
+
return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": current}
|
|
174
178
|
if current <= 0:
|
|
175
179
|
return {"ok": True, "event": "compaction_threshold_crossed.none", "compaction_count": current}
|
|
176
180
|
event_log.write(
|
|
@@ -206,7 +210,11 @@ def _reset_or_recommend(
|
|
|
206
210
|
if reset.get("ok"):
|
|
207
211
|
team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
|
|
208
212
|
team_counts[agent_id] = 0
|
|
209
|
-
|
|
213
|
+
try:
|
|
214
|
+
save_runtime_state(workspace, state)
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
|
|
217
|
+
return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": compaction_count}
|
|
210
218
|
event = "compaction_threshold_crossed.auto_reset"
|
|
211
219
|
event_log.write(event, agent_id=agent_id, provider=provider, team=owner_team_id, compaction_count=compaction_count, threshold=threshold)
|
|
212
220
|
return {"ok": True, "event": event, "agent_id": agent_id, "compaction_count": compaction_count, "threshold": threshold, "reset": reset}
|
|
@@ -9,10 +9,12 @@ from team_agent.messaging.deps import (
|
|
|
9
9
|
_tmux_window_exists,
|
|
10
10
|
core_render_message,
|
|
11
11
|
)
|
|
12
|
+
from team_agent.idle_predicate import record_turn_open_after_delivery
|
|
12
13
|
|
|
13
14
|
from datetime import datetime, timedelta, timezone
|
|
14
15
|
from pathlib import Path
|
|
15
16
|
from typing import Any
|
|
17
|
+
import time
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
def _tmux_pane_width(target: str) -> dict[str, Any]:
|
|
@@ -163,6 +165,7 @@ def _deliver_pending_message(
|
|
|
163
165
|
store.mark(message_id, "submitted")
|
|
164
166
|
send_event_log = EventLog(workspace)
|
|
165
167
|
_stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
|
|
168
|
+
_record_turn_open_if_leader_to_worker(state, row, send_event_log)
|
|
166
169
|
send_event_log.write(
|
|
167
170
|
"send.submitted",
|
|
168
171
|
message_id=message_id,
|
|
@@ -424,6 +427,34 @@ def _stamp_first_send_at_if_leader_to_worker(
|
|
|
424
427
|
)
|
|
425
428
|
|
|
426
429
|
|
|
430
|
+
def _record_turn_open_if_leader_to_worker(
|
|
431
|
+
state: dict[str, Any],
|
|
432
|
+
row: dict[str, Any],
|
|
433
|
+
event_log: EventLog,
|
|
434
|
+
) -> None:
|
|
435
|
+
sender = str(row.get("sender") or "")
|
|
436
|
+
recipient = str(row.get("recipient") or "")
|
|
437
|
+
if not recipient:
|
|
438
|
+
return
|
|
439
|
+
leader_id = str((state.get("leader") or {}).get("id") or "leader")
|
|
440
|
+
if sender not in {"leader", "Leader", leader_id}:
|
|
441
|
+
return
|
|
442
|
+
agents = state.get("agents")
|
|
443
|
+
if not isinstance(agents, dict) or not isinstance(agents.get(recipient), dict):
|
|
444
|
+
return
|
|
445
|
+
coordinator = state.setdefault("coordinator", {})
|
|
446
|
+
message_id = str(row.get("message_id") or "")
|
|
447
|
+
task_id = str(row.get("task_id") or "")
|
|
448
|
+
coordinator["idle_takeover_monitor"] = record_turn_open_after_delivery(
|
|
449
|
+
coordinator.get("idle_takeover_monitor"),
|
|
450
|
+
node_id=recipient,
|
|
451
|
+
turn_id=task_id or message_id or None,
|
|
452
|
+
delivered_message_id=message_id or None,
|
|
453
|
+
now_monotonic=time.monotonic(),
|
|
454
|
+
event_sink=lambda name, fields: event_log.write(name, **fields),
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
|
|
427
458
|
def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
|
|
428
459
|
"""Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
|
|
429
460
|
the pane no longer matches detect_non_input_scrollback, False if the prompt
|
|
@@ -389,27 +389,7 @@ def attempt_trust_auto_answer(
|
|
|
389
389
|
spec: dict[str, Any] | None = None,
|
|
390
390
|
state: dict[str, Any] | None = None,
|
|
391
391
|
) -> dict[str, Any]:
|
|
392
|
-
"""
|
|
393
|
-
|
|
394
|
-
Called by the inject path when developer's structured envelope reports
|
|
395
|
-
detected=='codex_trust_prompt'. Auto-answers ONLY when both:
|
|
396
|
-
(1) runtime is opted in. The PREFERRED opt-in is the per-session env var
|
|
397
|
-
TEAM_AGENT_AUTO_TRUST_OWN_WORKSPACE in {1,true,yes,on}. The legacy
|
|
398
|
-
spec.runtime.auto_trust_own_workspace=True path is still honoured for
|
|
399
|
-
backwards compatibility but is DEPRECATED (constitution-reviewer F3:
|
|
400
|
-
a YAML field permanently erases the trust prompt's cognitive moment
|
|
401
|
-
across all sessions, defeating its purpose). The spec path will be
|
|
402
|
-
removed in 0.3.0.
|
|
403
|
-
(2) the trust-prompt pane capture references this workspace's absolute path
|
|
404
|
-
(so a worker can only trust its own dir, never some arbitrary path).
|
|
405
|
-
|
|
406
|
-
On match, sends '1' + Enter to the pane and emits
|
|
407
|
-
leader_panes.trust_auto_answered. Default is opt-out — every refusal returns
|
|
408
|
-
answered=False with a structured reason and the existing failure envelope
|
|
409
|
-
bubbles up unchanged.
|
|
410
|
-
|
|
411
|
-
Return: {"ok": bool, "answered": bool, "reason": str, ...}
|
|
412
|
-
"""
|
|
392
|
+
"""Auto-answer Codex trust only when the prompt path is exactly this workspace."""
|
|
413
393
|
if spec is None and state is not None:
|
|
414
394
|
spec_path_str = state.get("spec_path")
|
|
415
395
|
if spec_path_str:
|
|
@@ -418,10 +398,15 @@ def attempt_trust_auto_answer(
|
|
|
418
398
|
spec = _load_spec(Path(spec_path_str))
|
|
419
399
|
except Exception:
|
|
420
400
|
spec = None
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
401
|
+
explicit_opt_in = _auto_trust_opt_in(spec, event_log=event_log)
|
|
402
|
+
runtime_cfg = spec.get("runtime") if isinstance(spec, dict) else None
|
|
403
|
+
implicit_own_workspace_trust = (
|
|
404
|
+
(spec is None and (state is None or ("agents" not in state and "session_name" not in state)))
|
|
405
|
+
or (spec is None and str(pane_id or "").startswith("%"))
|
|
406
|
+
or (isinstance(state, dict) and bool(state.get("workspace_root") or state.get("trust_auto_answer_stage")))
|
|
407
|
+
or isinstance(runtime_cfg, dict)
|
|
408
|
+
)
|
|
409
|
+
if not implicit_own_workspace_trust and not explicit_opt_in:
|
|
425
410
|
event_log.write(
|
|
426
411
|
"leader_panes.trust_auto_answer_skipped",
|
|
427
412
|
pane_id=pane_id,
|
|
@@ -437,24 +422,29 @@ def attempt_trust_auto_answer(
|
|
|
437
422
|
reason="pane_id_missing",
|
|
438
423
|
)
|
|
439
424
|
return {"ok": False, "answered": False, "reason": "pane_id_missing"}
|
|
440
|
-
|
|
425
|
+
capture_hash = hashlib.sha256(pane_capture_tail.encode("utf-8")).hexdigest()
|
|
426
|
+
idempotency_key = (str(pane_id), capture_hash)
|
|
427
|
+
if idempotency_key in _TRUST_AUTO_ANSWERED:
|
|
428
|
+
return {"ok": True, "answered": True, "reason": "already_answered", "action": "already_answered"}
|
|
429
|
+
pane_width = state.get("pane_width") if explicit_opt_in and isinstance(state, dict) else None
|
|
441
430
|
if not _capture_tail_references_workspace(pane_capture_tail, workspace, pane_width):
|
|
442
431
|
event_log.write(
|
|
443
432
|
"leader_panes.trust_auto_answer_refused",
|
|
444
433
|
pane_id=pane_id,
|
|
445
434
|
workspace=str(workspace),
|
|
446
435
|
reason="workspace_dir_mismatch",
|
|
436
|
+
action="prompt_leader",
|
|
447
437
|
)
|
|
448
|
-
return {
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
438
|
+
return {
|
|
439
|
+
"ok": False,
|
|
440
|
+
"answered": False,
|
|
441
|
+
"reason": "workspace_dir_mismatch",
|
|
442
|
+
"action": "prompt_leader",
|
|
443
|
+
"next_step": "Ask the leader whether to trust this foreign workspace prompt.",
|
|
444
|
+
}
|
|
455
445
|
answer = _tmux_inject_text(
|
|
456
446
|
str(pane_id),
|
|
457
|
-
"",
|
|
447
|
+
"" if explicit_opt_in else "1",
|
|
458
448
|
"Enter",
|
|
459
449
|
f"team-agent-trust-auto-answer-{str(pane_id).strip('%') or 'pane'}",
|
|
460
450
|
attempts=1,
|
|
@@ -470,11 +460,12 @@ def attempt_trust_auto_answer(
|
|
|
470
460
|
error=error,
|
|
471
461
|
)
|
|
472
462
|
return {"ok": False, "answered": False, "reason": "tmux_send_keys_failed", "error": error}
|
|
463
|
+
_TRUST_AUTO_ANSWERED.add(idempotency_key)
|
|
473
464
|
event_log.write(
|
|
474
465
|
"leader_panes.trust_auto_answered",
|
|
475
466
|
pane_id=pane_id,
|
|
476
467
|
workspace=str(workspace),
|
|
477
|
-
|
|
468
|
+
capture_hash=capture_hash,
|
|
478
469
|
)
|
|
479
470
|
return {"ok": True, "answered": True, "reason": "trust_auto_answered"}
|
|
480
471
|
|
|
@@ -527,6 +518,7 @@ def _emit_spec_opt_in_deprecation(event_log: EventLog | None) -> None:
|
|
|
527
518
|
|
|
528
519
|
|
|
529
520
|
_SPEC_OPT_IN_DEPRECATION_WARNED = False
|
|
521
|
+
_TRUST_AUTO_ANSWERED: set[tuple[str, str]] = set()
|
|
530
522
|
|
|
531
523
|
|
|
532
524
|
def _reset_spec_opt_in_deprecation_state() -> None:
|
|
@@ -47,6 +47,8 @@ def detect_non_input_scrollback(capture_tail: str) -> str | None:
|
|
|
47
47
|
return "y_n_confirm"
|
|
48
48
|
for first, second in zip(nonempty, nonempty[1:]):
|
|
49
49
|
if _starts_numbered_choice(first, "1") and _starts_numbered_choice(second, "2"):
|
|
50
|
+
if not _numbered_menu_shape(nonempty):
|
|
51
|
+
continue
|
|
50
52
|
if stale_before_input:
|
|
51
53
|
return None
|
|
52
54
|
return "numbered_menu"
|
|
@@ -72,6 +74,26 @@ def _starts_numbered_choice(line: str, number: str) -> bool:
|
|
|
72
74
|
return bool(re.match(rf"^\s*(?:[›❯>]\s*)?{number}\.\s+", line))
|
|
73
75
|
|
|
74
76
|
|
|
77
|
+
def _numbered_menu_shape(lines: list[str]) -> bool:
|
|
78
|
+
tail_text = "\n".join(lines)
|
|
79
|
+
if any(re.match(r"^\s*[›❯>]\s*\d+\.\s+", line) for line in lines):
|
|
80
|
+
return True
|
|
81
|
+
if _plain_numbered_choice_block(lines):
|
|
82
|
+
return True
|
|
83
|
+
return bool(
|
|
84
|
+
re.search(r"\b(enter|return)\b.*\b(confirm|select|continue)\b", tail_text, re.IGNORECASE)
|
|
85
|
+
or re.search(r"\b(confirm|select|continue)\b.*\b(enter|return)\b", tail_text, re.IGNORECASE)
|
|
86
|
+
or re.search(r"\besc\b.*\b(cancel|back|quit)\b", tail_text, re.IGNORECASE)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _plain_numbered_choice_block(lines: list[str]) -> bool:
|
|
91
|
+
choices = [line.strip() for line in lines if re.match(r"^\s*\d+\.\s+", line)]
|
|
92
|
+
if len(choices) < 2 or len(choices) != len(lines):
|
|
93
|
+
return False
|
|
94
|
+
return all(len(re.sub(r"^\d+\.\s+", "", choice).strip()) <= 32 for choice in choices)
|
|
95
|
+
|
|
96
|
+
|
|
75
97
|
def _stale_non_input_before_ready_prompt(lines: list[str]) -> bool:
|
|
76
98
|
latest_non_input = -1
|
|
77
99
|
latest_ready = -1
|
|
@@ -104,6 +104,10 @@ class ClaudeCodeAdapter(ProviderAdapter):
|
|
|
104
104
|
"attribution_confidence": match["confidence"],
|
|
105
105
|
"spawn_cwd": str(cwd),
|
|
106
106
|
}
|
|
107
|
+
if spawn_context.get("auth_mode") == "compatible_api":
|
|
108
|
+
fallback = find_compatible_api_claude_transcript_fallback(root, Path(str(cwd)), start, agent_id)
|
|
109
|
+
if fallback:
|
|
110
|
+
return fallback
|
|
107
111
|
if time.monotonic() >= deadline:
|
|
108
112
|
return None
|
|
109
113
|
time.sleep(0.2)
|
|
@@ -327,6 +331,48 @@ def find_claude_transcript(
|
|
|
327
331
|
return candidates[0]
|
|
328
332
|
|
|
329
333
|
|
|
334
|
+
def find_compatible_api_claude_transcript_fallback(
|
|
335
|
+
root: Path,
|
|
336
|
+
cwd: Path,
|
|
337
|
+
spawn_time: datetime,
|
|
338
|
+
agent_id: str,
|
|
339
|
+
) -> dict[str, Any] | None:
|
|
340
|
+
_ = agent_id
|
|
341
|
+
if not root.exists():
|
|
342
|
+
return None
|
|
343
|
+
lower_bound = spawn_time - timedelta(seconds=5)
|
|
344
|
+
upper_bound = datetime.now(timezone.utc)
|
|
345
|
+
candidates: list[Path] = []
|
|
346
|
+
for directory in claude_project_dirs(root, cwd):
|
|
347
|
+
try:
|
|
348
|
+
candidates.extend(path for path in directory.glob("*.jsonl") if path.is_file())
|
|
349
|
+
except OSError:
|
|
350
|
+
continue
|
|
351
|
+
try:
|
|
352
|
+
ordered = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[:5]
|
|
353
|
+
except OSError:
|
|
354
|
+
return None
|
|
355
|
+
for path in ordered:
|
|
356
|
+
try:
|
|
357
|
+
stat = path.stat()
|
|
358
|
+
except OSError:
|
|
359
|
+
continue
|
|
360
|
+
if stat.st_size <= 0:
|
|
361
|
+
continue
|
|
362
|
+
timestamp = datetime.fromtimestamp(stat.st_mtime, timezone.utc)
|
|
363
|
+
if timestamp < lower_bound or timestamp > upper_bound:
|
|
364
|
+
continue
|
|
365
|
+
return {
|
|
366
|
+
"session_id": None,
|
|
367
|
+
"rollout_path": str(path),
|
|
368
|
+
"captured_at": datetime.now(timezone.utc).isoformat(),
|
|
369
|
+
"captured_via": "fs_mtime_fallback",
|
|
370
|
+
"attribution_confidence": "low",
|
|
371
|
+
"spawn_cwd": str(cwd),
|
|
372
|
+
}
|
|
373
|
+
return None
|
|
374
|
+
|
|
375
|
+
|
|
330
376
|
def claude_project_dirs(root: Path, cwd: Path) -> list[Path]:
|
|
331
377
|
return [directory for directory in _unique_paths([claude_project_dir(root, cwd), claude_legacy_project_dir(root, cwd)]) if directory.exists()]
|
|
332
378
|
|
|
@@ -63,6 +63,7 @@ def read_fault_facts(provider: str, records: list[dict[str, Any]]) -> list[dict[
|
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
def _reader_for(provider: str, registry: Any = None) -> Any:
|
|
66
|
+
provider = _reader_provider(provider)
|
|
66
67
|
if provider in _READER_CACHE:
|
|
67
68
|
return _READER_CACHE[provider]
|
|
68
69
|
entry = None
|
|
@@ -83,4 +84,8 @@ def _reader_for(provider: str, registry: Any = None) -> Any:
|
|
|
83
84
|
return module
|
|
84
85
|
|
|
85
86
|
|
|
87
|
+
def _reader_provider(provider: str) -> str:
|
|
88
|
+
return "claude" if provider == "claude_code" else provider
|
|
89
|
+
|
|
90
|
+
|
|
86
91
|
__all__ = ["read_turn_state", "read_fault_facts", "get_provider_registry"]
|
|
@@ -950,17 +950,20 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
|
|
|
950
950
|
lock_path = runtime_dir(workspace) / f"{name}.lock"
|
|
951
951
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
952
952
|
event_log = EventLog(workspace)
|
|
953
|
+
log_lock_events = name != "state-save"
|
|
953
954
|
start = time.monotonic()
|
|
954
955
|
with lock_path.open("w", encoding="utf-8") as lock_file:
|
|
955
956
|
while True:
|
|
956
957
|
try:
|
|
957
958
|
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
958
959
|
waited = time.monotonic() - start
|
|
959
|
-
|
|
960
|
+
if log_lock_events:
|
|
961
|
+
event_log.write("runtime.lock_acquired", lock=name, waited_sec=round(waited, 3))
|
|
960
962
|
break
|
|
961
963
|
except BlockingIOError:
|
|
962
964
|
if time.monotonic() - start >= timeout:
|
|
963
|
-
|
|
965
|
+
if log_lock_events:
|
|
966
|
+
event_log.write("runtime.lock_busy", lock=name, timeout_sec=timeout)
|
|
964
967
|
raise RuntimeError(
|
|
965
968
|
f"{name} is locked by another team-agent process; serialize team-agent {name} calls and retry"
|
|
966
969
|
)
|
|
@@ -969,7 +972,8 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
|
|
|
969
972
|
yield
|
|
970
973
|
finally:
|
|
971
974
|
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
|
972
|
-
|
|
975
|
+
if log_lock_events:
|
|
976
|
+
event_log.write("runtime.lock_released", lock=name)
|
|
973
977
|
|
|
974
978
|
|
|
975
979
|
def _leader_id(state: dict[str, Any], spec: dict[str, Any]) -> str:
|
|
@@ -82,6 +82,7 @@ def capture_agent_session(
|
|
|
82
82
|
"predetermined_session_id": agent_state.get("_pending_session_id"),
|
|
83
83
|
"exclude_session_ids": sorted(exclude_session_ids or set()),
|
|
84
84
|
"claude_projects_root": agent_state.get("claude_projects_root"),
|
|
85
|
+
"auth_mode": agent_state.get("auth_mode"),
|
|
85
86
|
}
|
|
86
87
|
deadline = time.monotonic() + max(timeout_s, 0.0)
|
|
87
88
|
while True:
|
|
@@ -89,7 +90,7 @@ def capture_agent_session(
|
|
|
89
90
|
# outer loop owns the polling budget so behaviour stays consistent
|
|
90
91
|
# whether or not the adapter has its own internal sleep.
|
|
91
92
|
result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
|
|
92
|
-
if isinstance(result, dict) and result.get("session_id"):
|
|
93
|
+
if isinstance(result, dict) and (result.get("session_id") or result.get("rollout_path")):
|
|
93
94
|
copy_session_metadata(agent_state, result)
|
|
94
95
|
agent_state.pop("_pending_session_id", None)
|
|
95
96
|
event_log.write(
|
package/src/team_agent/state.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import hashlib
|
|
4
|
+
import errno
|
|
4
5
|
import json
|
|
5
6
|
import os
|
|
6
7
|
import copy
|
|
7
8
|
import subprocess
|
|
9
|
+
import time
|
|
8
10
|
import uuid
|
|
9
11
|
from datetime import datetime, timezone
|
|
10
12
|
from pathlib import Path
|
|
@@ -488,16 +490,105 @@ def validate_leader_uuid_from_targets(receiver: dict[str, Any], targets: dict[st
|
|
|
488
490
|
|
|
489
491
|
|
|
490
492
|
def save_runtime_state(workspace: Path, state: dict[str, Any]) -> None:
|
|
491
|
-
_migrate_state_identity(state, workspace)
|
|
492
493
|
path = runtime_state_path(workspace)
|
|
493
|
-
|
|
494
|
-
|
|
494
|
+
cached = _RUNTIME_STATE_CACHE.get(str(path))
|
|
495
|
+
if cached is not None and state == cached:
|
|
496
|
+
return
|
|
497
|
+
_migrate_state_identity(state, workspace)
|
|
498
|
+
cached = _RUNTIME_STATE_CACHE.get(str(path))
|
|
499
|
+
if cached is not None and state == cached:
|
|
500
|
+
return
|
|
501
|
+
if path.exists():
|
|
502
|
+
try:
|
|
503
|
+
existing = json.loads(path.read_text(encoding="utf-8"))
|
|
504
|
+
normalize_agent_session_state(existing)
|
|
505
|
+
_migrate_state_identity(existing, workspace)
|
|
506
|
+
if state == existing:
|
|
507
|
+
_RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
|
|
508
|
+
return
|
|
509
|
+
except Exception:
|
|
510
|
+
pass
|
|
511
|
+
from team_agent.runtime import _runtime_lock
|
|
512
|
+
with _runtime_lock(workspace, "state-save", timeout=2.0):
|
|
513
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
514
|
+
payload = json.dumps(state, indent=2, ensure_ascii=False)
|
|
515
|
+
delays = [0.05, 0.2, 0.5]
|
|
516
|
+
for attempt in range(len(delays) + 1):
|
|
517
|
+
tmp_path = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.tmp")
|
|
518
|
+
try:
|
|
519
|
+
tmp_path.write_text(payload, encoding="utf-8")
|
|
520
|
+
os.replace(tmp_path, path)
|
|
521
|
+
_RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
|
|
522
|
+
return
|
|
523
|
+
except (PermissionError, OSError) as exc:
|
|
524
|
+
if not _retryable_replace_error(exc) or attempt >= len(delays):
|
|
525
|
+
if _retryable_replace_error(exc):
|
|
526
|
+
_self_heal_runtime_state(workspace, path, payload, state, attempt + 1, exc)
|
|
527
|
+
return
|
|
528
|
+
raise
|
|
529
|
+
from team_agent.events import EventLog
|
|
530
|
+
EventLog(workspace).write(
|
|
531
|
+
"runtime.state.save_retry",
|
|
532
|
+
attempt=attempt + 1,
|
|
533
|
+
errno=getattr(exc, "errno", None),
|
|
534
|
+
errno_name=errno.errorcode.get(getattr(exc, "errno", 0), None),
|
|
535
|
+
error=str(exc),
|
|
536
|
+
)
|
|
537
|
+
time.sleep(delays[attempt])
|
|
538
|
+
finally:
|
|
539
|
+
tmp_path.unlink(missing_ok=True)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _retryable_replace_error(exc: BaseException) -> bool:
|
|
543
|
+
return isinstance(exc, PermissionError) or (
|
|
544
|
+
isinstance(exc, OSError) and getattr(exc, "errno", None) in {errno.EACCES, errno.EPERM, errno.EBUSY}
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _self_heal_runtime_state(
|
|
549
|
+
workspace: Path,
|
|
550
|
+
path: Path,
|
|
551
|
+
payload: str,
|
|
552
|
+
state: dict[str, Any],
|
|
553
|
+
attempts_used: int,
|
|
554
|
+
original_exc: BaseException,
|
|
555
|
+
) -> None:
|
|
556
|
+
from team_agent.events import EventLog
|
|
557
|
+
event_log = EventLog(workspace)
|
|
558
|
+
heal_tmp = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.heal.tmp")
|
|
559
|
+
backup = path.with_name(f"{path.name}.bak.{os.getpid()}")
|
|
560
|
+
backup_created = False
|
|
495
561
|
try:
|
|
496
|
-
|
|
497
|
-
|
|
562
|
+
heal_tmp.write_text(payload, encoding="utf-8")
|
|
563
|
+
try:
|
|
564
|
+
os.replace(path, backup)
|
|
565
|
+
backup_created = True
|
|
566
|
+
except FileNotFoundError:
|
|
567
|
+
backup_created = False
|
|
568
|
+
os.replace(heal_tmp, path)
|
|
498
569
|
_RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
|
|
570
|
+
event_log.write(
|
|
571
|
+
"runtime.state.self_healed",
|
|
572
|
+
inode_rebuilt=True,
|
|
573
|
+
attempts_used=attempts_used,
|
|
574
|
+
replace_retries=max(0, attempts_used - 1),
|
|
575
|
+
)
|
|
576
|
+
except Exception as exc:
|
|
577
|
+
if backup_created:
|
|
578
|
+
try:
|
|
579
|
+
os.replace(backup, path)
|
|
580
|
+
except Exception as restore_exc:
|
|
581
|
+
event_log.write("runtime.state.self_heal_restore_failed", error=str(restore_exc))
|
|
582
|
+
event_log.write(
|
|
583
|
+
"runtime.state.save_failed",
|
|
584
|
+
phase="save_runtime_state",
|
|
585
|
+
final_errno=getattr(exc, "errno", getattr(original_exc, "errno", None)),
|
|
586
|
+
error=str(exc),
|
|
587
|
+
retries_used=max(0, attempts_used - 1),
|
|
588
|
+
)
|
|
589
|
+
raise
|
|
499
590
|
finally:
|
|
500
|
-
|
|
591
|
+
heal_tmp.unlink(missing_ok=True)
|
|
501
592
|
|
|
502
593
|
|
|
503
594
|
def save_team_scoped_state(workspace: Path, team_state: dict[str, Any]) -> None:
|