@team-agent/installer 0.2.10 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@team-agent/installer",
3
- "version": "0.2.10",
3
+ "version": "0.2.11",
4
4
  "description": "npx installer for Team Agent",
5
5
  "keywords": [
6
6
  "codex",
@@ -39,6 +39,8 @@ def main(argv: list[str] | None = None) -> None:
39
39
 
40
40
  interval = args.tick_interval if args.tick_interval is not None else _tick_interval(workspace)
41
41
  initial_ppid = os.getppid()
42
+ failure_count = 0
43
+ last_failure_signature: tuple[str, str] | None = None
42
44
  while not STOP:
43
45
  # Stage 14 (Gap 37b) — orphan self-detection. If our original parent (test harness,
44
46
  # shell, or supervisor) died, our ppid is reparented to 1 (or to a launchd shim on
@@ -55,7 +57,41 @@ def main(argv: list[str] | None = None) -> None:
55
57
  workspace=str(workspace),
56
58
  )
57
59
  break
58
- result = runtime.coordinator_tick(workspace)
60
+ try:
61
+ result = runtime.coordinator_tick(workspace)
62
+ except Exception as exc:
63
+ failure_count += 1
64
+ signature = (type(exc).__name__, str(exc)[:200])
65
+ sleep_sec = min(interval * (2 ** min(failure_count - 1, 5)), 60.0)
66
+ if signature != last_failure_signature:
67
+ last_failure_signature = signature
68
+ event_log.write(
69
+ "coordinator.tick_error",
70
+ error=str(exc),
71
+ exc_type=type(exc).__name__,
72
+ consecutive_failures=failure_count,
73
+ next_sleep_sec=sleep_sec,
74
+ )
75
+ elif failure_count == 1 or failure_count % 12 == 0 or sleep_sec in {40.0, 60.0}:
76
+ event_log.write(
77
+ "coordinator.tick_error",
78
+ error=str(exc),
79
+ exc_type=type(exc).__name__,
80
+ consecutive_failures=failure_count,
81
+ next_sleep_sec=sleep_sec,
82
+ )
83
+ else:
84
+ event_log.write(
85
+ "coordinator.tick_error.suppressed",
86
+ consecutive_failures=failure_count,
87
+ next_sleep_sec=sleep_sec,
88
+ )
89
+ time.sleep(sleep_sec)
90
+ continue
91
+ if failure_count:
92
+ event_log.write("coordinator.tick_recovered", consecutive_failures=failure_count)
93
+ failure_count = 0
94
+ last_failure_signature = None
59
95
  if result.get("stop") or args.once:
60
96
  break
61
97
  time.sleep(interval)
@@ -288,14 +288,18 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
288
288
  # Gap 32: the take-over reminder is driven by file-fact turn-state via the
289
289
  # idle_takeover predicate (the legacy screen-scrape obligation path is retired).
290
290
  _coord_meta = state.setdefault("coordinator", {})
291
+ idle_nodes = build_idle_nodes(state)
292
+ _record_unknown_idle_nodes(state, idle_nodes, event_log)
291
293
  idle_eval = evaluate_takeover_reminder(
292
- build_idle_nodes(state),
294
+ idle_nodes,
293
295
  monitor_state=_coord_meta.get("idle_takeover_monitor"),
294
296
  now_monotonic=_time.monotonic(),
295
297
  debounce_seconds=IDLE_DEBOUNCE_SECONDS,
298
+ event_sink=lambda name, fields: event_log.write(name, **fields),
296
299
  )
297
300
  _coord_meta["idle_takeover_monitor"] = idle_eval.get("monitor_state")
298
- push_idle_reminder(workspace, state, event_log, idle_eval)
301
+ if idle_eval.get("should_ping"):
302
+ push_idle_reminder(workspace, state, event_log, idle_eval)
299
303
  idle_alerts = (
300
304
  [{"alert_type": "idle_takeover", "message": idle_eval.get("message"),
301
305
  "reason": idle_eval.get("reason"), "interrupted": idle_eval.get("interrupted_nodes")}]
@@ -338,7 +342,25 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
338
342
  if drift:
339
343
  drift_results.append(drift)
340
344
  api_errors = detect_leader_api_errors(workspace, state, store, event_log)
341
- save_runtime_state(workspace, state)
345
+ try:
346
+ save_runtime_state(workspace, state)
347
+ except Exception as exc:
348
+ event_log.write("runtime.state.save_failed", phase="tick_end", error=str(exc), exc_type=type(exc).__name__)
349
+ return {
350
+ "ok": False,
351
+ "stop": False,
352
+ "reason": "persistence_degraded",
353
+ "persisted": False,
354
+ "error": str(exc),
355
+ "delivered": delivered,
356
+ "scheduled": fired,
357
+ "stuck": stuck,
358
+ "idle_alerts": idle_alerts,
359
+ "deadlock_alerts": deadlock_alerts,
360
+ "compaction": compaction_results,
361
+ "session_drift": drift_results,
362
+ "api_errors": api_errors,
363
+ }
342
364
  results = _collect_results_and_notify_watchers(workspace, event_log)
343
365
  # Stage 12: prune the dedupe log every tick — cheap O(n) delete bounded by 24h window.
344
366
  from team_agent.message_store.leader_notification_log import prune_leader_notification_log
@@ -361,3 +383,29 @@ def coordinator_tick(workspace: Path) -> dict[str, Any]:
361
383
  "api_errors": api_errors,
362
384
  "results": results,
363
385
  }
386
+
387
+
388
+ def _record_unknown_idle_nodes(state: dict[str, Any], nodes: list[dict[str, Any]], event_log: EventLog) -> None:
389
+ coordinator = state.setdefault("coordinator", {})
390
+ unknown_ticks = coordinator.setdefault("unknown_ticks", {})
391
+ current_unknown: set[str] = set()
392
+ for node in nodes:
393
+ node_id = str(node.get("node_id") or "")
394
+ if not node_id:
395
+ continue
396
+ if node.get("state") == "unknown":
397
+ current_unknown.add(node_id)
398
+ count = int(unknown_ticks.get(node_id) or 0) + 1
399
+ unknown_ticks[node_id] = count
400
+ if count >= 60 and count % 12 == 0:
401
+ event_log.write(
402
+ "idle_takeover.unknown_persistent",
403
+ node_id=node_id,
404
+ provider=node.get("provider"),
405
+ auth_mode=node.get("auth_mode"),
406
+ consecutive_ticks=count,
407
+ rollout_path=node.get("rollout_path"),
408
+ )
409
+ for node_id in list(unknown_ticks):
410
+ if node_id not in current_unknown:
411
+ unknown_ticks.pop(node_id, None)
@@ -46,10 +46,10 @@ def evaluate_takeover_reminder(
46
46
  if node_state not in _IDLE_STATES:
47
47
  state["all_idle_since"] = None
48
48
  state["pinged_for_episode"] = None
49
- return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state)
49
+ return _result(False, None, f"node_{node_state or 'unknown'}", _interrupted(nodes), state, event_sink=event_sink, node=node)
50
50
 
51
51
  if not nodes:
52
- return _result(False, None, "no_nodes", [], state)
52
+ return _result(False, None, "no_nodes", [], state, event_sink=event_sink)
53
53
 
54
54
  if state.get("all_idle_since") is None:
55
55
  state["all_idle_since"] = now_monotonic
@@ -58,18 +58,18 @@ def evaluate_takeover_reminder(
58
58
  interrupted = _interrupted(nodes)
59
59
 
60
60
  if not state.get(_ARM_KEY):
61
- return _result(False, None, "not_armed_no_worker_turn", interrupted, state)
61
+ return _result(False, None, "not_armed_no_worker_turn", interrupted, state, event_sink=event_sink)
62
62
  if state.get(_SUPPRESS_KEY):
63
- return _result(False, None, "acknowledged", interrupted, state)
63
+ return _result(False, None, "acknowledged", interrupted, state, event_sink=event_sink)
64
64
  if elapsed < debounce_seconds:
65
- return _result(False, None, "debounce_active", interrupted, state)
65
+ return _result(False, None, "debounce_active", interrupted, state, event_sink=event_sink)
66
66
  if state.get("pinged_for_episode") == state.get("all_idle_since"):
67
- return _result(False, None, "already_pinged_this_episode", interrupted, state)
67
+ return _result(False, None, "already_pinged_this_episode", interrupted, state, event_sink=event_sink)
68
68
 
69
69
  state["pinged_for_episode"] = state["all_idle_since"]
70
70
  message = _neutral_message(len(nodes), elapsed, interrupted)
71
71
  _emit(event_sink, "idle_takeover.ping", nodes=len(nodes), elapsed_seconds=int(elapsed), interrupted=[i["node_id"] for i in interrupted])
72
- return _result(True, message, "all_idle_debounce_elapsed", interrupted, state)
72
+ return _result(True, message, "all_idle_debounce_elapsed", interrupted, state, event_sink=event_sink)
73
73
 
74
74
 
75
75
  def record_turn_open_after_delivery(
@@ -174,7 +174,25 @@ def _neutral_message(node_count: int, elapsed: float, interrupted: list[dict[str
174
174
  return base
175
175
 
176
176
 
177
- def _result(should_ping: bool, message: str | None, reason: str, annotations: list[dict[str, Any]], state: dict[str, Any]) -> dict[str, Any]:
177
+ def _result(
178
+ should_ping: bool,
179
+ message: str | None,
180
+ reason: str,
181
+ annotations: list[dict[str, Any]],
182
+ state: dict[str, Any],
183
+ *,
184
+ event_sink: Any = None,
185
+ node: dict[str, Any] | None = None,
186
+ ) -> dict[str, Any]:
187
+ if not should_ping and state.get("last_no_ping_reason") != reason:
188
+ state["last_no_ping_reason"] = reason
189
+ _emit(
190
+ event_sink,
191
+ "idle_takeover.no_ping",
192
+ reason=reason,
193
+ node_id=(node or {}).get("node_id"),
194
+ armed=bool(state.get(_ARM_KEY)),
195
+ )
178
196
  return {
179
197
  "should_ping": should_ping,
180
198
  "message": message,
@@ -36,6 +36,9 @@ def build_idle_nodes(state: dict[str, Any]) -> list[dict[str, Any]]:
36
36
  "state": classification.get("state"),
37
37
  "turn_id": classification.get("turn_id"),
38
38
  "annotations": classification.get("annotations"),
39
+ "provider": provider,
40
+ "auth_mode": agent_state.get("auth_mode"),
41
+ "rollout_path": agent_state.get("rollout_path"),
39
42
  })
40
43
  leader_node = _leader_node(state)
41
44
  if leader_node is not None:
@@ -170,7 +170,11 @@ def detect_compaction_degradation(
170
170
  team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
171
171
  current = max(int(team_counts.get(agent_id) or 0), count)
172
172
  team_counts[agent_id] = current
173
- save_runtime_state(workspace, state)
173
+ try:
174
+ save_runtime_state(workspace, state)
175
+ except Exception as exc:
176
+ event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
177
+ return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": current}
174
178
  if current <= 0:
175
179
  return {"ok": True, "event": "compaction_threshold_crossed.none", "compaction_count": current}
176
180
  event_log.write(
@@ -206,7 +210,11 @@ def _reset_or_recommend(
206
210
  if reset.get("ok"):
207
211
  team_counts = state.setdefault("coordinator", {}).setdefault("compaction_counts", {}).setdefault(owner_team_id, {})
208
212
  team_counts[agent_id] = 0
209
- save_runtime_state(workspace, state)
213
+ try:
214
+ save_runtime_state(workspace, state)
215
+ except Exception as exc:
216
+ event_log.write("runtime.state.save_failed", phase="compaction_detect", error=str(exc), exc_type=type(exc).__name__)
217
+ return {"ok": False, "event": "compaction_threshold_crossed.unpersisted", "agent_id": agent_id, "compaction_count": compaction_count}
210
218
  event = "compaction_threshold_crossed.auto_reset"
211
219
  event_log.write(event, agent_id=agent_id, provider=provider, team=owner_team_id, compaction_count=compaction_count, threshold=threshold)
212
220
  return {"ok": True, "event": event, "agent_id": agent_id, "compaction_count": compaction_count, "threshold": threshold, "reset": reset}
@@ -9,10 +9,12 @@ from team_agent.messaging.deps import (
9
9
  _tmux_window_exists,
10
10
  core_render_message,
11
11
  )
12
+ from team_agent.idle_predicate import record_turn_open_after_delivery
12
13
 
13
14
  from datetime import datetime, timedelta, timezone
14
15
  from pathlib import Path
15
16
  from typing import Any
17
+ import time
16
18
 
17
19
 
18
20
  def _tmux_pane_width(target: str) -> dict[str, Any]:
@@ -163,6 +165,7 @@ def _deliver_pending_message(
163
165
  store.mark(message_id, "submitted")
164
166
  send_event_log = EventLog(workspace)
165
167
  _stamp_first_send_at_if_leader_to_worker(state, row, send_event_log)
168
+ _record_turn_open_if_leader_to_worker(state, row, send_event_log)
166
169
  send_event_log.write(
167
170
  "send.submitted",
168
171
  message_id=message_id,
@@ -424,6 +427,34 @@ def _stamp_first_send_at_if_leader_to_worker(
424
427
  )
425
428
 
426
429
 
430
+ def _record_turn_open_if_leader_to_worker(
431
+ state: dict[str, Any],
432
+ row: dict[str, Any],
433
+ event_log: EventLog,
434
+ ) -> None:
435
+ sender = str(row.get("sender") or "")
436
+ recipient = str(row.get("recipient") or "")
437
+ if not recipient:
438
+ return
439
+ leader_id = str((state.get("leader") or {}).get("id") or "leader")
440
+ if sender not in {"leader", "Leader", leader_id}:
441
+ return
442
+ agents = state.get("agents")
443
+ if not isinstance(agents, dict) or not isinstance(agents.get(recipient), dict):
444
+ return
445
+ coordinator = state.setdefault("coordinator", {})
446
+ message_id = str(row.get("message_id") or "")
447
+ task_id = str(row.get("task_id") or "")
448
+ coordinator["idle_takeover_monitor"] = record_turn_open_after_delivery(
449
+ coordinator.get("idle_takeover_monitor"),
450
+ node_id=recipient,
451
+ turn_id=task_id or message_id or None,
452
+ delivered_message_id=message_id or None,
453
+ now_monotonic=time.monotonic(),
454
+ event_sink=lambda name, fields: event_log.write(name, **fields),
455
+ )
456
+
457
+
427
458
  def _wait_for_trust_prompt_dismissal(target: str, *, timeout: float = 3.0, poll_interval: float = 0.1) -> bool:
428
459
  """Spark MEDIUM #4: bounded poll for trust prompt dismissal. Returns True once
429
460
  the pane no longer matches detect_non_input_scrollback, False if the prompt
@@ -104,6 +104,10 @@ class ClaudeCodeAdapter(ProviderAdapter):
104
104
  "attribution_confidence": match["confidence"],
105
105
  "spawn_cwd": str(cwd),
106
106
  }
107
+ if spawn_context.get("auth_mode") == "compatible_api":
108
+ fallback = find_compatible_api_claude_transcript_fallback(root, Path(str(cwd)), start, agent_id)
109
+ if fallback:
110
+ return fallback
107
111
  if time.monotonic() >= deadline:
108
112
  return None
109
113
  time.sleep(0.2)
@@ -327,6 +331,48 @@ def find_claude_transcript(
327
331
  return candidates[0]
328
332
 
329
333
 
334
+ def find_compatible_api_claude_transcript_fallback(
335
+ root: Path,
336
+ cwd: Path,
337
+ spawn_time: datetime,
338
+ agent_id: str,
339
+ ) -> dict[str, Any] | None:
340
+ _ = agent_id
341
+ if not root.exists():
342
+ return None
343
+ lower_bound = spawn_time - timedelta(seconds=5)
344
+ upper_bound = datetime.now(timezone.utc)
345
+ candidates: list[Path] = []
346
+ for directory in claude_project_dirs(root, cwd):
347
+ try:
348
+ candidates.extend(path for path in directory.glob("*.jsonl") if path.is_file())
349
+ except OSError:
350
+ continue
351
+ try:
352
+ ordered = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[:5]
353
+ except OSError:
354
+ return None
355
+ for path in ordered:
356
+ try:
357
+ stat = path.stat()
358
+ except OSError:
359
+ continue
360
+ if stat.st_size <= 0:
361
+ continue
362
+ timestamp = datetime.fromtimestamp(stat.st_mtime, timezone.utc)
363
+ if timestamp < lower_bound or timestamp > upper_bound:
364
+ continue
365
+ return {
366
+ "session_id": None,
367
+ "rollout_path": str(path),
368
+ "captured_at": datetime.now(timezone.utc).isoformat(),
369
+ "captured_via": "fs_mtime_fallback",
370
+ "attribution_confidence": "low",
371
+ "spawn_cwd": str(cwd),
372
+ }
373
+ return None
374
+
375
+
330
376
  def claude_project_dirs(root: Path, cwd: Path) -> list[Path]:
331
377
  return [directory for directory in _unique_paths([claude_project_dir(root, cwd), claude_legacy_project_dir(root, cwd)]) if directory.exists()]
332
378
 
@@ -63,6 +63,7 @@ def read_fault_facts(provider: str, records: list[dict[str, Any]]) -> list[dict[
63
63
 
64
64
 
65
65
  def _reader_for(provider: str, registry: Any = None) -> Any:
66
+ provider = _reader_provider(provider)
66
67
  if provider in _READER_CACHE:
67
68
  return _READER_CACHE[provider]
68
69
  entry = None
@@ -83,4 +84,8 @@ def _reader_for(provider: str, registry: Any = None) -> Any:
83
84
  return module
84
85
 
85
86
 
87
+ def _reader_provider(provider: str) -> str:
88
+ return "claude" if provider == "claude_code" else provider
89
+
90
+
86
91
  __all__ = ["read_turn_state", "read_fault_facts", "get_provider_registry"]
@@ -950,17 +950,20 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
950
950
  lock_path = runtime_dir(workspace) / f"{name}.lock"
951
951
  lock_path.parent.mkdir(parents=True, exist_ok=True)
952
952
  event_log = EventLog(workspace)
953
+ log_lock_events = name != "state-save"
953
954
  start = time.monotonic()
954
955
  with lock_path.open("w", encoding="utf-8") as lock_file:
955
956
  while True:
956
957
  try:
957
958
  fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
958
959
  waited = time.monotonic() - start
959
- event_log.write("runtime.lock_acquired", lock=name, waited_sec=round(waited, 3))
960
+ if log_lock_events:
961
+ event_log.write("runtime.lock_acquired", lock=name, waited_sec=round(waited, 3))
960
962
  break
961
963
  except BlockingIOError:
962
964
  if time.monotonic() - start >= timeout:
963
- event_log.write("runtime.lock_busy", lock=name, timeout_sec=timeout)
965
+ if log_lock_events:
966
+ event_log.write("runtime.lock_busy", lock=name, timeout_sec=timeout)
964
967
  raise RuntimeError(
965
968
  f"{name} is locked by another team-agent process; serialize team-agent {name} calls and retry"
966
969
  )
@@ -969,7 +972,8 @@ def _runtime_lock(workspace: Path, name: str, timeout: float = 5.0):
969
972
  yield
970
973
  finally:
971
974
  fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
972
- event_log.write("runtime.lock_released", lock=name)
975
+ if log_lock_events:
976
+ event_log.write("runtime.lock_released", lock=name)
973
977
 
974
978
 
975
979
  def _leader_id(state: dict[str, Any], spec: dict[str, Any]) -> str:
@@ -82,6 +82,7 @@ def capture_agent_session(
82
82
  "predetermined_session_id": agent_state.get("_pending_session_id"),
83
83
  "exclude_session_ids": sorted(exclude_session_ids or set()),
84
84
  "claude_projects_root": agent_state.get("claude_projects_root"),
85
+ "auth_mode": agent_state.get("auth_mode"),
85
86
  }
86
87
  deadline = time.monotonic() + max(timeout_s, 0.0)
87
88
  while True:
@@ -89,7 +90,7 @@ def capture_agent_session(
89
90
  # outer loop owns the polling budget so behaviour stays consistent
90
91
  # whether or not the adapter has its own internal sleep.
91
92
  result = adapter.capture_session_id(agent_id, spawn_context, timeout_s=0)
92
- if isinstance(result, dict) and result.get("session_id"):
93
+ if isinstance(result, dict) and (result.get("session_id") or result.get("rollout_path")):
93
94
  copy_session_metadata(agent_state, result)
94
95
  agent_state.pop("_pending_session_id", None)
95
96
  event_log.write(
@@ -1,10 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import hashlib
4
+ import errno
4
5
  import json
5
6
  import os
6
7
  import copy
7
8
  import subprocess
9
+ import time
8
10
  import uuid
9
11
  from datetime import datetime, timezone
10
12
  from pathlib import Path
@@ -488,16 +490,105 @@ def validate_leader_uuid_from_targets(receiver: dict[str, Any], targets: dict[st
488
490
 
489
491
 
490
492
  def save_runtime_state(workspace: Path, state: dict[str, Any]) -> None:
491
- _migrate_state_identity(state, workspace)
492
493
  path = runtime_state_path(workspace)
493
- path.parent.mkdir(parents=True, exist_ok=True)
494
- tmp_path = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.tmp")
494
+ cached = _RUNTIME_STATE_CACHE.get(str(path))
495
+ if cached is not None and state == cached:
496
+ return
497
+ _migrate_state_identity(state, workspace)
498
+ cached = _RUNTIME_STATE_CACHE.get(str(path))
499
+ if cached is not None and state == cached:
500
+ return
501
+ if path.exists():
502
+ try:
503
+ existing = json.loads(path.read_text(encoding="utf-8"))
504
+ normalize_agent_session_state(existing)
505
+ _migrate_state_identity(existing, workspace)
506
+ if state == existing:
507
+ _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
508
+ return
509
+ except Exception:
510
+ pass
511
+ from team_agent.runtime import _runtime_lock
512
+ with _runtime_lock(workspace, "state-save", timeout=2.0):
513
+ path.parent.mkdir(parents=True, exist_ok=True)
514
+ payload = json.dumps(state, indent=2, ensure_ascii=False)
515
+ delays = [0.05, 0.2, 0.5]
516
+ for attempt in range(len(delays) + 1):
517
+ tmp_path = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.tmp")
518
+ try:
519
+ tmp_path.write_text(payload, encoding="utf-8")
520
+ os.replace(tmp_path, path)
521
+ _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
522
+ return
523
+ except (PermissionError, OSError) as exc:
524
+ if not _retryable_replace_error(exc) or attempt >= len(delays):
525
+ if _retryable_replace_error(exc):
526
+ _self_heal_runtime_state(workspace, path, payload, state, attempt + 1, exc)
527
+ return
528
+ raise
529
+ from team_agent.events import EventLog
530
+ EventLog(workspace).write(
531
+ "runtime.state.save_retry",
532
+ attempt=attempt + 1,
533
+ errno=getattr(exc, "errno", None),
534
+ errno_name=errno.errorcode.get(getattr(exc, "errno", 0), None),
535
+ error=str(exc),
536
+ )
537
+ time.sleep(delays[attempt])
538
+ finally:
539
+ tmp_path.unlink(missing_ok=True)
540
+
541
+
542
+ def _retryable_replace_error(exc: BaseException) -> bool:
543
+ return isinstance(exc, PermissionError) or (
544
+ isinstance(exc, OSError) and getattr(exc, "errno", None) in {errno.EACCES, errno.EPERM, errno.EBUSY}
545
+ )
546
+
547
+
548
+ def _self_heal_runtime_state(
549
+ workspace: Path,
550
+ path: Path,
551
+ payload: str,
552
+ state: dict[str, Any],
553
+ attempts_used: int,
554
+ original_exc: BaseException,
555
+ ) -> None:
556
+ from team_agent.events import EventLog
557
+ event_log = EventLog(workspace)
558
+ heal_tmp = path.with_name(f"{path.name}.{os.getpid()}.{uuid.uuid4().hex}.heal.tmp")
559
+ backup = path.with_name(f"{path.name}.bak.{os.getpid()}")
560
+ backup_created = False
495
561
  try:
496
- tmp_path.write_text(json.dumps(state, indent=2, ensure_ascii=False), encoding="utf-8")
497
- os.replace(tmp_path, path)
562
+ heal_tmp.write_text(payload, encoding="utf-8")
563
+ try:
564
+ os.replace(path, backup)
565
+ backup_created = True
566
+ except FileNotFoundError:
567
+ backup_created = False
568
+ os.replace(heal_tmp, path)
498
569
  _RUNTIME_STATE_CACHE[str(path)] = copy.deepcopy(state)
570
+ event_log.write(
571
+ "runtime.state.self_healed",
572
+ inode_rebuilt=True,
573
+ attempts_used=attempts_used,
574
+ replace_retries=max(0, attempts_used - 1),
575
+ )
576
+ except Exception as exc:
577
+ if backup_created:
578
+ try:
579
+ os.replace(backup, path)
580
+ except Exception as restore_exc:
581
+ event_log.write("runtime.state.self_heal_restore_failed", error=str(restore_exc))
582
+ event_log.write(
583
+ "runtime.state.save_failed",
584
+ phase="save_runtime_state",
585
+ final_errno=getattr(exc, "errno", getattr(original_exc, "errno", None)),
586
+ error=str(exc),
587
+ retries_used=max(0, attempts_used - 1),
588
+ )
589
+ raise
499
590
  finally:
500
- tmp_path.unlink(missing_ok=True)
591
+ heal_tmp.unlink(missing_ok=True)
501
592
 
502
593
 
503
594
  def save_team_scoped_state(workspace: Path, team_state: dict[str, Any]) -> None: