nexo-brain 7.31.0 → 7.31.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.31.0",
3
+ "version": "7.31.2",
4
4
  "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
5
5
  "author": {
6
6
  "name": "NEXO Brain",
package/README.md CHANGED
@@ -18,7 +18,9 @@
18
18
 
19
19
  [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
20
20
 
21
- Version `7.31.0` is the current packaged-runtime line. Minor release over v7.30.33 - the recommended Claude Code model moves from Opus 4.8 to Fable 5 with max reasoning (`claude-fable-5`) across all four main resonance tiers (the `muy_bajo` tier keeps Haiku for cheap internal classifiers and Codex stays on GPT-5.5), existing installs riding NEXO defaults auto-migrate on update while customized models are respected, and learning housekeeping no longer aborts when the embedding backend is missing.
21
+ Version `7.31.2` is the current packaged-runtime line. Patch release over v7.31.1 - the session ID becomes a durable identity (sessions survive quiet work periods; physical cleanup at 24h), runtime residents are isolated per generation so two installs can never kill each other's resident, and obsolete residents retire themselves once clients disconnect.
22
+
23
+ Previously in `7.31.1`: patch release over v7.31.0 - headless automations pause and queue when the selected engine is unavailable (credits, rate limits, expired auth) and resume automatically with one operator notice in their language; protocol nudge shaping ships in shadow mode; and the client config push stops writing an invalid `mcp__*` permission rule to Claude Code settings.
22
24
 
23
25
  Previously in `7.30.33`: patch release over v7.30.32 - personal agent/script status now keeps the newest real run between manual executions and cron history, so a successful manual agent run cannot be hidden behind an older scheduled failure.
24
26
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.31.0",
3
+ "version": "7.31.2",
4
4
  "mcpName": "io.github.wazionapps/nexo",
5
5
  "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
6
6
  "homepage": "https://nexo-brain.com",
@@ -1205,6 +1205,14 @@ def run_automation_prompt(
1205
1205
  f"{selected_backend} automation backend selected but launcher is not installed; fallback blocked."
1206
1206
  )
1207
1207
 
1208
+ # Fase 1.6 — provider circuit breaker. "Installed" is not "available":
1209
+ # with credits exhausted / rate limited / auth expired, every headless
1210
+ # cron used to launch a session that died mid-flight, burned its retry
1211
+ # budget and escalated to the operator per-item. The breaker fails fast
1212
+ # with a queue-me signal instead; one probe per retry window re-tests.
1213
+ from provider_circuit_breaker import raise_if_unavailable
1214
+ raise_if_unavailable(selected_backend)
1215
+
1208
1216
  # Resonance map decides (model, effort) for every call. ``caller`` is
1209
1217
  # MANDATORY — every script that invokes the automation backend must be
1210
1218
  # registered in src/resonance_map.py so its reasoning budget is a
@@ -1414,6 +1422,7 @@ def run_automation_prompt(
1414
1422
  stderr = result.stderr or ""
1415
1423
  if not recorded:
1416
1424
  stderr = _append_stderr(stderr, record_error)
1425
+ _record_provider_breaker_outcome(selected_backend, result.returncode, final_stdout, stderr)
1417
1426
  return subprocess.CompletedProcess(
1418
1427
  cmd,
1419
1428
  result.returncode,
@@ -1490,6 +1499,7 @@ def run_automation_prompt(
1490
1499
  stderr = result.stderr or ""
1491
1500
  if not recorded:
1492
1501
  stderr = _append_stderr(stderr, record_error)
1502
+ _record_provider_breaker_outcome(selected_backend, result.returncode, final_stdout, stderr)
1493
1503
  return subprocess.CompletedProcess(
1494
1504
  cmd,
1495
1505
  result.returncode,
@@ -1500,6 +1510,22 @@ def run_automation_prompt(
1500
1510
  raise AutomationBackendUnavailableError(f"Unsupported automation backend: {selected_backend}")
1501
1511
 
1502
1512
 
1513
+ def _record_provider_breaker_outcome(backend: str, returncode: int | None, stdout: str, stderr: str) -> None:
1514
+ """Fase 1.6 — feed the circuit breaker after every headless session.
1515
+
1516
+ Success closes the breaker; classified failures (credits/rate-limit/auth)
1517
+ open it immediately so the NEXT cron fails fast and queues instead of
1518
+ launching another doomed session. Best-effort: breaker bookkeeping must
1519
+ never mask the session result.
1520
+ """
1521
+ try:
1522
+ from provider_circuit_breaker import classify_session_failure, record_session_outcome
1523
+ reason = classify_session_failure(returncode, stdout or "", stderr or "")
1524
+ record_session_outcome(backend, ok=(reason is None), reason=reason)
1525
+ except Exception:
1526
+ pass
1527
+
1528
+
1503
1529
  def probe_automation_backend(
1504
1530
  *,
1505
1531
  backend: str | None = None,
@@ -1012,7 +1012,16 @@ def _load_toml_object(path: Path) -> dict:
1012
1012
  def _write_toml_object(path: Path, payload: dict) -> None:
1013
1013
  path.parent.mkdir(parents=True, exist_ok=True)
1014
1014
  lines = _emit_toml_table(payload)
1015
- path.write_text("\n".join(lines).rstrip() + "\n")
1015
+ content = "\n".join(lines).rstrip() + "\n"
1016
+ # v7.31.x (Fase 1) — write-if-changed: rewriting an identical config.toml
1017
+ # on every Desktop update churns mtime/content signatures and can
1018
+ # re-trigger Codex's hook/trust confirmation prompt for the operator.
1019
+ try:
1020
+ if path.is_file() and path.read_text() == content:
1021
+ return
1022
+ except Exception:
1023
+ pass # unreadable existing file -> fall through to a clean write
1024
+ path.write_text(content)
1016
1025
 
1017
1026
 
1018
1027
  def _sync_codex_managed_config(
@@ -1527,6 +1536,12 @@ def _claude_desktop_managed_metadata(server_config: dict, *, operator_name: str)
1527
1536
  # (followup-runner, email-monitor, deep-sleep, etc.) to work without
1528
1537
  # interactive approval prompts. Without this, Claude Code headless invocations
1529
1538
  # stall waiting for MCP tool approvals.
1539
+ #
1540
+ # v7.31.x (Fase 1) — "mcp__*" is NOT a valid Claude Code allow rule (allow
1541
+ # patterns must name a literal mcp__<server>__ scope; only deny/ask accept
1542
+ # bare wildcards). Claude Code skips it and shows a Settings Warning on every
1543
+ # launch. List the NEXO-managed servers explicitly instead; user-added
1544
+ # servers belong to the user's own config, not to this template.
1530
1545
  _NEXO_HEADLESS_ALLOWLIST = (
1531
1546
  "Bash",
1532
1547
  "Read",
@@ -1539,6 +1554,16 @@ _NEXO_HEADLESS_ALLOWLIST = (
1539
1554
  "NotebookEdit",
1540
1555
  "WebSearch",
1541
1556
  "WebFetch",
1557
+ "mcp__nexo__*",
1558
+ "mcp__nexo_chrome_control__*",
1559
+ "mcp__nexo_desktop_control__*",
1560
+ "mcp__nexo_power_control__*",
1561
+ )
1562
+
1563
+ # Entries previously pushed by this template that Claude Code rejects as
1564
+ # invalid. The sync REMOVES them so already-contaminated installs stop
1565
+ # showing the launch warning. Safe: Claude Code was skipping them anyway.
1566
+ _NEXO_INVALID_ALLOWLIST_ENTRIES = (
1542
1567
  "mcp__*",
1543
1568
  )
1544
1569
 
@@ -1558,6 +1583,12 @@ def _ensure_headless_permissions(payload: dict) -> None:
1558
1583
  allow_list = []
1559
1584
  permissions["allow"] = allow_list
1560
1585
 
1586
+ # v7.31.x (Fase 1) — migrate away invalid entries this template used to
1587
+ # push (Claude Code skips them and warns on every launch).
1588
+ for invalid in _NEXO_INVALID_ALLOWLIST_ENTRIES:
1589
+ while invalid in allow_list:
1590
+ allow_list.remove(invalid)
1591
+
1561
1592
  existing = {str(item) for item in allow_list if isinstance(item, str)}
1562
1593
  for entry in _NEXO_HEADLESS_ALLOWLIST:
1563
1594
  if entry not in existing:
package/src/db/_core.py CHANGED
@@ -36,10 +36,18 @@ _data_dir = os.path.dirname(DB_PATH)
36
36
  os.makedirs(_data_dir, exist_ok=True)
37
37
 
38
38
  # TTLs in seconds (match session-coord.sh behavior)
39
- SESSION_STALE_SECONDS = 900 # 15 min (documented TTL)
39
+ SESSION_STALE_SECONDS = 900 # 15 min (documented TTL) — visibility horizon only
40
40
  MESSAGE_TTL_SECONDS = 3600 # 1 hour
41
41
  QUESTION_TTL_SECONDS = 600 # 10 min
42
42
 
43
+ # Phase 2.1 — purge horizon, deliberately FAR above the visibility TTL.
44
+ # SESSION_STALE_SECONDS only governs what counts as "active" in listings;
45
+ # physically DELETING rows at 15 min destroyed the continuity of any session
46
+ # that spent >15 min in code tools without touching a nexo_* tool (incident
47
+ # 10-jun: two working sessions lost mid-task, orphaning their open protocol
48
+ # tasks). Rows now survive 24h so revival/heartbeat can find them.
49
+ SESSION_PURGE_SECONDS = 24 * 3600
50
+
43
51
  # Single shared connection per process with write serialization.
44
52
  # SQLite allows only one writer at a time. Using a shared connection with
45
53
  # check_same_thread=False and a write lock ensures:
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
  """NEXO DB — Sessions module."""
3
3
  import time, secrets, string, sqlite3
4
4
  from datetime import datetime
5
- from db._core import get_db, _gen_id, now_epoch, local_time_str, SESSION_STALE_SECONDS, MESSAGE_TTL_SECONDS, QUESTION_TTL_SECONDS
5
+ from db._core import get_db, _gen_id, now_epoch, local_time_str, SESSION_STALE_SECONDS, SESSION_PURGE_SECONDS, MESSAGE_TTL_SECONDS, QUESTION_TTL_SECONDS
6
6
 
7
7
  # ── Session operations ──────────────────────────────────────────────
8
8
 
@@ -166,9 +166,17 @@ def get_active_sessions() -> list[dict]:
166
166
 
167
167
 
168
168
  def clean_stale_sessions() -> int:
169
- """Remove stale sessions. Returns count removed."""
169
+ """Purge sessions older than the PURGE horizon. Returns count removed.
170
+
171
+ Phase 2.1 — this used to delete at SESSION_STALE_SECONDS (15 min), which
172
+ destroyed any session that worked quietly in code tools for a while: the
173
+ next session/cron to start would erase it, its next nexo_track failed
174
+ with "Session not found" and its open protocol tasks were orphaned.
175
+ Deletion now happens at SESSION_PURGE_SECONDS (24h); the 15-min TTL keeps
176
+ governing VISIBILITY (get_active_sessions/search_sessions) unchanged.
177
+ """
170
178
  conn = get_db()
171
- cutoff = now_epoch() - SESSION_STALE_SECONDS
179
+ cutoff = now_epoch() - SESSION_PURGE_SECONDS
172
180
  stale = conn.execute(
173
181
  "SELECT sid FROM sessions WHERE last_update_epoch <= ?", (cutoff,)
174
182
  ).fetchall()
@@ -309,13 +317,42 @@ def search_sessions(keyword: str) -> list[dict]:
309
317
 
310
318
  # ── File tracking ───────────────────────────────────────────────────
311
319
 
320
+ def revive_session(sid: str, task_hint: str = "(revived session)") -> bool:
321
+ """Phase 2.1 — re-create a session row for a valid SID that vanished.
322
+
323
+ A session can disappear legitimately (purge horizon, manual cleanup,
324
+ DB swap) while its client keeps working with the same SID. The durable
325
+ identity is the SID, not the row: revive it instead of erroring, so the
326
+ "Session not found. Register first." class of breakage cannot occur.
327
+ Returns True when a row was actually (re)created.
328
+ """
329
+ sid = _validate_sid(sid)
330
+ conn = get_db()
331
+ now = now_epoch()
332
+ cursor = conn.execute(
333
+ "INSERT OR IGNORE INTO sessions (sid, task, started_epoch, last_update_epoch, local_time) "
334
+ "VALUES (?, ?, ?, ?, ?)",
335
+ (sid, task_hint, now, now, local_time_str()),
336
+ )
337
+ conn.commit()
338
+ return cursor.rowcount > 0
339
+
340
+
312
341
  def track_files(sid: str, paths: list[str]) -> dict:
313
- """Track files for a session. Returns conflicts if any."""
342
+ """Track files for a session. Returns conflicts if any.
343
+
344
+ Phase 2.1 — a valid SID whose row vanished is REVIVED instead of being
345
+ told "Session not found. Register first." (the heartbeat already revived
346
+ missing sessions; this layer was internally inconsistent). The result
347
+ carries ``revived: True`` so callers can log the recovery.
348
+ """
349
+ sid = _validate_sid(sid)
314
350
  conn = get_db()
315
351
  now = now_epoch()
352
+ revived = False
316
353
  session = conn.execute("SELECT sid FROM sessions WHERE sid = ?", (sid,)).fetchone()
317
354
  if not session:
318
- return {"error": f"Session {sid} not found. Register first."}
355
+ revived = revive_session(sid, task_hint="(revived by nexo_track)")
319
356
 
320
357
  for path in paths:
321
358
  conn.execute(
@@ -324,7 +361,10 @@ def track_files(sid: str, paths: list[str]) -> dict:
324
361
  )
325
362
  conn.commit()
326
363
  conflicts = _check_conflicts(conn, sid)
327
- return {"tracked": paths, "conflicts": conflicts}
364
+ result = {"tracked": paths, "conflicts": conflicts}
365
+ if revived:
366
+ result["revived"] = True
367
+ return result
328
368
 
329
369
 
330
370
  def untrack_files(sid: str, paths: list[str] | None = None):
@@ -42,6 +42,22 @@ PROTOCOL_SKIP_TOOLS = {
42
42
  "nexo_rules_check",
43
43
  }
44
44
  ACTION_TASK_TYPES = {"edit", "execute", "delegate"}
45
+
46
+ # Phase 1.5 (SPEC-FIABILIDAD-FASES-2026-06) — protocol nudge shaping.
47
+ # The "Non-trivial work without nexo_task_open" warning fired on EVERY
48
+ # non-trivial tool call from tool #1 (no threshold, no rate limit, no
49
+ # session-type awareness) — measurable as noise that gets ignored. Shaping:
50
+ # - threshold: only nudge after N consecutive non-trivial tools w/o task
51
+ # - cooldown: once nudged, stay quiet for a window
52
+ # - headless: runner sessions are covered by HeadlessEnforcer already
53
+ # (enforcement_engine.py, threshold 4/2 + cooldown) — skip the nudge
54
+ # Mode is SHADOW by default: visible behaviour is UNCHANGED; decisions are
55
+ # logged to runtime/logs/protocol-nudge-shadow.ndjson so the threshold can
56
+ # be calibrated with real data before flipping NEXO_PROTOCOL_NUDGE_MODE to
57
+ # "active". ("off" disables shaping bookkeeping entirely.)
58
+ PROTOCOL_NUDGE_MODE = str(os.environ.get("NEXO_PROTOCOL_NUDGE_MODE", "shadow")).strip().lower()
59
+ PROTOCOL_NUDGE_THRESHOLD = max(1, int(os.environ.get("NEXO_PROTOCOL_NUDGE_THRESHOLD", "6") or 6))
60
+ PROTOCOL_NUDGE_COOLDOWN_S = max(0, int(os.environ.get("NEXO_PROTOCOL_NUDGE_COOLDOWN_S", "300") or 300))
45
61
  NEXO_CODE_ROOT = Path(os.environ.get("NEXO_CODE", str(Path(__file__).resolve().parent))).expanduser().resolve()
46
62
  LIVE_REPO_ROOT = NEXO_CODE_ROOT.parent if NEXO_CODE_ROOT.name == "src" else NEXO_CODE_ROOT
47
63
  PUBLIC_REPO_DIRS = {
@@ -1198,6 +1214,110 @@ def _append_protocol_warning(warnings: list[dict], message: str) -> None:
1198
1214
  warnings.append({"message": clean})
1199
1215
 
1200
1216
 
1217
+ def _protocol_nudge_state_path() -> Path:
1218
+ base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
1219
+ return base / "runtime" / "data" / "protocol-nudge-state.json"
1220
+
1221
+
1222
+ def _protocol_nudge_shadow_log_path() -> Path:
1223
+ base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
1224
+ return base / "runtime" / "logs" / "protocol-nudge-shadow.ndjson"
1225
+
1226
+
1227
+ def _shape_protocol_nudge(sid: str) -> dict:
1228
+ """Phase 1.5 — decide whether the no-task nudge SHOULD fire under shaping.
1229
+
1230
+ Pure bookkeeping + decision; never raises (a broken state file must not
1231
+ break the hook). Returns {would_emit, reason, streak}.
1232
+ """
1233
+ import json as _json
1234
+ import time as _time
1235
+
1236
+ headless = (
1237
+ str(os.environ.get("NEXO_AUTOMATION", "")).strip() == "1"
1238
+ or str(os.environ.get("NEXO_HEADLESS", "")).strip() == "1"
1239
+ )
1240
+ if headless:
1241
+ return {"would_emit": False, "reason": "headless-covered-by-enforcer", "streak": 0}
1242
+
1243
+ state_path = _protocol_nudge_state_path()
1244
+ state: dict = {}
1245
+ try:
1246
+ state = _json.loads(state_path.read_text(encoding="utf-8"))
1247
+ if not isinstance(state, dict):
1248
+ state = {}
1249
+ except Exception:
1250
+ state = {}
1251
+
1252
+ now = _time.time()
1253
+ # Drop stale sessions (>48h) so the file cannot grow without bound.
1254
+ state = {
1255
+ key: value for key, value in state.items()
1256
+ if isinstance(value, dict) and (now - float(value.get("updated_at") or 0)) < 48 * 3600
1257
+ }
1258
+ entry = state.get(sid) or {}
1259
+ streak = int(entry.get("streak") or 0) + 1
1260
+ last_nudge_at = float(entry.get("last_nudge_at") or 0)
1261
+ entry.update({"streak": streak, "updated_at": now})
1262
+
1263
+ if streak < PROTOCOL_NUDGE_THRESHOLD:
1264
+ decision = {"would_emit": False, "reason": "under-threshold", "streak": streak}
1265
+ elif last_nudge_at and (now - last_nudge_at) < PROTOCOL_NUDGE_COOLDOWN_S:
1266
+ decision = {"would_emit": False, "reason": "cooldown", "streak": streak}
1267
+ else:
1268
+ entry["last_nudge_at"] = now
1269
+ decision = {"would_emit": True, "reason": "threshold-reached", "streak": streak}
1270
+
1271
+ state[sid] = entry
1272
+ try:
1273
+ state_path.parent.mkdir(parents=True, exist_ok=True)
1274
+ tmp = state_path.with_suffix(".json.tmp")
1275
+ tmp.write_text(_json.dumps(state, ensure_ascii=False) + "\n", encoding="utf-8")
1276
+ os.replace(tmp, state_path)
1277
+ except Exception:
1278
+ pass
1279
+ return decision
1280
+
1281
+
1282
+ def _reset_protocol_nudge_streak(sid: str) -> None:
1283
+ """A session with an open task is compliant — its streak restarts."""
1284
+ import json as _json
1285
+
1286
+ if PROTOCOL_NUDGE_MODE == "off" or not sid:
1287
+ return
1288
+ state_path = _protocol_nudge_state_path()
1289
+ try:
1290
+ state = _json.loads(state_path.read_text(encoding="utf-8"))
1291
+ if not isinstance(state, dict) or sid not in state:
1292
+ return
1293
+ state[sid]["streak"] = 0
1294
+ tmp = state_path.with_suffix(".json.tmp")
1295
+ tmp.write_text(_json.dumps(state, ensure_ascii=False) + "\n", encoding="utf-8")
1296
+ os.replace(tmp, state_path)
1297
+ except Exception:
1298
+ pass
1299
+
1300
+
1301
+ def _log_protocol_nudge_shadow(sid: str, decision: dict, emitted_today: bool) -> None:
1302
+ import json as _json
1303
+ import time as _time
1304
+
1305
+ try:
1306
+ path = _protocol_nudge_shadow_log_path()
1307
+ path.parent.mkdir(parents=True, exist_ok=True)
1308
+ with path.open("a", encoding="utf-8") as handle:
1309
+ handle.write(_json.dumps({
1310
+ "ts": _time.time(),
1311
+ "sid": sid,
1312
+ "mode": PROTOCOL_NUDGE_MODE,
1313
+ "threshold": PROTOCOL_NUDGE_THRESHOLD,
1314
+ "decision": decision,
1315
+ "legacy_warning_emitted": emitted_today,
1316
+ }, ensure_ascii=False) + "\n")
1317
+ except Exception:
1318
+ pass
1319
+
1320
+
1201
1321
  def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
1202
1322
  short_name = _short_tool_name(tool_name)
1203
1323
  if short_name in PROTOCOL_SKIP_TOOLS or short_name not in NON_TRIVIAL_PROTOCOL_TOOLS:
@@ -1214,6 +1334,17 @@ def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
1214
1334
  task = _find_any_open_task(conn, sid)
1215
1335
  has_guard = _session_has_guard_check(conn, sid)
1216
1336
  if not task:
1337
+ # Phase 1.5 — shaping decision. In SHADOW mode (default) the visible
1338
+ # behaviour below is untouched and the decision is only logged so the
1339
+ # threshold can be calibrated; in ACTIVE mode the shaping governs
1340
+ # (headless skip, streak threshold, cooldown); "off" disables both.
1341
+ nudge = None
1342
+ if PROTOCOL_NUDGE_MODE in {"shadow", "active"}:
1343
+ nudge = _shape_protocol_nudge(sid)
1344
+ if PROTOCOL_NUDGE_MODE == "active" and nudge and not nudge["would_emit"]:
1345
+ _log_protocol_nudge_shadow(sid, nudge, emitted_today=False)
1346
+ return warnings
1347
+
1217
1348
  guard_note = (
1218
1349
  render_core_prompt("hook-protocol-warning-task-open-guard-note")
1219
1350
  if short_name in {"Read", "Bash", "Grep", "Glob"} and not has_guard
@@ -1230,8 +1361,12 @@ def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
1230
1361
  warnings,
1231
1362
  render_core_prompt("hook-protocol-warning-heartbeat-close-evidence"),
1232
1363
  )
1364
+ if PROTOCOL_NUDGE_MODE == "shadow" and nudge is not None:
1365
+ _log_protocol_nudge_shadow(sid, nudge, emitted_today=True)
1233
1366
  return warnings
1234
1367
 
1368
+ _reset_protocol_nudge_streak(sid)
1369
+
1235
1370
  task_id = str(task.get("task_id") or "").strip()
1236
1371
  if str(task.get("task_type") or "").strip() in ACTION_TASK_TYPES and not (task.get("opened_with_guard") or has_guard):
1237
1372
  _append_protocol_warning(
@@ -0,0 +1,230 @@
1
+ """Provider circuit breaker — Fase 1.6 (SPEC-FIABILIDAD-FASES-2026-06).
2
+
3
+ Incident (2026-06-10, operator report): when the selected engine (Claude or
4
+ Codex) is unavailable — credits exhausted, rate limited, auth expired — every
5
+ headless cron (email-monitor, deep-sleep, evolution, catch-up, followups…)
6
+ still launched a session that died mid-flight, burned its retry budget, then
7
+ escalated to the operator by email (in English, regardless of the configured
8
+ language). Work was lost or degraded to manual across the whole system.
9
+
10
+ This module gives the single launch path (agent_runner.run_automation_prompt)
11
+ a shared, persisted circuit breaker:
12
+
13
+ - ``check_provider_available(backend)`` — gate BEFORE launching.
14
+ - ``classify_session_failure(...)`` — map a dead session to a cause.
15
+ - ``record_session_outcome(backend, …)`` — close on success, open on
16
+ classified failures (credits/rate-limit/auth open immediately; generic
17
+ failures only after N consecutive).
18
+ - ``should_notify_operator(backend)`` — True exactly once per opening, so
19
+ the operator gets ONE notice instead of one per queued item.
20
+
21
+ State lives in ``$NEXO_HOME/runtime/data/provider-circuit-breaker.json`` so
22
+ every cron process shares the same view. Writes are atomic (tmp + replace).
23
+ The breaker FAILS OPEN on its own errors: a broken state file must never
24
+ block automations.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import json
30
+ import os
31
+ import re
32
+ import time
33
+ from pathlib import Path
34
+
35
+ # Failure classes that open the breaker on FIRST sight: retrying cannot help
36
+ # until the underlying condition clears.
37
+ HARD_OPEN_REASONS = {"credits", "rate_limit", "auth"}
38
+
39
+ # Generic failures (network blips, crashes) need this many consecutive hits
40
+ # before the breaker opens — one flaky session must not pause the fleet.
41
+ GENERIC_OPEN_THRESHOLD = 3
42
+
43
+ # How long the breaker stays open before allowing ONE half-open probe call.
44
+ DEFAULT_RETRY_AFTER_S = {
45
+ "credits": 30 * 60, # credit top-ups/renewals are slow; probe every 30m
46
+ "rate_limit": 15 * 60, # unless the provider told us a reset time
47
+ "auth": 60 * 60, # needs operator action; probe hourly anyway
48
+ "generic": 10 * 60,
49
+ }
50
+
51
+ _FAILURE_PATTERNS = (
52
+ ("credits", re.compile(
53
+ r"credit balance is too low|insufficient[_ ]quota|exceeded your current quota"
54
+ r"|billing hard limit|out of credits|usage limit reached|plan limits",
55
+ re.I)),
56
+ ("rate_limit", re.compile(
57
+ r"rate[_ -]?limit|too many requests|\b429\b|overloaded[_ ]error|\b529\b"
58
+ r"|server overloaded|capacity constraints",
59
+ re.I)),
60
+ ("auth", re.compile(
61
+ r"authentication[_ ]error|\b401\b|unauthorized|oauth token (has )?expired"
62
+ r"|invalid api key|api key not (found|valid)|please run /login|token_revoked",
63
+ re.I)),
64
+ )
65
+
66
+
67
+ def _state_path() -> Path:
68
+ base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
69
+ return base / "runtime" / "data" / "provider-circuit-breaker.json"
70
+
71
+
72
+ def _now() -> float:
73
+ return time.time()
74
+
75
+
76
+ def _load_state() -> dict:
77
+ try:
78
+ raw = _state_path().read_text(encoding="utf-8")
79
+ data = json.loads(raw)
80
+ return data if isinstance(data, dict) else {}
81
+ except Exception:
82
+ return {}
83
+
84
+
85
+ def _save_state(state: dict) -> None:
86
+ try:
87
+ path = _state_path()
88
+ path.parent.mkdir(parents=True, exist_ok=True)
89
+ tmp = path.with_suffix(".json.tmp")
90
+ tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
91
+ os.replace(tmp, path)
92
+ except Exception:
93
+ pass # the breaker must never break the caller
94
+
95
+
96
+ def _entry(state: dict, backend: str) -> dict:
97
+ entry = state.get(backend)
98
+ if not isinstance(entry, dict):
99
+ entry = {}
100
+ state[backend] = entry
101
+ return entry
102
+
103
+
104
+ class ProviderTemporarilyUnavailableError(RuntimeError):
105
+ """Selected provider is up for maintenance by reality (credits/rate/auth).
106
+
107
+ Callers should QUEUE/DEFER their work without burning retry budgets; the
108
+ breaker re-probes automatically once ``retry_after`` passes.
109
+ """
110
+
111
+ def __init__(self, backend: str, reason: str, retry_after_ts: float | None):
112
+ self.backend = backend
113
+ self.reason = reason
114
+ self.retry_after_ts = retry_after_ts
115
+ wait = ""
116
+ if retry_after_ts:
117
+ wait = f"; next probe after {time.strftime('%H:%M', time.localtime(retry_after_ts))}"
118
+ super().__init__(
119
+ f"provider '{backend}' temporarily unavailable (reason: {reason}){wait}. "
120
+ "Work should be queued, not retried blindly."
121
+ )
122
+
123
+
124
+ def classify_session_failure(returncode: int | None, stdout: str = "", stderr: str = "") -> str | None:
125
+ """Map a finished/dead session to a failure class, or None if it looks fine.
126
+
127
+ Only classifies KNOWN unavailability shapes; an exit code != 0 with no
128
+ matching pattern returns "generic" so the threshold logic decides.
129
+ A zero return code returns None.
130
+ """
131
+ if returncode == 0:
132
+ return None
133
+ haystack = f"{stdout or ''}\n{stderr or ''}"
134
+ for reason, pattern in _FAILURE_PATTERNS:
135
+ if pattern.search(haystack):
136
+ return reason
137
+ return "generic"
138
+
139
+
140
+ def check_provider_available(backend: str) -> tuple[bool, dict]:
141
+ """Gate to call BEFORE launching the provider.
142
+
143
+ Returns (True, entry) when closed — or when open but past retry_after, in
144
+ which case the caller's attempt IS the half-open probe (its outcome will
145
+ close or re-open the breaker via record_session_outcome).
146
+ Returns (False, entry) while open and inside the wait window.
147
+ """
148
+ state = _load_state()
149
+ entry = _entry(state, backend)
150
+ if entry.get("state") != "open":
151
+ return True, entry
152
+ retry_after = float(entry.get("retry_after") or 0)
153
+ if retry_after and _now() >= retry_after:
154
+ entry["half_open_probe_at"] = _now()
155
+ _save_state(state)
156
+ return True, entry
157
+ return False, entry
158
+
159
+
160
+ def raise_if_unavailable(backend: str) -> None:
161
+ ok, entry = check_provider_available(backend)
162
+ if ok:
163
+ return
164
+ raise ProviderTemporarilyUnavailableError(
165
+ backend,
166
+ str(entry.get("reason") or "unknown"),
167
+ float(entry.get("retry_after") or 0) or None,
168
+ )
169
+
170
+
171
+ def record_session_outcome(
172
+ backend: str,
173
+ *,
174
+ ok: bool,
175
+ reason: str | None = None,
176
+ retry_after_s: float | None = None,
177
+ ) -> dict:
178
+ """Update the breaker after a session finished (or died).
179
+
180
+ ``reason`` should come from classify_session_failure. ``retry_after_s``
181
+ lets callers honour a provider-reported reset time.
182
+ """
183
+ state = _load_state()
184
+ entry = _entry(state, backend)
185
+ if ok:
186
+ was_open = entry.get("state") == "open"
187
+ state[backend] = {
188
+ "state": "closed",
189
+ "consecutive_failures": 0,
190
+ "closed_at": _now(),
191
+ "recovered_from": entry.get("reason") if was_open else None,
192
+ }
193
+ _save_state(state)
194
+ return state[backend]
195
+
196
+ failure_reason = reason or "generic"
197
+ consecutive = int(entry.get("consecutive_failures") or 0) + 1
198
+ entry["consecutive_failures"] = consecutive
199
+ should_open = failure_reason in HARD_OPEN_REASONS or consecutive >= GENERIC_OPEN_THRESHOLD
200
+ if should_open:
201
+ wait = retry_after_s if retry_after_s else DEFAULT_RETRY_AFTER_S.get(failure_reason, DEFAULT_RETRY_AFTER_S["generic"])
202
+ already_open = entry.get("state") == "open"
203
+ entry.update({
204
+ "state": "open",
205
+ "reason": failure_reason,
206
+ "opened_at": entry.get("opened_at") if already_open else _now(),
207
+ "retry_after": _now() + float(wait),
208
+ })
209
+ if not already_open:
210
+ entry["operator_notified_at"] = None
211
+ _save_state(state)
212
+ return entry
213
+
214
+
215
+ def should_notify_operator(backend: str) -> bool:
216
+ """True exactly once per opening — callers use it to send ONE notice."""
217
+ state = _load_state()
218
+ entry = _entry(state, backend)
219
+ if entry.get("state") != "open":
220
+ return False
221
+ if entry.get("operator_notified_at"):
222
+ return False
223
+ entry["operator_notified_at"] = _now()
224
+ _save_state(state)
225
+ return True
226
+
227
+
228
+ def breaker_status() -> dict:
229
+ """Read-only snapshot for doctors/diagnostics."""
230
+ return _load_state()
@@ -34,6 +34,16 @@ STATE_FILE = "runtime-service.json"
34
34
  LOCK_FILE = "runtime-service.lock"
35
35
  LOG_FILE = "runtime-service.log"
36
36
 
37
+ # Phase 2.1/2.2 — state isolation per runtime generation. Two different Brain
38
+ # installs (e.g. the managed ~/.nexo/core runtime and an npm-global one)
39
+ # used to share ONE state file: each side saw a "stale_runtime" resident and
40
+ # KILLED the other's in an endless ping-pong (1,314 resident restarts logged
41
+ # on the operator machine; every restart forced the next conversation to pay
42
+ # a 10-48s cold Brain boot and expired client sessions). With the state file
43
+ # keyed by runtime generation, a runtime can only ever see — and manage —
44
+ # its OWN resident. Foreign residents become invisible instead of killable.
45
+
46
+
37
47
 
38
48
  def env_flag(name: str, *, default: bool = False) -> bool:
39
49
  value = os.environ.get(name)
@@ -55,12 +65,29 @@ def service_url(host: str | None = None, port: int | None = None, path: str | No
55
65
  return f"http://{host or service_host()}:{int(port or service_port())}{path or service_path()}"
56
66
 
57
67
 
58
- def service_state_path() -> Path:
68
+ def _generation_state_token(generation: str) -> str:
69
+ """Stable filesystem-safe token for a runtime generation."""
70
+ text = str(generation or "unknown").strip() or "unknown"
71
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()[:12]
72
+
73
+
74
+ def _current_generation_token() -> str:
75
+ return _generation_state_token(current_runtime_identity().get("runtime_generation", "unknown"))
76
+
77
+
78
+ def _legacy_service_state_path() -> Path:
59
79
  root = paths.runtime_state_dir()
60
80
  root.mkdir(parents=True, exist_ok=True)
61
81
  return root / STATE_FILE
62
82
 
63
83
 
84
+ def service_state_path() -> Path:
85
+ root = paths.runtime_state_dir()
86
+ root.mkdir(parents=True, exist_ok=True)
87
+ token = _current_generation_token()
88
+ return root / f"runtime-service-{token}.json"
89
+
90
+
64
91
  def service_log_path() -> Path:
65
92
  root = paths.logs_dir()
66
93
  root.mkdir(parents=True, exist_ok=True)
@@ -70,7 +97,8 @@ def service_log_path() -> Path:
70
97
  def service_lock_path() -> Path:
71
98
  root = paths.runtime_state_dir()
72
99
  root.mkdir(parents=True, exist_ok=True)
73
- return root / LOCK_FILE
100
+ token = _current_generation_token()
101
+ return root / f"runtime-service-{token}.lock"
74
102
 
75
103
 
76
104
  @contextmanager
@@ -128,10 +156,22 @@ def service_start_lock(*, timeout: float = 10.0):
128
156
  def read_service_state() -> dict[str, Any]:
129
157
  try:
130
158
  path = service_state_path()
131
- if not path.is_file():
132
- return {}
133
- data = json.loads(path.read_text(encoding="utf-8"))
134
- return data if isinstance(data, dict) else {}
159
+ if path.is_file():
160
+ data = json.loads(path.read_text(encoding="utf-8"))
161
+ return data if isinstance(data, dict) else {}
162
+ # Phase 2.1 one-time soft migration: adopt a pre-generation legacy
163
+ # state file ONLY if it belongs to this same runtime. A foreign
164
+ # install's legacy state stays invisible (never "stale to kill").
165
+ legacy = _legacy_service_state_path()
166
+ if legacy.is_file():
167
+ data = json.loads(legacy.read_text(encoding="utf-8"))
168
+ if isinstance(data, dict) and state_matches_current_runtime(data):
169
+ try:
170
+ legacy.replace(path)
171
+ except Exception:
172
+ pass
173
+ return data
174
+ return {}
135
175
  except Exception:
136
176
  return {}
137
177
 
@@ -445,6 +485,107 @@ def runtime_service_status() -> dict[str, Any]:
445
485
  }
446
486
 
447
487
 
488
+ # Phase 2.1/2.2 — resident obsolescence watch.
489
+ #
490
+ # With per-generation state files, residents no longer kill each other; the
491
+ # missing piece is cleanup: after a runtime update, the OLD resident must
492
+ # eventually exit, while the CURRENT one must stay warm forever (a hot Brain
493
+ # is what turns 10-48s conversation starts into fast ones). Rules:
494
+ # - a resident whose on-disk runtime generation still matches its own NEVER
495
+ # self-terminates, idle or not;
496
+ # - an OBSOLETE resident (disk generation changed under it) exits cleanly
497
+ # once it has had no established client connections for two consecutive
498
+ # checks (anti-flap), removing its state file on the way out;
499
+ # - if connections cannot be counted (no lsof/netstat), it stays alive —
500
+ # fail-safe towards living.
501
+
502
+ OBSOLESCENCE_CHECK_SECONDS = 300
503
+
504
+
505
+ def _count_established_connections(port: int) -> int | None:
506
+ """Best-effort count of ESTABLISHED TCP connections to ``port``.
507
+
508
+ Returns None when it cannot tell (missing tooling) so callers can fail
509
+ safe. Uses lsof on POSIX and netstat on Windows; both ship with the OS.
510
+ """
511
+ try:
512
+ if os.name == "nt":
513
+ out = subprocess.run(
514
+ ["netstat", "-ano", "-p", "tcp"],
515
+ capture_output=True, text=True, timeout=10,
516
+ ).stdout
517
+ needle = f":{port} "
518
+ return sum(
519
+ 1 for line in out.splitlines()
520
+ if "ESTABLISHED" in line and needle in line.split("ESTABLISHED")[0]
521
+ )
522
+ out = subprocess.run(
523
+ ["lsof", "-nP", f"-iTCP:{port}", "-sTCP:ESTABLISHED"],
524
+ capture_output=True, text=True, timeout=10,
525
+ ).stdout
526
+ rows = [line for line in out.splitlines() if "ESTABLISHED" in line]
527
+ # lsof lists both directions of loopback pairs; the resident's own
528
+ # accept side is one row per client connection.
529
+ return len(rows)
530
+ except Exception:
531
+ return None
532
+
533
+
534
+ def _resident_is_obsolete(boot_generation: str) -> bool:
535
+ try:
536
+ from runtime_versioning import compute_mcp_runtime_fingerprint, read_version_for_path, runtime_generation
537
+
538
+ root = current_server_path().parent
539
+ version = read_version_for_path(root) or read_version_for_path(root.parent)
540
+ fingerprint = compute_mcp_runtime_fingerprint(root, use_cache=False)
541
+ current = runtime_generation(version, fingerprint, str(root))
542
+ return bool(boot_generation) and bool(current) and current != boot_generation
543
+ except Exception:
544
+ return False # cannot tell -> assume still current (fail-safe)
545
+
546
+
547
+ def start_resident_obsolescence_watch(*, port: int, on_exit=None) -> None:
548
+ """Spawn the daemon thread that retires obsolete residents gracefully."""
549
+ import threading
550
+
551
+ boot_generation = str(current_runtime_identity().get("runtime_generation") or "")
552
+
553
+ def _watch() -> None:
554
+ strikes = 0
555
+ while True:
556
+ time.sleep(OBSOLESCENCE_CHECK_SECONDS)
557
+ try:
558
+ if not _resident_is_obsolete(boot_generation):
559
+ strikes = 0
560
+ continue
561
+ connections = _count_established_connections(port)
562
+ if connections is None or connections > 0:
563
+ strikes = 0
564
+ continue
565
+ strikes += 1
566
+ if strikes < 2:
567
+ continue
568
+ print(
569
+ f"[runtime-service] obsolete resident (gen {boot_generation[:24]}…) idle for two checks — exiting cleanly",
570
+ file=sys.stderr,
571
+ )
572
+ try:
573
+ service_state_path().unlink(missing_ok=True)
574
+ except Exception:
575
+ pass
576
+ if callable(on_exit):
577
+ try:
578
+ on_exit()
579
+ except Exception:
580
+ pass
581
+ os._exit(0)
582
+ except Exception:
583
+ strikes = 0 # the watch must never kill a healthy resident
584
+
585
+ thread = threading.Thread(target=_watch, name="resident-obsolescence-watch", daemon=True)
586
+ thread.start()
587
+
588
+
448
589
  def run_mcp_proxy_adapter(*, name: str, instructions: str, run_kwargs: dict[str, Any]) -> None:
449
590
  from fastmcp.server import create_proxy
450
591
 
@@ -57,6 +57,7 @@ if str(NEXO_CODE) not in sys.path:
57
57
  sys.path.insert(0, str(NEXO_CODE))
58
58
 
59
59
  from agent_runner import AutomationBackendUnavailableError, run_automation_prompt
60
+ from provider_circuit_breaker import ProviderTemporarilyUnavailableError
60
61
  from client_preferences import (
61
62
  resolve_automation_backend,
62
63
  )
@@ -1997,19 +1998,24 @@ def _localized_operator_escalation_email(
1997
1998
  exhausted_count: int,
1998
1999
  details: str,
1999
2000
  ) -> tuple[str, str]:
2001
+ # Phase 1.6 — subjects are signed by the AGENT (assistant_name, dynamic
2002
+ # per install), not by the product: the operator talks to their agent.
2000
2003
  if _uses_spanish(operator_language):
2001
- subject = f"[NEXO] Emails requiring manual attention ({exhausted_count})"
2004
+ # Phase 1.6 this branch used to contain the ENGLISH text copied
2005
+ # verbatim (operator-reported 10-jun: escalation mails arrived in
2006
+ # English with language=es configured). Real Spanish now.
2007
+ subject = f"[{assistant_name}] Emails que necesitan tu atención ({exhausted_count})"
2002
2008
  body = (
2003
- f"Hello {operator_name},\n\n"
2004
- f"The following emails have already been attempted {MAX_EMAIL_ATTEMPTS} times "
2005
- f"without succeeding (the session dies before completion):\n\n{details}\n\n"
2006
- "I marked them as `needs_interactive`. "
2007
- f"Open {assistant_name} Desktop and ask about the affected email so it can be resolved manually.\n\n"
2009
+ f"Hola {operator_name},\n\n"
2010
+ f"Los siguientes emails ya se han intentado {MAX_EMAIL_ATTEMPTS} veces "
2011
+ f"sin conseguirlo (la sesión muere antes de terminar):\n\n{details}\n\n"
2012
+ "Los he marcado como `needs_interactive`. "
2013
+ f"Abre {assistant_name} Desktop y pregunta por el email afectado para resolverlo manualmente.\n\n"
2008
2014
  f"— {assistant_name}"
2009
2015
  )
2010
2016
  return subject, body
2011
2017
 
2012
- subject = f"[NEXO] Emails requiring manual attention ({exhausted_count})"
2018
+ subject = f"[{assistant_name}] Emails requiring manual attention ({exhausted_count})"
2013
2019
  body = (
2014
2020
  f"Hello {operator_name},\n\n"
2015
2021
  f"The following emails have already been attempted {MAX_EMAIL_ATTEMPTS} times "
@@ -2354,6 +2360,17 @@ def launch_nexo(config, debt_block="", target_emails=None):
2354
2360
  _email_checkpoint_delete(mid)
2355
2361
  return True
2356
2362
 
2363
+ except ProviderTemporarilyUnavailableError as e:
2364
+ # Fase 1.6 — the engine is alive but unusable (credits/rate/auth).
2365
+ # This attempt must NOT count against the email (the provider being
2366
+ # down is not this email's fault), no scary per-item escalation:
2367
+ # give the attempt back, notify the operator ONCE per opening (in
2368
+ # their language) and let the breaker's probe window decide when to
2369
+ # resume. The work stays queued exactly where it was.
2370
+ log.warning(f"Provider circuit breaker open ({e.backend}: {e.reason}) — queueing work, attempt returned")
2371
+ _decrement_attempts(target_message_ids)
2372
+ _notify_provider_breaker_open_once(e)
2373
+ return False
2357
2374
  except AutomationBackendUnavailableError as e:
2358
2375
  log.error(f"Automation backend unavailable: {e}")
2359
2376
  _persist_failure_checkpoints(error_msg=f"AutomationBackendUnavailable: {e}", last_text="")
@@ -2407,6 +2424,94 @@ def _increment_attempts(email_ids):
2407
2424
  log.warning(f"Failed to increment attempts: {e}")
2408
2425
 
2409
2426
 
2427
+ def _decrement_attempts(email_ids):
2428
+ """Fase 1.6 — give an attempt back when the launch was vetoed by the
2429
+ provider circuit breaker: the provider being down is not the email's
2430
+ fault and must not push it towards needs_interactive."""
2431
+ if not email_ids:
2432
+ return
2433
+ try:
2434
+ conn = sqlite3.connect(str(EMAIL_DB_PATH))
2435
+ for mid in email_ids:
2436
+ conn.execute(
2437
+ "UPDATE emails SET attempts = MAX(COALESCE(attempts, 1) - 1, 0) WHERE message_id = ?",
2438
+ (mid,),
2439
+ )
2440
+ conn.commit()
2441
+ conn.close()
2442
+ except Exception as e:
2443
+ log.warning(f"Failed to decrement attempts: {e}")
2444
+
2445
+
2446
+ def _notify_provider_breaker_open_once(error):
2447
+ """Fase 1.6 — ONE operator notice per breaker opening, in their language.
2448
+
2449
+ Replaces the per-item English escalation storm the operator reported
2450
+ (10-jun): with credits exhausted, every queued email generated its own
2451
+ 'needs manual attention' mail. Now: a single message explaining the pause
2452
+ and that work is queued and resumes automatically.
2453
+ """
2454
+ try:
2455
+ from provider_circuit_breaker import should_notify_operator
2456
+ if not should_notify_operator(error.backend):
2457
+ return
2458
+ operator_name, assistant_name, operator_language = _get_operator_info()
2459
+ config = load_config()
2460
+ operator_email = config.get("operator_email", "")
2461
+ if not operator_email:
2462
+ log.warning("Breaker open but no operator_email configured — skipping notice")
2463
+ return
2464
+ retry_hint = ""
2465
+ if error.retry_after_ts:
2466
+ retry_hint = datetime.fromtimestamp(error.retry_after_ts).strftime("%H:%M")
2467
+ reason_es = {
2468
+ "credits": "créditos agotados",
2469
+ "rate_limit": "límite de uso alcanzado",
2470
+ "auth": "sesión caducada (hay que volver a conectar)",
2471
+ }.get(error.reason, error.reason)
2472
+ reason_en = {
2473
+ "credits": "credits exhausted",
2474
+ "rate_limit": "rate limit reached",
2475
+ "auth": "session expired (needs re-login)",
2476
+ }.get(error.reason, error.reason)
2477
+ if _uses_spanish(operator_language):
2478
+ subject = f"[{assistant_name}] Motor {error.backend} en pausa ({reason_es})"
2479
+ body = (
2480
+ f"Hola {operator_name},\n\n"
2481
+ f"He pausado las automatizaciones que usan {error.backend} porque está no disponible: {reason_es}.\n\n"
2482
+ "El trabajo pendiente queda EN COLA (no se pierde nada) y se reanudará solo en cuanto el motor vuelva"
2483
+ + (f" (próxima comprobación ~{retry_hint})" if retry_hint else "")
2484
+ + ".\n\nNo recibirás un aviso por cada tarea: solo este, y otro cuando se reanude.\n\n"
2485
+ f"— {assistant_name}"
2486
+ )
2487
+ else:
2488
+ subject = f"[{assistant_name}] Engine {error.backend} paused ({reason_en})"
2489
+ body = (
2490
+ f"Hello {operator_name},\n\n"
2491
+ f"I paused the automations that use {error.backend} because it is unavailable: {reason_en}.\n\n"
2492
+ "Pending work stays QUEUED (nothing is lost) and resumes automatically once the engine is back"
2493
+ + (f" (next probe ~{retry_hint})" if retry_hint else "")
2494
+ + ".\n\nYou will not get one notice per task — just this one, and another when work resumes.\n\n"
2495
+ f"— {assistant_name}"
2496
+ )
2497
+ body_file = BASE_DIR / ".breaker-notice-body.txt"
2498
+ body_file.write_text(body, encoding="utf-8")
2499
+ send_script = get_send_reply_script_path(local_script_dir=_script_dir)
2500
+ subprocess.run(
2501
+ [
2502
+ sys.executable, str(send_script),
2503
+ "--to", f"{operator_name} <{operator_email}>",
2504
+ "--subject", subject,
2505
+ "--body-file", str(body_file),
2506
+ ],
2507
+ timeout=30,
2508
+ capture_output=True,
2509
+ )
2510
+ log.info(f"Breaker-open notice sent to operator ({error.backend}: {error.reason})")
2511
+ except Exception as e:
2512
+ log.warning(f"Failed to send breaker-open notice: {e}")
2513
+
2514
+
2410
2515
  def _mark_needs_interactive(email_ids):
2411
2516
  """Mark emails as needs_interactive after too many failed attempts."""
2412
2517
  if not email_ids:
package/src/server.py CHANGED
@@ -3093,4 +3093,14 @@ if __name__ == "__main__":
3093
3093
  "mode": "runtime-service",
3094
3094
  }
3095
3095
  )
3096
+ # Phase 2.1/2.2 — retire this resident gracefully if the on-disk
3097
+ # runtime is updated under it AND no clients remain connected.
3098
+ # The current-generation resident never self-terminates: a warm
3099
+ # Brain is what keeps conversation starts fast.
3100
+ from runtime_service import start_resident_obsolescence_watch
3101
+
3102
+ start_resident_obsolescence_watch(
3103
+ port=port,
3104
+ on_exit=lambda: (close_local_context_db(), close_db()),
3105
+ )
3096
3106
  mcp.run(**run_kwargs)
@@ -1973,13 +1973,47 @@ def _toolbox_summary(conn) -> str:
1973
1973
  return ""
1974
1974
 
1975
1975
 
1976
+ def _log_session_learning_aggregation_shadow(sid: str, *, blocked: bool, pending_count: int) -> None:
1977
+ """Phase 1.5 (shadow) — session-level learning aggregation telemetry.
1978
+
1979
+ The per-line gate above only sees corrections its detector flagged in the
1980
+ moment. The real close flow (here — NOT stop.py, which fires after every
1981
+ response with a 10s timeout) is where a session-WIDE aggregation belongs.
1982
+ Shadow first: record close-time compliance metrics to
1983
+ runtime/logs/learning-aggregation-shadow.ndjson so the active phase
1984
+ (full buffer analysis) can be sized with real data before it gates
1985
+ anything. Never raises, never blocks.
1986
+ """
1987
+ try:
1988
+ import json as _json
1989
+ import os as _os
1990
+ import time as _time
1991
+ from pathlib import Path as _Path
1992
+
1993
+ base = _Path(_os.environ.get("NEXO_HOME") or (_Path.home() / ".nexo"))
1994
+ path = base / "runtime" / "logs" / "learning-aggregation-shadow.ndjson"
1995
+ path.parent.mkdir(parents=True, exist_ok=True)
1996
+ with path.open("a", encoding="utf-8") as handle:
1997
+ handle.write(_json.dumps({
1998
+ "ts": _time.time(),
1999
+ "sid": sid,
2000
+ "close_blocked_by_pending_correction": blocked,
2001
+ "pending_corrections_at_close": pending_count,
2002
+ }, ensure_ascii=False) + "\n")
2003
+ except Exception:
2004
+ pass
2005
+
2006
+
1976
2007
  def handle_stop(sid: str) -> str:
1977
2008
  """Cleanly close a session, removing it from active sessions immediately."""
2009
+ pending_count = 0
1978
2010
  try:
1979
2011
  from db import list_session_correction_requirements
1980
2012
 
1981
2013
  pending = list_session_correction_requirements(session_id=sid, status="open", limit=3)
2014
+ pending_count = len(pending or [])
1982
2015
  if pending:
2016
+ _log_session_learning_aggregation_shadow(sid, blocked=True, pending_count=pending_count)
1983
2017
  return (
1984
2018
  "ERROR: session has user correction(s) without durable learning_add. "
1985
2019
  "Call nexo_learning_add for the correction before nexo_stop. "
@@ -1987,6 +2021,7 @@ def handle_stop(sid: str) -> str:
1987
2021
  )
1988
2022
  except Exception:
1989
2023
  pass
2024
+ _log_session_learning_aggregation_shadow(sid, blocked=False, pending_count=pending_count)
1990
2025
  _stop_keepalive(sid)
1991
2026
  complete_session(sid)
1992
2027
  return f"Session {sid} closed."