nexo-brain 7.30.33 → 7.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.30.33",
3
+ "version": "7.31.1",
4
4
  "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
5
5
  "author": {
6
6
  "name": "NEXO Brain",
package/README.md CHANGED
@@ -18,7 +18,11 @@
18
18
 
19
19
  [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
20
20
 
21
- Version `7.30.33` is the current packaged-runtime line. Patch release over v7.30.32 - personal agent/script status now keeps the newest real run between manual executions and cron history, so a successful manual agent run cannot be hidden behind an older scheduled failure.
21
+ Version `7.31.1` is the current packaged-runtime line. Patch release over v7.31.0 - headless automations pause and queue when the selected engine is unavailable (credits, rate limits, expired auth) and resume automatically with one operator notice in their language; protocol nudge shaping ships in shadow mode; and the client config push stops writing an invalid `mcp__*` permission rule to Claude Code settings.
22
+
23
+ Previously in `7.31.0`: minor release over v7.30.33 - the recommended Claude Code model moves from Opus 4.8 to Fable 5 with max reasoning (`claude-fable-5`) across all four main resonance tiers (the `muy_bajo` tier keeps Haiku for cheap internal classifiers and Codex stays on GPT-5.5), existing installs riding NEXO defaults auto-migrate on update while customized models are respected, and learning housekeeping no longer aborts when the embedding backend is missing.
24
+
25
+ Previously in `7.30.33`: patch release over v7.30.32 - personal agent/script status now keeps the newest real run between manual executions and cron history, so a successful manual agent run cannot be hidden behind an older scheduled failure.
22
26
 
23
27
  Previously in `7.30.32`: patch release over v7.30.31 - packaged update/doctor now repair npm/npx wrapper drift, archive stale personal script backups, validate observable automation health contracts, and block legacy Claude/Codex project memory writes.
24
28
 
package/bin/nexo-brain.js CHANGED
@@ -115,7 +115,7 @@ const PUBLIC_CONTRIBUTION_UPSTREAM = "wazionapps/nexo";
115
115
  const MODEL_DEFAULTS_PATH = path.join(__dirname, "..", "src", "model_defaults.json");
116
116
  function _loadModelDefaults() {
117
117
  const fallback = {
118
- claude_code: { model: "claude-opus-4-7[1m]", reasoning_effort: "max", display_name: "Opus 4.7 with 1M context" },
118
+ claude_code: { model: "claude-fable-5", reasoning_effort: "max", display_name: "Fable 5 with max reasoning" },
119
119
  codex: { model: "gpt-5.5", reasoning_effort: "xhigh", display_name: "GPT-5.5 with max reasoning" },
120
120
  };
121
121
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexo-brain",
3
- "version": "7.30.33",
3
+ "version": "7.31.1",
4
4
  "mcpName": "io.github.wazionapps/nexo",
5
5
  "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
6
6
  "homepage": "https://nexo-brain.com",
@@ -1205,6 +1205,14 @@ def run_automation_prompt(
1205
1205
  f"{selected_backend} automation backend selected but launcher is not installed; fallback blocked."
1206
1206
  )
1207
1207
 
1208
+ # Fase 1.6 — provider circuit breaker. "Installed" is not "available":
1209
+ # with credits exhausted / rate limited / auth expired, every headless
1210
+ # cron used to launch a session that died mid-flight, burned its retry
1211
+ # budget and escalated to the operator per-item. The breaker fails fast
1212
+ # with a queue-me signal instead; one probe per retry window re-tests.
1213
+ from provider_circuit_breaker import raise_if_unavailable
1214
+ raise_if_unavailable(selected_backend)
1215
+
1208
1216
  # Resonance map decides (model, effort) for every call. ``caller`` is
1209
1217
  # MANDATORY — every script that invokes the automation backend must be
1210
1218
  # registered in src/resonance_map.py so its reasoning budget is a
@@ -1414,6 +1422,7 @@ def run_automation_prompt(
1414
1422
  stderr = result.stderr or ""
1415
1423
  if not recorded:
1416
1424
  stderr = _append_stderr(stderr, record_error)
1425
+ _record_provider_breaker_outcome(selected_backend, result.returncode, final_stdout, stderr)
1417
1426
  return subprocess.CompletedProcess(
1418
1427
  cmd,
1419
1428
  result.returncode,
@@ -1490,6 +1499,7 @@ def run_automation_prompt(
1490
1499
  stderr = result.stderr or ""
1491
1500
  if not recorded:
1492
1501
  stderr = _append_stderr(stderr, record_error)
1502
+ _record_provider_breaker_outcome(selected_backend, result.returncode, final_stdout, stderr)
1493
1503
  return subprocess.CompletedProcess(
1494
1504
  cmd,
1495
1505
  result.returncode,
@@ -1500,6 +1510,22 @@ def run_automation_prompt(
1500
1510
  raise AutomationBackendUnavailableError(f"Unsupported automation backend: {selected_backend}")
1501
1511
 
1502
1512
 
1513
+ def _record_provider_breaker_outcome(backend: str, returncode: int | None, stdout: str, stderr: str) -> None:
1514
+ """Fase 1.6 — feed the circuit breaker after every headless session.
1515
+
1516
+ Success closes the breaker; classified failures (credits/rate-limit/auth)
1517
+ open it immediately so the NEXT cron fails fast and queues instead of
1518
+ launching another doomed session. Best-effort: breaker bookkeeping must
1519
+ never mask the session result.
1520
+ """
1521
+ try:
1522
+ from provider_circuit_breaker import classify_session_failure, record_session_outcome
1523
+ reason = classify_session_failure(returncode, stdout or "", stderr or "")
1524
+ record_session_outcome(backend, ok=(reason is None), reason=reason)
1525
+ except Exception:
1526
+ pass
1527
+
1528
+
1503
1529
  def probe_automation_backend(
1504
1530
  *,
1505
1531
  backend: str | None = None,
@@ -2058,7 +2058,7 @@ def _refresh_resonance_tiers_model_defaults(dest: Path = NEXO_HOME) -> list[str]
2058
2058
  dest / "personal" / "brain" / "resonance_tiers.json",
2059
2059
  dest / "brain" / "resonance_tiers.json",
2060
2060
  ]
2061
- old_prefixes = ("claude-opus-4-6", "claude-opus-4-7")
2061
+ old_prefixes = ("claude-opus-4-6", "claude-opus-4-7", "claude-opus-4-8")
2062
2062
 
2063
2063
  for target_path in target_paths:
2064
2064
  try:
@@ -2089,7 +2089,7 @@ def _refresh_resonance_tiers_model_defaults(dest: Path = NEXO_HOME) -> list[str]
2089
2089
  model = str(claude.get("model") or "").strip()
2090
2090
  tier_changed = False
2091
2091
  if model and model.startswith(old_prefixes):
2092
- claude["model"] = str(source_claude.get("model") or "claude-opus-4-8")
2092
+ claude["model"] = str(source_claude.get("model") or "claude-fable-5")
2093
2093
  tier_changed = True
2094
2094
  if tier_changed and not str(claude.get("effort") or "").strip() and source_claude.get("effort"):
2095
2095
  claude["effort"] = str(source_claude.get("effort"))
@@ -83,7 +83,7 @@ except Exception:
83
83
 
84
84
  def resolve_client_runtime_profile(client: str, preferences: dict | None = None) -> dict:
85
85
  defaults = {
86
- "claude_code": {"model": "claude-opus-4-7[1m]", "reasoning_effort": "max"},
86
+ "claude_code": {"model": "claude-fable-5", "reasoning_effort": "max"},
87
87
  "codex": {"model": "gpt-5.5", "reasoning_effort": "xhigh"},
88
88
  }
89
89
  return dict(defaults.get(client, {}))
@@ -1012,7 +1012,16 @@ def _load_toml_object(path: Path) -> dict:
1012
1012
  def _write_toml_object(path: Path, payload: dict) -> None:
1013
1013
  path.parent.mkdir(parents=True, exist_ok=True)
1014
1014
  lines = _emit_toml_table(payload)
1015
- path.write_text("\n".join(lines).rstrip() + "\n")
1015
+ content = "\n".join(lines).rstrip() + "\n"
1016
+ # v7.31.x (Fase 1) — write-if-changed: rewriting an identical config.toml
1017
+ # on every Desktop update churns mtime/content signatures and can
1018
+ # re-trigger Codex's hook/trust confirmation prompt for the operator.
1019
+ try:
1020
+ if path.is_file() and path.read_text() == content:
1021
+ return
1022
+ except Exception:
1023
+ pass # unreadable existing file -> fall through to a clean write
1024
+ path.write_text(content)
1016
1025
 
1017
1026
 
1018
1027
  def _sync_codex_managed_config(
@@ -1527,6 +1536,12 @@ def _claude_desktop_managed_metadata(server_config: dict, *, operator_name: str)
1527
1536
  # (followup-runner, email-monitor, deep-sleep, etc.) to work without
1528
1537
  # interactive approval prompts. Without this, Claude Code headless invocations
1529
1538
  # stall waiting for MCP tool approvals.
1539
+ #
1540
+ # v7.31.x (Fase 1) — "mcp__*" is NOT a valid Claude Code allow rule (allow
1541
+ # patterns must name a literal mcp__<server>__ scope; only deny/ask accept
1542
+ # bare wildcards). Claude Code skips it and shows a Settings Warning on every
1543
+ # launch. List the NEXO-managed servers explicitly instead; user-added
1544
+ # servers belong to the user's own config, not to this template.
1530
1545
  _NEXO_HEADLESS_ALLOWLIST = (
1531
1546
  "Bash",
1532
1547
  "Read",
@@ -1539,6 +1554,16 @@ _NEXO_HEADLESS_ALLOWLIST = (
1539
1554
  "NotebookEdit",
1540
1555
  "WebSearch",
1541
1556
  "WebFetch",
1557
+ "mcp__nexo__*",
1558
+ "mcp__nexo_chrome_control__*",
1559
+ "mcp__nexo_desktop_control__*",
1560
+ "mcp__nexo_power_control__*",
1561
+ )
1562
+
1563
+ # Entries previously pushed by this template that Claude Code rejects as
1564
+ # invalid. The sync REMOVES them so already-contaminated installs stop
1565
+ # showing the launch warning. Safe: Claude Code was skipping them anyway.
1566
+ _NEXO_INVALID_ALLOWLIST_ENTRIES = (
1542
1567
  "mcp__*",
1543
1568
  )
1544
1569
 
@@ -1558,6 +1583,12 @@ def _ensure_headless_permissions(payload: dict) -> None:
1558
1583
  allow_list = []
1559
1584
  permissions["allow"] = allow_list
1560
1585
 
1586
+ # v7.31.x (Fase 1) — migrate away invalid entries this template used to
1587
+ # push (Claude Code skips them and warns on every launch).
1588
+ for invalid in _NEXO_INVALID_ALLOWLIST_ENTRIES:
1589
+ while invalid in allow_list:
1590
+ allow_list.remove(invalid)
1591
+
1561
1592
  existing = {str(item) for item in allow_list if isinstance(item, str)}
1562
1593
  for entry in _NEXO_HEADLESS_ALLOWLIST:
1563
1594
  if entry not in existing:
@@ -42,6 +42,22 @@ PROTOCOL_SKIP_TOOLS = {
42
42
  "nexo_rules_check",
43
43
  }
44
44
  ACTION_TASK_TYPES = {"edit", "execute", "delegate"}
45
+
46
+ # Phase 1.5 (SPEC-FIABILIDAD-FASES-2026-06) — protocol nudge shaping.
47
+ # The "Non-trivial work without nexo_task_open" warning fired on EVERY
48
+ # non-trivial tool call from tool #1 (no threshold, no rate limit, no
49
+ # session-type awareness) — measurable as noise that gets ignored. Shaping:
50
+ # - threshold: only nudge after N consecutive non-trivial tools w/o task
51
+ # - cooldown: once nudged, stay quiet for a window
52
+ # - headless: runner sessions are covered by HeadlessEnforcer already
53
+ # (enforcement_engine.py, threshold 4/2 + cooldown) — skip the nudge
54
+ # Mode is SHADOW by default: visible behaviour is UNCHANGED; decisions are
55
+ # logged to runtime/logs/protocol-nudge-shadow.ndjson so the threshold can
56
+ # be calibrated with real data before flipping NEXO_PROTOCOL_NUDGE_MODE to
57
+ # "active". ("off" disables shaping bookkeeping entirely.)
58
+ PROTOCOL_NUDGE_MODE = str(os.environ.get("NEXO_PROTOCOL_NUDGE_MODE", "shadow")).strip().lower()
59
+ PROTOCOL_NUDGE_THRESHOLD = max(1, int(os.environ.get("NEXO_PROTOCOL_NUDGE_THRESHOLD", "6") or 6))
60
+ PROTOCOL_NUDGE_COOLDOWN_S = max(0, int(os.environ.get("NEXO_PROTOCOL_NUDGE_COOLDOWN_S", "300") or 300))
45
61
  NEXO_CODE_ROOT = Path(os.environ.get("NEXO_CODE", str(Path(__file__).resolve().parent))).expanduser().resolve()
46
62
  LIVE_REPO_ROOT = NEXO_CODE_ROOT.parent if NEXO_CODE_ROOT.name == "src" else NEXO_CODE_ROOT
47
63
  PUBLIC_REPO_DIRS = {
@@ -1198,6 +1214,110 @@ def _append_protocol_warning(warnings: list[dict], message: str) -> None:
1198
1214
  warnings.append({"message": clean})
1199
1215
 
1200
1216
 
1217
+ def _protocol_nudge_state_path() -> Path:
1218
+ base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
1219
+ return base / "runtime" / "data" / "protocol-nudge-state.json"
1220
+
1221
+
1222
+ def _protocol_nudge_shadow_log_path() -> Path:
1223
+ base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
1224
+ return base / "runtime" / "logs" / "protocol-nudge-shadow.ndjson"
1225
+
1226
+
1227
+ def _shape_protocol_nudge(sid: str) -> dict:
1228
+ """Phase 1.5 — decide whether the no-task nudge SHOULD fire under shaping.
1229
+
1230
+ Pure bookkeeping + decision; never raises (a broken state file must not
1231
+ break the hook). Returns {would_emit, reason, streak}.
1232
+ """
1233
+ import json as _json
1234
+ import time as _time
1235
+
1236
+ headless = (
1237
+ str(os.environ.get("NEXO_AUTOMATION", "")).strip() == "1"
1238
+ or str(os.environ.get("NEXO_HEADLESS", "")).strip() == "1"
1239
+ )
1240
+ if headless:
1241
+ return {"would_emit": False, "reason": "headless-covered-by-enforcer", "streak": 0}
1242
+
1243
+ state_path = _protocol_nudge_state_path()
1244
+ state: dict = {}
1245
+ try:
1246
+ state = _json.loads(state_path.read_text(encoding="utf-8"))
1247
+ if not isinstance(state, dict):
1248
+ state = {}
1249
+ except Exception:
1250
+ state = {}
1251
+
1252
+ now = _time.time()
1253
+ # Drop stale sessions (>48h) so the file cannot grow without bound.
1254
+ state = {
1255
+ key: value for key, value in state.items()
1256
+ if isinstance(value, dict) and (now - float(value.get("updated_at") or 0)) < 48 * 3600
1257
+ }
1258
+ entry = state.get(sid) or {}
1259
+ streak = int(entry.get("streak") or 0) + 1
1260
+ last_nudge_at = float(entry.get("last_nudge_at") or 0)
1261
+ entry.update({"streak": streak, "updated_at": now})
1262
+
1263
+ if streak < PROTOCOL_NUDGE_THRESHOLD:
1264
+ decision = {"would_emit": False, "reason": "under-threshold", "streak": streak}
1265
+ elif last_nudge_at and (now - last_nudge_at) < PROTOCOL_NUDGE_COOLDOWN_S:
1266
+ decision = {"would_emit": False, "reason": "cooldown", "streak": streak}
1267
+ else:
1268
+ entry["last_nudge_at"] = now
1269
+ decision = {"would_emit": True, "reason": "threshold-reached", "streak": streak}
1270
+
1271
+ state[sid] = entry
1272
+ try:
1273
+ state_path.parent.mkdir(parents=True, exist_ok=True)
1274
+ tmp = state_path.with_suffix(".json.tmp")
1275
+ tmp.write_text(_json.dumps(state, ensure_ascii=False) + "\n", encoding="utf-8")
1276
+ os.replace(tmp, state_path)
1277
+ except Exception:
1278
+ pass
1279
+ return decision
1280
+
1281
+
1282
+ def _reset_protocol_nudge_streak(sid: str) -> None:
1283
+ """A session with an open task is compliant — its streak restarts."""
1284
+ import json as _json
1285
+
1286
+ if PROTOCOL_NUDGE_MODE == "off" or not sid:
1287
+ return
1288
+ state_path = _protocol_nudge_state_path()
1289
+ try:
1290
+ state = _json.loads(state_path.read_text(encoding="utf-8"))
1291
+ if not isinstance(state, dict) or sid not in state:
1292
+ return
1293
+ state[sid]["streak"] = 0
1294
+ tmp = state_path.with_suffix(".json.tmp")
1295
+ tmp.write_text(_json.dumps(state, ensure_ascii=False) + "\n", encoding="utf-8")
1296
+ os.replace(tmp, state_path)
1297
+ except Exception:
1298
+ pass
1299
+
1300
+
1301
+ def _log_protocol_nudge_shadow(sid: str, decision: dict, emitted_today: bool) -> None:
1302
+ import json as _json
1303
+ import time as _time
1304
+
1305
+ try:
1306
+ path = _protocol_nudge_shadow_log_path()
1307
+ path.parent.mkdir(parents=True, exist_ok=True)
1308
+ with path.open("a", encoding="utf-8") as handle:
1309
+ handle.write(_json.dumps({
1310
+ "ts": _time.time(),
1311
+ "sid": sid,
1312
+ "mode": PROTOCOL_NUDGE_MODE,
1313
+ "threshold": PROTOCOL_NUDGE_THRESHOLD,
1314
+ "decision": decision,
1315
+ "legacy_warning_emitted": emitted_today,
1316
+ }, ensure_ascii=False) + "\n")
1317
+ except Exception:
1318
+ pass
1319
+
1320
+
1201
1321
  def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
1202
1322
  short_name = _short_tool_name(tool_name)
1203
1323
  if short_name in PROTOCOL_SKIP_TOOLS or short_name not in NON_TRIVIAL_PROTOCOL_TOOLS:
@@ -1214,6 +1334,17 @@ def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
1214
1334
  task = _find_any_open_task(conn, sid)
1215
1335
  has_guard = _session_has_guard_check(conn, sid)
1216
1336
  if not task:
1337
+ # Phase 1.5 — shaping decision. In SHADOW mode (default) the visible
1338
+ # behaviour below is untouched and the decision is only logged so the
1339
+ # threshold can be calibrated; in ACTIVE mode the shaping governs
1340
+ # (headless skip, streak threshold, cooldown); "off" disables both.
1341
+ nudge = None
1342
+ if PROTOCOL_NUDGE_MODE in {"shadow", "active"}:
1343
+ nudge = _shape_protocol_nudge(sid)
1344
+ if PROTOCOL_NUDGE_MODE == "active" and nudge and not nudge["would_emit"]:
1345
+ _log_protocol_nudge_shadow(sid, nudge, emitted_today=False)
1346
+ return warnings
1347
+
1217
1348
  guard_note = (
1218
1349
  render_core_prompt("hook-protocol-warning-task-open-guard-note")
1219
1350
  if short_name in {"Read", "Bash", "Grep", "Glob"} and not has_guard
@@ -1230,8 +1361,12 @@ def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
1230
1361
  warnings,
1231
1362
  render_core_prompt("hook-protocol-warning-heartbeat-close-evidence"),
1232
1363
  )
1364
+ if PROTOCOL_NUDGE_MODE == "shadow" and nudge is not None:
1365
+ _log_protocol_nudge_shadow(sid, nudge, emitted_today=True)
1233
1366
  return warnings
1234
1367
 
1368
+ _reset_protocol_nudge_streak(sid)
1369
+
1235
1370
  task_id = str(task.get("task_id") or "").strip()
1236
1371
  if str(task.get("task_type") or "").strip() in ACTION_TASK_TYPES and not (task.get("opened_with_guard") or has_guard):
1237
1372
  _append_protocol_warning(
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "schema_version": 1,
3
3
  "claude_code": {
4
- "model": "claude-opus-4-8",
4
+ "model": "claude-fable-5",
5
5
  "reasoning_effort": "max",
6
- "display_name": "Opus 4.8 with max reasoning",
7
- "recommendation_version": 3,
8
- "previous_defaults": ["claude-opus-4-7[1m]", "claude-opus-4-7", "claude-opus-4-6[1m]"]
6
+ "display_name": "Fable 5 with max reasoning",
7
+ "recommendation_version": 4,
8
+ "previous_defaults": ["claude-opus-4-8", "claude-opus-4-7[1m]", "claude-opus-4-7", "claude-opus-4-6[1m]"]
9
9
  },
10
10
  "codex": {
11
11
  "model": "gpt-5.5",
@@ -20,11 +20,11 @@ from typing import Any
20
20
  _FALLBACK: dict[str, Any] = {
21
21
  "schema_version": 1,
22
22
  "claude_code": {
23
- "model": "claude-opus-4-8",
23
+ "model": "claude-fable-5",
24
24
  "reasoning_effort": "max",
25
- "display_name": "Opus 4.8 with max reasoning",
26
- "recommendation_version": 3,
27
- "previous_defaults": ["claude-opus-4-7[1m]", "claude-opus-4-7", "claude-opus-4-6[1m]"],
25
+ "display_name": "Fable 5 with max reasoning",
26
+ "recommendation_version": 4,
27
+ "previous_defaults": ["claude-opus-4-8", "claude-opus-4-7[1m]", "claude-opus-4-7", "claude-opus-4-6[1m]"],
28
28
  },
29
29
  "codex": {
30
30
  "model": "gpt-5.5",
@@ -99,7 +99,7 @@ def looks_like_claude_model(model: str) -> bool:
99
99
  return str(model or "").strip().lower().startswith(_CLAUDE_MODEL_PREFIXES)
100
100
 
101
101
 
102
- _CLAUDE_DEFAULT_PREFIXES = ("claude-opus-4-6", "claude-opus-4-7")
102
+ _CLAUDE_DEFAULT_PREFIXES = ("claude-opus-4-6", "claude-opus-4-7", "claude-opus-4-8")
103
103
 
104
104
 
105
105
  def heal_runtime_profiles(profiles: dict) -> tuple[dict, list[str]]:
@@ -0,0 +1,230 @@
1
+ """Provider circuit breaker — Fase 1.6 (SPEC-FIABILIDAD-FASES-2026-06).
2
+
3
+ Incident (2026-06-10, operator report): when the selected engine (Claude or
4
+ Codex) is unavailable — credits exhausted, rate limited, auth expired — every
5
+ headless cron (email-monitor, deep-sleep, evolution, catch-up, followups…)
6
+ still launched a session that died mid-flight, burned its retry budget, then
7
+ escalated to the operator by email (in English, regardless of the configured
8
+ language). Work was lost or degraded to manual across the whole system.
9
+
10
+ This module gives the single launch path (agent_runner.run_automation_prompt)
11
+ a shared, persisted circuit breaker:
12
+
13
+ - ``check_provider_available(backend)`` — gate BEFORE launching.
14
+ - ``classify_session_failure(...)`` — map a dead session to a cause.
15
+ - ``record_session_outcome(backend, …)`` — close on success, open on
16
+ classified failures (credits/rate-limit/auth open immediately; generic
17
+ failures only after N consecutive).
18
+ - ``should_notify_operator(backend)`` — True exactly once per opening, so
19
+ the operator gets ONE notice instead of one per queued item.
20
+
21
+ State lives in ``$NEXO_HOME/runtime/data/provider-circuit-breaker.json`` so
22
+ every cron process shares the same view. Writes are atomic (tmp + replace).
23
+ The breaker FAILS OPEN on its own errors: a broken state file must never
24
+ block automations.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import json
30
+ import os
31
+ import re
32
+ import time
33
+ from pathlib import Path
34
+
35
+ # Failure classes that open the breaker on FIRST sight: retrying cannot help
36
+ # until the underlying condition clears.
37
+ HARD_OPEN_REASONS = {"credits", "rate_limit", "auth"}
38
+
39
+ # Generic failures (network blips, crashes) need this many consecutive hits
40
+ # before the breaker opens — one flaky session must not pause the fleet.
41
+ GENERIC_OPEN_THRESHOLD = 3
42
+
43
+ # How long the breaker stays open before allowing ONE half-open probe call.
44
+ DEFAULT_RETRY_AFTER_S = {
45
+ "credits": 30 * 60, # credit top-ups/renewals are slow; probe every 30m
46
+ "rate_limit": 15 * 60, # unless the provider told us a reset time
47
+ "auth": 60 * 60, # needs operator action; probe hourly anyway
48
+ "generic": 10 * 60,
49
+ }
50
+
51
+ _FAILURE_PATTERNS = (
52
+ ("credits", re.compile(
53
+ r"credit balance is too low|insufficient[_ ]quota|exceeded your current quota"
54
+ r"|billing hard limit|out of credits|usage limit reached|plan limits",
55
+ re.I)),
56
+ ("rate_limit", re.compile(
57
+ r"rate[_ -]?limit|too many requests|\b429\b|overloaded[_ ]error|\b529\b"
58
+ r"|server overloaded|capacity constraints",
59
+ re.I)),
60
+ ("auth", re.compile(
61
+ r"authentication[_ ]error|\b401\b|unauthorized|oauth token (has )?expired"
62
+ r"|invalid api key|api key not (found|valid)|please run /login|token_revoked",
63
+ re.I)),
64
+ )
65
+
66
+
67
+ def _state_path() -> Path:
68
+ base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
69
+ return base / "runtime" / "data" / "provider-circuit-breaker.json"
70
+
71
+
72
+ def _now() -> float:
73
+ return time.time()
74
+
75
+
76
+ def _load_state() -> dict:
77
+ try:
78
+ raw = _state_path().read_text(encoding="utf-8")
79
+ data = json.loads(raw)
80
+ return data if isinstance(data, dict) else {}
81
+ except Exception:
82
+ return {}
83
+
84
+
85
+ def _save_state(state: dict) -> None:
86
+ try:
87
+ path = _state_path()
88
+ path.parent.mkdir(parents=True, exist_ok=True)
89
+ tmp = path.with_suffix(".json.tmp")
90
+ tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
91
+ os.replace(tmp, path)
92
+ except Exception:
93
+ pass # the breaker must never break the caller
94
+
95
+
96
+ def _entry(state: dict, backend: str) -> dict:
97
+ entry = state.get(backend)
98
+ if not isinstance(entry, dict):
99
+ entry = {}
100
+ state[backend] = entry
101
+ return entry
102
+
103
+
104
+ class ProviderTemporarilyUnavailableError(RuntimeError):
105
+ """Selected provider is up for maintenance by reality (credits/rate/auth).
106
+
107
+ Callers should QUEUE/DEFER their work without burning retry budgets; the
108
+ breaker re-probes automatically once ``retry_after`` passes.
109
+ """
110
+
111
+ def __init__(self, backend: str, reason: str, retry_after_ts: float | None):
112
+ self.backend = backend
113
+ self.reason = reason
114
+ self.retry_after_ts = retry_after_ts
115
+ wait = ""
116
+ if retry_after_ts:
117
+ wait = f"; next probe after {time.strftime('%H:%M', time.localtime(retry_after_ts))}"
118
+ super().__init__(
119
+ f"provider '{backend}' temporarily unavailable (reason: {reason}){wait}. "
120
+ "Work should be queued, not retried blindly."
121
+ )
122
+
123
+
124
+ def classify_session_failure(returncode: int | None, stdout: str = "", stderr: str = "") -> str | None:
125
+ """Map a finished/dead session to a failure class, or None if it looks fine.
126
+
127
+ Only classifies KNOWN unavailability shapes; an exit code != 0 with no
128
+ matching pattern returns "generic" so the threshold logic decides.
129
+ A zero return code returns None.
130
+ """
131
+ if returncode == 0:
132
+ return None
133
+ haystack = f"{stdout or ''}\n{stderr or ''}"
134
+ for reason, pattern in _FAILURE_PATTERNS:
135
+ if pattern.search(haystack):
136
+ return reason
137
+ return "generic"
138
+
139
+
140
+ def check_provider_available(backend: str) -> tuple[bool, dict]:
141
+ """Gate to call BEFORE launching the provider.
142
+
143
+ Returns (True, entry) when closed — or when open but past retry_after, in
144
+ which case the caller's attempt IS the half-open probe (its outcome will
145
+ close or re-open the breaker via record_session_outcome).
146
+ Returns (False, entry) while open and inside the wait window.
147
+ """
148
+ state = _load_state()
149
+ entry = _entry(state, backend)
150
+ if entry.get("state") != "open":
151
+ return True, entry
152
+ retry_after = float(entry.get("retry_after") or 0)
153
+ if retry_after and _now() >= retry_after:
154
+ entry["half_open_probe_at"] = _now()
155
+ _save_state(state)
156
+ return True, entry
157
+ return False, entry
158
+
159
+
160
+ def raise_if_unavailable(backend: str) -> None:
161
+ ok, entry = check_provider_available(backend)
162
+ if ok:
163
+ return
164
+ raise ProviderTemporarilyUnavailableError(
165
+ backend,
166
+ str(entry.get("reason") or "unknown"),
167
+ float(entry.get("retry_after") or 0) or None,
168
+ )
169
+
170
+
171
+ def record_session_outcome(
172
+ backend: str,
173
+ *,
174
+ ok: bool,
175
+ reason: str | None = None,
176
+ retry_after_s: float | None = None,
177
+ ) -> dict:
178
+ """Update the breaker after a session finished (or died).
179
+
180
+ ``reason`` should come from classify_session_failure. ``retry_after_s``
181
+ lets callers honour a provider-reported reset time.
182
+ """
183
+ state = _load_state()
184
+ entry = _entry(state, backend)
185
+ if ok:
186
+ was_open = entry.get("state") == "open"
187
+ state[backend] = {
188
+ "state": "closed",
189
+ "consecutive_failures": 0,
190
+ "closed_at": _now(),
191
+ "recovered_from": entry.get("reason") if was_open else None,
192
+ }
193
+ _save_state(state)
194
+ return state[backend]
195
+
196
+ failure_reason = reason or "generic"
197
+ consecutive = int(entry.get("consecutive_failures") or 0) + 1
198
+ entry["consecutive_failures"] = consecutive
199
+ should_open = failure_reason in HARD_OPEN_REASONS or consecutive >= GENERIC_OPEN_THRESHOLD
200
+ if should_open:
201
+ wait = retry_after_s if retry_after_s else DEFAULT_RETRY_AFTER_S.get(failure_reason, DEFAULT_RETRY_AFTER_S["generic"])
202
+ already_open = entry.get("state") == "open"
203
+ entry.update({
204
+ "state": "open",
205
+ "reason": failure_reason,
206
+ "opened_at": entry.get("opened_at") if already_open else _now(),
207
+ "retry_after": _now() + float(wait),
208
+ })
209
+ if not already_open:
210
+ entry["operator_notified_at"] = None
211
+ _save_state(state)
212
+ return entry
213
+
214
+
215
+ def should_notify_operator(backend: str) -> bool:
216
+ """True exactly once per opening — callers use it to send ONE notice."""
217
+ state = _load_state()
218
+ entry = _entry(state, backend)
219
+ if entry.get("state") != "open":
220
+ return False
221
+ if entry.get("operator_notified_at"):
222
+ return False
223
+ entry["operator_notified_at"] = _now()
224
+ _save_state(state)
225
+ return True
226
+
227
+
228
+ def breaker_status() -> dict:
229
+ """Read-only snapshot for doctors/diagnostics."""
230
+ return _load_state()
@@ -1,19 +1,19 @@
1
1
  {
2
2
  "tiers": {
3
3
  "maximo": {
4
- "claude_code": { "model": "claude-opus-4-8", "effort": "max" },
4
+ "claude_code": { "model": "claude-fable-5", "effort": "max" },
5
5
  "codex": { "model": "gpt-5.5", "effort": "xhigh" }
6
6
  },
7
7
  "alto": {
8
- "claude_code": { "model": "claude-opus-4-8", "effort": "xhigh" },
8
+ "claude_code": { "model": "claude-fable-5", "effort": "xhigh" },
9
9
  "codex": { "model": "gpt-5.5", "effort": "high" }
10
10
  },
11
11
  "medio": {
12
- "claude_code": { "model": "claude-opus-4-8", "effort": "high" },
12
+ "claude_code": { "model": "claude-fable-5", "effort": "high" },
13
13
  "codex": { "model": "gpt-5.5", "effort": "medium" }
14
14
  },
15
15
  "bajo": {
16
- "claude_code": { "model": "claude-opus-4-8", "effort": "medium" },
16
+ "claude_code": { "model": "claude-fable-5", "effort": "medium" },
17
17
  "codex": { "model": "gpt-5.5", "effort": "low" }
18
18
  },
19
19
  "muy_bajo": {
@@ -57,6 +57,7 @@ if str(NEXO_CODE) not in sys.path:
57
57
  sys.path.insert(0, str(NEXO_CODE))
58
58
 
59
59
  from agent_runner import AutomationBackendUnavailableError, run_automation_prompt
60
+ from provider_circuit_breaker import ProviderTemporarilyUnavailableError
60
61
  from client_preferences import (
61
62
  resolve_automation_backend,
62
63
  )
@@ -1997,19 +1998,24 @@ def _localized_operator_escalation_email(
1997
1998
  exhausted_count: int,
1998
1999
  details: str,
1999
2000
  ) -> tuple[str, str]:
2001
+ # Phase 1.6 — subjects are signed by the AGENT (assistant_name, dynamic
2002
+ # per install), not by the product: the operator talks to their agent.
2000
2003
  if _uses_spanish(operator_language):
2001
- subject = f"[NEXO] Emails requiring manual attention ({exhausted_count})"
2004
+ # Phase 1.6 this branch used to contain the ENGLISH text copied
2005
+ # verbatim (operator-reported 10-jun: escalation mails arrived in
2006
+ # English with language=es configured). Real Spanish now.
2007
+ subject = f"[{assistant_name}] Emails que necesitan tu atención ({exhausted_count})"
2002
2008
  body = (
2003
- f"Hello {operator_name},\n\n"
2004
- f"The following emails have already been attempted {MAX_EMAIL_ATTEMPTS} times "
2005
- f"without succeeding (the session dies before completion):\n\n{details}\n\n"
2006
- "I marked them as `needs_interactive`. "
2007
- f"Open {assistant_name} Desktop and ask about the affected email so it can be resolved manually.\n\n"
2009
+ f"Hola {operator_name},\n\n"
2010
+ f"Los siguientes emails ya se han intentado {MAX_EMAIL_ATTEMPTS} veces "
2011
+ f"sin conseguirlo (la sesión muere antes de terminar):\n\n{details}\n\n"
2012
+ "Los he marcado como `needs_interactive`. "
2013
+ f"Abre {assistant_name} Desktop y pregunta por el email afectado para resolverlo manualmente.\n\n"
2008
2014
  f"— {assistant_name}"
2009
2015
  )
2010
2016
  return subject, body
2011
2017
 
2012
- subject = f"[NEXO] Emails requiring manual attention ({exhausted_count})"
2018
+ subject = f"[{assistant_name}] Emails requiring manual attention ({exhausted_count})"
2013
2019
  body = (
2014
2020
  f"Hello {operator_name},\n\n"
2015
2021
  f"The following emails have already been attempted {MAX_EMAIL_ATTEMPTS} times "
@@ -2354,6 +2360,17 @@ def launch_nexo(config, debt_block="", target_emails=None):
2354
2360
  _email_checkpoint_delete(mid)
2355
2361
  return True
2356
2362
 
2363
+ except ProviderTemporarilyUnavailableError as e:
2364
+ # Fase 1.6 — the engine is alive but unusable (credits/rate/auth).
2365
+ # This attempt must NOT count against the email (the provider being
2366
+ # down is not this email's fault), no scary per-item escalation:
2367
+ # give the attempt back, notify the operator ONCE per opening (in
2368
+ # their language) and let the breaker's probe window decide when to
2369
+ # resume. The work stays queued exactly where it was.
2370
+ log.warning(f"Provider circuit breaker open ({e.backend}: {e.reason}) — queueing work, attempt returned")
2371
+ _decrement_attempts(target_message_ids)
2372
+ _notify_provider_breaker_open_once(e)
2373
+ return False
2357
2374
  except AutomationBackendUnavailableError as e:
2358
2375
  log.error(f"Automation backend unavailable: {e}")
2359
2376
  _persist_failure_checkpoints(error_msg=f"AutomationBackendUnavailable: {e}", last_text="")
@@ -2407,6 +2424,94 @@ def _increment_attempts(email_ids):
2407
2424
  log.warning(f"Failed to increment attempts: {e}")
2408
2425
 
2409
2426
 
2427
+ def _decrement_attempts(email_ids):
2428
+ """Fase 1.6 — give an attempt back when the launch was vetoed by the
2429
+ provider circuit breaker: the provider being down is not the email's
2430
+ fault and must not push it towards needs_interactive."""
2431
+ if not email_ids:
2432
+ return
2433
+ try:
2434
+ conn = sqlite3.connect(str(EMAIL_DB_PATH))
2435
+ for mid in email_ids:
2436
+ conn.execute(
2437
+ "UPDATE emails SET attempts = MAX(COALESCE(attempts, 1) - 1, 0) WHERE message_id = ?",
2438
+ (mid,),
2439
+ )
2440
+ conn.commit()
2441
+ conn.close()
2442
+ except Exception as e:
2443
+ log.warning(f"Failed to decrement attempts: {e}")
2444
+
2445
+
2446
+ def _notify_provider_breaker_open_once(error):
2447
+ """Fase 1.6 — ONE operator notice per breaker opening, in their language.
2448
+
2449
+ Replaces the per-item English escalation storm the operator reported
2450
+ (10-jun): with credits exhausted, every queued email generated its own
2451
+ 'needs manual attention' mail. Now: a single message explaining the pause
2452
+ and that work is queued and resumes automatically.
2453
+ """
2454
+ try:
2455
+ from provider_circuit_breaker import should_notify_operator
2456
+ if not should_notify_operator(error.backend):
2457
+ return
2458
+ operator_name, assistant_name, operator_language = _get_operator_info()
2459
+ config = load_config()
2460
+ operator_email = config.get("operator_email", "")
2461
+ if not operator_email:
2462
+ log.warning("Breaker open but no operator_email configured — skipping notice")
2463
+ return
2464
+ retry_hint = ""
2465
+ if error.retry_after_ts:
2466
+ retry_hint = datetime.fromtimestamp(error.retry_after_ts).strftime("%H:%M")
2467
+ reason_es = {
2468
+ "credits": "créditos agotados",
2469
+ "rate_limit": "límite de uso alcanzado",
2470
+ "auth": "sesión caducada (hay que volver a conectar)",
2471
+ }.get(error.reason, error.reason)
2472
+ reason_en = {
2473
+ "credits": "credits exhausted",
2474
+ "rate_limit": "rate limit reached",
2475
+ "auth": "session expired (needs re-login)",
2476
+ }.get(error.reason, error.reason)
2477
+ if _uses_spanish(operator_language):
2478
+ subject = f"[{assistant_name}] Motor {error.backend} en pausa ({reason_es})"
2479
+ body = (
2480
+ f"Hola {operator_name},\n\n"
2481
+ f"He pausado las automatizaciones que usan {error.backend} porque está no disponible: {reason_es}.\n\n"
2482
+ "El trabajo pendiente queda EN COLA (no se pierde nada) y se reanudará solo en cuanto el motor vuelva"
2483
+ + (f" (próxima comprobación ~{retry_hint})" if retry_hint else "")
2484
+ + ".\n\nNo recibirás un aviso por cada tarea: solo este, y otro cuando se reanude.\n\n"
2485
+ f"— {assistant_name}"
2486
+ )
2487
+ else:
2488
+ subject = f"[{assistant_name}] Engine {error.backend} paused ({reason_en})"
2489
+ body = (
2490
+ f"Hello {operator_name},\n\n"
2491
+ f"I paused the automations that use {error.backend} because it is unavailable: {reason_en}.\n\n"
2492
+ "Pending work stays QUEUED (nothing is lost) and resumes automatically once the engine is back"
2493
+ + (f" (next probe ~{retry_hint})" if retry_hint else "")
2494
+ + ".\n\nYou will not get one notice per task — just this one, and another when work resumes.\n\n"
2495
+ f"— {assistant_name}"
2496
+ )
2497
+ body_file = BASE_DIR / ".breaker-notice-body.txt"
2498
+ body_file.write_text(body, encoding="utf-8")
2499
+ send_script = get_send_reply_script_path(local_script_dir=_script_dir)
2500
+ subprocess.run(
2501
+ [
2502
+ sys.executable, str(send_script),
2503
+ "--to", f"{operator_name} <{operator_email}>",
2504
+ "--subject", subject,
2505
+ "--body-file", str(body_file),
2506
+ ],
2507
+ timeout=30,
2508
+ capture_output=True,
2509
+ )
2510
+ log.info(f"Breaker-open notice sent to operator ({error.backend}: {error.reason})")
2511
+ except Exception as e:
2512
+ log.warning(f"Failed to send breaker-open notice: {e}")
2513
+
2514
+
2410
2515
  def _mark_needs_interactive(email_ids):
2411
2516
  """Mark emails as needs_interactive after too many failed attempts."""
2412
2517
  if not email_ids:
@@ -158,10 +158,21 @@ def detect_duplicates(conn):
158
158
  if len(learnings) < 2:
159
159
  return []
160
160
 
161
- model = build_fastembed_embedding("bge-base-embeddings")
162
- texts = [f"{l['title']}: {l['content'][:300]}" for l in learnings]
163
- embeddings = list(model.embed(texts))
164
- embeddings = np.array(embeddings)
161
+ # build_fastembed_embedding() lazily imports fastembed inside its body, so a
162
+ # missing backend raises ModuleNotFoundError HERE, not at the import guard
163
+ # above (which only resolves the helper symbol). Without this guard the whole
164
+ # housekeeping run crashes with exit 1 whenever there are >=2 learnings to
165
+ # compare and fastembed is absent. Degrade exactly like the "not available"
166
+ # branch instead of aborting decay/prioritization/archival.
167
+ try:
168
+ model = build_fastembed_embedding("bge-base-embeddings")
169
+ texts = [f"{l['title']}: {l['content'][:300]}" for l in learnings]
170
+ embeddings = list(model.embed(texts))
171
+ embeddings = np.array(embeddings)
172
+ except Exception as exc:
173
+ print(f"[{ts}] Dedup skipped: embedding backend unavailable "
174
+ f"({type(exc).__name__}: {exc})")
175
+ return []
165
176
 
166
177
  # Normalize
167
178
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
@@ -1973,13 +1973,47 @@ def _toolbox_summary(conn) -> str:
1973
1973
  return ""
1974
1974
 
1975
1975
 
1976
+ def _log_session_learning_aggregation_shadow(sid: str, *, blocked: bool, pending_count: int) -> None:
1977
+ """Phase 1.5 (shadow) — session-level learning aggregation telemetry.
1978
+
1979
+ The per-line gate above only sees corrections its detector flagged in the
1980
+ moment. The real close flow (here — NOT stop.py, which fires after every
1981
+ response with a 10s timeout) is where a session-WIDE aggregation belongs.
1982
+ Shadow first: record close-time compliance metrics to
1983
+ runtime/logs/learning-aggregation-shadow.ndjson so the active phase
1984
+ (full buffer analysis) can be sized with real data before it gates
1985
+ anything. Never raises, never blocks.
1986
+ """
1987
+ try:
1988
+ import json as _json
1989
+ import os as _os
1990
+ import time as _time
1991
+ from pathlib import Path as _Path
1992
+
1993
+ base = _Path(_os.environ.get("NEXO_HOME") or (_Path.home() / ".nexo"))
1994
+ path = base / "runtime" / "logs" / "learning-aggregation-shadow.ndjson"
1995
+ path.parent.mkdir(parents=True, exist_ok=True)
1996
+ with path.open("a", encoding="utf-8") as handle:
1997
+ handle.write(_json.dumps({
1998
+ "ts": _time.time(),
1999
+ "sid": sid,
2000
+ "close_blocked_by_pending_correction": blocked,
2001
+ "pending_corrections_at_close": pending_count,
2002
+ }, ensure_ascii=False) + "\n")
2003
+ except Exception:
2004
+ pass
2005
+
2006
+
1976
2007
  def handle_stop(sid: str) -> str:
1977
2008
  """Cleanly close a session, removing it from active sessions immediately."""
2009
+ pending_count = 0
1978
2010
  try:
1979
2011
  from db import list_session_correction_requirements
1980
2012
 
1981
2013
  pending = list_session_correction_requirements(session_id=sid, status="open", limit=3)
2014
+ pending_count = len(pending or [])
1982
2015
  if pending:
2016
+ _log_session_learning_aggregation_shadow(sid, blocked=True, pending_count=pending_count)
1983
2017
  return (
1984
2018
  "ERROR: session has user correction(s) without durable learning_add. "
1985
2019
  "Call nexo_learning_add for the correction before nexo_stop. "
@@ -1987,6 +2021,7 @@ def handle_stop(sid: str) -> str:
1987
2021
  )
1988
2022
  except Exception:
1989
2023
  pass
2024
+ _log_session_learning_aggregation_shadow(sid, blocked=False, pending_count=pending_count)
1990
2025
  _stop_keepalive(sid)
1991
2026
  complete_session(sid)
1992
2027
  return f"Session {sid} closed."