npm - nexo-brain - Versions diffs - 7.31.0 → 7.31.1 - Mend

nexo-brain 7.31.0 → 7.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/.claude-plugin/plugin.json +1 -1
package/README.md +3 -1
package/package.json +1 -1
package/src/agent_runner.py +26 -0
package/src/client_sync.py +32 -1
package/src/hook_guardrails.py +135 -0
package/src/provider_circuit_breaker.py +230 -0
package/src/scripts/nexo-email-monitor.py +112 -7
package/src/tools_sessions.py +35 -0

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "nexo-brain",
-  "version": "7.31.0",
+  "version": "7.31.1",
   "description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
   "author": {
     "name": "NEXO Brain",

package/README.md CHANGED Viewed

@@ -18,7 +18,9 @@
 [Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
-Version `7.31.0` is the current packaged-runtime line. Minor release over v7.30.33 - the recommended Claude Code model moves from Opus 4.8 to Fable 5 with max reasoning (`claude-fable-5`) across all four main resonance tiers (the `muy_bajo` tier keeps Haiku for cheap internal classifiers and Codex stays on GPT-5.5), existing installs riding NEXO defaults auto-migrate on update while customized models are respected, and learning housekeeping no longer aborts when the embedding backend is missing.
+Version `7.31.1` is the current packaged-runtime line. Patch release over v7.31.0 - headless automations pause and queue when the selected engine is unavailable (credits, rate limits, expired auth) and resume automatically with one operator notice in their language; protocol nudge shaping ships in shadow mode; and the client config push stops writing an invalid `mcp__*` permission rule to Claude Code settings.
+Previously in `7.31.0`: minor release over v7.30.33 - the recommended Claude Code model moves from Opus 4.8 to Fable 5 with max reasoning (`claude-fable-5`) across all four main resonance tiers (the `muy_bajo` tier keeps Haiku for cheap internal classifiers and Codex stays on GPT-5.5), existing installs riding NEXO defaults auto-migrate on update while customized models are respected, and learning housekeeping no longer aborts when the embedding backend is missing.
 Previously in `7.30.33`: patch release over v7.30.32 - personal agent/script status now keeps the newest real run between manual executions and cron history, so a successful manual agent run cannot be hidden behind an older scheduled failure.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "nexo-brain",
-  "version": "7.31.0",
+  "version": "7.31.1",
   "mcpName": "io.github.wazionapps/nexo",
   "description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
   "homepage": "https://nexo-brain.com",

package/src/agent_runner.py CHANGED Viewed

@@ -1205,6 +1205,14 @@ def run_automation_prompt(
             f"{selected_backend} automation backend selected but launcher is not installed; fallback blocked."
         )
+    # Fase 1.6 — provider circuit breaker. "Installed" is not "available":
+    # with credits exhausted / rate limited / auth expired, every headless
+    # cron used to launch a session that died mid-flight, burned its retry
+    # budget and escalated to the operator per-item. The breaker fails fast
+    # with a queue-me signal instead; one probe per retry window re-tests.
+    from provider_circuit_breaker import raise_if_unavailable
+    raise_if_unavailable(selected_backend)
     # Resonance map decides (model, effort) for every call. ``caller`` is
     # MANDATORY — every script that invokes the automation backend must be
     # registered in src/resonance_map.py so its reasoning budget is a
@@ -1414,6 +1422,7 @@ def run_automation_prompt(
         stderr = result.stderr or ""
         if not recorded:
             stderr = _append_stderr(stderr, record_error)
+        _record_provider_breaker_outcome(selected_backend, result.returncode, final_stdout, stderr)
         return subprocess.CompletedProcess(
             cmd,
             result.returncode,
@@ -1490,6 +1499,7 @@ def run_automation_prompt(
             stderr = result.stderr or ""
             if not recorded:
                 stderr = _append_stderr(stderr, record_error)
+            _record_provider_breaker_outcome(selected_backend, result.returncode, final_stdout, stderr)
             return subprocess.CompletedProcess(
                 cmd,
                 result.returncode,
@@ -1500,6 +1510,22 @@ def run_automation_prompt(
     raise AutomationBackendUnavailableError(f"Unsupported automation backend: {selected_backend}")
+def _record_provider_breaker_outcome(backend: str, returncode: int | None, stdout: str, stderr: str) -> None:
+    """Fase 1.6 — feed the circuit breaker after every headless session.
+    Success closes the breaker; classified failures (credits/rate-limit/auth)
+    open it immediately so the NEXT cron fails fast and queues instead of
+    launching another doomed session. Best-effort: breaker bookkeeping must
+    never mask the session result.
+    """
+    try:
+        from provider_circuit_breaker import classify_session_failure, record_session_outcome
+        reason = classify_session_failure(returncode, stdout or "", stderr or "")
+        record_session_outcome(backend, ok=(reason is None), reason=reason)
+    except Exception:
+        pass
 def probe_automation_backend(
     *,
     backend: str | None = None,

package/src/client_sync.py CHANGED Viewed

@@ -1012,7 +1012,16 @@ def _load_toml_object(path: Path) -> dict:
 def _write_toml_object(path: Path, payload: dict) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
     lines = _emit_toml_table(payload)
-    path.write_text("\n".join(lines).rstrip() + "\n")
+    content = "\n".join(lines).rstrip() + "\n"
+    # v7.31.x (Fase 1) — write-if-changed: rewriting an identical config.toml
+    # on every Desktop update churns mtime/content signatures and can
+    # re-trigger Codex's hook/trust confirmation prompt for the operator.
+    try:
+        if path.is_file() and path.read_text() == content:
+            return
+    except Exception:
+        pass  # unreadable existing file -> fall through to a clean write
+    path.write_text(content)
 def _sync_codex_managed_config(
@@ -1527,6 +1536,12 @@ def _claude_desktop_managed_metadata(server_config: dict, *, operator_name: str)
 # (followup-runner, email-monitor, deep-sleep, etc.) to work without
 # interactive approval prompts. Without this, Claude Code headless invocations
 # stall waiting for MCP tool approvals.
+#
+# v7.31.x (Fase 1) — "mcp__*" is NOT a valid Claude Code allow rule (allow
+# patterns must name a literal mcp__<server>__ scope; only deny/ask accept
+# bare wildcards). Claude Code skips it and shows a Settings Warning on every
+# launch. List the NEXO-managed servers explicitly instead; user-added
+# servers belong to the user's own config, not to this template.
 _NEXO_HEADLESS_ALLOWLIST = (
     "Bash",
     "Read",
@@ -1539,6 +1554,16 @@ _NEXO_HEADLESS_ALLOWLIST = (
     "NotebookEdit",
     "WebSearch",
     "WebFetch",
+    "mcp__nexo__*",
+    "mcp__nexo_chrome_control__*",
+    "mcp__nexo_desktop_control__*",
+    "mcp__nexo_power_control__*",
+)
+# Entries previously pushed by this template that Claude Code rejects as
+# invalid. The sync REMOVES them so already-contaminated installs stop
+# showing the launch warning. Safe: Claude Code was skipping them anyway.
+_NEXO_INVALID_ALLOWLIST_ENTRIES = (
     "mcp__*",
 )
@@ -1558,6 +1583,12 @@ def _ensure_headless_permissions(payload: dict) -> None:
         allow_list = []
         permissions["allow"] = allow_list
+    # v7.31.x (Fase 1) — migrate away invalid entries this template used to
+    # push (Claude Code skips them and warns on every launch).
+    for invalid in _NEXO_INVALID_ALLOWLIST_ENTRIES:
+        while invalid in allow_list:
+            allow_list.remove(invalid)
     existing = {str(item) for item in allow_list if isinstance(item, str)}
     for entry in _NEXO_HEADLESS_ALLOWLIST:
         if entry not in existing:

package/src/hook_guardrails.py CHANGED Viewed

@@ -42,6 +42,22 @@ PROTOCOL_SKIP_TOOLS = {
     "nexo_rules_check",
 }
 ACTION_TASK_TYPES = {"edit", "execute", "delegate"}
+# Phase 1.5 (SPEC-FIABILIDAD-FASES-2026-06) — protocol nudge shaping.
+# The "Non-trivial work without nexo_task_open" warning fired on EVERY
+# non-trivial tool call from tool #1 (no threshold, no rate limit, no
+# session-type awareness) — measurable as noise that gets ignored. Shaping:
+#   - threshold: only nudge after N consecutive non-trivial tools w/o task
+#   - cooldown: once nudged, stay quiet for a window
+#   - headless: runner sessions are covered by HeadlessEnforcer already
+#     (enforcement_engine.py, threshold 4/2 + cooldown) — skip the nudge
+# Mode is SHADOW by default: visible behaviour is UNCHANGED; decisions are
+# logged to runtime/logs/protocol-nudge-shadow.ndjson so the threshold can
+# be calibrated with real data before flipping NEXO_PROTOCOL_NUDGE_MODE to
+# "active". ("off" disables shaping bookkeeping entirely.)
+PROTOCOL_NUDGE_MODE = str(os.environ.get("NEXO_PROTOCOL_NUDGE_MODE", "shadow")).strip().lower()
+PROTOCOL_NUDGE_THRESHOLD = max(1, int(os.environ.get("NEXO_PROTOCOL_NUDGE_THRESHOLD", "6") or 6))
+PROTOCOL_NUDGE_COOLDOWN_S = max(0, int(os.environ.get("NEXO_PROTOCOL_NUDGE_COOLDOWN_S", "300") or 300))
 NEXO_CODE_ROOT = Path(os.environ.get("NEXO_CODE", str(Path(__file__).resolve().parent))).expanduser().resolve()
 LIVE_REPO_ROOT = NEXO_CODE_ROOT.parent if NEXO_CODE_ROOT.name == "src" else NEXO_CODE_ROOT
 PUBLIC_REPO_DIRS = {
@@ -1198,6 +1214,110 @@ def _append_protocol_warning(warnings: list[dict], message: str) -> None:
     warnings.append({"message": clean})
+def _protocol_nudge_state_path() -> Path:
+    base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
+    return base / "runtime" / "data" / "protocol-nudge-state.json"
+def _protocol_nudge_shadow_log_path() -> Path:
+    base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
+    return base / "runtime" / "logs" / "protocol-nudge-shadow.ndjson"
+def _shape_protocol_nudge(sid: str) -> dict:
+    """Phase 1.5 — decide whether the no-task nudge SHOULD fire under shaping.
+    Pure bookkeeping + decision; never raises (a broken state file must not
+    break the hook). Returns {would_emit, reason, streak}.
+    """
+    import json as _json
+    import time as _time
+    headless = (
+        str(os.environ.get("NEXO_AUTOMATION", "")).strip() == "1"
+        or str(os.environ.get("NEXO_HEADLESS", "")).strip() == "1"
+    )
+    if headless:
+        return {"would_emit": False, "reason": "headless-covered-by-enforcer", "streak": 0}
+    state_path = _protocol_nudge_state_path()
+    state: dict = {}
+    try:
+        state = _json.loads(state_path.read_text(encoding="utf-8"))
+        if not isinstance(state, dict):
+            state = {}
+    except Exception:
+        state = {}
+    now = _time.time()
+    # Drop stale sessions (>48h) so the file cannot grow without bound.
+    state = {
+        key: value for key, value in state.items()
+        if isinstance(value, dict) and (now - float(value.get("updated_at") or 0)) < 48 * 3600
+    }
+    entry = state.get(sid) or {}
+    streak = int(entry.get("streak") or 0) + 1
+    last_nudge_at = float(entry.get("last_nudge_at") or 0)
+    entry.update({"streak": streak, "updated_at": now})
+    if streak < PROTOCOL_NUDGE_THRESHOLD:
+        decision = {"would_emit": False, "reason": "under-threshold", "streak": streak}
+    elif last_nudge_at and (now - last_nudge_at) < PROTOCOL_NUDGE_COOLDOWN_S:
+        decision = {"would_emit": False, "reason": "cooldown", "streak": streak}
+    else:
+        entry["last_nudge_at"] = now
+        decision = {"would_emit": True, "reason": "threshold-reached", "streak": streak}
+    state[sid] = entry
+    try:
+        state_path.parent.mkdir(parents=True, exist_ok=True)
+        tmp = state_path.with_suffix(".json.tmp")
+        tmp.write_text(_json.dumps(state, ensure_ascii=False) + "\n", encoding="utf-8")
+        os.replace(tmp, state_path)
+    except Exception:
+        pass
+    return decision
+def _reset_protocol_nudge_streak(sid: str) -> None:
+    """A session with an open task is compliant — its streak restarts."""
+    import json as _json
+    if PROTOCOL_NUDGE_MODE == "off" or not sid:
+        return
+    state_path = _protocol_nudge_state_path()
+    try:
+        state = _json.loads(state_path.read_text(encoding="utf-8"))
+        if not isinstance(state, dict) or sid not in state:
+            return
+        state[sid]["streak"] = 0
+        tmp = state_path.with_suffix(".json.tmp")
+        tmp.write_text(_json.dumps(state, ensure_ascii=False) + "\n", encoding="utf-8")
+        os.replace(tmp, state_path)
+    except Exception:
+        pass
+def _log_protocol_nudge_shadow(sid: str, decision: dict, emitted_today: bool) -> None:
+    import json as _json
+    import time as _time
+    try:
+        path = _protocol_nudge_shadow_log_path()
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with path.open("a", encoding="utf-8") as handle:
+            handle.write(_json.dumps({
+                "ts": _time.time(),
+                "sid": sid,
+                "mode": PROTOCOL_NUDGE_MODE,
+                "threshold": PROTOCOL_NUDGE_THRESHOLD,
+                "decision": decision,
+                "legacy_warning_emitted": emitted_today,
+            }, ensure_ascii=False) + "\n")
+    except Exception:
+        pass
 def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
     short_name = _short_tool_name(tool_name)
     if short_name in PROTOCOL_SKIP_TOOLS or short_name not in NON_TRIVIAL_PROTOCOL_TOOLS:
@@ -1214,6 +1334,17 @@ def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
     task = _find_any_open_task(conn, sid)
     has_guard = _session_has_guard_check(conn, sid)
     if not task:
+        # Phase 1.5 — shaping decision. In SHADOW mode (default) the visible
+        # behaviour below is untouched and the decision is only logged so the
+        # threshold can be calibrated; in ACTIVE mode the shaping governs
+        # (headless skip, streak threshold, cooldown); "off" disables both.
+        nudge = None
+        if PROTOCOL_NUDGE_MODE in {"shadow", "active"}:
+            nudge = _shape_protocol_nudge(sid)
+        if PROTOCOL_NUDGE_MODE == "active" and nudge and not nudge["would_emit"]:
+            _log_protocol_nudge_shadow(sid, nudge, emitted_today=False)
+            return warnings
         guard_note = (
             render_core_prompt("hook-protocol-warning-task-open-guard-note")
             if short_name in {"Read", "Bash", "Grep", "Glob"} and not has_guard
@@ -1230,8 +1361,12 @@ def _collect_protocol_warnings(conn, *, sid: str, tool_name: str) -> list[dict]:
             warnings,
             render_core_prompt("hook-protocol-warning-heartbeat-close-evidence"),
         )
+        if PROTOCOL_NUDGE_MODE == "shadow" and nudge is not None:
+            _log_protocol_nudge_shadow(sid, nudge, emitted_today=True)
         return warnings
+    _reset_protocol_nudge_streak(sid)
     task_id = str(task.get("task_id") or "").strip()
     if str(task.get("task_type") or "").strip() in ACTION_TASK_TYPES and not (task.get("opened_with_guard") or has_guard):
         _append_protocol_warning(

package/src/provider_circuit_breaker.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""Provider circuit breaker — Fase 1.6 (SPEC-FIABILIDAD-FASES-2026-06).
+Incident (2026-06-10, operator report): when the selected engine (Claude or
+Codex) is unavailable — credits exhausted, rate limited, auth expired — every
+headless cron (email-monitor, deep-sleep, evolution, catch-up, followups…)
+still launched a session that died mid-flight, burned its retry budget, then
+escalated to the operator by email (in English, regardless of the configured
+language). Work was lost or degraded to manual across the whole system.
+This module gives the single launch path (agent_runner.run_automation_prompt)
+a shared, persisted circuit breaker:
+- ``check_provider_available(backend)``  — gate BEFORE launching.
+- ``classify_session_failure(...)``      — map a dead session to a cause.
+- ``record_session_outcome(backend, …)`` — close on success, open on
+  classified failures (credits/rate-limit/auth open immediately; generic
+  failures only after N consecutive).
+- ``should_notify_operator(backend)``    — True exactly once per opening, so
+  the operator gets ONE notice instead of one per queued item.
+State lives in ``$NEXO_HOME/runtime/data/provider-circuit-breaker.json`` so
+every cron process shares the same view. Writes are atomic (tmp + replace).
+The breaker FAILS OPEN on its own errors: a broken state file must never
+block automations.
+"""
+from __future__ import annotations
+import json
+import os
+import re
+import time
+from pathlib import Path
+# Failure classes that open the breaker on FIRST sight: retrying cannot help
+# until the underlying condition clears.
+HARD_OPEN_REASONS = {"credits", "rate_limit", "auth"}
+# Generic failures (network blips, crashes) need this many consecutive hits
+# before the breaker opens — one flaky session must not pause the fleet.
+GENERIC_OPEN_THRESHOLD = 3
+# How long the breaker stays open before allowing ONE half-open probe call.
+DEFAULT_RETRY_AFTER_S = {
+    "credits": 30 * 60,      # credit top-ups/renewals are slow; probe every 30m
+    "rate_limit": 15 * 60,   # unless the provider told us a reset time
+    "auth": 60 * 60,         # needs operator action; probe hourly anyway
+    "generic": 10 * 60,
+}
+_FAILURE_PATTERNS = (
+    ("credits", re.compile(
+        r"credit balance is too low|insufficient[_ ]quota|exceeded your current quota"
+        r"|billing hard limit|out of credits|usage limit reached|plan limits",
+        re.I)),
+    ("rate_limit", re.compile(
+        r"rate[_ -]?limit|too many requests|\b429\b|overloaded[_ ]error|\b529\b"
+        r"|server overloaded|capacity constraints",
+        re.I)),
+    ("auth", re.compile(
+        r"authentication[_ ]error|\b401\b|unauthorized|oauth token (has )?expired"
+        r"|invalid api key|api key not (found|valid)|please run /login|token_revoked",
+        re.I)),
+)
+def _state_path() -> Path:
+    base = Path(os.environ.get("NEXO_HOME") or (Path.home() / ".nexo"))
+    return base / "runtime" / "data" / "provider-circuit-breaker.json"
+def _now() -> float:
+    return time.time()
+def _load_state() -> dict:
+    try:
+        raw = _state_path().read_text(encoding="utf-8")
+        data = json.loads(raw)
+        return data if isinstance(data, dict) else {}
+    except Exception:
+        return {}
+def _save_state(state: dict) -> None:
+    try:
+        path = _state_path()
+        path.parent.mkdir(parents=True, exist_ok=True)
+        tmp = path.with_suffix(".json.tmp")
+        tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+        os.replace(tmp, path)
+    except Exception:
+        pass  # the breaker must never break the caller
+def _entry(state: dict, backend: str) -> dict:
+    entry = state.get(backend)
+    if not isinstance(entry, dict):
+        entry = {}
+        state[backend] = entry
+    return entry
+class ProviderTemporarilyUnavailableError(RuntimeError):
+    """Selected provider is up for maintenance by reality (credits/rate/auth).
+    Callers should QUEUE/DEFER their work without burning retry budgets; the
+    breaker re-probes automatically once ``retry_after`` passes.
+    """
+    def __init__(self, backend: str, reason: str, retry_after_ts: float | None):
+        self.backend = backend
+        self.reason = reason
+        self.retry_after_ts = retry_after_ts
+        wait = ""
+        if retry_after_ts:
+            wait = f"; next probe after {time.strftime('%H:%M', time.localtime(retry_after_ts))}"
+        super().__init__(
+            f"provider '{backend}' temporarily unavailable (reason: {reason}){wait}. "
+            "Work should be queued, not retried blindly."
+        )
+def classify_session_failure(returncode: int | None, stdout: str = "", stderr: str = "") -> str | None:
+    """Map a finished/dead session to a failure class, or None if it looks fine.
+    Only classifies KNOWN unavailability shapes; an exit code != 0 with no
+    matching pattern returns "generic" so the threshold logic decides.
+    A zero return code returns None.
+    """
+    if returncode == 0:
+        return None
+    haystack = f"{stdout or ''}\n{stderr or ''}"
+    for reason, pattern in _FAILURE_PATTERNS:
+        if pattern.search(haystack):
+            return reason
+    return "generic"
+def check_provider_available(backend: str) -> tuple[bool, dict]:
+    """Gate to call BEFORE launching the provider.
+    Returns (True, entry) when closed — or when open but past retry_after, in
+    which case the caller's attempt IS the half-open probe (its outcome will
+    close or re-open the breaker via record_session_outcome).
+    Returns (False, entry) while open and inside the wait window.
+    """
+    state = _load_state()
+    entry = _entry(state, backend)
+    if entry.get("state") != "open":
+        return True, entry
+    retry_after = float(entry.get("retry_after") or 0)
+    if retry_after and _now() >= retry_after:
+        entry["half_open_probe_at"] = _now()
+        _save_state(state)
+        return True, entry
+    return False, entry
+def raise_if_unavailable(backend: str) -> None:
+    ok, entry = check_provider_available(backend)
+    if ok:
+        return
+    raise ProviderTemporarilyUnavailableError(
+        backend,
+        str(entry.get("reason") or "unknown"),
+        float(entry.get("retry_after") or 0) or None,
+    )
+def record_session_outcome(
+    backend: str,
+    *,
+    ok: bool,
+    reason: str | None = None,
+    retry_after_s: float | None = None,
+) -> dict:
+    """Update the breaker after a session finished (or died).
+    ``reason`` should come from classify_session_failure. ``retry_after_s``
+    lets callers honour a provider-reported reset time.
+    """
+    state = _load_state()
+    entry = _entry(state, backend)
+    if ok:
+        was_open = entry.get("state") == "open"
+        state[backend] = {
+            "state": "closed",
+            "consecutive_failures": 0,
+            "closed_at": _now(),
+            "recovered_from": entry.get("reason") if was_open else None,
+        }
+        _save_state(state)
+        return state[backend]
+    failure_reason = reason or "generic"
+    consecutive = int(entry.get("consecutive_failures") or 0) + 1
+    entry["consecutive_failures"] = consecutive
+    should_open = failure_reason in HARD_OPEN_REASONS or consecutive >= GENERIC_OPEN_THRESHOLD
+    if should_open:
+        wait = retry_after_s if retry_after_s else DEFAULT_RETRY_AFTER_S.get(failure_reason, DEFAULT_RETRY_AFTER_S["generic"])
+        already_open = entry.get("state") == "open"
+        entry.update({
+            "state": "open",
+            "reason": failure_reason,
+            "opened_at": entry.get("opened_at") if already_open else _now(),
+            "retry_after": _now() + float(wait),
+        })
+        if not already_open:
+            entry["operator_notified_at"] = None
+    _save_state(state)
+    return entry
+def should_notify_operator(backend: str) -> bool:
+    """True exactly once per opening — callers use it to send ONE notice."""
+    state = _load_state()
+    entry = _entry(state, backend)
+    if entry.get("state") != "open":
+        return False
+    if entry.get("operator_notified_at"):
+        return False
+    entry["operator_notified_at"] = _now()
+    _save_state(state)
+    return True
+def breaker_status() -> dict:
+    """Read-only snapshot for doctors/diagnostics."""
+    return _load_state()

package/src/scripts/nexo-email-monitor.py CHANGED Viewed

@@ -57,6 +57,7 @@ if str(NEXO_CODE) not in sys.path:
     sys.path.insert(0, str(NEXO_CODE))
 from agent_runner import AutomationBackendUnavailableError, run_automation_prompt
+from provider_circuit_breaker import ProviderTemporarilyUnavailableError
 from client_preferences import (
     resolve_automation_backend,
 )
@@ -1997,19 +1998,24 @@ def _localized_operator_escalation_email(
     exhausted_count: int,
     details: str,
 ) -> tuple[str, str]:
+    # Phase 1.6 — subjects are signed by the AGENT (assistant_name, dynamic
+    # per install), not by the product: the operator talks to their agent.
     if _uses_spanish(operator_language):
-        subject = f"[NEXO] Emails requiring manual attention ({exhausted_count})"
+        # Phase 1.6 — this branch used to contain the ENGLISH text copied
+        # verbatim (operator-reported 10-jun: escalation mails arrived in
+        # English with language=es configured). Real Spanish now.
+        subject = f"[{assistant_name}] Emails que necesitan tu atención ({exhausted_count})"
         body = (
-            f"Hello {operator_name},\n\n"
-            f"The following emails have already been attempted {MAX_EMAIL_ATTEMPTS} times "
-            f"without succeeding (the session dies before completion):\n\n{details}\n\n"
-            "I marked them as `needs_interactive`. "
-            f"Open {assistant_name} Desktop and ask about the affected email so it can be resolved manually.\n\n"
+            f"Hola {operator_name},\n\n"
+            f"Los siguientes emails ya se han intentado {MAX_EMAIL_ATTEMPTS} veces "
+            f"sin conseguirlo (la sesión muere antes de terminar):\n\n{details}\n\n"
+            "Los he marcado como `needs_interactive`. "
+            f"Abre {assistant_name} Desktop y pregunta por el email afectado para resolverlo manualmente.\n\n"
             f"— {assistant_name}"
         )
         return subject, body
-    subject = f"[NEXO] Emails requiring manual attention ({exhausted_count})"
+    subject = f"[{assistant_name}] Emails requiring manual attention ({exhausted_count})"
     body = (
         f"Hello {operator_name},\n\n"
         f"The following emails have already been attempted {MAX_EMAIL_ATTEMPTS} times "
@@ -2354,6 +2360,17 @@ def launch_nexo(config, debt_block="", target_emails=None):
             _email_checkpoint_delete(mid)
         return True
+    except ProviderTemporarilyUnavailableError as e:
+        # Fase 1.6 — the engine is alive but unusable (credits/rate/auth).
+        # This attempt must NOT count against the email (the provider being
+        # down is not this email's fault), no scary per-item escalation:
+        # give the attempt back, notify the operator ONCE per opening (in
+        # their language) and let the breaker's probe window decide when to
+        # resume. The work stays queued exactly where it was.
+        log.warning(f"Provider circuit breaker open ({e.backend}: {e.reason}) — queueing work, attempt returned")
+        _decrement_attempts(target_message_ids)
+        _notify_provider_breaker_open_once(e)
+        return False
     except AutomationBackendUnavailableError as e:
         log.error(f"Automation backend unavailable: {e}")
         _persist_failure_checkpoints(error_msg=f"AutomationBackendUnavailable: {e}", last_text="")
@@ -2407,6 +2424,94 @@ def _increment_attempts(email_ids):
         log.warning(f"Failed to increment attempts: {e}")
+def _decrement_attempts(email_ids):
+    """Fase 1.6 — give an attempt back when the launch was vetoed by the
+    provider circuit breaker: the provider being down is not the email's
+    fault and must not push it towards needs_interactive."""
+    if not email_ids:
+        return
+    try:
+        conn = sqlite3.connect(str(EMAIL_DB_PATH))
+        for mid in email_ids:
+            conn.execute(
+                "UPDATE emails SET attempts = MAX(COALESCE(attempts, 1) - 1, 0) WHERE message_id = ?",
+                (mid,),
+            )
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        log.warning(f"Failed to decrement attempts: {e}")
+def _notify_provider_breaker_open_once(error):
+    """Fase 1.6 — ONE operator notice per breaker opening, in their language.
+    Replaces the per-item English escalation storm the operator reported
+    (10-jun): with credits exhausted, every queued email generated its own
+    'needs manual attention' mail. Now: a single message explaining the pause
+    and that work is queued and resumes automatically.
+    """
+    try:
+        from provider_circuit_breaker import should_notify_operator
+        if not should_notify_operator(error.backend):
+            return
+        operator_name, assistant_name, operator_language = _get_operator_info()
+        config = load_config()
+        operator_email = config.get("operator_email", "")
+        if not operator_email:
+            log.warning("Breaker open but no operator_email configured — skipping notice")
+            return
+        retry_hint = ""
+        if error.retry_after_ts:
+            retry_hint = datetime.fromtimestamp(error.retry_after_ts).strftime("%H:%M")
+        reason_es = {
+            "credits": "créditos agotados",
+            "rate_limit": "límite de uso alcanzado",
+            "auth": "sesión caducada (hay que volver a conectar)",
+        }.get(error.reason, error.reason)
+        reason_en = {
+            "credits": "credits exhausted",
+            "rate_limit": "rate limit reached",
+            "auth": "session expired (needs re-login)",
+        }.get(error.reason, error.reason)
+        if _uses_spanish(operator_language):
+            subject = f"[{assistant_name}] Motor {error.backend} en pausa ({reason_es})"
+            body = (
+                f"Hola {operator_name},\n\n"
+                f"He pausado las automatizaciones que usan {error.backend} porque está no disponible: {reason_es}.\n\n"
+                "El trabajo pendiente queda EN COLA (no se pierde nada) y se reanudará solo en cuanto el motor vuelva"
+                + (f" (próxima comprobación ~{retry_hint})" if retry_hint else "")
+                + ".\n\nNo recibirás un aviso por cada tarea: solo este, y otro cuando se reanude.\n\n"
+                f"— {assistant_name}"
+            )
+        else:
+            subject = f"[{assistant_name}] Engine {error.backend} paused ({reason_en})"
+            body = (
+                f"Hello {operator_name},\n\n"
+                f"I paused the automations that use {error.backend} because it is unavailable: {reason_en}.\n\n"
+                "Pending work stays QUEUED (nothing is lost) and resumes automatically once the engine is back"
+                + (f" (next probe ~{retry_hint})" if retry_hint else "")
+                + ".\n\nYou will not get one notice per task — just this one, and another when work resumes.\n\n"
+                f"— {assistant_name}"
+            )
+        body_file = BASE_DIR / ".breaker-notice-body.txt"
+        body_file.write_text(body, encoding="utf-8")
+        send_script = get_send_reply_script_path(local_script_dir=_script_dir)
+        subprocess.run(
+            [
+                sys.executable, str(send_script),
+                "--to", f"{operator_name} <{operator_email}>",
+                "--subject", subject,
+                "--body-file", str(body_file),
+            ],
+            timeout=30,
+            capture_output=True,
+        )
+        log.info(f"Breaker-open notice sent to operator ({error.backend}: {error.reason})")
+    except Exception as e:
+        log.warning(f"Failed to send breaker-open notice: {e}")
 def _mark_needs_interactive(email_ids):
     """Mark emails as needs_interactive after too many failed attempts."""
     if not email_ids:

package/src/tools_sessions.py CHANGED Viewed

@@ -1973,13 +1973,47 @@ def _toolbox_summary(conn) -> str:
     return ""
+def _log_session_learning_aggregation_shadow(sid: str, *, blocked: bool, pending_count: int) -> None:
+    """Phase 1.5 (shadow) — session-level learning aggregation telemetry.
+    The per-line gate above only sees corrections its detector flagged in the
+    moment. The real close flow (here — NOT stop.py, which fires after every
+    response with a 10s timeout) is where a session-WIDE aggregation belongs.
+    Shadow first: record close-time compliance metrics to
+    runtime/logs/learning-aggregation-shadow.ndjson so the active phase
+    (full buffer analysis) can be sized with real data before it gates
+    anything. Never raises, never blocks.
+    """
+    try:
+        import json as _json
+        import os as _os
+        import time as _time
+        from pathlib import Path as _Path
+        base = _Path(_os.environ.get("NEXO_HOME") or (_Path.home() / ".nexo"))
+        path = base / "runtime" / "logs" / "learning-aggregation-shadow.ndjson"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with path.open("a", encoding="utf-8") as handle:
+            handle.write(_json.dumps({
+                "ts": _time.time(),
+                "sid": sid,
+                "close_blocked_by_pending_correction": blocked,
+                "pending_corrections_at_close": pending_count,
+            }, ensure_ascii=False) + "\n")
+    except Exception:
+        pass
 def handle_stop(sid: str) -> str:
     """Cleanly close a session, removing it from active sessions immediately."""
+    pending_count = 0
     try:
         from db import list_session_correction_requirements
         pending = list_session_correction_requirements(session_id=sid, status="open", limit=3)
+        pending_count = len(pending or [])
         if pending:
+            _log_session_learning_aggregation_shadow(sid, blocked=True, pending_count=pending_count)
             return (
                 "ERROR: session has user correction(s) without durable learning_add. "
                 "Call nexo_learning_add for the correction before nexo_stop. "
@@ -1987,6 +2021,7 @@ def handle_stop(sid: str) -> str:
             )
     except Exception:
         pass
+    _log_session_learning_aggregation_shadow(sid, blocked=False, pending_count=pending_count)
     _stop_keepalive(sid)
     complete_session(sid)
     return f"Session {sid} closed."