nexo-brain 7.9.30 → 7.9.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +7 -1
- package/package.json +1 -1
- package/src/call_model_raw.py +38 -4
- package/src/scripts/nexo-email-monitor.py +291 -2
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.9.
|
|
3
|
+
"version": "7.9.33",
|
|
4
4
|
"description": "Local cognitive runtime for Claude Code \u2014 persistent memory, overnight learning, doctor diagnostics, personal scripts, recovery-aware jobs, startup preflight, and optional dashboard/power helper.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "NEXO Brain",
|
package/README.md
CHANGED
|
@@ -18,7 +18,13 @@
|
|
|
18
18
|
|
|
19
19
|
[Watch the overview video](https://nexo-brain.com/watch/) · [Watch on YouTube](https://www.youtube.com/watch?v=i2lkGhKyVqI) · [Open the infographic](https://nexo-brain.com/assets/nexo-brain-infographic-v5.png)
|
|
20
20
|
|
|
21
|
-
Version `7.9.
|
|
21
|
+
Version `7.9.33` is the current packaged-runtime line. Patch release over `7.9.32`: adds ``usedforsecurity=False`` to the SHA-1 call that derives a filesystem-safe checkpoint filename from the email's Message-ID, so Bandit's B324 audit no longer fails the publish workflow on a non-security usage. The ``v7.9.32`` git tag is preserved for traceability but no npm release ever shipped for it; ``nexo-brain@7.9.33`` is the first release that carries the 7.9.32 email-recovery checkpoints.
|
|
22
|
+
|
|
23
|
+
Previously in `7.9.32`: hardens the email monitor's recovery so emails that fall between Brain releases never end up in a permanent limbo. The periodic ``_recover_unreplied_processed`` sweep now looks back 7 days (was 24h), and every failed worker run persists a per-email checkpoint at ``~/.nexo/nexo-email/checkpoints/`` capturing files touched, last assistant narration, and error. Retry attempts inject that context into the next prompt so a long task (drafting a presentation, multi-step analysis) continues from where the previous attempt died instead of restarting from scratch. Stale checkpoints are pruned automatically after 7 days. 15 new unit tests cover the helpers.
|
|
24
|
+
|
|
25
|
+
Previously in `7.9.31`: fixes a wire-level bug where ``call_model_raw`` was sending ``stop_sequences=["\n", ".", " "]`` by default, which the current Anthropic Messages API rejects with HTTP 400 ``each stop sequence must contain non-whitespace``. The default is now ``None`` (no ``stop_sequences`` field sent) since ``max_tokens=3`` already caps the yes/no classifier output. A local guard rejects whitespace-only caller values up front so the error shows where the caller is, not as a remote 400. Also removes an internal design document that did not belong in the open-source distribution.
|
|
26
|
+
|
|
27
|
+
Previously in `7.9.30`: hotfix for a missing ``import sys`` in ``src/agent_runner.py`` that ruff F821 caught in CI and blocked the 7.9.29 publish workflow before any npm artifact shipped. ``nexo-brain@7.9.30`` is the first npm release that carries the 7.9.29 override-path hardening.
|
|
22
28
|
|
|
23
29
|
Previously in `7.9.29`: hardening pass on the optional LLM endpoint and auth provider override path. The bearer is now passed to the Anthropic SDK via `auth_token` so it lands in the standard `Authorization: Bearer` header (7.9.28 sent it as `X-Api-Key` and any compatible proxy rejected every request with 401). The Brain config directory is resolved on each call instead of cached at import, so LaunchAgent crons that export `NEXO_HOME` via a wrapper now reach the right `~/.nexo/config/`. The `Idempotency-Key` header accepts a caller-provided value so application-level retries reuse the same dedup key. Override mode is strict about its bearer source: if `auth_provider.json` is missing or the helper fails, the call raises `ClassifierUnavailableError` instead of falling back to the operator's real `ANTHROPIC_API_KEY`, which would otherwise leak to the custom proxy as a second header. A new end-to-end test suite drives the real SDK against a local `http.server` and asserts on captured wire headers and body, complementing the SDK-mock unit tests.
|
|
24
30
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexo-brain",
|
|
3
|
-
"version": "7.9.
|
|
3
|
+
"version": "7.9.33",
|
|
4
4
|
"mcpName": "io.github.wazionapps/nexo",
|
|
5
5
|
"description": "NEXO Brain — Shared brain for AI agents. Persistent memory, semantic RAG, natural forgetting, metacognitive guard, trust scoring, 150+ MCP tools. Works with Claude Code, Codex, Claude Desktop & any MCP client. 100% local, free.",
|
|
6
6
|
"homepage": "https://nexo-brain.com",
|
package/src/call_model_raw.py
CHANGED
|
@@ -469,9 +469,16 @@ def _call_anthropic_raw(
|
|
|
469
469
|
"model": wire_model,
|
|
470
470
|
"max_tokens": max_tokens,
|
|
471
471
|
"temperature": temperature,
|
|
472
|
-
"stop_sequences": stop_sequences,
|
|
473
472
|
"messages": [{"role": "user", "content": prompt}],
|
|
474
473
|
}
|
|
474
|
+
if stop_sequences:
|
|
475
|
+
# Anthropic API rejects whitespace-only stop sequences with
|
|
476
|
+
# 400 ``each stop sequence must contain non-whitespace``. The
|
|
477
|
+
# caller-validation in call_model_raw filters these out before
|
|
478
|
+
# we reach this point; the empty/None case is also covered by
|
|
479
|
+
# the truthy check above so we omit the field entirely instead
|
|
480
|
+
# of sending ``stop_sequences: null`` to the wire.
|
|
481
|
+
kwargs["stop_sequences"] = stop_sequences
|
|
475
482
|
if system:
|
|
476
483
|
kwargs["system"] = system
|
|
477
484
|
|
|
@@ -582,7 +589,20 @@ def call_model_raw(
|
|
|
582
589
|
"enforcer_classifier".
|
|
583
590
|
max_tokens — hard cap on output tokens. Default 3 (yes/no only).
|
|
584
591
|
temperature — sampling temperature. Default 0.0 (deterministic).
|
|
585
|
-
stop_sequences — early-stop strings. Default
|
|
592
|
+
stop_sequences — early-stop strings. Default ``None`` (no stop
|
|
593
|
+
sequence sent on the wire). Anthropic's API
|
|
594
|
+
rejects whitespace-only entries with
|
|
595
|
+
``each stop sequence must contain
|
|
596
|
+
non-whitespace`` (HTTP 400), so the previous
|
|
597
|
+
default of ``["\\n", ".", " "]`` made every
|
|
598
|
+
``enforcer_classifier`` request fail in
|
|
599
|
+
production. ``max_tokens=3`` already serves as
|
|
600
|
+
the hard cap for yes/no classification, so a
|
|
601
|
+
stop sequence is unnecessary by default.
|
|
602
|
+
Callers that want a deterministic stop can
|
|
603
|
+
pass e.g. ``["."]``; whitespace-only entries
|
|
604
|
+
are rejected locally with
|
|
605
|
+
``ClassifierUnavailableError``.
|
|
586
606
|
timeout — per-request timeout in seconds. Default 10.0.
|
|
587
607
|
system — optional system prompt. Default None (provider default).
|
|
588
608
|
idempotency_key — optional opaque token attached as
|
|
@@ -612,8 +632,22 @@ def call_model_raw(
|
|
|
612
632
|
Callers MUST catch this and fall back to a safer default. Fase 2 spec
|
|
613
633
|
0.20 is explicit: silence is not obedience. Never fail-open.
|
|
614
634
|
"""
|
|
615
|
-
if stop_sequences is None:
|
|
616
|
-
|
|
635
|
+
if stop_sequences is not None:
|
|
636
|
+
# Anthropic API: ``each stop sequence must contain
|
|
637
|
+
# non-whitespace`` (HTTP 400). Surface the configuration error
|
|
638
|
+
# locally instead of letting Anthropic 400 the request — and,
|
|
639
|
+
# in override mode, instead of letting the proxy translate that
|
|
640
|
+
# 400 into a misleading ``503 all_providers_down``.
|
|
641
|
+
invalid = [
|
|
642
|
+
repr(s) for s in stop_sequences
|
|
643
|
+
if not isinstance(s, str) or not s.strip()
|
|
644
|
+
]
|
|
645
|
+
if invalid:
|
|
646
|
+
raise ClassifierUnavailableError(
|
|
647
|
+
"stop_sequences contains whitespace-only or non-string "
|
|
648
|
+
f"entries: {', '.join(invalid)}; Anthropic API requires "
|
|
649
|
+
"every stop sequence to contain non-whitespace"
|
|
650
|
+
)
|
|
617
651
|
|
|
618
652
|
# Local imports to avoid circulars and keep agent_runner.py decoupled.
|
|
619
653
|
from client_preferences import ( # type: ignore
|
|
@@ -74,6 +74,7 @@ EMAIL_DB_PATH = BASE_DIR / "nexo-email.db"
|
|
|
74
74
|
LOCK_FILE = BASE_DIR / ".lock"
|
|
75
75
|
SESSIONS_FILE = BASE_DIR / ".active-sessions.json"
|
|
76
76
|
WORKER_JOBS_DIR = BASE_DIR / "worker-jobs"
|
|
77
|
+
CHECKPOINTS_DIR = BASE_DIR / "checkpoints"
|
|
77
78
|
LOG_FILE = BASE_DIR / "monitor.log"
|
|
78
79
|
ALERT_FILE = BASE_DIR / ".consecutive-failures"
|
|
79
80
|
EMPTY_BACKOFF_STATE_FILE = BASE_DIR / ".empty-inbox-backoff.json"
|
|
@@ -112,6 +113,7 @@ CREATE INDEX IF NOT EXISTS idx_ee_ts ON email_events(timestamp);
|
|
|
112
113
|
|
|
113
114
|
BASE_DIR.mkdir(parents=True, exist_ok=True)
|
|
114
115
|
WORKER_JOBS_DIR.mkdir(parents=True, exist_ok=True)
|
|
116
|
+
CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
115
117
|
|
|
116
118
|
# Rotating log: 5MB max, keep 3 backups
|
|
117
119
|
handler = RotatingFileHandler(str(LOG_FILE), maxBytes=5*1024*1024, backupCount=3)
|
|
@@ -121,6 +123,228 @@ log.setLevel(logging.INFO)
|
|
|
121
123
|
log.addHandler(handler)
|
|
122
124
|
|
|
123
125
|
|
|
126
|
+
# ----------------------------------------------------------------------
|
|
127
|
+
# Email checkpoint system
|
|
128
|
+
# ----------------------------------------------------------------------
|
|
129
|
+
# Each email Nexo processes can take a non-trivial amount of work (drafting
|
|
130
|
+
# code, building a presentation, multi-step analysis). When a worker dies
|
|
131
|
+
# mid-flight (Brain release, OOM, timeout, manual reboot) the next retry
|
|
132
|
+
# previously started from scratch — it had no memory of the partial work the
|
|
133
|
+
# previous attempt had already produced. For long replies that meant tokens
|
|
134
|
+
# wasted on re-discovery and, occasionally, half-written files left behind in
|
|
135
|
+
# the working directory with no narrative context.
|
|
136
|
+
#
|
|
137
|
+
# The checkpoint helpers below persist a small JSON record per email-thread
|
|
138
|
+
# at ``~/.nexo/nexo-email/checkpoints/<sha1(message_id)[:16]>.json`` capturing
|
|
139
|
+
# what the previous attempt did so the retry's prompt can include it. The
|
|
140
|
+
# checkpoint is best-effort: if reading or writing fails the worker keeps
|
|
141
|
+
# running, just without the recovery context.
|
|
142
|
+
|
|
143
|
+
import hashlib as _hashlib # alias to keep the public ``hashlib`` import explicit
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _email_checkpoint_path(message_id: str) -> Path:
|
|
147
|
+
"""Stable, filesystem-safe path for a given Message-ID.
|
|
148
|
+
|
|
149
|
+
Message-IDs contain ``<``, ``>``, ``@`` and other characters that mix
|
|
150
|
+
badly with filesystems, so we hash them. 16 hex chars (~64 bits) is well
|
|
151
|
+
above the collision threshold for the few hundred emails Nexo handles
|
|
152
|
+
per operator, while keeping filenames short enough to skim in a directory
|
|
153
|
+
listing during a debug session.
|
|
154
|
+
"""
|
|
155
|
+
# ``usedforsecurity=False`` declares the hash is purely a filename
|
|
156
|
+
# disambiguator (Message-IDs contain ``<``, ``>``, ``@`` that the FS
|
|
157
|
+
# rejects), not a cryptographic primitive. Bandit B324 flags weak
|
|
158
|
+
# algorithms used for security; this annotation tells it this call
|
|
159
|
+
# is safe by intent.
|
|
160
|
+
digest = _hashlib.sha1( # noqa: S324 - non-security: filename hashing only
|
|
161
|
+
(message_id or "").encode("utf-8"),
|
|
162
|
+
usedforsecurity=False,
|
|
163
|
+
).hexdigest()[:16]
|
|
164
|
+
return CHECKPOINTS_DIR / f"{digest}.json"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _email_checkpoint_read(message_id: str) -> dict | None:
|
|
168
|
+
"""Return the checkpoint dict for ``message_id`` if one exists, else None.
|
|
169
|
+
|
|
170
|
+
Returns ``None`` (not raise) on any IO/parse failure so the worker can
|
|
171
|
+
treat "no recovery context" as a safe default.
|
|
172
|
+
"""
|
|
173
|
+
if not message_id:
|
|
174
|
+
return None
|
|
175
|
+
path = _email_checkpoint_path(message_id)
|
|
176
|
+
try:
|
|
177
|
+
if not path.is_file():
|
|
178
|
+
return None
|
|
179
|
+
return json.loads(path.read_text())
|
|
180
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
181
|
+
log.warning(f"Checkpoint read failed for {message_id}: {exc}")
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _email_checkpoint_write(
|
|
186
|
+
*,
|
|
187
|
+
message_id: str,
|
|
188
|
+
subject: str,
|
|
189
|
+
files_touched: list[str],
|
|
190
|
+
last_assistant_text: str,
|
|
191
|
+
last_error: str,
|
|
192
|
+
attempts: int,
|
|
193
|
+
) -> None:
|
|
194
|
+
"""Persist a checkpoint atomically (tmp + rename).
|
|
195
|
+
|
|
196
|
+
Best-effort: any failure is logged at warning level but never raised so
|
|
197
|
+
the worker keeps progressing.
|
|
198
|
+
"""
|
|
199
|
+
if not message_id:
|
|
200
|
+
return
|
|
201
|
+
path = _email_checkpoint_path(message_id)
|
|
202
|
+
existing = _email_checkpoint_read(message_id) or {}
|
|
203
|
+
now_iso = datetime.now().isoformat(timespec="seconds")
|
|
204
|
+
payload = {
|
|
205
|
+
"message_id": message_id,
|
|
206
|
+
"subject": str(subject or "")[:200],
|
|
207
|
+
"first_attempt_at": existing.get("first_attempt_at") or now_iso,
|
|
208
|
+
"last_attempt_at": now_iso,
|
|
209
|
+
"attempts": int(attempts or existing.get("attempts", 0) + 1),
|
|
210
|
+
"files_touched": sorted(set(
|
|
211
|
+
list(existing.get("files_touched") or []) + list(files_touched or [])
|
|
212
|
+
))[:50], # cap so a misbehaving run cannot blow up the checkpoint
|
|
213
|
+
"last_assistant_text": str(last_assistant_text or "")[:4000],
|
|
214
|
+
"last_error": str(last_error or "")[:500],
|
|
215
|
+
}
|
|
216
|
+
try:
|
|
217
|
+
tmp = path.with_suffix(path.suffix + ".tmp")
|
|
218
|
+
tmp.write_text(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
219
|
+
tmp.replace(path)
|
|
220
|
+
except OSError as exc:
|
|
221
|
+
log.warning(f"Checkpoint write failed for {message_id}: {exc}")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _email_checkpoint_delete(message_id: str) -> None:
|
|
225
|
+
"""Remove the checkpoint when an email succeeds or is escalated."""
|
|
226
|
+
if not message_id:
|
|
227
|
+
return
|
|
228
|
+
try:
|
|
229
|
+
_email_checkpoint_path(message_id).unlink(missing_ok=True)
|
|
230
|
+
except OSError as exc:
|
|
231
|
+
log.warning(f"Checkpoint delete failed for {message_id}: {exc}")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _email_checkpoint_cleanup(*, max_age_days: int = 7) -> int:
|
|
235
|
+
"""Drop checkpoint files older than ``max_age_days``. Idempotent.
|
|
236
|
+
|
|
237
|
+
Returns the number of files removed. Called from ``main()`` once per
|
|
238
|
+
monitor tick; on a healthy Mac this is sub-millisecond because the
|
|
239
|
+
directory rarely holds more than a handful of entries.
|
|
240
|
+
"""
|
|
241
|
+
if not CHECKPOINTS_DIR.is_dir():
|
|
242
|
+
return 0
|
|
243
|
+
cutoff = time.time() - (max_age_days * 86400)
|
|
244
|
+
removed = 0
|
|
245
|
+
for path in CHECKPOINTS_DIR.glob("*.json"):
|
|
246
|
+
try:
|
|
247
|
+
if path.stat().st_mtime < cutoff:
|
|
248
|
+
path.unlink()
|
|
249
|
+
removed += 1
|
|
250
|
+
except OSError:
|
|
251
|
+
continue
|
|
252
|
+
if removed:
|
|
253
|
+
log.info(f"Checkpoint cleanup: removed {removed} stale file(s) older than {max_age_days}d")
|
|
254
|
+
return removed
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _scan_files_modified_since(working_dir: str | os.PathLike, since_epoch: float, *, max_files: int = 50) -> list[str]:
|
|
258
|
+
"""Return absolute paths in ``working_dir`` whose mtime is newer than
|
|
259
|
+
``since_epoch``. Used after a worker run to capture the files Nexo
|
|
260
|
+
edited or created during the attempt, so a retry can decide whether to
|
|
261
|
+
pick up where it left off.
|
|
262
|
+
|
|
263
|
+
Skips hidden directories, NEXO runtime caches, and Git internals to
|
|
264
|
+
avoid drowning the checkpoint in noise. Caps at ``max_files`` entries
|
|
265
|
+
in case the caller passes a large repository as ``cwd``.
|
|
266
|
+
"""
|
|
267
|
+
root = Path(working_dir or "").expanduser()
|
|
268
|
+
if not root.is_dir():
|
|
269
|
+
return []
|
|
270
|
+
skip_dirs = {".git", ".venv", "node_modules", "__pycache__", ".nexo", "Library", "Documents"}
|
|
271
|
+
out: list[str] = []
|
|
272
|
+
try:
|
|
273
|
+
for child in root.rglob("*"):
|
|
274
|
+
try:
|
|
275
|
+
if any(part in skip_dirs for part in child.parts):
|
|
276
|
+
continue
|
|
277
|
+
if child.is_file() and child.stat().st_mtime > since_epoch:
|
|
278
|
+
out.append(str(child))
|
|
279
|
+
if len(out) >= max_files:
|
|
280
|
+
break
|
|
281
|
+
except OSError:
|
|
282
|
+
continue
|
|
283
|
+
except OSError:
|
|
284
|
+
return []
|
|
285
|
+
return out
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _build_previous_progress_block(message_ids: list[str]) -> str:
|
|
289
|
+
"""Build a human-readable section describing progress from prior attempts
|
|
290
|
+
on the given message_ids. Returns an empty string if no checkpoints
|
|
291
|
+
exist, so the prompt builder can append it unconditionally."""
|
|
292
|
+
blocks: list[str] = []
|
|
293
|
+
for mid in message_ids or []:
|
|
294
|
+
cp = _email_checkpoint_read(mid)
|
|
295
|
+
if not cp:
|
|
296
|
+
continue
|
|
297
|
+
subject = cp.get("subject") or "(no subject)"
|
|
298
|
+
attempts = cp.get("attempts") or 1
|
|
299
|
+
files = cp.get("files_touched") or []
|
|
300
|
+
last_text = (cp.get("last_assistant_text") or "").strip()
|
|
301
|
+
last_error = (cp.get("last_error") or "").strip()
|
|
302
|
+
section = [
|
|
303
|
+
f"### Previous attempt on email \"{subject}\"",
|
|
304
|
+
f"- Attempts so far: {attempts}",
|
|
305
|
+
]
|
|
306
|
+
if files:
|
|
307
|
+
section.append(f"- Files the previous attempt touched (may already contain partial work):")
|
|
308
|
+
for f in files[:20]:
|
|
309
|
+
section.append(f" - {f}")
|
|
310
|
+
if last_text:
|
|
311
|
+
section.append("- Last narration captured before the previous attempt died:")
|
|
312
|
+
section.append(" " + last_text.replace("\n", "\n ")[:1500])
|
|
313
|
+
if last_error:
|
|
314
|
+
section.append(f"- Last error: {last_error}")
|
|
315
|
+
section.append(
|
|
316
|
+
"- Decide: continue from where the previous attempt left off (preferred when the partial files are coherent), or start fresh (only if the previous progress is clearly wrong). Either way, do not duplicate work."
|
|
317
|
+
)
|
|
318
|
+
blocks.append("\n".join(section))
|
|
319
|
+
if not blocks:
|
|
320
|
+
return ""
|
|
321
|
+
return "\n\n## Previous attempt context (recovery checkpoint)\n\n" + "\n\n".join(blocks) + "\n"
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _extract_last_assistant_text_from_run(stdout: str) -> str:
|
|
325
|
+
"""Best-effort: pull the last assistant-visible text from Claude Code's
|
|
326
|
+
JSON output. Used to give the next attempt's prompt a hint of what the
|
|
327
|
+
dying attempt was thinking. Returns empty string if nothing parseable.
|
|
328
|
+
"""
|
|
329
|
+
raw = (stdout or "").strip()
|
|
330
|
+
if not raw or not raw.startswith("{"):
|
|
331
|
+
return raw[:1000]
|
|
332
|
+
try:
|
|
333
|
+
payload = json.loads(raw)
|
|
334
|
+
except json.JSONDecodeError:
|
|
335
|
+
return raw[:1000]
|
|
336
|
+
# Claude Code 1.x: ``{"result": "...text..."}`` is the canonical exit shape.
|
|
337
|
+
result = payload.get("result")
|
|
338
|
+
if isinstance(result, str) and result.strip():
|
|
339
|
+
return result.strip()[:4000]
|
|
340
|
+
if isinstance(result, dict):
|
|
341
|
+
# Some configs return structured result; collect strings.
|
|
342
|
+
flat = " ".join(str(v) for v in result.values() if isinstance(v, str))
|
|
343
|
+
if flat.strip():
|
|
344
|
+
return flat.strip()[:4000]
|
|
345
|
+
return raw[:1000]
|
|
346
|
+
|
|
347
|
+
|
|
124
348
|
def operator_routing_context() -> str:
|
|
125
349
|
if not ROUTING_RULES_FILE.exists():
|
|
126
350
|
return "No special routing rules."
|
|
@@ -1627,6 +1851,7 @@ def build_processing_prompt(
|
|
|
1627
1851
|
debt_block: str = "",
|
|
1628
1852
|
routing_rules: str = "",
|
|
1629
1853
|
recent_hot_context: str = "",
|
|
1854
|
+
previous_progress_block: str = "",
|
|
1630
1855
|
) -> str:
|
|
1631
1856
|
interactive_emails = list(needs_interactive or [])
|
|
1632
1857
|
target_items = list(target_emails or [])
|
|
@@ -1680,7 +1905,12 @@ def build_processing_prompt(
|
|
|
1680
1905
|
send_reply_script=send_reply_script,
|
|
1681
1906
|
trusted_domains_label=trusted_domains_label,
|
|
1682
1907
|
routing_rules=routing_rules or "No special routing rules.",
|
|
1683
|
-
extra_instructions_block=(
|
|
1908
|
+
extra_instructions_block=(
|
|
1909
|
+
(
|
|
1910
|
+
("\n" + extra_instructions_block.strip() + "\n") if extra_instructions_block.strip() else ""
|
|
1911
|
+
)
|
|
1912
|
+
+ (previous_progress_block or "")
|
|
1913
|
+
),
|
|
1684
1914
|
target_block=target_block,
|
|
1685
1915
|
interactive_block=interactive_block,
|
|
1686
1916
|
debt_block=(f"\n{debt_block.strip()}\n" if str(debt_block or "").strip() else ""),
|
|
@@ -1715,6 +1945,13 @@ def launch_nexo(config, debt_block="", target_emails=None):
|
|
|
1715
1945
|
|
|
1716
1946
|
routing_rules = operator_routing_context()
|
|
1717
1947
|
recent_hot_context = read_recent_hot_context(query="", hours=24, limit=10)
|
|
1948
|
+
target_message_ids = [str(e.get("message_id") or "") for e in (target_emails or []) if e.get("message_id")]
|
|
1949
|
+
previous_progress_block = _build_previous_progress_block(target_message_ids)
|
|
1950
|
+
if previous_progress_block:
|
|
1951
|
+
log.info(
|
|
1952
|
+
f"Resuming from checkpoint(s) for {len(target_message_ids)} email(s); "
|
|
1953
|
+
"previous attempt context attached to prompt."
|
|
1954
|
+
)
|
|
1718
1955
|
prompt = build_processing_prompt(
|
|
1719
1956
|
config=config,
|
|
1720
1957
|
operator_name=operator_name,
|
|
@@ -1734,7 +1971,10 @@ def launch_nexo(config, debt_block="", target_emails=None):
|
|
|
1734
1971
|
debt_block=debt_block,
|
|
1735
1972
|
routing_rules=routing_rules,
|
|
1736
1973
|
recent_hot_context=recent_hot_context,
|
|
1974
|
+
previous_progress_block=previous_progress_block,
|
|
1737
1975
|
)
|
|
1976
|
+
working_dir = config.get("working_dir", str(Path.home()))
|
|
1977
|
+
run_started_at = time.time()
|
|
1738
1978
|
|
|
1739
1979
|
env = os.environ.copy()
|
|
1740
1980
|
env["NEXO_HEADLESS"] = "1" # Skip stop hook post-mortem
|
|
@@ -1763,6 +2003,29 @@ def launch_nexo(config, debt_block="", target_emails=None):
|
|
|
1763
2003
|
requested_timeout = int(config.get("max_process_time", MAX_AUTOMATION_TIMEOUT_SECONDS) or MAX_AUTOMATION_TIMEOUT_SECONDS)
|
|
1764
2004
|
effective_timeout = max(60, min(requested_timeout, MAX_AUTOMATION_TIMEOUT_SECONDS))
|
|
1765
2005
|
|
|
2006
|
+
def _persist_failure_checkpoints(*, error_msg: str, last_text: str) -> None:
|
|
2007
|
+
"""Capture per-email checkpoint when the run did not complete OK so
|
|
2008
|
+
the next attempt's prompt carries the previous attempt's progress.
|
|
2009
|
+
Best-effort: never raises out of here."""
|
|
2010
|
+
if not target_message_ids:
|
|
2011
|
+
return
|
|
2012
|
+
try:
|
|
2013
|
+
files_touched = _scan_files_modified_since(working_dir, run_started_at)
|
|
2014
|
+
except Exception:
|
|
2015
|
+
files_touched = []
|
|
2016
|
+
for em in target_emails or []:
|
|
2017
|
+
mid = str(em.get("message_id") or "")
|
|
2018
|
+
if not mid:
|
|
2019
|
+
continue
|
|
2020
|
+
_email_checkpoint_write(
|
|
2021
|
+
message_id=mid,
|
|
2022
|
+
subject=str(em.get("subject") or ""),
|
|
2023
|
+
files_touched=files_touched,
|
|
2024
|
+
last_assistant_text=last_text,
|
|
2025
|
+
last_error=error_msg,
|
|
2026
|
+
attempts=int((em.get("attempts") or 0) + 1),
|
|
2027
|
+
)
|
|
2028
|
+
|
|
1766
2029
|
try:
|
|
1767
2030
|
result = run_automation_prompt(
|
|
1768
2031
|
prompt,
|
|
@@ -1781,17 +2044,33 @@ def launch_nexo(config, debt_block="", target_emails=None):
|
|
|
1781
2044
|
log.error(f"NEXO exit code {result.returncode}")
|
|
1782
2045
|
if result.stderr:
|
|
1783
2046
|
log.error(f"stderr: {result.stderr[:500]}")
|
|
2047
|
+
_persist_failure_checkpoints(
|
|
2048
|
+
error_msg=f"exit {result.returncode}: {(result.stderr or '')[:200]}",
|
|
2049
|
+
last_text=_extract_last_assistant_text_from_run(result.stdout or ""),
|
|
2050
|
+
)
|
|
1784
2051
|
return False
|
|
2052
|
+
# Success: drop checkpoints for the emails the worker just handled,
|
|
2053
|
+
# so the recovery context does not leak into a future, unrelated
|
|
2054
|
+
# attempt on the same Message-ID (rare, but possible after a
|
|
2055
|
+
# status reset by ``_recover_unreplied_processed``).
|
|
2056
|
+
for mid in target_message_ids:
|
|
2057
|
+
_email_checkpoint_delete(mid)
|
|
1785
2058
|
return True
|
|
1786
2059
|
|
|
1787
2060
|
except AutomationBackendUnavailableError as e:
|
|
1788
2061
|
log.error(f"Automation backend unavailable: {e}")
|
|
2062
|
+
_persist_failure_checkpoints(error_msg=f"AutomationBackendUnavailable: {e}", last_text="")
|
|
1789
2063
|
return False
|
|
1790
2064
|
except subprocess.TimeoutExpired:
|
|
1791
2065
|
log.error(f"Email automation exceeded {effective_timeout}s and was terminated")
|
|
2066
|
+
_persist_failure_checkpoints(
|
|
2067
|
+
error_msg=f"timeout after {effective_timeout}s",
|
|
2068
|
+
last_text="",
|
|
2069
|
+
)
|
|
1792
2070
|
return False
|
|
1793
2071
|
except Exception as e:
|
|
1794
2072
|
log.error(f"Launch error: {e}")
|
|
2073
|
+
_persist_failure_checkpoints(error_msg=f"unexpected: {e}", last_text="")
|
|
1795
2074
|
return False
|
|
1796
2075
|
def track_failure(success):
|
|
1797
2076
|
"""Track consecutive failures. Alert if 3+ in a row."""
|
|
@@ -1914,8 +2193,18 @@ def main():
|
|
|
1914
2193
|
|
|
1915
2194
|
reconcile_orphaned_seen(config, hours=24)
|
|
1916
2195
|
reconcile_terminal_unseen(config, hours=48)
|
|
1917
|
-
|
|
2196
|
+
# Recovery window widened from 24h to 7 days (168h): a single email can
|
|
2197
|
+
# fall between several Brain releases in a short window (4 releases in
|
|
2198
|
+
# one day on 2026-04-26). The 24h sweep let those drop into a permanent
|
|
2199
|
+
# limbo because the next sweep happened after the email was already
|
|
2200
|
+
# outside the lookback. 7 days is large enough to absorb a normal
|
|
2201
|
+
# release cadence while still small enough that very old "stuck"
|
|
2202
|
+
# emails are not retried indefinitely. Companion checkpoint system in
|
|
2203
|
+
# ``_email_checkpoint_*`` lets a retried email continue from the
|
|
2204
|
+
# previous attempt's progress instead of restarting from scratch.
|
|
2205
|
+
_recover_unreplied_processed(config, hours=168)
|
|
1918
2206
|
preregistered_count = preregister_pending_emails(config)
|
|
2207
|
+
_email_checkpoint_cleanup(max_age_days=7)
|
|
1919
2208
|
|
|
1920
2209
|
# --- Concurrency check ---
|
|
1921
2210
|
active_count = _active_session_count()
|