meshcode 2.10.13__tar.gz → 2.10.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {meshcode-2.10.13 → meshcode-2.10.15}/PKG-INFO +1 -1
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/__init__.py +1 -1
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/backend.py +102 -26
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/realtime.py +52 -21
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/server.py +222 -72
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/PKG-INFO +1 -1
- {meshcode-2.10.13 → meshcode-2.10.15}/pyproject.toml +1 -1
- {meshcode-2.10.13 → meshcode-2.10.15}/README.md +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/ascii_art.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/cli.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/comms_v4.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/invites.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/launcher.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/launcher_install.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/__init__.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/__main__.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/test_backend.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/test_realtime.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/test_server_wrapper.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/preferences.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/protocol_v2.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/run_agent.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/secrets.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/self_update.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/setup_clients.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/SOURCES.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/dependency_links.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/entry_points.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/requires.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/top_level.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/setup.cfg +0 -0
- {meshcode-2.10.13 → meshcode-2.10.15}/tests/test_status_enum_coverage.py +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""MeshCode — Real-time communication between AI agents."""
|
|
2
|
-
__version__ = "2.10.
|
|
2
|
+
__version__ = "2.10.15"
|
|
@@ -5,6 +5,8 @@ Zero deps beyond stdlib (urllib).
|
|
|
5
5
|
"""
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
|
+
import time as _time
|
|
9
|
+
import threading as _threading
|
|
8
10
|
from datetime import datetime
|
|
9
11
|
from pathlib import Path
|
|
10
12
|
from typing import Any, Dict, List, Optional
|
|
@@ -12,6 +14,54 @@ from urllib.error import HTTPError, URLError
|
|
|
12
14
|
from urllib.parse import quote
|
|
13
15
|
from urllib.request import Request, urlopen
|
|
14
16
|
|
|
17
|
+
|
|
18
|
+
# ── Circuit Breaker ──────────────────────────────────────────────
|
|
19
|
+
# Protects against cascading failures when Supabase is down.
|
|
20
|
+
# States: CLOSED (normal) → OPEN (reject fast) → HALF_OPEN (probe)
|
|
21
|
+
class _CircuitBreaker:
|
|
22
|
+
CLOSED = "closed"
|
|
23
|
+
OPEN = "open"
|
|
24
|
+
HALF_OPEN = "half_open"
|
|
25
|
+
|
|
26
|
+
def __init__(self, failure_threshold: int = 5, recovery_timeout: float = 30.0):
|
|
27
|
+
self.failure_threshold = failure_threshold
|
|
28
|
+
self.recovery_timeout = recovery_timeout
|
|
29
|
+
self.state = self.CLOSED
|
|
30
|
+
self.failure_count = 0
|
|
31
|
+
self.last_failure_time = 0.0
|
|
32
|
+
self._lock = _threading.Lock()
|
|
33
|
+
|
|
34
|
+
def can_execute(self) -> bool:
|
|
35
|
+
with self._lock:
|
|
36
|
+
if self.state == self.CLOSED:
|
|
37
|
+
return True
|
|
38
|
+
if self.state == self.OPEN:
|
|
39
|
+
if _time.monotonic() - self.last_failure_time >= self.recovery_timeout:
|
|
40
|
+
self.state = self.HALF_OPEN
|
|
41
|
+
return True
|
|
42
|
+
return False
|
|
43
|
+
# HALF_OPEN: allow one probe
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
def record_success(self) -> None:
|
|
47
|
+
with self._lock:
|
|
48
|
+
self.failure_count = 0
|
|
49
|
+
self.state = self.CLOSED
|
|
50
|
+
|
|
51
|
+
def record_failure(self) -> None:
|
|
52
|
+
with self._lock:
|
|
53
|
+
self.failure_count += 1
|
|
54
|
+
self.last_failure_time = _time.monotonic()
|
|
55
|
+
if self.failure_count >= self.failure_threshold:
|
|
56
|
+
self.state = self.OPEN
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def is_open(self) -> bool:
|
|
60
|
+
return self.state == self.OPEN
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
_circuit = _CircuitBreaker(failure_threshold=5, recovery_timeout=30.0)
|
|
64
|
+
|
|
15
65
|
# Bake in production defaults — RLS-protected publishable key, safe to ship.
|
|
16
66
|
_DEFAULT_SUPABASE_URL = "https://gjinagyyjttyxnaoavnz.supabase.co"
|
|
17
67
|
_DEFAULT_SUPABASE_KEY = "sb_publishable_qwN9PO1L7jUXhhbhhVk2CQ_z1FXG2Qf"
|
|
@@ -66,22 +116,31 @@ def _headers(*, prefer: Optional[str] = None, content_profile: bool = True) -> D
|
|
|
66
116
|
|
|
67
117
|
|
|
68
118
|
def _request(method: str, path: str, *, data: Any = None, prefer: Optional[str] = None) -> Any:
|
|
119
|
+
if not _circuit.can_execute():
|
|
120
|
+
return {"_error": "circuit breaker open — Supabase temporarily unavailable", "_code": 503}
|
|
69
121
|
url = f"{SUPABASE_URL}/rest/v1/{path}"
|
|
70
122
|
body = json.dumps(data).encode("utf-8") if data else None
|
|
71
123
|
req = Request(url, data=body, method=method, headers=_headers(prefer=prefer))
|
|
72
124
|
try:
|
|
73
125
|
with urlopen(req, timeout=10) as resp:
|
|
74
126
|
raw = resp.read().decode("utf-8")
|
|
127
|
+
_circuit.record_success()
|
|
75
128
|
return json.loads(raw) if raw.strip() else None
|
|
76
129
|
except HTTPError as e:
|
|
77
130
|
err = e.read().decode("utf-8", errors="replace")
|
|
131
|
+
# 4xx = client error (not a backend failure), don't trip breaker
|
|
132
|
+
if 400 <= e.code < 500:
|
|
133
|
+
_circuit.record_success()
|
|
134
|
+
else:
|
|
135
|
+
_circuit.record_failure()
|
|
78
136
|
try:
|
|
79
137
|
err_obj = json.loads(err)
|
|
80
138
|
return {"_error": err_obj.get("message", err[:200]), "_code": e.code}
|
|
81
139
|
except Exception:
|
|
82
140
|
return {"_error": err[:200], "_code": e.code}
|
|
83
|
-
except URLError as e:
|
|
84
|
-
|
|
141
|
+
except (URLError, OSError, TimeoutError) as e:
|
|
142
|
+
_circuit.record_failure()
|
|
143
|
+
return {"_error": str(getattr(e, 'reason', e)), "_code": 0}
|
|
85
144
|
|
|
86
145
|
|
|
87
146
|
def sb_select(table: str, filters: str = "", order: Optional[str] = None, limit: Optional[int] = None) -> List[Dict]:
|
|
@@ -136,32 +195,49 @@ def enable_recording(api_key: str, project_id: str, agent_name: str, session_id:
|
|
|
136
195
|
_recording_session_id = session_id
|
|
137
196
|
|
|
138
197
|
|
|
139
|
-
def sb_rpc(fn_name: str, params: Dict) -> Any:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
err = e.read().decode("utf-8", errors="replace")
|
|
198
|
+
def sb_rpc(fn_name: str, params: Dict, *, _max_retries: int = 3) -> Any:
|
|
199
|
+
import random as _random
|
|
200
|
+
if not _circuit.can_execute():
|
|
201
|
+
return {"_error": "circuit breaker open — Supabase temporarily unavailable", "_circuit": "open"}
|
|
202
|
+
last_err = None
|
|
203
|
+
for attempt in range(_max_retries):
|
|
204
|
+
url = f"{SUPABASE_URL}/rest/v1/rpc/{fn_name}"
|
|
205
|
+
body = json.dumps(params).encode("utf-8")
|
|
206
|
+
req = Request(url, data=body, method="POST", headers=_headers(content_profile=False))
|
|
149
207
|
try:
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
208
|
+
with urlopen(req, timeout=10) as resp:
|
|
209
|
+
raw = resp.read().decode("utf-8")
|
|
210
|
+
result = json.loads(raw) if raw.strip() else None
|
|
211
|
+
_circuit.record_success()
|
|
212
|
+
# Auto-record tool calls to session events (hot-reloadable)
|
|
213
|
+
if _recording_enabled and fn_name not in _SKIP_RECORDING:
|
|
214
|
+
_bg_record("tool_call", {"rpc": fn_name})
|
|
215
|
+
return result
|
|
216
|
+
except HTTPError as e:
|
|
217
|
+
err = e.read().decode("utf-8", errors="replace")
|
|
218
|
+
# 4xx errors are not transient — don't retry, don't trip breaker
|
|
219
|
+
if 400 <= e.code < 500:
|
|
220
|
+
_circuit.record_success()
|
|
221
|
+
try:
|
|
222
|
+
result = {"_error": json.loads(err).get("message", err[:200])}
|
|
223
|
+
except Exception:
|
|
224
|
+
result = {"_error": err[:200]}
|
|
225
|
+
if _recording_enabled and fn_name not in _SKIP_RECORDING:
|
|
226
|
+
_bg_record("error", {"rpc": fn_name, "error": str(err)[:200]})
|
|
227
|
+
return result
|
|
228
|
+
_circuit.record_failure()
|
|
229
|
+
last_err = err
|
|
230
|
+
except (URLError, OSError, TimeoutError) as e:
|
|
231
|
+
_circuit.record_failure()
|
|
232
|
+
last_err = str(getattr(e, 'reason', e))
|
|
233
|
+
# Retry with jitter for transient errors (5xx, network)
|
|
234
|
+
if attempt < _max_retries - 1:
|
|
235
|
+
delay = (2 ** attempt) + _random.uniform(0, 1)
|
|
236
|
+
_time.sleep(delay)
|
|
237
|
+
# All retries exhausted
|
|
161
238
|
if _recording_enabled and fn_name not in _SKIP_RECORDING:
|
|
162
|
-
_bg_record("
|
|
163
|
-
|
|
164
|
-
return result
|
|
239
|
+
_bg_record("error", {"rpc": fn_name, "error": str(last_err)[:200], "retries_exhausted": True})
|
|
240
|
+
return {"_error": str(last_err)[:200] if last_err else "request failed after retries"}
|
|
165
241
|
|
|
166
242
|
|
|
167
243
|
def _bg_record(event_type: str, payload: dict):
|
|
@@ -53,8 +53,9 @@ class RealtimeListener:
|
|
|
53
53
|
self.notify_callback = notify_callback
|
|
54
54
|
self.service_role_key = service_role_key
|
|
55
55
|
|
|
56
|
-
# Last
|
|
57
|
-
self.queue: Deque[Dict] = deque(maxlen=
|
|
56
|
+
# Last 500 unread messages — drained by meshcode_check tool
|
|
57
|
+
self.queue: Deque[Dict] = deque(maxlen=500)
|
|
58
|
+
self._overflow_warned = False
|
|
58
59
|
self._task: Optional[asyncio.Task] = None
|
|
59
60
|
# asyncio.Event() in Py3.10+ no longer requires a running loop, but
|
|
60
61
|
# on older Python or certain Windows event-loop policies it can
|
|
@@ -99,22 +100,39 @@ class RealtimeListener:
|
|
|
99
100
|
pass
|
|
100
101
|
|
|
101
102
|
async def _run(self) -> None:
|
|
102
|
-
"""Outer loop: reconnect with exponential backoff on disconnect.
|
|
103
|
+
"""Outer loop: reconnect with exponential backoff on disconnect.
|
|
104
|
+
|
|
105
|
+
NEVER gives up — keeps retrying with capped backoff (max 60s).
|
|
106
|
+
The MCP server must stay alive regardless of Realtime health.
|
|
107
|
+
"""
|
|
103
108
|
backoff = 1
|
|
109
|
+
consecutive_failures = 0
|
|
104
110
|
while not self._stop.is_set():
|
|
105
111
|
try:
|
|
106
112
|
await self._connect_and_listen()
|
|
107
113
|
backoff = 1 # reset on clean disconnect
|
|
114
|
+
consecutive_failures = 0
|
|
108
115
|
except asyncio.CancelledError:
|
|
109
116
|
return
|
|
110
117
|
except Exception as e:
|
|
111
|
-
|
|
118
|
+
consecutive_failures += 1
|
|
119
|
+
if consecutive_failures % 10 == 0:
|
|
120
|
+
log.error(
|
|
121
|
+
f"Realtime: {consecutive_failures} consecutive failures — "
|
|
122
|
+
f"still retrying (backoff={backoff}s). Last error: {e}"
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
log.warning(
|
|
126
|
+
f"Realtime connection error ({consecutive_failures}): "
|
|
127
|
+
f"{e}; reconnecting in {backoff}s"
|
|
128
|
+
)
|
|
129
|
+
self._connected = False
|
|
112
130
|
try:
|
|
113
131
|
await asyncio.wait_for(self._stop.wait(), timeout=backoff)
|
|
114
132
|
return # stop signaled
|
|
115
133
|
except asyncio.TimeoutError:
|
|
116
134
|
pass
|
|
117
|
-
backoff = min(backoff * 2,
|
|
135
|
+
backoff = min(backoff * 2, 60)
|
|
118
136
|
|
|
119
137
|
async def _connect_and_listen(self) -> None:
|
|
120
138
|
"""Single connection lifecycle: connect, subscribe, listen."""
|
|
@@ -155,23 +173,33 @@ class RealtimeListener:
|
|
|
155
173
|
await ws.send(json.dumps(join_msg))
|
|
156
174
|
|
|
157
175
|
# Wait for phx_reply to confirm subscription was accepted.
|
|
176
|
+
# Retry up to 3 times with backoff — transient Supabase latency
|
|
177
|
+
# can cause one agent to time out while siblings succeed.
|
|
158
178
|
self._subscription_ok = False
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
179
|
+
for _sub_attempt in range(3):
|
|
180
|
+
try:
|
|
181
|
+
reply_raw = await asyncio.wait_for(ws.recv(), timeout=15.0)
|
|
182
|
+
reply = json.loads(reply_raw)
|
|
183
|
+
reply_status = (reply.get("payload") or {}).get("status")
|
|
184
|
+
if reply_status == "ok":
|
|
185
|
+
self._subscription_ok = True
|
|
186
|
+
log.info(f"Realtime subscription OK for {self.agent_name} on {topic}")
|
|
187
|
+
break
|
|
188
|
+
else:
|
|
189
|
+
log.warning(
|
|
190
|
+
f"Realtime subscription attempt {_sub_attempt+1} failed for {self.agent_name}: "
|
|
191
|
+
f"status={reply_status} payload={reply.get('payload')}"
|
|
192
|
+
)
|
|
193
|
+
except asyncio.TimeoutError:
|
|
194
|
+
log.warning(f"Realtime subscription attempt {_sub_attempt+1} TIMEOUT for {self.agent_name}")
|
|
195
|
+
except Exception as e:
|
|
196
|
+
log.warning(f"Realtime subscription attempt {_sub_attempt+1} error: {e}")
|
|
197
|
+
# Retry: re-send join message after brief backoff
|
|
198
|
+
if _sub_attempt < 2:
|
|
199
|
+
await asyncio.sleep(2 ** _sub_attempt)
|
|
200
|
+
await ws.send(json.dumps(join_msg))
|
|
201
|
+
if not self._subscription_ok:
|
|
202
|
+
log.error(f"Realtime subscription FAILED after 3 attempts for {self.agent_name}")
|
|
175
203
|
|
|
176
204
|
# Heartbeat task to keep the connection alive
|
|
177
205
|
heartbeat_task = asyncio.create_task(self._heartbeat(ws))
|
|
@@ -228,6 +256,9 @@ class RealtimeListener:
|
|
|
228
256
|
"id": record.get("id"),
|
|
229
257
|
"parent_id": record.get("parent_msg_id"),
|
|
230
258
|
}
|
|
259
|
+
if len(self.queue) >= 400 and not self._overflow_warned:
|
|
260
|
+
log.warning(f"Message queue at {len(self.queue)}/500 — risk of dropping messages")
|
|
261
|
+
self._overflow_warned = True
|
|
231
262
|
self.queue.append(enriched)
|
|
232
263
|
# Wake any meshcode_wait blocked on this event.
|
|
233
264
|
try:
|
|
@@ -8,11 +8,14 @@ Run with:
|
|
|
8
8
|
MESHCODE_PROJECT=my-app MESHCODE_AGENT=backend python -m meshcode_mcp serve
|
|
9
9
|
"""
|
|
10
10
|
import asyncio
|
|
11
|
+
import atexit
|
|
11
12
|
import json
|
|
12
13
|
import logging
|
|
13
14
|
import os
|
|
15
|
+
import signal
|
|
14
16
|
import sys
|
|
15
17
|
import hashlib as _hashlib
|
|
18
|
+
import traceback as _traceback
|
|
16
19
|
from collections import deque
|
|
17
20
|
from contextlib import asynccontextmanager
|
|
18
21
|
from typing import Any, Dict, List, Optional, Union
|
|
@@ -64,16 +67,21 @@ def _agent_color(name: str) -> str:
|
|
|
64
67
|
|
|
65
68
|
|
|
66
69
|
def _mc_log(msg: str, level: str = "info") -> None:
|
|
67
|
-
"""Colored [meshcode-mcp] log line. Uses agent color if available.
|
|
70
|
+
"""Colored [meshcode-mcp] log line. Uses agent color if available.
|
|
71
|
+
|
|
72
|
+
CRITICAL: Must write to stderr, NEVER stdout. MCP protocol uses stdout
|
|
73
|
+
for JSON-RPC — any non-JSON output to stdout corrupts the stream and
|
|
74
|
+
causes Claude Code to kill the connection.
|
|
75
|
+
"""
|
|
68
76
|
agent = os.environ.get("MESHCODE_AGENT", "")
|
|
69
77
|
c = _agent_color(agent) if agent else "\033[36m"
|
|
70
78
|
prefix = f"{c}{_ANSI_BOLD}[meshcode-mcp]{_ANSI_RESET}"
|
|
71
79
|
if level == "error":
|
|
72
|
-
print(f"{prefix} \033[91mERROR:{_ANSI_RESET} {msg}",
|
|
80
|
+
print(f"{prefix} \033[91mERROR:{_ANSI_RESET} {msg}", file=sys.stderr)
|
|
73
81
|
elif level == "warn":
|
|
74
|
-
print(f"{prefix} \033[33mWARNING:{_ANSI_RESET} {msg}",
|
|
82
|
+
print(f"{prefix} \033[33mWARNING:{_ANSI_RESET} {msg}", file=sys.stderr)
|
|
75
83
|
else:
|
|
76
|
-
print(f"{prefix} {c}{msg}{_ANSI_RESET}",
|
|
84
|
+
print(f"{prefix} {c}{msg}{_ANSI_RESET}", file=sys.stderr)
|
|
77
85
|
|
|
78
86
|
|
|
79
87
|
# ============================================================
|
|
@@ -370,34 +378,44 @@ def _get_api_key() -> str:
|
|
|
370
378
|
_API_KEY_CACHE = kc_val
|
|
371
379
|
return kc_val
|
|
372
380
|
except Exception as e:
|
|
373
|
-
_mc_log(f" keychain lookup failed for profile '{profile}': {e}",
|
|
381
|
+
_mc_log(f" keychain lookup failed for profile '{profile}': {e}", file=sys.stderr)
|
|
374
382
|
_API_KEY_CACHE = ""
|
|
375
383
|
return ""
|
|
376
384
|
|
|
377
385
|
|
|
378
|
-
# Resolve project_id at startup. Try in order:
|
|
386
|
+
# Resolve project_id at startup with retry+backoff. Try in order:
|
|
379
387
|
# 1. MESHCODE_PROJECT_ID env var (baked by `meshcode setup`, fastest)
|
|
380
388
|
# 2. mc_resolve_project RPC with the user's api_key (security definer, bypasses RLS)
|
|
381
389
|
# 3. Direct SELECT via get_project_id (only works if RLS is open / user is admin)
|
|
382
390
|
_PROJECT_ID: Optional[str] = os.environ.get("MESHCODE_PROJECT_ID") or None
|
|
383
391
|
if not _PROJECT_ID:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
392
|
+
_BOOT_MAX_RETRIES = 3
|
|
393
|
+
_BOOT_BACKOFF = [2, 5, 10] # seconds between retries
|
|
394
|
+
for _boot_attempt in range(_BOOT_MAX_RETRIES):
|
|
395
|
+
_api_key = _get_api_key()
|
|
396
|
+
if _api_key:
|
|
397
|
+
try:
|
|
398
|
+
_r = be.sb_rpc("mc_resolve_project", {
|
|
399
|
+
"p_api_key": _api_key,
|
|
400
|
+
"p_project_name": PROJECT_NAME,
|
|
401
|
+
})
|
|
402
|
+
if isinstance(_r, dict) and _r.get("project_id"):
|
|
403
|
+
_PROJECT_ID = _r["project_id"]
|
|
404
|
+
break
|
|
405
|
+
elif isinstance(_r, dict) and _r.get("error"):
|
|
406
|
+
_mc_log(f" mc_resolve_project: {_r['error']}", file=sys.stderr)
|
|
407
|
+
except Exception as _e:
|
|
408
|
+
_mc_log(f" mc_resolve_project failed: {_e}", file=sys.stderr)
|
|
409
|
+
if not _PROJECT_ID:
|
|
410
|
+
_PROJECT_ID = be.get_project_id(PROJECT_NAME)
|
|
411
|
+
if _PROJECT_ID:
|
|
412
|
+
break
|
|
413
|
+
if _boot_attempt < _BOOT_MAX_RETRIES - 1:
|
|
414
|
+
_wait = _BOOT_BACKOFF[_boot_attempt]
|
|
415
|
+
_mc_log(f" project resolution failed (attempt {_boot_attempt+1}/{_BOOT_MAX_RETRIES}), retrying in {_wait}s...", file=sys.stderr)
|
|
416
|
+
_time.sleep(_wait)
|
|
399
417
|
if not _PROJECT_ID:
|
|
400
|
-
_mc_log(f"project '{PROJECT_NAME}' not found (check MESHCODE_KEYCHAIN_PROFILE / MESHCODE_API_KEY)", "error")
|
|
418
|
+
_mc_log(f"project '{PROJECT_NAME}' not found after {_BOOT_MAX_RETRIES} attempts (check MESHCODE_KEYCHAIN_PROFILE / MESHCODE_API_KEY)", "error")
|
|
401
419
|
sys.exit(2)
|
|
402
420
|
|
|
403
421
|
# Resolve project plan for adaptive features (heartbeat interval, etc.)
|
|
@@ -411,7 +429,7 @@ except Exception:
|
|
|
411
429
|
|
|
412
430
|
_register_result = be.register_agent(PROJECT_NAME, AGENT_NAME, AGENT_ROLE or "MCP-connected agent", api_key=_get_api_key())
|
|
413
431
|
if isinstance(_register_result, dict) and _register_result.get("error"):
|
|
414
|
-
_mc_log(f" register failed: {_register_result['error']}",
|
|
432
|
+
_mc_log(f" register failed: {_register_result['error']}", file=sys.stderr)
|
|
415
433
|
|
|
416
434
|
# ── Fetch profile color from dashboard (single source of truth) ──
|
|
417
435
|
try:
|
|
@@ -445,7 +463,7 @@ def _flip_status(status: str, task: str = "") -> bool:
|
|
|
445
463
|
return False
|
|
446
464
|
|
|
447
465
|
if not _flip_status("idle", ""):
|
|
448
|
-
_mc_log(f" could not flip status to idle",
|
|
466
|
+
_mc_log(f" could not flip status to idle", file=sys.stderr)
|
|
449
467
|
|
|
450
468
|
|
|
451
469
|
# ============================================================
|
|
@@ -492,16 +510,17 @@ def _schedule_flip(status: str, task: str = "") -> None:
|
|
|
492
510
|
|
|
493
511
|
|
|
494
512
|
def _set_state(state: str, tool: str = "") -> None:
|
|
495
|
-
"""Update the state machine and broadcast to dashboard."""
|
|
513
|
+
"""Update the state machine and broadcast to dashboard. Thread-safe."""
|
|
496
514
|
global _current_state, _current_tool, _last_tool_at, _working_timer
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
_working_timer
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
515
|
+
with _flip_lock:
|
|
516
|
+
# Cancel any pending working→online timer
|
|
517
|
+
if _working_timer is not None:
|
|
518
|
+
_working_timer.cancel()
|
|
519
|
+
_working_timer = None
|
|
520
|
+
_current_state = state
|
|
521
|
+
_current_tool = tool
|
|
522
|
+
if state == "working":
|
|
523
|
+
_last_tool_at = _time.time()
|
|
505
524
|
_schedule_flip(state, tool)
|
|
506
525
|
|
|
507
526
|
|
|
@@ -552,13 +571,14 @@ def with_working_status(func):
|
|
|
552
571
|
except Exception as e:
|
|
553
572
|
if not skip:
|
|
554
573
|
_auto_learn_error(name, e, list(kwargs.keys()))
|
|
555
|
-
raise
|
|
574
|
+
# NEVER re-raise — return structured error instead of crashing
|
|
575
|
+
import traceback as _tb
|
|
576
|
+
_log_crash_to_db("tool_exception", f"{name}: {type(e).__name__}: {e}\n{_tb.format_exc()[-500:]}")
|
|
577
|
+
return {"error": f"tool {name} failed: {type(e).__name__}: {e}", "_recovered": True}
|
|
556
578
|
finally:
|
|
557
579
|
if not skip:
|
|
558
580
|
global _last_tool_at
|
|
559
581
|
_last_tool_at = _time.time()
|
|
560
|
-
# Don't flip to online here — CPU-based detection in heartbeat
|
|
561
|
-
# will handle the transition when LLM stops generating
|
|
562
582
|
return awrapper
|
|
563
583
|
else:
|
|
564
584
|
@_functools.wraps(func)
|
|
@@ -575,13 +595,14 @@ def with_working_status(func):
|
|
|
575
595
|
except Exception as e:
|
|
576
596
|
if not skip:
|
|
577
597
|
_auto_learn_error(name, e, list(kwargs.keys()))
|
|
578
|
-
raise
|
|
598
|
+
# NEVER re-raise — return structured error instead of crashing
|
|
599
|
+
import traceback as _tb
|
|
600
|
+
_log_crash_to_db("tool_exception", f"{name}: {type(e).__name__}: {e}\n{_tb.format_exc()[-500:]}")
|
|
601
|
+
return {"error": f"tool {name} failed: {type(e).__name__}: {e}", "_recovered": True}
|
|
579
602
|
finally:
|
|
580
603
|
if not skip:
|
|
581
604
|
global _last_tool_at
|
|
582
605
|
_last_tool_at = _time.time()
|
|
583
|
-
# Don't flip to online here — CPU-based detection in heartbeat
|
|
584
|
-
# will handle the transition when LLM stops generating
|
|
585
606
|
return swrapper
|
|
586
607
|
|
|
587
608
|
|
|
@@ -607,7 +628,7 @@ def _acquire_lease() -> bool:
|
|
|
607
628
|
})
|
|
608
629
|
except Exception as e:
|
|
609
630
|
# Non-fatal: RPC might not exist on older servers.
|
|
610
|
-
_mc_log(f"stale-lease pre-clean skipped: {e}",
|
|
631
|
+
_mc_log(f"stale-lease pre-clean skipped: {e}", file=sys.stderr)
|
|
611
632
|
for attempt in range(3):
|
|
612
633
|
try:
|
|
613
634
|
r = be.sb_rpc("mc_acquire_agent_lease", {
|
|
@@ -659,14 +680,14 @@ def _acquire_lease() -> bool:
|
|
|
659
680
|
_mc_log(f"Could not start — agent '{AGENT_NAME}' is running in another window.", "error")
|
|
660
681
|
_mc_log("Close the other window first, or use a different agent name.", "error")
|
|
661
682
|
return False
|
|
662
|
-
_mc_log(f"lease attempt {attempt+1}: {r.get('error')}",
|
|
683
|
+
_mc_log(f"lease attempt {attempt+1}: {r.get('error')}", file=sys.stderr)
|
|
663
684
|
else:
|
|
664
685
|
return True
|
|
665
686
|
except Exception as e:
|
|
666
|
-
_mc_log(f"lease attempt {attempt+1} failed: {e}",
|
|
687
|
+
_mc_log(f"lease attempt {attempt+1} failed: {e}", file=sys.stderr)
|
|
667
688
|
if attempt < 2:
|
|
668
689
|
_time.sleep(2)
|
|
669
|
-
_mc_log(f" lease failed after 3 attempts — proceeding anyway",
|
|
690
|
+
_mc_log(f" lease failed after 3 attempts — proceeding anyway", file=sys.stderr)
|
|
670
691
|
return True
|
|
671
692
|
|
|
672
693
|
if not _acquire_lease():
|
|
@@ -684,7 +705,7 @@ def _boot_diagnostic() -> None:
|
|
|
684
705
|
be.sb_select("mc_projects", f"id=eq.{_PROJECT_ID}", limit=1)
|
|
685
706
|
checks_passed += 1
|
|
686
707
|
except Exception as e:
|
|
687
|
-
print(f"[meshcode] BOOT CHECK FAILED: Supabase API unreachable ({e}). Fix: check network/VPN.",
|
|
708
|
+
print(f"[meshcode] BOOT CHECK FAILED: Supabase API unreachable ({e}). Fix: check network/VPN.", file=sys.stderr)
|
|
688
709
|
|
|
689
710
|
# Check 2: Lease valid
|
|
690
711
|
try:
|
|
@@ -694,11 +715,11 @@ def _boot_diagnostic() -> None:
|
|
|
694
715
|
if agent.get("instance_id") == _INSTANCE_ID:
|
|
695
716
|
checks_passed += 1
|
|
696
717
|
else:
|
|
697
|
-
print(f"[meshcode] BOOT CHECK FAILED: Lease mismatch — expected {_INSTANCE_ID}, got {agent.get('instance_id')}. Fix: restart agent.",
|
|
718
|
+
print(f"[meshcode] BOOT CHECK FAILED: Lease mismatch — expected {_INSTANCE_ID}, got {agent.get('instance_id')}. Fix: restart agent.", file=sys.stderr)
|
|
698
719
|
else:
|
|
699
|
-
print(f"[meshcode] BOOT CHECK FAILED: Agent '{AGENT_NAME}' not found in project. Fix: register agent first.",
|
|
720
|
+
print(f"[meshcode] BOOT CHECK FAILED: Agent '{AGENT_NAME}' not found in project. Fix: register agent first.", file=sys.stderr)
|
|
700
721
|
except Exception as e:
|
|
701
|
-
print(f"[meshcode] BOOT CHECK FAILED: Could not verify lease ({e}).",
|
|
722
|
+
print(f"[meshcode] BOOT CHECK FAILED: Could not verify lease ({e}).", file=sys.stderr)
|
|
702
723
|
|
|
703
724
|
# Check 3: Heartbeat recent
|
|
704
725
|
try:
|
|
@@ -707,7 +728,7 @@ def _boot_diagnostic() -> None:
|
|
|
707
728
|
if hb:
|
|
708
729
|
checks_passed += 1
|
|
709
730
|
else:
|
|
710
|
-
print(f"[meshcode] BOOT CHECK WARNING: No heartbeat recorded yet.",
|
|
731
|
+
print(f"[meshcode] BOOT CHECK WARNING: No heartbeat recorded yet.", file=sys.stderr)
|
|
711
732
|
else:
|
|
712
733
|
checks_passed += 1 # skip if no agent data
|
|
713
734
|
except Exception:
|
|
@@ -724,9 +745,9 @@ def _boot_diagnostic() -> None:
|
|
|
724
745
|
checks_passed += 1 # non-critical
|
|
725
746
|
|
|
726
747
|
if checks_passed == checks_total:
|
|
727
|
-
print(f"[meshcode] All boot checks passed ({checks_passed}/{checks_total}).",
|
|
748
|
+
print(f"[meshcode] All boot checks passed ({checks_passed}/{checks_total}).", file=sys.stderr)
|
|
728
749
|
else:
|
|
729
|
-
print(f"[meshcode] Boot checks: {checks_passed}/{checks_total} passed. Agent starting anyway.",
|
|
750
|
+
print(f"[meshcode] Boot checks: {checks_passed}/{checks_total} passed. Agent starting anyway.", file=sys.stderr)
|
|
730
751
|
|
|
731
752
|
|
|
732
753
|
_boot_diagnostic()
|
|
@@ -747,6 +768,53 @@ def _release_lease() -> None:
|
|
|
747
768
|
pass
|
|
748
769
|
|
|
749
770
|
|
|
771
|
+
# ── Crash logging + graceful shutdown ──────────────────────────
|
|
772
|
+
_SHUTDOWN_LOGGED = False
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def _log_crash_to_db(reason: str = "unknown", error_detail: str = "") -> None:
|
|
776
|
+
"""Best-effort crash log to mc_agent_crash_logs table. Non-fatal if table doesn't exist."""
|
|
777
|
+
global _SHUTDOWN_LOGGED
|
|
778
|
+
if _SHUTDOWN_LOGGED:
|
|
779
|
+
return
|
|
780
|
+
_SHUTDOWN_LOGGED = True
|
|
781
|
+
try:
|
|
782
|
+
be.sb_rpc("mc_log_error", {
|
|
783
|
+
"p_api_key": _get_api_key(),
|
|
784
|
+
"p_project_id": _PROJECT_ID,
|
|
785
|
+
"p_agent_name": AGENT_NAME,
|
|
786
|
+
"p_error_type": reason,
|
|
787
|
+
"p_error_detail": error_detail[:2000],
|
|
788
|
+
"p_instance_id": _INSTANCE_ID,
|
|
789
|
+
})
|
|
790
|
+
except Exception:
|
|
791
|
+
# Table may not exist yet — fall back to status update
|
|
792
|
+
try:
|
|
793
|
+
be.set_status(_PROJECT_ID, AGENT_NAME, "offline",
|
|
794
|
+
f"crashed: {reason[:100]}", api_key=_get_api_key())
|
|
795
|
+
except Exception:
|
|
796
|
+
pass
|
|
797
|
+
_mc_log(f" crash logged: {reason}", file=sys.stderr)
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def _on_exit() -> None:
|
|
801
|
+
"""atexit handler — release lease and log shutdown."""
|
|
802
|
+
_log_crash_to_db("process_exit", "atexit handler fired")
|
|
803
|
+
_release_lease()
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def _on_signal(signum, frame) -> None:
|
|
807
|
+
"""Signal handler for SIGTERM/SIGINT — clean shutdown."""
|
|
808
|
+
sig_name = signal.Signals(signum).name if hasattr(signal, 'Signals') else str(signum)
|
|
809
|
+
_log_crash_to_db("signal", f"Received {sig_name}")
|
|
810
|
+
_release_lease()
|
|
811
|
+
sys.exit(128 + signum)
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
atexit.register(_on_exit)
|
|
815
|
+
signal.signal(signal.SIGTERM, _on_signal)
|
|
816
|
+
|
|
817
|
+
|
|
750
818
|
# ============================================================
|
|
751
819
|
# Agent identity from Supabase profile (for system instructions)
|
|
752
820
|
# ============================================================
|
|
@@ -1037,25 +1105,28 @@ def _heartbeat_thread_fn():
|
|
|
1037
1105
|
try:
|
|
1038
1106
|
be.sb_rpc("mc_heartbeat", {"p_project_id": _PROJECT_ID, "p_agent_name": AGENT_NAME, "p_version": _SDK_VERSION})
|
|
1039
1107
|
|
|
1040
|
-
# CPU-based status detection
|
|
1108
|
+
# CPU-based status detection — read shared state under lock
|
|
1041
1109
|
parent_cpu = _get_parent_cpu()
|
|
1042
|
-
|
|
1110
|
+
with _flip_lock:
|
|
1111
|
+
cur_state = _current_state
|
|
1112
|
+
in_wait = _IN_WAIT
|
|
1113
|
+
idle_secs = _time.time() - _last_tool_at
|
|
1043
1114
|
|
|
1044
|
-
if
|
|
1115
|
+
if in_wait:
|
|
1045
1116
|
# Actually in meshcode_wait right now — listening for messages
|
|
1046
|
-
if
|
|
1117
|
+
if cur_state != "waiting":
|
|
1047
1118
|
_set_state("waiting", "listening for messages")
|
|
1048
1119
|
elif parent_cpu > 3.0:
|
|
1049
1120
|
# LLM is actively generating tokens or streaming
|
|
1050
|
-
if
|
|
1121
|
+
if cur_state != "working":
|
|
1051
1122
|
_set_state("working", "generating response")
|
|
1052
|
-
elif
|
|
1123
|
+
elif cur_state == "working":
|
|
1053
1124
|
# LLM just stopped — transition to online (not sleeping)
|
|
1054
1125
|
_set_state("online", "")
|
|
1055
|
-
elif
|
|
1126
|
+
elif cur_state == "online" and idle_secs > 30:
|
|
1056
1127
|
# Brief idle — show as idle, not sleeping yet
|
|
1057
1128
|
_set_state("idle", "idle")
|
|
1058
|
-
elif
|
|
1129
|
+
elif cur_state == "idle" and idle_secs > 300 and parent_cpu < 2.0 and not _STAY_AWAKE:
|
|
1059
1130
|
# Extended idle + no CPU activity → sleeping (5 min, not 90s)
|
|
1060
1131
|
_set_state("sleeping", "sleeping")
|
|
1061
1132
|
|
|
@@ -1105,6 +1176,18 @@ async def lifespan(_app):
|
|
|
1105
1176
|
)
|
|
1106
1177
|
await _REALTIME.start()
|
|
1107
1178
|
|
|
1179
|
+
# Wait up to 5s for realtime to report connected before proceeding.
|
|
1180
|
+
# Without this, the lifespan yields before the WS is ready, and Claude
|
|
1181
|
+
# Code's handshake can time out on slower network paths — one agent
|
|
1182
|
+
# fails while siblings on the same box succeed.
|
|
1183
|
+
for _rt_check in range(10):
|
|
1184
|
+
if getattr(_REALTIME, '_connected', False):
|
|
1185
|
+
log.info(f"Realtime connected for {AGENT_NAME}")
|
|
1186
|
+
break
|
|
1187
|
+
await asyncio.sleep(0.5)
|
|
1188
|
+
else:
|
|
1189
|
+
log.warning(f"Realtime not connected after 5s for {AGENT_NAME} — continuing with polling fallback")
|
|
1190
|
+
|
|
1108
1191
|
# IMMEDIATE: send first heartbeat + set online status BEFORE any tool calls.
|
|
1109
1192
|
# Without this, the agent appears offline for up to 30s after boot.
|
|
1110
1193
|
for _attempt in range(3):
|
|
@@ -1152,16 +1235,19 @@ async def lifespan(_app):
|
|
|
1152
1235
|
})
|
|
1153
1236
|
except Exception:
|
|
1154
1237
|
pass # Never block shutdown
|
|
1155
|
-
log.info("lifespan shutdown — stopping heartbeat + realtime
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
# Flip to offline + release lease so the dashboard reflects reality
|
|
1160
|
-
# within seconds (not waiting for the 30s cron to notice).
|
|
1238
|
+
log.info("lifespan shutdown — releasing lease + stopping heartbeat + realtime")
|
|
1239
|
+
# Release lease FIRST — before stopping heartbeat thread.
|
|
1240
|
+
# If heartbeat join times out, the lease is already released so
|
|
1241
|
+
# the agent won't block reconnection.
|
|
1161
1242
|
try:
|
|
1162
1243
|
_release_lease()
|
|
1163
1244
|
except Exception as _e:
|
|
1164
1245
|
log.warning(f"could not release lease: {_e}")
|
|
1246
|
+
_heartbeat_stop.set()
|
|
1247
|
+
hb_thread.join(timeout=5)
|
|
1248
|
+
if hb_thread.is_alive():
|
|
1249
|
+
log.warning("heartbeat thread did not stop within 5s — lease already released")
|
|
1250
|
+
await _REALTIME.stop()
|
|
1165
1251
|
|
|
1166
1252
|
|
|
1167
1253
|
# ============================================================
|
|
@@ -1368,9 +1454,9 @@ try:
|
|
|
1368
1454
|
elif isinstance(_ls_val, str):
|
|
1369
1455
|
_LAST_SEEN_TS = _ls_val
|
|
1370
1456
|
if _LAST_SEEN_TS:
|
|
1371
|
-
print(f"[meshcode] Restored last_seen={_LAST_SEEN_TS} from mesh memory.",
|
|
1457
|
+
print(f"[meshcode] Restored last_seen={_LAST_SEEN_TS} from mesh memory.", file=sys.stderr)
|
|
1372
1458
|
except Exception as _e:
|
|
1373
|
-
print(f"[meshcode] Could not restore last_seen: {_e}",
|
|
1459
|
+
print(f"[meshcode] Could not restore last_seen: {_e}", file=sys.stderr)
|
|
1374
1460
|
|
|
1375
1461
|
|
|
1376
1462
|
def _get_pending_tasks_summary() -> Optional[List[Dict[str, str]]]:
|
|
@@ -2166,6 +2252,7 @@ def meshcode_scratchpad_set(key: str, value: Any) -> Dict[str, Any]:
|
|
|
2166
2252
|
json_value = value if isinstance(value, (dict, list)) else {"_raw": value}
|
|
2167
2253
|
return be.sb_rpc("mc_scratchpad_set", {
|
|
2168
2254
|
"p_api_key": api_key,
|
|
2255
|
+
"p_project_id": _PROJECT_ID,
|
|
2169
2256
|
"p_key": key,
|
|
2170
2257
|
"p_value": json_value,
|
|
2171
2258
|
"p_tier": "reference",
|
|
@@ -2182,9 +2269,9 @@ def meshcode_scratchpad_get(key: Optional[str] = None) -> Dict[str, Any]:
|
|
|
2182
2269
|
"""
|
|
2183
2270
|
api_key = _get_api_key()
|
|
2184
2271
|
if key:
|
|
2185
|
-
return be.sb_rpc("mc_scratchpad_get", {"p_api_key": api_key, "p_key": key})
|
|
2272
|
+
return be.sb_rpc("mc_scratchpad_get", {"p_api_key": api_key, "p_project_id": _PROJECT_ID, "p_key": key})
|
|
2186
2273
|
else:
|
|
2187
|
-
return be.sb_rpc("mc_scratchpad_list", {"p_api_key": api_key})
|
|
2274
|
+
return be.sb_rpc("mc_scratchpad_list", {"p_api_key": api_key, "p_project_id": _PROJECT_ID})
|
|
2188
2275
|
|
|
2189
2276
|
|
|
2190
2277
|
# ----------------- OBSIDIAN SYNC HELPER -----------------
|
|
@@ -2332,6 +2419,7 @@ def meshcode_forget(key: str) -> Dict[str, Any]:
|
|
|
2332
2419
|
"p_api_key": api_key,
|
|
2333
2420
|
"p_agent_name": AGENT_NAME,
|
|
2334
2421
|
"p_key": key,
|
|
2422
|
+
"p_project_name": PROJECT_NAME,
|
|
2335
2423
|
})
|
|
2336
2424
|
|
|
2337
2425
|
|
|
@@ -2357,6 +2445,50 @@ def meshcode_recall_search(query: str) -> Dict[str, Any]:
|
|
|
2357
2445
|
})
|
|
2358
2446
|
|
|
2359
2447
|
|
|
2448
|
+
# ----------------- HEALTH CHECK -----------------
|
|
2449
|
+
|
|
2450
|
+
@mcp.tool()
|
|
2451
|
+
def meshcode_health() -> Dict[str, Any]:
|
|
2452
|
+
"""Check MCP server health: DB connectivity, Realtime status, circuit breaker state, uptime."""
|
|
2453
|
+
import time as _t
|
|
2454
|
+
health: Dict[str, Any] = {
|
|
2455
|
+
"agent": AGENT_NAME,
|
|
2456
|
+
"project": PROJECT_NAME,
|
|
2457
|
+
"instance_id": _INSTANCE_ID,
|
|
2458
|
+
"sdk_version": _SDK_VERSION,
|
|
2459
|
+
}
|
|
2460
|
+
|
|
2461
|
+
# DB latency check
|
|
2462
|
+
_start = _t.monotonic()
|
|
2463
|
+
try:
|
|
2464
|
+
r = be.sb_select("mc_projects", f"id=eq.{_PROJECT_ID}", limit=1)
|
|
2465
|
+
health["db_latency_ms"] = round((_t.monotonic() - _start) * 1000, 1)
|
|
2466
|
+
health["db_status"] = "ok" if r else "empty"
|
|
2467
|
+
except Exception as e:
|
|
2468
|
+
health["db_latency_ms"] = round((_t.monotonic() - _start) * 1000, 1)
|
|
2469
|
+
health["db_status"] = f"error: {e}"
|
|
2470
|
+
|
|
2471
|
+
# Circuit breaker state
|
|
2472
|
+
health["circuit_breaker"] = {
|
|
2473
|
+
"state": be._circuit.state,
|
|
2474
|
+
"failure_count": be._circuit.failure_count,
|
|
2475
|
+
"threshold": be._circuit.failure_threshold,
|
|
2476
|
+
}
|
|
2477
|
+
|
|
2478
|
+
# Realtime status
|
|
2479
|
+
health["realtime_connected"] = getattr(_rt_state, 'connected', False) if '_rt_state' in dir() else "unknown"
|
|
2480
|
+
|
|
2481
|
+
# Process uptime
|
|
2482
|
+
try:
|
|
2483
|
+
import psutil
|
|
2484
|
+
proc = psutil.Process()
|
|
2485
|
+
health["uptime_seconds"] = round(_t.time() - proc.create_time(), 0)
|
|
2486
|
+
except Exception:
|
|
2487
|
+
health["uptime_seconds"] = "unknown (psutil not available)"
|
|
2488
|
+
|
|
2489
|
+
return health
|
|
2490
|
+
|
|
2491
|
+
|
|
2360
2492
|
# ----------------- RESOURCES -----------------
|
|
2361
2493
|
|
|
2362
2494
|
@mcp.tool()
|
|
@@ -2484,7 +2616,7 @@ def _auto_update() -> None:
|
|
|
2484
2616
|
return
|
|
2485
2617
|
|
|
2486
2618
|
# 3. Install the new version (blocking, 60s timeout)
|
|
2487
|
-
print(f"[meshcode] Updating {current} → {latest}...",
|
|
2619
|
+
print(f"[meshcode] Updating {current} → {latest}...", file=sys.stderr)
|
|
2488
2620
|
try:
|
|
2489
2621
|
result = subprocess.run(
|
|
2490
2622
|
[sys.executable, "-m", "pip", "install", "--upgrade",
|
|
@@ -2502,7 +2634,7 @@ def _auto_update() -> None:
|
|
|
2502
2634
|
return
|
|
2503
2635
|
|
|
2504
2636
|
# 4. Re-exec to load the new code
|
|
2505
|
-
print(f"[meshcode] Updated to {latest}, restarting...",
|
|
2637
|
+
print(f"[meshcode] Updated to {latest}, restarting...", file=sys.stderr)
|
|
2506
2638
|
os.environ["MESHCODE_UPDATED"] = "1"
|
|
2507
2639
|
try:
|
|
2508
2640
|
os.execv(sys.executable, [sys.executable] + sys.argv)
|
|
@@ -2511,10 +2643,28 @@ def _auto_update() -> None:
|
|
|
2511
2643
|
|
|
2512
2644
|
|
|
2513
2645
|
def run_server():
|
|
2514
|
-
"""Start the MCP server on stdio (default for Claude Code).
|
|
2646
|
+
"""Start the MCP server on stdio (default for Claude Code).
|
|
2647
|
+
|
|
2648
|
+
Wraps mcp.run() with crash recovery: if the event loop dies for any
|
|
2649
|
+
reason, log the crash, release the lease, and exit cleanly instead of
|
|
2650
|
+
leaving the agent in a zombie state.
|
|
2651
|
+
"""
|
|
2515
2652
|
_auto_update()
|
|
2516
2653
|
print(
|
|
2517
2654
|
f"[meshcode-mcp] Starting server for {AGENT_NAME}@{PROJECT_NAME}",
|
|
2518
2655
|
file=sys.stderr,
|
|
2519
2656
|
)
|
|
2520
|
-
|
|
2657
|
+
try:
|
|
2658
|
+
mcp.run()
|
|
2659
|
+
except KeyboardInterrupt:
|
|
2660
|
+
_log_crash_to_db("keyboard_interrupt", "User stopped the agent")
|
|
2661
|
+
except SystemExit as e:
|
|
2662
|
+
_log_crash_to_db("system_exit", f"exit code: {e.code}")
|
|
2663
|
+
raise # re-raise so the process exits with the correct code
|
|
2664
|
+
except Exception as e:
|
|
2665
|
+
import traceback as _tb
|
|
2666
|
+
tb_str = _tb.format_exc()
|
|
2667
|
+
_log_crash_to_db("unhandled_exception", f"{type(e).__name__}: {e}\n{tb_str}")
|
|
2668
|
+
print(f"[meshcode-mcp] FATAL: {e}", file=sys.stderr)
|
|
2669
|
+
print(f"[meshcode-mcp] Stack trace logged to mc_agent_crash_logs", file=sys.stderr)
|
|
2670
|
+
sys.exit(1)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|