meshcode 2.10.13__tar.gz → 2.10.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {meshcode-2.10.13 → meshcode-2.10.14}/PKG-INFO +1 -1
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/__init__.py +1 -1
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/backend.py +102 -26
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/realtime.py +52 -21
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/server.py +195 -50
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/PKG-INFO +1 -1
- {meshcode-2.10.13 → meshcode-2.10.14}/pyproject.toml +1 -1
- {meshcode-2.10.13 → meshcode-2.10.14}/README.md +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/ascii_art.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/cli.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/comms_v4.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/invites.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/launcher.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/launcher_install.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/__init__.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/__main__.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/test_backend.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/test_realtime.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/test_server_wrapper.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/preferences.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/protocol_v2.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/run_agent.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/secrets.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/self_update.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/setup_clients.py +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/SOURCES.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/dependency_links.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/entry_points.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/requires.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/top_level.txt +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/setup.cfg +0 -0
- {meshcode-2.10.13 → meshcode-2.10.14}/tests/test_status_enum_coverage.py +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""MeshCode — Real-time communication between AI agents."""
|
|
2
|
-
__version__ = "2.10.
|
|
2
|
+
__version__ = "2.10.14"
|
|
@@ -5,6 +5,8 @@ Zero deps beyond stdlib (urllib).
|
|
|
5
5
|
"""
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
|
+
import time as _time
|
|
9
|
+
import threading as _threading
|
|
8
10
|
from datetime import datetime
|
|
9
11
|
from pathlib import Path
|
|
10
12
|
from typing import Any, Dict, List, Optional
|
|
@@ -12,6 +14,54 @@ from urllib.error import HTTPError, URLError
|
|
|
12
14
|
from urllib.parse import quote
|
|
13
15
|
from urllib.request import Request, urlopen
|
|
14
16
|
|
|
17
|
+
|
|
18
|
+
# ── Circuit Breaker ──────────────────────────────────────────────
|
|
19
|
+
# Protects against cascading failures when Supabase is down.
|
|
20
|
+
# States: CLOSED (normal) → OPEN (reject fast) → HALF_OPEN (probe)
|
|
21
|
+
class _CircuitBreaker:
|
|
22
|
+
CLOSED = "closed"
|
|
23
|
+
OPEN = "open"
|
|
24
|
+
HALF_OPEN = "half_open"
|
|
25
|
+
|
|
26
|
+
def __init__(self, failure_threshold: int = 5, recovery_timeout: float = 30.0):
|
|
27
|
+
self.failure_threshold = failure_threshold
|
|
28
|
+
self.recovery_timeout = recovery_timeout
|
|
29
|
+
self.state = self.CLOSED
|
|
30
|
+
self.failure_count = 0
|
|
31
|
+
self.last_failure_time = 0.0
|
|
32
|
+
self._lock = _threading.Lock()
|
|
33
|
+
|
|
34
|
+
def can_execute(self) -> bool:
|
|
35
|
+
with self._lock:
|
|
36
|
+
if self.state == self.CLOSED:
|
|
37
|
+
return True
|
|
38
|
+
if self.state == self.OPEN:
|
|
39
|
+
if _time.monotonic() - self.last_failure_time >= self.recovery_timeout:
|
|
40
|
+
self.state = self.HALF_OPEN
|
|
41
|
+
return True
|
|
42
|
+
return False
|
|
43
|
+
# HALF_OPEN: allow one probe
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
def record_success(self) -> None:
|
|
47
|
+
with self._lock:
|
|
48
|
+
self.failure_count = 0
|
|
49
|
+
self.state = self.CLOSED
|
|
50
|
+
|
|
51
|
+
def record_failure(self) -> None:
|
|
52
|
+
with self._lock:
|
|
53
|
+
self.failure_count += 1
|
|
54
|
+
self.last_failure_time = _time.monotonic()
|
|
55
|
+
if self.failure_count >= self.failure_threshold:
|
|
56
|
+
self.state = self.OPEN
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def is_open(self) -> bool:
|
|
60
|
+
return self.state == self.OPEN
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
_circuit = _CircuitBreaker(failure_threshold=5, recovery_timeout=30.0)
|
|
64
|
+
|
|
15
65
|
# Bake in production defaults — RLS-protected publishable key, safe to ship.
|
|
16
66
|
_DEFAULT_SUPABASE_URL = "https://gjinagyyjttyxnaoavnz.supabase.co"
|
|
17
67
|
_DEFAULT_SUPABASE_KEY = "sb_publishable_qwN9PO1L7jUXhhbhhVk2CQ_z1FXG2Qf"
|
|
@@ -66,22 +116,31 @@ def _headers(*, prefer: Optional[str] = None, content_profile: bool = True) -> D
|
|
|
66
116
|
|
|
67
117
|
|
|
68
118
|
def _request(method: str, path: str, *, data: Any = None, prefer: Optional[str] = None) -> Any:
|
|
119
|
+
if not _circuit.can_execute():
|
|
120
|
+
return {"_error": "circuit breaker open — Supabase temporarily unavailable", "_code": 503}
|
|
69
121
|
url = f"{SUPABASE_URL}/rest/v1/{path}"
|
|
70
122
|
body = json.dumps(data).encode("utf-8") if data else None
|
|
71
123
|
req = Request(url, data=body, method=method, headers=_headers(prefer=prefer))
|
|
72
124
|
try:
|
|
73
125
|
with urlopen(req, timeout=10) as resp:
|
|
74
126
|
raw = resp.read().decode("utf-8")
|
|
127
|
+
_circuit.record_success()
|
|
75
128
|
return json.loads(raw) if raw.strip() else None
|
|
76
129
|
except HTTPError as e:
|
|
77
130
|
err = e.read().decode("utf-8", errors="replace")
|
|
131
|
+
# 4xx = client error (not a backend failure), don't trip breaker
|
|
132
|
+
if 400 <= e.code < 500:
|
|
133
|
+
_circuit.record_success()
|
|
134
|
+
else:
|
|
135
|
+
_circuit.record_failure()
|
|
78
136
|
try:
|
|
79
137
|
err_obj = json.loads(err)
|
|
80
138
|
return {"_error": err_obj.get("message", err[:200]), "_code": e.code}
|
|
81
139
|
except Exception:
|
|
82
140
|
return {"_error": err[:200], "_code": e.code}
|
|
83
|
-
except URLError as e:
|
|
84
|
-
|
|
141
|
+
except (URLError, OSError, TimeoutError) as e:
|
|
142
|
+
_circuit.record_failure()
|
|
143
|
+
return {"_error": str(getattr(e, 'reason', e)), "_code": 0}
|
|
85
144
|
|
|
86
145
|
|
|
87
146
|
def sb_select(table: str, filters: str = "", order: Optional[str] = None, limit: Optional[int] = None) -> List[Dict]:
|
|
@@ -136,32 +195,49 @@ def enable_recording(api_key: str, project_id: str, agent_name: str, session_id:
|
|
|
136
195
|
_recording_session_id = session_id
|
|
137
196
|
|
|
138
197
|
|
|
139
|
-
def sb_rpc(fn_name: str, params: Dict) -> Any:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
err = e.read().decode("utf-8", errors="replace")
|
|
198
|
+
def sb_rpc(fn_name: str, params: Dict, *, _max_retries: int = 3) -> Any:
|
|
199
|
+
import random as _random
|
|
200
|
+
if not _circuit.can_execute():
|
|
201
|
+
return {"_error": "circuit breaker open — Supabase temporarily unavailable", "_circuit": "open"}
|
|
202
|
+
last_err = None
|
|
203
|
+
for attempt in range(_max_retries):
|
|
204
|
+
url = f"{SUPABASE_URL}/rest/v1/rpc/{fn_name}"
|
|
205
|
+
body = json.dumps(params).encode("utf-8")
|
|
206
|
+
req = Request(url, data=body, method="POST", headers=_headers(content_profile=False))
|
|
149
207
|
try:
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
208
|
+
with urlopen(req, timeout=10) as resp:
|
|
209
|
+
raw = resp.read().decode("utf-8")
|
|
210
|
+
result = json.loads(raw) if raw.strip() else None
|
|
211
|
+
_circuit.record_success()
|
|
212
|
+
# Auto-record tool calls to session events (hot-reloadable)
|
|
213
|
+
if _recording_enabled and fn_name not in _SKIP_RECORDING:
|
|
214
|
+
_bg_record("tool_call", {"rpc": fn_name})
|
|
215
|
+
return result
|
|
216
|
+
except HTTPError as e:
|
|
217
|
+
err = e.read().decode("utf-8", errors="replace")
|
|
218
|
+
# 4xx errors are not transient — don't retry, don't trip breaker
|
|
219
|
+
if 400 <= e.code < 500:
|
|
220
|
+
_circuit.record_success()
|
|
221
|
+
try:
|
|
222
|
+
result = {"_error": json.loads(err).get("message", err[:200])}
|
|
223
|
+
except Exception:
|
|
224
|
+
result = {"_error": err[:200]}
|
|
225
|
+
if _recording_enabled and fn_name not in _SKIP_RECORDING:
|
|
226
|
+
_bg_record("error", {"rpc": fn_name, "error": str(err)[:200]})
|
|
227
|
+
return result
|
|
228
|
+
_circuit.record_failure()
|
|
229
|
+
last_err = err
|
|
230
|
+
except (URLError, OSError, TimeoutError) as e:
|
|
231
|
+
_circuit.record_failure()
|
|
232
|
+
last_err = str(getattr(e, 'reason', e))
|
|
233
|
+
# Retry with jitter for transient errors (5xx, network)
|
|
234
|
+
if attempt < _max_retries - 1:
|
|
235
|
+
delay = (2 ** attempt) + _random.uniform(0, 1)
|
|
236
|
+
_time.sleep(delay)
|
|
237
|
+
# All retries exhausted
|
|
161
238
|
if _recording_enabled and fn_name not in _SKIP_RECORDING:
|
|
162
|
-
_bg_record("
|
|
163
|
-
|
|
164
|
-
return result
|
|
239
|
+
_bg_record("error", {"rpc": fn_name, "error": str(last_err)[:200], "retries_exhausted": True})
|
|
240
|
+
return {"_error": str(last_err)[:200] if last_err else "request failed after retries"}
|
|
165
241
|
|
|
166
242
|
|
|
167
243
|
def _bg_record(event_type: str, payload: dict):
|
|
@@ -53,8 +53,9 @@ class RealtimeListener:
|
|
|
53
53
|
self.notify_callback = notify_callback
|
|
54
54
|
self.service_role_key = service_role_key
|
|
55
55
|
|
|
56
|
-
# Last
|
|
57
|
-
self.queue: Deque[Dict] = deque(maxlen=
|
|
56
|
+
# Last 500 unread messages — drained by meshcode_check tool
|
|
57
|
+
self.queue: Deque[Dict] = deque(maxlen=500)
|
|
58
|
+
self._overflow_warned = False
|
|
58
59
|
self._task: Optional[asyncio.Task] = None
|
|
59
60
|
# asyncio.Event() in Py3.10+ no longer requires a running loop, but
|
|
60
61
|
# on older Python or certain Windows event-loop policies it can
|
|
@@ -99,22 +100,39 @@ class RealtimeListener:
|
|
|
99
100
|
pass
|
|
100
101
|
|
|
101
102
|
async def _run(self) -> None:
|
|
102
|
-
"""Outer loop: reconnect with exponential backoff on disconnect.
|
|
103
|
+
"""Outer loop: reconnect with exponential backoff on disconnect.
|
|
104
|
+
|
|
105
|
+
NEVER gives up — keeps retrying with capped backoff (max 60s).
|
|
106
|
+
The MCP server must stay alive regardless of Realtime health.
|
|
107
|
+
"""
|
|
103
108
|
backoff = 1
|
|
109
|
+
consecutive_failures = 0
|
|
104
110
|
while not self._stop.is_set():
|
|
105
111
|
try:
|
|
106
112
|
await self._connect_and_listen()
|
|
107
113
|
backoff = 1 # reset on clean disconnect
|
|
114
|
+
consecutive_failures = 0
|
|
108
115
|
except asyncio.CancelledError:
|
|
109
116
|
return
|
|
110
117
|
except Exception as e:
|
|
111
|
-
|
|
118
|
+
consecutive_failures += 1
|
|
119
|
+
if consecutive_failures % 10 == 0:
|
|
120
|
+
log.error(
|
|
121
|
+
f"Realtime: {consecutive_failures} consecutive failures — "
|
|
122
|
+
f"still retrying (backoff={backoff}s). Last error: {e}"
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
log.warning(
|
|
126
|
+
f"Realtime connection error ({consecutive_failures}): "
|
|
127
|
+
f"{e}; reconnecting in {backoff}s"
|
|
128
|
+
)
|
|
129
|
+
self._connected = False
|
|
112
130
|
try:
|
|
113
131
|
await asyncio.wait_for(self._stop.wait(), timeout=backoff)
|
|
114
132
|
return # stop signaled
|
|
115
133
|
except asyncio.TimeoutError:
|
|
116
134
|
pass
|
|
117
|
-
backoff = min(backoff * 2,
|
|
135
|
+
backoff = min(backoff * 2, 60)
|
|
118
136
|
|
|
119
137
|
async def _connect_and_listen(self) -> None:
|
|
120
138
|
"""Single connection lifecycle: connect, subscribe, listen."""
|
|
@@ -155,23 +173,33 @@ class RealtimeListener:
|
|
|
155
173
|
await ws.send(json.dumps(join_msg))
|
|
156
174
|
|
|
157
175
|
# Wait for phx_reply to confirm subscription was accepted.
|
|
176
|
+
# Retry up to 3 times with backoff — transient Supabase latency
|
|
177
|
+
# can cause one agent to time out while siblings succeed.
|
|
158
178
|
self._subscription_ok = False
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
179
|
+
for _sub_attempt in range(3):
|
|
180
|
+
try:
|
|
181
|
+
reply_raw = await asyncio.wait_for(ws.recv(), timeout=15.0)
|
|
182
|
+
reply = json.loads(reply_raw)
|
|
183
|
+
reply_status = (reply.get("payload") or {}).get("status")
|
|
184
|
+
if reply_status == "ok":
|
|
185
|
+
self._subscription_ok = True
|
|
186
|
+
log.info(f"Realtime subscription OK for {self.agent_name} on {topic}")
|
|
187
|
+
break
|
|
188
|
+
else:
|
|
189
|
+
log.warning(
|
|
190
|
+
f"Realtime subscription attempt {_sub_attempt+1} failed for {self.agent_name}: "
|
|
191
|
+
f"status={reply_status} payload={reply.get('payload')}"
|
|
192
|
+
)
|
|
193
|
+
except asyncio.TimeoutError:
|
|
194
|
+
log.warning(f"Realtime subscription attempt {_sub_attempt+1} TIMEOUT for {self.agent_name}")
|
|
195
|
+
except Exception as e:
|
|
196
|
+
log.warning(f"Realtime subscription attempt {_sub_attempt+1} error: {e}")
|
|
197
|
+
# Retry: re-send join message after brief backoff
|
|
198
|
+
if _sub_attempt < 2:
|
|
199
|
+
await asyncio.sleep(2 ** _sub_attempt)
|
|
200
|
+
await ws.send(json.dumps(join_msg))
|
|
201
|
+
if not self._subscription_ok:
|
|
202
|
+
log.error(f"Realtime subscription FAILED after 3 attempts for {self.agent_name}")
|
|
175
203
|
|
|
176
204
|
# Heartbeat task to keep the connection alive
|
|
177
205
|
heartbeat_task = asyncio.create_task(self._heartbeat(ws))
|
|
@@ -228,6 +256,9 @@ class RealtimeListener:
|
|
|
228
256
|
"id": record.get("id"),
|
|
229
257
|
"parent_id": record.get("parent_msg_id"),
|
|
230
258
|
}
|
|
259
|
+
if len(self.queue) >= 400 and not self._overflow_warned:
|
|
260
|
+
log.warning(f"Message queue at {len(self.queue)}/500 — risk of dropping messages")
|
|
261
|
+
self._overflow_warned = True
|
|
231
262
|
self.queue.append(enriched)
|
|
232
263
|
# Wake any meshcode_wait blocked on this event.
|
|
233
264
|
try:
|
|
@@ -8,11 +8,14 @@ Run with:
|
|
|
8
8
|
MESHCODE_PROJECT=my-app MESHCODE_AGENT=backend python -m meshcode_mcp serve
|
|
9
9
|
"""
|
|
10
10
|
import asyncio
|
|
11
|
+
import atexit
|
|
11
12
|
import json
|
|
12
13
|
import logging
|
|
13
14
|
import os
|
|
15
|
+
import signal
|
|
14
16
|
import sys
|
|
15
17
|
import hashlib as _hashlib
|
|
18
|
+
import traceback as _traceback
|
|
16
19
|
from collections import deque
|
|
17
20
|
from contextlib import asynccontextmanager
|
|
18
21
|
from typing import Any, Dict, List, Optional, Union
|
|
@@ -375,29 +378,39 @@ def _get_api_key() -> str:
|
|
|
375
378
|
return ""
|
|
376
379
|
|
|
377
380
|
|
|
378
|
-
# Resolve project_id at startup. Try in order:
|
|
381
|
+
# Resolve project_id at startup with retry+backoff. Try in order:
|
|
379
382
|
# 1. MESHCODE_PROJECT_ID env var (baked by `meshcode setup`, fastest)
|
|
380
383
|
# 2. mc_resolve_project RPC with the user's api_key (security definer, bypasses RLS)
|
|
381
384
|
# 3. Direct SELECT via get_project_id (only works if RLS is open / user is admin)
|
|
382
385
|
_PROJECT_ID: Optional[str] = os.environ.get("MESHCODE_PROJECT_ID") or None
|
|
383
386
|
if not _PROJECT_ID:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
387
|
+
_BOOT_MAX_RETRIES = 3
|
|
388
|
+
_BOOT_BACKOFF = [2, 5, 10] # seconds between retries
|
|
389
|
+
for _boot_attempt in range(_BOOT_MAX_RETRIES):
|
|
390
|
+
_api_key = _get_api_key()
|
|
391
|
+
if _api_key:
|
|
392
|
+
try:
|
|
393
|
+
_r = be.sb_rpc("mc_resolve_project", {
|
|
394
|
+
"p_api_key": _api_key,
|
|
395
|
+
"p_project_name": PROJECT_NAME,
|
|
396
|
+
})
|
|
397
|
+
if isinstance(_r, dict) and _r.get("project_id"):
|
|
398
|
+
_PROJECT_ID = _r["project_id"]
|
|
399
|
+
break
|
|
400
|
+
elif isinstance(_r, dict) and _r.get("error"):
|
|
401
|
+
_mc_log(f" mc_resolve_project: {_r['error']}", "warn")
|
|
402
|
+
except Exception as _e:
|
|
403
|
+
_mc_log(f" mc_resolve_project failed: {_e}", "warn")
|
|
404
|
+
if not _PROJECT_ID:
|
|
405
|
+
_PROJECT_ID = be.get_project_id(PROJECT_NAME)
|
|
406
|
+
if _PROJECT_ID:
|
|
407
|
+
break
|
|
408
|
+
if _boot_attempt < _BOOT_MAX_RETRIES - 1:
|
|
409
|
+
_wait = _BOOT_BACKOFF[_boot_attempt]
|
|
410
|
+
_mc_log(f" project resolution failed (attempt {_boot_attempt+1}/{_BOOT_MAX_RETRIES}), retrying in {_wait}s...", "warn")
|
|
411
|
+
_time.sleep(_wait)
|
|
399
412
|
if not _PROJECT_ID:
|
|
400
|
-
_mc_log(f"project '{PROJECT_NAME}' not found (check MESHCODE_KEYCHAIN_PROFILE / MESHCODE_API_KEY)", "error")
|
|
413
|
+
_mc_log(f"project '{PROJECT_NAME}' not found after {_BOOT_MAX_RETRIES} attempts (check MESHCODE_KEYCHAIN_PROFILE / MESHCODE_API_KEY)", "error")
|
|
401
414
|
sys.exit(2)
|
|
402
415
|
|
|
403
416
|
# Resolve project plan for adaptive features (heartbeat interval, etc.)
|
|
@@ -492,16 +505,17 @@ def _schedule_flip(status: str, task: str = "") -> None:
|
|
|
492
505
|
|
|
493
506
|
|
|
494
507
|
def _set_state(state: str, tool: str = "") -> None:
|
|
495
|
-
"""Update the state machine and broadcast to dashboard."""
|
|
508
|
+
"""Update the state machine and broadcast to dashboard. Thread-safe."""
|
|
496
509
|
global _current_state, _current_tool, _last_tool_at, _working_timer
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
_working_timer
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
510
|
+
with _flip_lock:
|
|
511
|
+
# Cancel any pending working→online timer
|
|
512
|
+
if _working_timer is not None:
|
|
513
|
+
_working_timer.cancel()
|
|
514
|
+
_working_timer = None
|
|
515
|
+
_current_state = state
|
|
516
|
+
_current_tool = tool
|
|
517
|
+
if state == "working":
|
|
518
|
+
_last_tool_at = _time.time()
|
|
505
519
|
_schedule_flip(state, tool)
|
|
506
520
|
|
|
507
521
|
|
|
@@ -552,13 +566,14 @@ def with_working_status(func):
|
|
|
552
566
|
except Exception as e:
|
|
553
567
|
if not skip:
|
|
554
568
|
_auto_learn_error(name, e, list(kwargs.keys()))
|
|
555
|
-
raise
|
|
569
|
+
# NEVER re-raise — return structured error instead of crashing
|
|
570
|
+
import traceback as _tb
|
|
571
|
+
_log_crash_to_db("tool_exception", f"{name}: {type(e).__name__}: {e}\n{_tb.format_exc()[-500:]}")
|
|
572
|
+
return {"error": f"tool {name} failed: {type(e).__name__}: {e}", "_recovered": True}
|
|
556
573
|
finally:
|
|
557
574
|
if not skip:
|
|
558
575
|
global _last_tool_at
|
|
559
576
|
_last_tool_at = _time.time()
|
|
560
|
-
# Don't flip to online here — CPU-based detection in heartbeat
|
|
561
|
-
# will handle the transition when LLM stops generating
|
|
562
577
|
return awrapper
|
|
563
578
|
else:
|
|
564
579
|
@_functools.wraps(func)
|
|
@@ -575,13 +590,14 @@ def with_working_status(func):
|
|
|
575
590
|
except Exception as e:
|
|
576
591
|
if not skip:
|
|
577
592
|
_auto_learn_error(name, e, list(kwargs.keys()))
|
|
578
|
-
raise
|
|
593
|
+
# NEVER re-raise — return structured error instead of crashing
|
|
594
|
+
import traceback as _tb
|
|
595
|
+
_log_crash_to_db("tool_exception", f"{name}: {type(e).__name__}: {e}\n{_tb.format_exc()[-500:]}")
|
|
596
|
+
return {"error": f"tool {name} failed: {type(e).__name__}: {e}", "_recovered": True}
|
|
579
597
|
finally:
|
|
580
598
|
if not skip:
|
|
581
599
|
global _last_tool_at
|
|
582
600
|
_last_tool_at = _time.time()
|
|
583
|
-
# Don't flip to online here — CPU-based detection in heartbeat
|
|
584
|
-
# will handle the transition when LLM stops generating
|
|
585
601
|
return swrapper
|
|
586
602
|
|
|
587
603
|
|
|
@@ -747,6 +763,53 @@ def _release_lease() -> None:
|
|
|
747
763
|
pass
|
|
748
764
|
|
|
749
765
|
|
|
766
|
+
# ── Crash logging + graceful shutdown ──────────────────────────
|
|
767
|
+
_SHUTDOWN_LOGGED = False
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def _log_crash_to_db(reason: str = "unknown", error_detail: str = "") -> None:
|
|
771
|
+
"""Best-effort crash log to mc_agent_crash_logs table. Non-fatal if table doesn't exist."""
|
|
772
|
+
global _SHUTDOWN_LOGGED
|
|
773
|
+
if _SHUTDOWN_LOGGED:
|
|
774
|
+
return
|
|
775
|
+
_SHUTDOWN_LOGGED = True
|
|
776
|
+
try:
|
|
777
|
+
be.sb_rpc("mc_log_error", {
|
|
778
|
+
"p_api_key": _get_api_key(),
|
|
779
|
+
"p_project_id": _PROJECT_ID,
|
|
780
|
+
"p_agent_name": AGENT_NAME,
|
|
781
|
+
"p_error_type": reason,
|
|
782
|
+
"p_error_detail": error_detail[:2000],
|
|
783
|
+
"p_instance_id": _INSTANCE_ID,
|
|
784
|
+
})
|
|
785
|
+
except Exception:
|
|
786
|
+
# Table may not exist yet — fall back to status update
|
|
787
|
+
try:
|
|
788
|
+
be.set_status(_PROJECT_ID, AGENT_NAME, "offline",
|
|
789
|
+
f"crashed: {reason[:100]}", api_key=_get_api_key())
|
|
790
|
+
except Exception:
|
|
791
|
+
pass
|
|
792
|
+
_mc_log(f" crash logged: {reason}", "warn")
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
def _on_exit() -> None:
|
|
796
|
+
"""atexit handler — release lease and log shutdown."""
|
|
797
|
+
_log_crash_to_db("process_exit", "atexit handler fired")
|
|
798
|
+
_release_lease()
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def _on_signal(signum, frame) -> None:
|
|
802
|
+
"""Signal handler for SIGTERM/SIGINT — clean shutdown."""
|
|
803
|
+
sig_name = signal.Signals(signum).name if hasattr(signal, 'Signals') else str(signum)
|
|
804
|
+
_log_crash_to_db("signal", f"Received {sig_name}")
|
|
805
|
+
_release_lease()
|
|
806
|
+
sys.exit(128 + signum)
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
atexit.register(_on_exit)
|
|
810
|
+
signal.signal(signal.SIGTERM, _on_signal)
|
|
811
|
+
|
|
812
|
+
|
|
750
813
|
# ============================================================
|
|
751
814
|
# Agent identity from Supabase profile (for system instructions)
|
|
752
815
|
# ============================================================
|
|
@@ -1037,25 +1100,28 @@ def _heartbeat_thread_fn():
|
|
|
1037
1100
|
try:
|
|
1038
1101
|
be.sb_rpc("mc_heartbeat", {"p_project_id": _PROJECT_ID, "p_agent_name": AGENT_NAME, "p_version": _SDK_VERSION})
|
|
1039
1102
|
|
|
1040
|
-
# CPU-based status detection
|
|
1103
|
+
# CPU-based status detection — read shared state under lock
|
|
1041
1104
|
parent_cpu = _get_parent_cpu()
|
|
1042
|
-
|
|
1105
|
+
with _flip_lock:
|
|
1106
|
+
cur_state = _current_state
|
|
1107
|
+
in_wait = _IN_WAIT
|
|
1108
|
+
idle_secs = _time.time() - _last_tool_at
|
|
1043
1109
|
|
|
1044
|
-
if
|
|
1110
|
+
if in_wait:
|
|
1045
1111
|
# Actually in meshcode_wait right now — listening for messages
|
|
1046
|
-
if
|
|
1112
|
+
if cur_state != "waiting":
|
|
1047
1113
|
_set_state("waiting", "listening for messages")
|
|
1048
1114
|
elif parent_cpu > 3.0:
|
|
1049
1115
|
# LLM is actively generating tokens or streaming
|
|
1050
|
-
if
|
|
1116
|
+
if cur_state != "working":
|
|
1051
1117
|
_set_state("working", "generating response")
|
|
1052
|
-
elif
|
|
1118
|
+
elif cur_state == "working":
|
|
1053
1119
|
# LLM just stopped — transition to online (not sleeping)
|
|
1054
1120
|
_set_state("online", "")
|
|
1055
|
-
elif
|
|
1121
|
+
elif cur_state == "online" and idle_secs > 30:
|
|
1056
1122
|
# Brief idle — show as idle, not sleeping yet
|
|
1057
1123
|
_set_state("idle", "idle")
|
|
1058
|
-
elif
|
|
1124
|
+
elif cur_state == "idle" and idle_secs > 300 and parent_cpu < 2.0 and not _STAY_AWAKE:
|
|
1059
1125
|
# Extended idle + no CPU activity → sleeping (5 min, not 90s)
|
|
1060
1126
|
_set_state("sleeping", "sleeping")
|
|
1061
1127
|
|
|
@@ -1105,6 +1171,18 @@ async def lifespan(_app):
|
|
|
1105
1171
|
)
|
|
1106
1172
|
await _REALTIME.start()
|
|
1107
1173
|
|
|
1174
|
+
# Wait up to 5s for realtime to report connected before proceeding.
|
|
1175
|
+
# Without this, the lifespan yields before the WS is ready, and Claude
|
|
1176
|
+
# Code's handshake can time out on slower network paths — one agent
|
|
1177
|
+
# fails while siblings on the same box succeed.
|
|
1178
|
+
for _rt_check in range(10):
|
|
1179
|
+
if getattr(_REALTIME, '_connected', False):
|
|
1180
|
+
log.info(f"Realtime connected for {AGENT_NAME}")
|
|
1181
|
+
break
|
|
1182
|
+
await asyncio.sleep(0.5)
|
|
1183
|
+
else:
|
|
1184
|
+
log.warning(f"Realtime not connected after 5s for {AGENT_NAME} — continuing with polling fallback")
|
|
1185
|
+
|
|
1108
1186
|
# IMMEDIATE: send first heartbeat + set online status BEFORE any tool calls.
|
|
1109
1187
|
# Without this, the agent appears offline for up to 30s after boot.
|
|
1110
1188
|
for _attempt in range(3):
|
|
@@ -1152,16 +1230,19 @@ async def lifespan(_app):
|
|
|
1152
1230
|
})
|
|
1153
1231
|
except Exception:
|
|
1154
1232
|
pass # Never block shutdown
|
|
1155
|
-
log.info("lifespan shutdown — stopping heartbeat + realtime
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
# Flip to offline + release lease so the dashboard reflects reality
|
|
1160
|
-
# within seconds (not waiting for the 30s cron to notice).
|
|
1233
|
+
log.info("lifespan shutdown — releasing lease + stopping heartbeat + realtime")
|
|
1234
|
+
# Release lease FIRST — before stopping heartbeat thread.
|
|
1235
|
+
# If heartbeat join times out, the lease is already released so
|
|
1236
|
+
# the agent won't block reconnection.
|
|
1161
1237
|
try:
|
|
1162
1238
|
_release_lease()
|
|
1163
1239
|
except Exception as _e:
|
|
1164
1240
|
log.warning(f"could not release lease: {_e}")
|
|
1241
|
+
_heartbeat_stop.set()
|
|
1242
|
+
hb_thread.join(timeout=5)
|
|
1243
|
+
if hb_thread.is_alive():
|
|
1244
|
+
log.warning("heartbeat thread did not stop within 5s — lease already released")
|
|
1245
|
+
await _REALTIME.stop()
|
|
1165
1246
|
|
|
1166
1247
|
|
|
1167
1248
|
# ============================================================
|
|
@@ -2166,6 +2247,7 @@ def meshcode_scratchpad_set(key: str, value: Any) -> Dict[str, Any]:
|
|
|
2166
2247
|
json_value = value if isinstance(value, (dict, list)) else {"_raw": value}
|
|
2167
2248
|
return be.sb_rpc("mc_scratchpad_set", {
|
|
2168
2249
|
"p_api_key": api_key,
|
|
2250
|
+
"p_project_id": _PROJECT_ID,
|
|
2169
2251
|
"p_key": key,
|
|
2170
2252
|
"p_value": json_value,
|
|
2171
2253
|
"p_tier": "reference",
|
|
@@ -2182,9 +2264,9 @@ def meshcode_scratchpad_get(key: Optional[str] = None) -> Dict[str, Any]:
|
|
|
2182
2264
|
"""
|
|
2183
2265
|
api_key = _get_api_key()
|
|
2184
2266
|
if key:
|
|
2185
|
-
return be.sb_rpc("mc_scratchpad_get", {"p_api_key": api_key, "p_key": key})
|
|
2267
|
+
return be.sb_rpc("mc_scratchpad_get", {"p_api_key": api_key, "p_project_id": _PROJECT_ID, "p_key": key})
|
|
2186
2268
|
else:
|
|
2187
|
-
return be.sb_rpc("mc_scratchpad_list", {"p_api_key": api_key})
|
|
2269
|
+
return be.sb_rpc("mc_scratchpad_list", {"p_api_key": api_key, "p_project_id": _PROJECT_ID})
|
|
2188
2270
|
|
|
2189
2271
|
|
|
2190
2272
|
# ----------------- OBSIDIAN SYNC HELPER -----------------
|
|
@@ -2332,6 +2414,7 @@ def meshcode_forget(key: str) -> Dict[str, Any]:
|
|
|
2332
2414
|
"p_api_key": api_key,
|
|
2333
2415
|
"p_agent_name": AGENT_NAME,
|
|
2334
2416
|
"p_key": key,
|
|
2417
|
+
"p_project_name": PROJECT_NAME,
|
|
2335
2418
|
})
|
|
2336
2419
|
|
|
2337
2420
|
|
|
@@ -2357,6 +2440,50 @@ def meshcode_recall_search(query: str) -> Dict[str, Any]:
|
|
|
2357
2440
|
})
|
|
2358
2441
|
|
|
2359
2442
|
|
|
2443
|
+
# ----------------- HEALTH CHECK -----------------
|
|
2444
|
+
|
|
2445
|
+
@mcp.tool()
|
|
2446
|
+
def meshcode_health() -> Dict[str, Any]:
|
|
2447
|
+
"""Check MCP server health: DB connectivity, Realtime status, circuit breaker state, uptime."""
|
|
2448
|
+
import time as _t
|
|
2449
|
+
health: Dict[str, Any] = {
|
|
2450
|
+
"agent": AGENT_NAME,
|
|
2451
|
+
"project": PROJECT_NAME,
|
|
2452
|
+
"instance_id": _INSTANCE_ID,
|
|
2453
|
+
"sdk_version": _SDK_VERSION,
|
|
2454
|
+
}
|
|
2455
|
+
|
|
2456
|
+
# DB latency check
|
|
2457
|
+
_start = _t.monotonic()
|
|
2458
|
+
try:
|
|
2459
|
+
r = be.sb_select("mc_projects", f"id=eq.{_PROJECT_ID}", limit=1)
|
|
2460
|
+
health["db_latency_ms"] = round((_t.monotonic() - _start) * 1000, 1)
|
|
2461
|
+
health["db_status"] = "ok" if r else "empty"
|
|
2462
|
+
except Exception as e:
|
|
2463
|
+
health["db_latency_ms"] = round((_t.monotonic() - _start) * 1000, 1)
|
|
2464
|
+
health["db_status"] = f"error: {e}"
|
|
2465
|
+
|
|
2466
|
+
# Circuit breaker state
|
|
2467
|
+
health["circuit_breaker"] = {
|
|
2468
|
+
"state": be._circuit.state,
|
|
2469
|
+
"failure_count": be._circuit.failure_count,
|
|
2470
|
+
"threshold": be._circuit.failure_threshold,
|
|
2471
|
+
}
|
|
2472
|
+
|
|
2473
|
+
# Realtime status
|
|
2474
|
+
health["realtime_connected"] = getattr(_rt_state, 'connected', False) if '_rt_state' in dir() else "unknown"
|
|
2475
|
+
|
|
2476
|
+
# Process uptime
|
|
2477
|
+
try:
|
|
2478
|
+
import psutil
|
|
2479
|
+
proc = psutil.Process()
|
|
2480
|
+
health["uptime_seconds"] = round(_t.time() - proc.create_time(), 0)
|
|
2481
|
+
except Exception:
|
|
2482
|
+
health["uptime_seconds"] = "unknown (psutil not available)"
|
|
2483
|
+
|
|
2484
|
+
return health
|
|
2485
|
+
|
|
2486
|
+
|
|
2360
2487
|
# ----------------- RESOURCES -----------------
|
|
2361
2488
|
|
|
2362
2489
|
@mcp.tool()
|
|
@@ -2511,10 +2638,28 @@ def _auto_update() -> None:
|
|
|
2511
2638
|
|
|
2512
2639
|
|
|
2513
2640
|
def run_server():
|
|
2514
|
-
"""Start the MCP server on stdio (default for Claude Code).
|
|
2641
|
+
"""Start the MCP server on stdio (default for Claude Code).
|
|
2642
|
+
|
|
2643
|
+
Wraps mcp.run() with crash recovery: if the event loop dies for any
|
|
2644
|
+
reason, log the crash, release the lease, and exit cleanly instead of
|
|
2645
|
+
leaving the agent in a zombie state.
|
|
2646
|
+
"""
|
|
2515
2647
|
_auto_update()
|
|
2516
2648
|
print(
|
|
2517
2649
|
f"[meshcode-mcp] Starting server for {AGENT_NAME}@{PROJECT_NAME}",
|
|
2518
2650
|
file=sys.stderr,
|
|
2519
2651
|
)
|
|
2520
|
-
|
|
2652
|
+
try:
|
|
2653
|
+
mcp.run()
|
|
2654
|
+
except KeyboardInterrupt:
|
|
2655
|
+
_log_crash_to_db("keyboard_interrupt", "User stopped the agent")
|
|
2656
|
+
except SystemExit as e:
|
|
2657
|
+
_log_crash_to_db("system_exit", f"exit code: {e.code}")
|
|
2658
|
+
raise # re-raise so the process exits with the correct code
|
|
2659
|
+
except Exception as e:
|
|
2660
|
+
import traceback as _tb
|
|
2661
|
+
tb_str = _tb.format_exc()
|
|
2662
|
+
_log_crash_to_db("unhandled_exception", f"{type(e).__name__}: {e}\n{tb_str}")
|
|
2663
|
+
print(f"[meshcode-mcp] FATAL: {e}", file=sys.stderr)
|
|
2664
|
+
print(f"[meshcode-mcp] Stack trace logged to mc_agent_crash_logs", file=sys.stderr)
|
|
2665
|
+
sys.exit(1)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|