meshcode 2.10.13__tar.gz → 2.10.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {meshcode-2.10.13 → meshcode-2.10.15}/PKG-INFO +1 -1
  2. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/__init__.py +1 -1
  3. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/backend.py +102 -26
  4. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/realtime.py +52 -21
  5. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/server.py +222 -72
  6. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/PKG-INFO +1 -1
  7. {meshcode-2.10.13 → meshcode-2.10.15}/pyproject.toml +1 -1
  8. {meshcode-2.10.13 → meshcode-2.10.15}/README.md +0 -0
  9. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/ascii_art.py +0 -0
  10. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/cli.py +0 -0
  11. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/comms_v4.py +0 -0
  12. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/invites.py +0 -0
  13. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/launcher.py +0 -0
  14. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/launcher_install.py +0 -0
  15. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/__init__.py +0 -0
  16. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/__main__.py +0 -0
  17. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/test_backend.py +0 -0
  18. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/test_realtime.py +0 -0
  19. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/meshcode_mcp/test_server_wrapper.py +0 -0
  20. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/preferences.py +0 -0
  21. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/protocol_v2.py +0 -0
  22. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/run_agent.py +0 -0
  23. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/secrets.py +0 -0
  24. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/self_update.py +0 -0
  25. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode/setup_clients.py +0 -0
  26. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/SOURCES.txt +0 -0
  27. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/dependency_links.txt +0 -0
  28. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/entry_points.txt +0 -0
  29. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/requires.txt +0 -0
  30. {meshcode-2.10.13 → meshcode-2.10.15}/meshcode.egg-info/top_level.txt +0 -0
  31. {meshcode-2.10.13 → meshcode-2.10.15}/setup.cfg +0 -0
  32. {meshcode-2.10.13 → meshcode-2.10.15}/tests/test_status_enum_coverage.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: meshcode
3
- Version: 2.10.13
3
+ Version: 2.10.15
4
4
  Summary: Real-time communication between AI agents — Supabase-backed CLI
5
5
  Author-email: MeshCode <hello@meshcode.io>
6
6
  License: MIT
@@ -1,2 +1,2 @@
1
1
  """MeshCode — Real-time communication between AI agents."""
2
- __version__ = "2.10.13"
2
+ __version__ = "2.10.15"
@@ -5,6 +5,8 @@ Zero deps beyond stdlib (urllib).
5
5
  """
6
6
  import json
7
7
  import os
8
+ import time as _time
9
+ import threading as _threading
8
10
  from datetime import datetime
9
11
  from pathlib import Path
10
12
  from typing import Any, Dict, List, Optional
@@ -12,6 +14,54 @@ from urllib.error import HTTPError, URLError
12
14
  from urllib.parse import quote
13
15
  from urllib.request import Request, urlopen
14
16
 
17
+
18
+ # ── Circuit Breaker ──────────────────────────────────────────────
19
+ # Protects against cascading failures when Supabase is down.
20
+ # States: CLOSED (normal) → OPEN (reject fast) → HALF_OPEN (probe)
21
+ class _CircuitBreaker:
22
+ CLOSED = "closed"
23
+ OPEN = "open"
24
+ HALF_OPEN = "half_open"
25
+
26
+ def __init__(self, failure_threshold: int = 5, recovery_timeout: float = 30.0):
27
+ self.failure_threshold = failure_threshold
28
+ self.recovery_timeout = recovery_timeout
29
+ self.state = self.CLOSED
30
+ self.failure_count = 0
31
+ self.last_failure_time = 0.0
32
+ self._lock = _threading.Lock()
33
+
34
+ def can_execute(self) -> bool:
35
+ with self._lock:
36
+ if self.state == self.CLOSED:
37
+ return True
38
+ if self.state == self.OPEN:
39
+ if _time.monotonic() - self.last_failure_time >= self.recovery_timeout:
40
+ self.state = self.HALF_OPEN
41
+ return True
42
+ return False
43
+ # HALF_OPEN: allow one probe
44
+ return True
45
+
46
+ def record_success(self) -> None:
47
+ with self._lock:
48
+ self.failure_count = 0
49
+ self.state = self.CLOSED
50
+
51
+ def record_failure(self) -> None:
52
+ with self._lock:
53
+ self.failure_count += 1
54
+ self.last_failure_time = _time.monotonic()
55
+ if self.failure_count >= self.failure_threshold:
56
+ self.state = self.OPEN
57
+
58
+ @property
59
+ def is_open(self) -> bool:
60
+ return self.state == self.OPEN
61
+
62
+
63
+ _circuit = _CircuitBreaker(failure_threshold=5, recovery_timeout=30.0)
64
+
15
65
  # Bake in production defaults — RLS-protected publishable key, safe to ship.
16
66
  _DEFAULT_SUPABASE_URL = "https://gjinagyyjttyxnaoavnz.supabase.co"
17
67
  _DEFAULT_SUPABASE_KEY = "sb_publishable_qwN9PO1L7jUXhhbhhVk2CQ_z1FXG2Qf"
@@ -66,22 +116,31 @@ def _headers(*, prefer: Optional[str] = None, content_profile: bool = True) -> D
66
116
 
67
117
 
68
118
  def _request(method: str, path: str, *, data: Any = None, prefer: Optional[str] = None) -> Any:
119
+ if not _circuit.can_execute():
120
+ return {"_error": "circuit breaker open — Supabase temporarily unavailable", "_code": 503}
69
121
  url = f"{SUPABASE_URL}/rest/v1/{path}"
70
122
  body = json.dumps(data).encode("utf-8") if data else None
71
123
  req = Request(url, data=body, method=method, headers=_headers(prefer=prefer))
72
124
  try:
73
125
  with urlopen(req, timeout=10) as resp:
74
126
  raw = resp.read().decode("utf-8")
127
+ _circuit.record_success()
75
128
  return json.loads(raw) if raw.strip() else None
76
129
  except HTTPError as e:
77
130
  err = e.read().decode("utf-8", errors="replace")
131
+ # 4xx = client error (not a backend failure), don't trip breaker
132
+ if 400 <= e.code < 500:
133
+ _circuit.record_success()
134
+ else:
135
+ _circuit.record_failure()
78
136
  try:
79
137
  err_obj = json.loads(err)
80
138
  return {"_error": err_obj.get("message", err[:200]), "_code": e.code}
81
139
  except Exception:
82
140
  return {"_error": err[:200], "_code": e.code}
83
- except URLError as e:
84
- return {"_error": str(e.reason), "_code": 0}
141
+ except (URLError, OSError, TimeoutError) as e:
142
+ _circuit.record_failure()
143
+ return {"_error": str(getattr(e, 'reason', e)), "_code": 0}
85
144
 
86
145
 
87
146
  def sb_select(table: str, filters: str = "", order: Optional[str] = None, limit: Optional[int] = None) -> List[Dict]:
@@ -136,32 +195,49 @@ def enable_recording(api_key: str, project_id: str, agent_name: str, session_id:
136
195
  _recording_session_id = session_id
137
196
 
138
197
 
139
- def sb_rpc(fn_name: str, params: Dict) -> Any:
140
- url = f"{SUPABASE_URL}/rest/v1/rpc/{fn_name}"
141
- body = json.dumps(params).encode("utf-8")
142
- req = Request(url, data=body, method="POST", headers=_headers(content_profile=False))
143
- try:
144
- with urlopen(req, timeout=10) as resp:
145
- raw = resp.read().decode("utf-8")
146
- result = json.loads(raw) if raw.strip() else None
147
- except HTTPError as e:
148
- err = e.read().decode("utf-8", errors="replace")
198
+ def sb_rpc(fn_name: str, params: Dict, *, _max_retries: int = 3) -> Any:
199
+ import random as _random
200
+ if not _circuit.can_execute():
201
+ return {"_error": "circuit breaker open — Supabase temporarily unavailable", "_circuit": "open"}
202
+ last_err = None
203
+ for attempt in range(_max_retries):
204
+ url = f"{SUPABASE_URL}/rest/v1/rpc/{fn_name}"
205
+ body = json.dumps(params).encode("utf-8")
206
+ req = Request(url, data=body, method="POST", headers=_headers(content_profile=False))
149
207
  try:
150
- result = {"_error": json.loads(err).get("message", err[:200])}
151
- except Exception:
152
- result = {"_error": err[:200]}
153
- # Record errors too
154
- if _recording_enabled and fn_name not in _SKIP_RECORDING:
155
- _bg_record("error", {"rpc": fn_name, "error": str(err)[:200]})
156
- return result
157
- except URLError as e:
158
- return {"_error": str(e.reason)}
159
-
160
- # Auto-record tool calls to session events (hot-reloadable)
208
+ with urlopen(req, timeout=10) as resp:
209
+ raw = resp.read().decode("utf-8")
210
+ result = json.loads(raw) if raw.strip() else None
211
+ _circuit.record_success()
212
+ # Auto-record tool calls to session events (hot-reloadable)
213
+ if _recording_enabled and fn_name not in _SKIP_RECORDING:
214
+ _bg_record("tool_call", {"rpc": fn_name})
215
+ return result
216
+ except HTTPError as e:
217
+ err = e.read().decode("utf-8", errors="replace")
218
+ # 4xx errors are not transient don't retry, don't trip breaker
219
+ if 400 <= e.code < 500:
220
+ _circuit.record_success()
221
+ try:
222
+ result = {"_error": json.loads(err).get("message", err[:200])}
223
+ except Exception:
224
+ result = {"_error": err[:200]}
225
+ if _recording_enabled and fn_name not in _SKIP_RECORDING:
226
+ _bg_record("error", {"rpc": fn_name, "error": str(err)[:200]})
227
+ return result
228
+ _circuit.record_failure()
229
+ last_err = err
230
+ except (URLError, OSError, TimeoutError) as e:
231
+ _circuit.record_failure()
232
+ last_err = str(getattr(e, 'reason', e))
233
+ # Retry with jitter for transient errors (5xx, network)
234
+ if attempt < _max_retries - 1:
235
+ delay = (2 ** attempt) + _random.uniform(0, 1)
236
+ _time.sleep(delay)
237
+ # All retries exhausted
161
238
  if _recording_enabled and fn_name not in _SKIP_RECORDING:
162
- _bg_record("tool_call", {"rpc": fn_name})
163
-
164
- return result
239
+ _bg_record("error", {"rpc": fn_name, "error": str(last_err)[:200], "retries_exhausted": True})
240
+ return {"_error": str(last_err)[:200] if last_err else "request failed after retries"}
165
241
 
166
242
 
167
243
  def _bg_record(event_type: str, payload: dict):
@@ -53,8 +53,9 @@ class RealtimeListener:
53
53
  self.notify_callback = notify_callback
54
54
  self.service_role_key = service_role_key
55
55
 
56
- # Last 100 unread messages — drained by meshcode_check tool
57
- self.queue: Deque[Dict] = deque(maxlen=100)
56
+ # Last 500 unread messages — drained by meshcode_check tool
57
+ self.queue: Deque[Dict] = deque(maxlen=500)
58
+ self._overflow_warned = False
58
59
  self._task: Optional[asyncio.Task] = None
59
60
  # asyncio.Event() in Py3.10+ no longer requires a running loop, but
60
61
  # on older Python or certain Windows event-loop policies it can
@@ -99,22 +100,39 @@ class RealtimeListener:
99
100
  pass
100
101
 
101
102
  async def _run(self) -> None:
102
- """Outer loop: reconnect with exponential backoff on disconnect."""
103
+ """Outer loop: reconnect with exponential backoff on disconnect.
104
+
105
+ NEVER gives up — keeps retrying with capped backoff (max 60s).
106
+ The MCP server must stay alive regardless of Realtime health.
107
+ """
103
108
  backoff = 1
109
+ consecutive_failures = 0
104
110
  while not self._stop.is_set():
105
111
  try:
106
112
  await self._connect_and_listen()
107
113
  backoff = 1 # reset on clean disconnect
114
+ consecutive_failures = 0
108
115
  except asyncio.CancelledError:
109
116
  return
110
117
  except Exception as e:
111
- log.warning(f"Realtime connection error: {e}; reconnecting in {backoff}s")
118
+ consecutive_failures += 1
119
+ if consecutive_failures % 10 == 0:
120
+ log.error(
121
+ f"Realtime: {consecutive_failures} consecutive failures — "
122
+ f"still retrying (backoff={backoff}s). Last error: {e}"
123
+ )
124
+ else:
125
+ log.warning(
126
+ f"Realtime connection error ({consecutive_failures}): "
127
+ f"{e}; reconnecting in {backoff}s"
128
+ )
129
+ self._connected = False
112
130
  try:
113
131
  await asyncio.wait_for(self._stop.wait(), timeout=backoff)
114
132
  return # stop signaled
115
133
  except asyncio.TimeoutError:
116
134
  pass
117
- backoff = min(backoff * 2, 30)
135
+ backoff = min(backoff * 2, 60)
118
136
 
119
137
  async def _connect_and_listen(self) -> None:
120
138
  """Single connection lifecycle: connect, subscribe, listen."""
@@ -155,23 +173,33 @@ class RealtimeListener:
155
173
  await ws.send(json.dumps(join_msg))
156
174
 
157
175
  # Wait for phx_reply to confirm subscription was accepted.
176
+ # Retry up to 3 times with backoff — transient Supabase latency
177
+ # can cause one agent to time out while siblings succeed.
158
178
  self._subscription_ok = False
159
- try:
160
- reply_raw = await asyncio.wait_for(ws.recv(), timeout=10.0)
161
- reply = json.loads(reply_raw)
162
- reply_status = (reply.get("payload") or {}).get("status")
163
- if reply_status == "ok":
164
- self._subscription_ok = True
165
- log.info(f"Realtime subscription OK for {self.agent_name} on {topic}")
166
- else:
167
- log.error(
168
- f"Realtime subscription FAILED for {self.agent_name}: "
169
- f"status={reply_status} payload={reply.get('payload')}"
170
- )
171
- except asyncio.TimeoutError:
172
- log.error(f"Realtime subscription TIMEOUT — no phx_reply in 10s for {self.agent_name}")
173
- except Exception as e:
174
- log.error(f"Realtime subscription error reading phx_reply: {e}")
179
+ for _sub_attempt in range(3):
180
+ try:
181
+ reply_raw = await asyncio.wait_for(ws.recv(), timeout=15.0)
182
+ reply = json.loads(reply_raw)
183
+ reply_status = (reply.get("payload") or {}).get("status")
184
+ if reply_status == "ok":
185
+ self._subscription_ok = True
186
+ log.info(f"Realtime subscription OK for {self.agent_name} on {topic}")
187
+ break
188
+ else:
189
+ log.warning(
190
+ f"Realtime subscription attempt {_sub_attempt+1} failed for {self.agent_name}: "
191
+ f"status={reply_status} payload={reply.get('payload')}"
192
+ )
193
+ except asyncio.TimeoutError:
194
+ log.warning(f"Realtime subscription attempt {_sub_attempt+1} TIMEOUT for {self.agent_name}")
195
+ except Exception as e:
196
+ log.warning(f"Realtime subscription attempt {_sub_attempt+1} error: {e}")
197
+ # Retry: re-send join message after brief backoff
198
+ if _sub_attempt < 2:
199
+ await asyncio.sleep(2 ** _sub_attempt)
200
+ await ws.send(json.dumps(join_msg))
201
+ if not self._subscription_ok:
202
+ log.error(f"Realtime subscription FAILED after 3 attempts for {self.agent_name}")
175
203
 
176
204
  # Heartbeat task to keep the connection alive
177
205
  heartbeat_task = asyncio.create_task(self._heartbeat(ws))
@@ -228,6 +256,9 @@ class RealtimeListener:
228
256
  "id": record.get("id"),
229
257
  "parent_id": record.get("parent_msg_id"),
230
258
  }
259
+ if len(self.queue) >= 400 and not self._overflow_warned:
260
+ log.warning(f"Message queue at {len(self.queue)}/500 — risk of dropping messages")
261
+ self._overflow_warned = True
231
262
  self.queue.append(enriched)
232
263
  # Wake any meshcode_wait blocked on this event.
233
264
  try:
@@ -8,11 +8,14 @@ Run with:
8
8
  MESHCODE_PROJECT=my-app MESHCODE_AGENT=backend python -m meshcode_mcp serve
9
9
  """
10
10
  import asyncio
11
+ import atexit
11
12
  import json
12
13
  import logging
13
14
  import os
15
+ import signal
14
16
  import sys
15
17
  import hashlib as _hashlib
18
+ import traceback as _traceback
16
19
  from collections import deque
17
20
  from contextlib import asynccontextmanager
18
21
  from typing import Any, Dict, List, Optional, Union
@@ -64,16 +67,21 @@ def _agent_color(name: str) -> str:
64
67
 
65
68
 
66
69
  def _mc_log(msg: str, level: str = "info") -> None:
67
- """Colored [meshcode-mcp] log line. Uses agent color if available."""
70
+ """Colored [meshcode-mcp] log line. Uses agent color if available.
71
+
72
+ CRITICAL: Must write to stderr, NEVER stdout. MCP protocol uses stdout
73
+ for JSON-RPC — any non-JSON output to stdout corrupts the stream and
74
+ causes Claude Code to kill the connection.
75
+ """
68
76
  agent = os.environ.get("MESHCODE_AGENT", "")
69
77
  c = _agent_color(agent) if agent else "\033[36m"
70
78
  prefix = f"{c}{_ANSI_BOLD}[meshcode-mcp]{_ANSI_RESET}"
71
79
  if level == "error":
72
- print(f"{prefix} \033[91mERROR:{_ANSI_RESET} {msg}", "warn")
80
+ print(f"{prefix} \033[91mERROR:{_ANSI_RESET} {msg}", file=sys.stderr)
73
81
  elif level == "warn":
74
- print(f"{prefix} \033[33mWARNING:{_ANSI_RESET} {msg}", "warn")
82
+ print(f"{prefix} \033[33mWARNING:{_ANSI_RESET} {msg}", file=sys.stderr)
75
83
  else:
76
- print(f"{prefix} {c}{msg}{_ANSI_RESET}", "warn")
84
+ print(f"{prefix} {c}{msg}{_ANSI_RESET}", file=sys.stderr)
77
85
 
78
86
 
79
87
  # ============================================================
@@ -370,34 +378,44 @@ def _get_api_key() -> str:
370
378
  _API_KEY_CACHE = kc_val
371
379
  return kc_val
372
380
  except Exception as e:
373
- _mc_log(f" keychain lookup failed for profile '{profile}': {e}", "warn")
381
+ _mc_log(f" keychain lookup failed for profile '{profile}': {e}", file=sys.stderr)
374
382
  _API_KEY_CACHE = ""
375
383
  return ""
376
384
 
377
385
 
378
- # Resolve project_id at startup. Try in order:
386
+ # Resolve project_id at startup with retry+backoff. Try in order:
379
387
  # 1. MESHCODE_PROJECT_ID env var (baked by `meshcode setup`, fastest)
380
388
  # 2. mc_resolve_project RPC with the user's api_key (security definer, bypasses RLS)
381
389
  # 3. Direct SELECT via get_project_id (only works if RLS is open / user is admin)
382
390
  _PROJECT_ID: Optional[str] = os.environ.get("MESHCODE_PROJECT_ID") or None
383
391
  if not _PROJECT_ID:
384
- _api_key = _get_api_key()
385
- if _api_key:
386
- try:
387
- _r = be.sb_rpc("mc_resolve_project", {
388
- "p_api_key": _api_key,
389
- "p_project_name": PROJECT_NAME,
390
- })
391
- if isinstance(_r, dict) and _r.get("project_id"):
392
- _PROJECT_ID = _r["project_id"]
393
- elif isinstance(_r, dict) and _r.get("error"):
394
- _mc_log(f" mc_resolve_project: {_r['error']}", "warn")
395
- except Exception as _e:
396
- _mc_log(f" mc_resolve_project failed: {_e}", "warn")
397
- if not _PROJECT_ID:
398
- _PROJECT_ID = be.get_project_id(PROJECT_NAME)
392
+ _BOOT_MAX_RETRIES = 3
393
+ _BOOT_BACKOFF = [2, 5, 10] # seconds between retries
394
+ for _boot_attempt in range(_BOOT_MAX_RETRIES):
395
+ _api_key = _get_api_key()
396
+ if _api_key:
397
+ try:
398
+ _r = be.sb_rpc("mc_resolve_project", {
399
+ "p_api_key": _api_key,
400
+ "p_project_name": PROJECT_NAME,
401
+ })
402
+ if isinstance(_r, dict) and _r.get("project_id"):
403
+ _PROJECT_ID = _r["project_id"]
404
+ break
405
+ elif isinstance(_r, dict) and _r.get("error"):
406
+ _mc_log(f" mc_resolve_project: {_r['error']}", file=sys.stderr)
407
+ except Exception as _e:
408
+ _mc_log(f" mc_resolve_project failed: {_e}", file=sys.stderr)
409
+ if not _PROJECT_ID:
410
+ _PROJECT_ID = be.get_project_id(PROJECT_NAME)
411
+ if _PROJECT_ID:
412
+ break
413
+ if _boot_attempt < _BOOT_MAX_RETRIES - 1:
414
+ _wait = _BOOT_BACKOFF[_boot_attempt]
415
+ _mc_log(f" project resolution failed (attempt {_boot_attempt+1}/{_BOOT_MAX_RETRIES}), retrying in {_wait}s...", file=sys.stderr)
416
+ _time.sleep(_wait)
399
417
  if not _PROJECT_ID:
400
- _mc_log(f"project '{PROJECT_NAME}' not found (check MESHCODE_KEYCHAIN_PROFILE / MESHCODE_API_KEY)", "error")
418
+ _mc_log(f"project '{PROJECT_NAME}' not found after {_BOOT_MAX_RETRIES} attempts (check MESHCODE_KEYCHAIN_PROFILE / MESHCODE_API_KEY)", "error")
401
419
  sys.exit(2)
402
420
 
403
421
  # Resolve project plan for adaptive features (heartbeat interval, etc.)
@@ -411,7 +429,7 @@ except Exception:
411
429
 
412
430
  _register_result = be.register_agent(PROJECT_NAME, AGENT_NAME, AGENT_ROLE or "MCP-connected agent", api_key=_get_api_key())
413
431
  if isinstance(_register_result, dict) and _register_result.get("error"):
414
- _mc_log(f" register failed: {_register_result['error']}", "warn")
432
+ _mc_log(f" register failed: {_register_result['error']}", file=sys.stderr)
415
433
 
416
434
  # ── Fetch profile color from dashboard (single source of truth) ──
417
435
  try:
@@ -445,7 +463,7 @@ def _flip_status(status: str, task: str = "") -> bool:
445
463
  return False
446
464
 
447
465
  if not _flip_status("idle", ""):
448
- _mc_log(f" could not flip status to idle", "warn")
466
+ _mc_log(f" could not flip status to idle", file=sys.stderr)
449
467
 
450
468
 
451
469
  # ============================================================
@@ -492,16 +510,17 @@ def _schedule_flip(status: str, task: str = "") -> None:
492
510
 
493
511
 
494
512
  def _set_state(state: str, tool: str = "") -> None:
495
- """Update the state machine and broadcast to dashboard."""
513
+ """Update the state machine and broadcast to dashboard. Thread-safe."""
496
514
  global _current_state, _current_tool, _last_tool_at, _working_timer
497
- # Cancel any pending working→online timer
498
- if _working_timer is not None:
499
- _working_timer.cancel()
500
- _working_timer = None
501
- _current_state = state
502
- _current_tool = tool
503
- if state == "working":
504
- _last_tool_at = _time.time()
515
+ with _flip_lock:
516
+ # Cancel any pending working→online timer
517
+ if _working_timer is not None:
518
+ _working_timer.cancel()
519
+ _working_timer = None
520
+ _current_state = state
521
+ _current_tool = tool
522
+ if state == "working":
523
+ _last_tool_at = _time.time()
505
524
  _schedule_flip(state, tool)
506
525
 
507
526
 
@@ -552,13 +571,14 @@ def with_working_status(func):
552
571
  except Exception as e:
553
572
  if not skip:
554
573
  _auto_learn_error(name, e, list(kwargs.keys()))
555
- raise
574
+ # NEVER re-raise — return structured error instead of crashing
575
+ import traceback as _tb
576
+ _log_crash_to_db("tool_exception", f"{name}: {type(e).__name__}: {e}\n{_tb.format_exc()[-500:]}")
577
+ return {"error": f"tool {name} failed: {type(e).__name__}: {e}", "_recovered": True}
556
578
  finally:
557
579
  if not skip:
558
580
  global _last_tool_at
559
581
  _last_tool_at = _time.time()
560
- # Don't flip to online here — CPU-based detection in heartbeat
561
- # will handle the transition when LLM stops generating
562
582
  return awrapper
563
583
  else:
564
584
  @_functools.wraps(func)
@@ -575,13 +595,14 @@ def with_working_status(func):
575
595
  except Exception as e:
576
596
  if not skip:
577
597
  _auto_learn_error(name, e, list(kwargs.keys()))
578
- raise
598
+ # NEVER re-raise — return structured error instead of crashing
599
+ import traceback as _tb
600
+ _log_crash_to_db("tool_exception", f"{name}: {type(e).__name__}: {e}\n{_tb.format_exc()[-500:]}")
601
+ return {"error": f"tool {name} failed: {type(e).__name__}: {e}", "_recovered": True}
579
602
  finally:
580
603
  if not skip:
581
604
  global _last_tool_at
582
605
  _last_tool_at = _time.time()
583
- # Don't flip to online here — CPU-based detection in heartbeat
584
- # will handle the transition when LLM stops generating
585
606
  return swrapper
586
607
 
587
608
 
@@ -607,7 +628,7 @@ def _acquire_lease() -> bool:
607
628
  })
608
629
  except Exception as e:
609
630
  # Non-fatal: RPC might not exist on older servers.
610
- _mc_log(f"stale-lease pre-clean skipped: {e}", "warn")
631
+ _mc_log(f"stale-lease pre-clean skipped: {e}", file=sys.stderr)
611
632
  for attempt in range(3):
612
633
  try:
613
634
  r = be.sb_rpc("mc_acquire_agent_lease", {
@@ -659,14 +680,14 @@ def _acquire_lease() -> bool:
659
680
  _mc_log(f"Could not start — agent '{AGENT_NAME}' is running in another window.", "error")
660
681
  _mc_log("Close the other window first, or use a different agent name.", "error")
661
682
  return False
662
- _mc_log(f"lease attempt {attempt+1}: {r.get('error')}", "warn")
683
+ _mc_log(f"lease attempt {attempt+1}: {r.get('error')}", file=sys.stderr)
663
684
  else:
664
685
  return True
665
686
  except Exception as e:
666
- _mc_log(f"lease attempt {attempt+1} failed: {e}", "warn")
687
+ _mc_log(f"lease attempt {attempt+1} failed: {e}", file=sys.stderr)
667
688
  if attempt < 2:
668
689
  _time.sleep(2)
669
- _mc_log(f" lease failed after 3 attempts — proceeding anyway", "warn")
690
+ _mc_log(f" lease failed after 3 attempts — proceeding anyway", file=sys.stderr)
670
691
  return True
671
692
 
672
693
  if not _acquire_lease():
@@ -684,7 +705,7 @@ def _boot_diagnostic() -> None:
684
705
  be.sb_select("mc_projects", f"id=eq.{_PROJECT_ID}", limit=1)
685
706
  checks_passed += 1
686
707
  except Exception as e:
687
- print(f"[meshcode] BOOT CHECK FAILED: Supabase API unreachable ({e}). Fix: check network/VPN.", "warn")
708
+ print(f"[meshcode] BOOT CHECK FAILED: Supabase API unreachable ({e}). Fix: check network/VPN.", file=sys.stderr)
688
709
 
689
710
  # Check 2: Lease valid
690
711
  try:
@@ -694,11 +715,11 @@ def _boot_diagnostic() -> None:
694
715
  if agent.get("instance_id") == _INSTANCE_ID:
695
716
  checks_passed += 1
696
717
  else:
697
- print(f"[meshcode] BOOT CHECK FAILED: Lease mismatch — expected {_INSTANCE_ID}, got {agent.get('instance_id')}. Fix: restart agent.", "warn")
718
+ print(f"[meshcode] BOOT CHECK FAILED: Lease mismatch — expected {_INSTANCE_ID}, got {agent.get('instance_id')}. Fix: restart agent.", file=sys.stderr)
698
719
  else:
699
- print(f"[meshcode] BOOT CHECK FAILED: Agent '{AGENT_NAME}' not found in project. Fix: register agent first.", "warn")
720
+ print(f"[meshcode] BOOT CHECK FAILED: Agent '{AGENT_NAME}' not found in project. Fix: register agent first.", file=sys.stderr)
700
721
  except Exception as e:
701
- print(f"[meshcode] BOOT CHECK FAILED: Could not verify lease ({e}).", "warn")
722
+ print(f"[meshcode] BOOT CHECK FAILED: Could not verify lease ({e}).", file=sys.stderr)
702
723
 
703
724
  # Check 3: Heartbeat recent
704
725
  try:
@@ -707,7 +728,7 @@ def _boot_diagnostic() -> None:
707
728
  if hb:
708
729
  checks_passed += 1
709
730
  else:
710
- print(f"[meshcode] BOOT CHECK WARNING: No heartbeat recorded yet.", "warn")
731
+ print(f"[meshcode] BOOT CHECK WARNING: No heartbeat recorded yet.", file=sys.stderr)
711
732
  else:
712
733
  checks_passed += 1 # skip if no agent data
713
734
  except Exception:
@@ -724,9 +745,9 @@ def _boot_diagnostic() -> None:
724
745
  checks_passed += 1 # non-critical
725
746
 
726
747
  if checks_passed == checks_total:
727
- print(f"[meshcode] All boot checks passed ({checks_passed}/{checks_total}).", "warn")
748
+ print(f"[meshcode] All boot checks passed ({checks_passed}/{checks_total}).", file=sys.stderr)
728
749
  else:
729
- print(f"[meshcode] Boot checks: {checks_passed}/{checks_total} passed. Agent starting anyway.", "warn")
750
+ print(f"[meshcode] Boot checks: {checks_passed}/{checks_total} passed. Agent starting anyway.", file=sys.stderr)
730
751
 
731
752
 
732
753
  _boot_diagnostic()
@@ -747,6 +768,53 @@ def _release_lease() -> None:
747
768
  pass
748
769
 
749
770
 
771
+ # ── Crash logging + graceful shutdown ──────────────────────────
772
+ _SHUTDOWN_LOGGED = False
773
+
774
+
775
+ def _log_crash_to_db(reason: str = "unknown", error_detail: str = "") -> None:
776
+ """Best-effort crash log to mc_agent_crash_logs table. Non-fatal if table doesn't exist."""
777
+ global _SHUTDOWN_LOGGED
778
+ if _SHUTDOWN_LOGGED:
779
+ return
780
+ _SHUTDOWN_LOGGED = True
781
+ try:
782
+ be.sb_rpc("mc_log_error", {
783
+ "p_api_key": _get_api_key(),
784
+ "p_project_id": _PROJECT_ID,
785
+ "p_agent_name": AGENT_NAME,
786
+ "p_error_type": reason,
787
+ "p_error_detail": error_detail[:2000],
788
+ "p_instance_id": _INSTANCE_ID,
789
+ })
790
+ except Exception:
791
+ # Table may not exist yet — fall back to status update
792
+ try:
793
+ be.set_status(_PROJECT_ID, AGENT_NAME, "offline",
794
+ f"crashed: {reason[:100]}", api_key=_get_api_key())
795
+ except Exception:
796
+ pass
797
+ _mc_log(f" crash logged: {reason}", file=sys.stderr)
798
+
799
+
800
+ def _on_exit() -> None:
801
+ """atexit handler — release lease and log shutdown."""
802
+ _log_crash_to_db("process_exit", "atexit handler fired")
803
+ _release_lease()
804
+
805
+
806
+ def _on_signal(signum, frame) -> None:
807
+ """Signal handler for SIGTERM/SIGINT — clean shutdown."""
808
+ sig_name = signal.Signals(signum).name if hasattr(signal, 'Signals') else str(signum)
809
+ _log_crash_to_db("signal", f"Received {sig_name}")
810
+ _release_lease()
811
+ sys.exit(128 + signum)
812
+
813
+
814
+ atexit.register(_on_exit)
815
+ signal.signal(signal.SIGTERM, _on_signal)
816
+
817
+
750
818
  # ============================================================
751
819
  # Agent identity from Supabase profile (for system instructions)
752
820
  # ============================================================
@@ -1037,25 +1105,28 @@ def _heartbeat_thread_fn():
1037
1105
  try:
1038
1106
  be.sb_rpc("mc_heartbeat", {"p_project_id": _PROJECT_ID, "p_agent_name": AGENT_NAME, "p_version": _SDK_VERSION})
1039
1107
 
1040
- # CPU-based status detection
1108
+ # CPU-based status detection — read shared state under lock
1041
1109
  parent_cpu = _get_parent_cpu()
1042
- idle_secs = _time.time() - _last_tool_at
1110
+ with _flip_lock:
1111
+ cur_state = _current_state
1112
+ in_wait = _IN_WAIT
1113
+ idle_secs = _time.time() - _last_tool_at
1043
1114
 
1044
- if _IN_WAIT:
1115
+ if in_wait:
1045
1116
  # Actually in meshcode_wait right now — listening for messages
1046
- if _current_state != "waiting":
1117
+ if cur_state != "waiting":
1047
1118
  _set_state("waiting", "listening for messages")
1048
1119
  elif parent_cpu > 3.0:
1049
1120
  # LLM is actively generating tokens or streaming
1050
- if _current_state != "working":
1121
+ if cur_state != "working":
1051
1122
  _set_state("working", "generating response")
1052
- elif _current_state == "working":
1123
+ elif cur_state == "working":
1053
1124
  # LLM just stopped — transition to online (not sleeping)
1054
1125
  _set_state("online", "")
1055
- elif _current_state == "online" and idle_secs > 30:
1126
+ elif cur_state == "online" and idle_secs > 30:
1056
1127
  # Brief idle — show as idle, not sleeping yet
1057
1128
  _set_state("idle", "idle")
1058
- elif _current_state == "idle" and idle_secs > 300 and parent_cpu < 2.0 and not _STAY_AWAKE:
1129
+ elif cur_state == "idle" and idle_secs > 300 and parent_cpu < 2.0 and not _STAY_AWAKE:
1059
1130
  # Extended idle + no CPU activity → sleeping (5 min, not 90s)
1060
1131
  _set_state("sleeping", "sleeping")
1061
1132
 
@@ -1105,6 +1176,18 @@ async def lifespan(_app):
1105
1176
  )
1106
1177
  await _REALTIME.start()
1107
1178
 
1179
+ # Wait up to 5s for realtime to report connected before proceeding.
1180
+ # Without this, the lifespan yields before the WS is ready, and Claude
1181
+ # Code's handshake can time out on slower network paths — one agent
1182
+ # fails while siblings on the same box succeed.
1183
+ for _rt_check in range(10):
1184
+ if getattr(_REALTIME, '_connected', False):
1185
+ log.info(f"Realtime connected for {AGENT_NAME}")
1186
+ break
1187
+ await asyncio.sleep(0.5)
1188
+ else:
1189
+ log.warning(f"Realtime not connected after 5s for {AGENT_NAME} — continuing with polling fallback")
1190
+
1108
1191
  # IMMEDIATE: send first heartbeat + set online status BEFORE any tool calls.
1109
1192
  # Without this, the agent appears offline for up to 30s after boot.
1110
1193
  for _attempt in range(3):
@@ -1152,16 +1235,19 @@ async def lifespan(_app):
1152
1235
  })
1153
1236
  except Exception:
1154
1237
  pass # Never block shutdown
1155
- log.info("lifespan shutdown — stopping heartbeat + realtime + releasing lease")
1156
- _heartbeat_stop.set()
1157
- hb_thread.join(timeout=5)
1158
- await _REALTIME.stop()
1159
- # Flip to offline + release lease so the dashboard reflects reality
1160
- # within seconds (not waiting for the 30s cron to notice).
1238
+ log.info("lifespan shutdown — releasing lease + stopping heartbeat + realtime")
1239
+ # Release lease FIRST — before stopping heartbeat thread.
1240
+ # If heartbeat join times out, the lease is already released so
1241
+ # the agent won't block reconnection.
1161
1242
  try:
1162
1243
  _release_lease()
1163
1244
  except Exception as _e:
1164
1245
  log.warning(f"could not release lease: {_e}")
1246
+ _heartbeat_stop.set()
1247
+ hb_thread.join(timeout=5)
1248
+ if hb_thread.is_alive():
1249
+ log.warning("heartbeat thread did not stop within 5s — lease already released")
1250
+ await _REALTIME.stop()
1165
1251
 
1166
1252
 
1167
1253
  # ============================================================
@@ -1368,9 +1454,9 @@ try:
1368
1454
  elif isinstance(_ls_val, str):
1369
1455
  _LAST_SEEN_TS = _ls_val
1370
1456
  if _LAST_SEEN_TS:
1371
- print(f"[meshcode] Restored last_seen={_LAST_SEEN_TS} from mesh memory.", "warn")
1457
+ print(f"[meshcode] Restored last_seen={_LAST_SEEN_TS} from mesh memory.", file=sys.stderr)
1372
1458
  except Exception as _e:
1373
- print(f"[meshcode] Could not restore last_seen: {_e}", "warn")
1459
+ print(f"[meshcode] Could not restore last_seen: {_e}", file=sys.stderr)
1374
1460
 
1375
1461
 
1376
1462
  def _get_pending_tasks_summary() -> Optional[List[Dict[str, str]]]:
@@ -2166,6 +2252,7 @@ def meshcode_scratchpad_set(key: str, value: Any) -> Dict[str, Any]:
2166
2252
  json_value = value if isinstance(value, (dict, list)) else {"_raw": value}
2167
2253
  return be.sb_rpc("mc_scratchpad_set", {
2168
2254
  "p_api_key": api_key,
2255
+ "p_project_id": _PROJECT_ID,
2169
2256
  "p_key": key,
2170
2257
  "p_value": json_value,
2171
2258
  "p_tier": "reference",
@@ -2182,9 +2269,9 @@ def meshcode_scratchpad_get(key: Optional[str] = None) -> Dict[str, Any]:
2182
2269
  """
2183
2270
  api_key = _get_api_key()
2184
2271
  if key:
2185
- return be.sb_rpc("mc_scratchpad_get", {"p_api_key": api_key, "p_key": key})
2272
+ return be.sb_rpc("mc_scratchpad_get", {"p_api_key": api_key, "p_project_id": _PROJECT_ID, "p_key": key})
2186
2273
  else:
2187
- return be.sb_rpc("mc_scratchpad_list", {"p_api_key": api_key})
2274
+ return be.sb_rpc("mc_scratchpad_list", {"p_api_key": api_key, "p_project_id": _PROJECT_ID})
2188
2275
 
2189
2276
 
2190
2277
  # ----------------- OBSIDIAN SYNC HELPER -----------------
@@ -2332,6 +2419,7 @@ def meshcode_forget(key: str) -> Dict[str, Any]:
2332
2419
  "p_api_key": api_key,
2333
2420
  "p_agent_name": AGENT_NAME,
2334
2421
  "p_key": key,
2422
+ "p_project_name": PROJECT_NAME,
2335
2423
  })
2336
2424
 
2337
2425
 
@@ -2357,6 +2445,50 @@ def meshcode_recall_search(query: str) -> Dict[str, Any]:
2357
2445
  })
2358
2446
 
2359
2447
 
2448
+ # ----------------- HEALTH CHECK -----------------
2449
+
2450
+ @mcp.tool()
2451
+ def meshcode_health() -> Dict[str, Any]:
2452
+ """Check MCP server health: DB connectivity, Realtime status, circuit breaker state, uptime."""
2453
+ import time as _t
2454
+ health: Dict[str, Any] = {
2455
+ "agent": AGENT_NAME,
2456
+ "project": PROJECT_NAME,
2457
+ "instance_id": _INSTANCE_ID,
2458
+ "sdk_version": _SDK_VERSION,
2459
+ }
2460
+
2461
+ # DB latency check
2462
+ _start = _t.monotonic()
2463
+ try:
2464
+ r = be.sb_select("mc_projects", f"id=eq.{_PROJECT_ID}", limit=1)
2465
+ health["db_latency_ms"] = round((_t.monotonic() - _start) * 1000, 1)
2466
+ health["db_status"] = "ok" if r else "empty"
2467
+ except Exception as e:
2468
+ health["db_latency_ms"] = round((_t.monotonic() - _start) * 1000, 1)
2469
+ health["db_status"] = f"error: {e}"
2470
+
2471
+ # Circuit breaker state
2472
+ health["circuit_breaker"] = {
2473
+ "state": be._circuit.state,
2474
+ "failure_count": be._circuit.failure_count,
2475
+ "threshold": be._circuit.failure_threshold,
2476
+ }
2477
+
2478
+ # Realtime status
2479
+ health["realtime_connected"] = getattr(_rt_state, 'connected', False) if '_rt_state' in dir() else "unknown"
2480
+
2481
+ # Process uptime
2482
+ try:
2483
+ import psutil
2484
+ proc = psutil.Process()
2485
+ health["uptime_seconds"] = round(_t.time() - proc.create_time(), 0)
2486
+ except Exception:
2487
+ health["uptime_seconds"] = "unknown (psutil not available)"
2488
+
2489
+ return health
2490
+
2491
+
2360
2492
  # ----------------- RESOURCES -----------------
2361
2493
 
2362
2494
  @mcp.tool()
@@ -2484,7 +2616,7 @@ def _auto_update() -> None:
2484
2616
  return
2485
2617
 
2486
2618
  # 3. Install the new version (blocking, 60s timeout)
2487
- print(f"[meshcode] Updating {current} → {latest}...", "warn")
2619
+ print(f"[meshcode] Updating {current} → {latest}...", file=sys.stderr)
2488
2620
  try:
2489
2621
  result = subprocess.run(
2490
2622
  [sys.executable, "-m", "pip", "install", "--upgrade",
@@ -2502,7 +2634,7 @@ def _auto_update() -> None:
2502
2634
  return
2503
2635
 
2504
2636
  # 4. Re-exec to load the new code
2505
- print(f"[meshcode] Updated to {latest}, restarting...", "warn")
2637
+ print(f"[meshcode] Updated to {latest}, restarting...", file=sys.stderr)
2506
2638
  os.environ["MESHCODE_UPDATED"] = "1"
2507
2639
  try:
2508
2640
  os.execv(sys.executable, [sys.executable] + sys.argv)
@@ -2511,10 +2643,28 @@ def _auto_update() -> None:
2511
2643
 
2512
2644
 
2513
2645
  def run_server():
2514
- """Start the MCP server on stdio (default for Claude Code)."""
2646
+ """Start the MCP server on stdio (default for Claude Code).
2647
+
2648
+ Wraps mcp.run() with crash recovery: if the event loop dies for any
2649
+ reason, log the crash, release the lease, and exit cleanly instead of
2650
+ leaving the agent in a zombie state.
2651
+ """
2515
2652
  _auto_update()
2516
2653
  print(
2517
2654
  f"[meshcode-mcp] Starting server for {AGENT_NAME}@{PROJECT_NAME}",
2518
2655
  file=sys.stderr,
2519
2656
  )
2520
- mcp.run()
2657
+ try:
2658
+ mcp.run()
2659
+ except KeyboardInterrupt:
2660
+ _log_crash_to_db("keyboard_interrupt", "User stopped the agent")
2661
+ except SystemExit as e:
2662
+ _log_crash_to_db("system_exit", f"exit code: {e.code}")
2663
+ raise # re-raise so the process exits with the correct code
2664
+ except Exception as e:
2665
+ import traceback as _tb
2666
+ tb_str = _tb.format_exc()
2667
+ _log_crash_to_db("unhandled_exception", f"{type(e).__name__}: {e}\n{tb_str}")
2668
+ print(f"[meshcode-mcp] FATAL: {e}", file=sys.stderr)
2669
+ print(f"[meshcode-mcp] Stack trace logged to mc_agent_crash_logs", file=sys.stderr)
2670
+ sys.exit(1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: meshcode
3
- Version: 2.10.13
3
+ Version: 2.10.15
4
4
  Summary: Real-time communication between AI agents — Supabase-backed CLI
5
5
  Author-email: MeshCode <hello@meshcode.io>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "meshcode"
7
- version = "2.10.13"
7
+ version = "2.10.15"
8
8
  description = "Real-time communication between AI agents — Supabase-backed CLI"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes