meshcode 2.10.13__tar.gz → 2.10.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {meshcode-2.10.13 → meshcode-2.10.14}/PKG-INFO +1 -1
  2. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/__init__.py +1 -1
  3. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/backend.py +102 -26
  4. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/realtime.py +52 -21
  5. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/server.py +195 -50
  6. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/PKG-INFO +1 -1
  7. {meshcode-2.10.13 → meshcode-2.10.14}/pyproject.toml +1 -1
  8. {meshcode-2.10.13 → meshcode-2.10.14}/README.md +0 -0
  9. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/ascii_art.py +0 -0
  10. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/cli.py +0 -0
  11. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/comms_v4.py +0 -0
  12. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/invites.py +0 -0
  13. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/launcher.py +0 -0
  14. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/launcher_install.py +0 -0
  15. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/__init__.py +0 -0
  16. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/__main__.py +0 -0
  17. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/test_backend.py +0 -0
  18. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/test_realtime.py +0 -0
  19. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/meshcode_mcp/test_server_wrapper.py +0 -0
  20. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/preferences.py +0 -0
  21. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/protocol_v2.py +0 -0
  22. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/run_agent.py +0 -0
  23. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/secrets.py +0 -0
  24. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/self_update.py +0 -0
  25. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode/setup_clients.py +0 -0
  26. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/SOURCES.txt +0 -0
  27. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/dependency_links.txt +0 -0
  28. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/entry_points.txt +0 -0
  29. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/requires.txt +0 -0
  30. {meshcode-2.10.13 → meshcode-2.10.14}/meshcode.egg-info/top_level.txt +0 -0
  31. {meshcode-2.10.13 → meshcode-2.10.14}/setup.cfg +0 -0
  32. {meshcode-2.10.13 → meshcode-2.10.14}/tests/test_status_enum_coverage.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: meshcode
3
- Version: 2.10.13
3
+ Version: 2.10.14
4
4
  Summary: Real-time communication between AI agents — Supabase-backed CLI
5
5
  Author-email: MeshCode <hello@meshcode.io>
6
6
  License: MIT
@@ -1,2 +1,2 @@
1
1
  """MeshCode — Real-time communication between AI agents."""
2
- __version__ = "2.10.13"
2
+ __version__ = "2.10.14"
@@ -5,6 +5,8 @@ Zero deps beyond stdlib (urllib).
5
5
  """
6
6
  import json
7
7
  import os
8
+ import time as _time
9
+ import threading as _threading
8
10
  from datetime import datetime
9
11
  from pathlib import Path
10
12
  from typing import Any, Dict, List, Optional
@@ -12,6 +14,54 @@ from urllib.error import HTTPError, URLError
12
14
  from urllib.parse import quote
13
15
  from urllib.request import Request, urlopen
14
16
 
17
+
18
+ # ── Circuit Breaker ──────────────────────────────────────────────
19
+ # Protects against cascading failures when Supabase is down.
20
+ # States: CLOSED (normal) → OPEN (reject fast) → HALF_OPEN (probe)
21
+ class _CircuitBreaker:
22
+ CLOSED = "closed"
23
+ OPEN = "open"
24
+ HALF_OPEN = "half_open"
25
+
26
+ def __init__(self, failure_threshold: int = 5, recovery_timeout: float = 30.0):
27
+ self.failure_threshold = failure_threshold
28
+ self.recovery_timeout = recovery_timeout
29
+ self.state = self.CLOSED
30
+ self.failure_count = 0
31
+ self.last_failure_time = 0.0
32
+ self._lock = _threading.Lock()
33
+
34
+ def can_execute(self) -> bool:
35
+ with self._lock:
36
+ if self.state == self.CLOSED:
37
+ return True
38
+ if self.state == self.OPEN:
39
+ if _time.monotonic() - self.last_failure_time >= self.recovery_timeout:
40
+ self.state = self.HALF_OPEN
41
+ return True
42
+ return False
43
+ # HALF_OPEN: allow one probe
44
+ return True
45
+
46
+ def record_success(self) -> None:
47
+ with self._lock:
48
+ self.failure_count = 0
49
+ self.state = self.CLOSED
50
+
51
+ def record_failure(self) -> None:
52
+ with self._lock:
53
+ self.failure_count += 1
54
+ self.last_failure_time = _time.monotonic()
55
+ if self.failure_count >= self.failure_threshold:
56
+ self.state = self.OPEN
57
+
58
+ @property
59
+ def is_open(self) -> bool:
60
+ return self.state == self.OPEN
61
+
62
+
63
+ _circuit = _CircuitBreaker(failure_threshold=5, recovery_timeout=30.0)
64
+
15
65
  # Bake in production defaults — RLS-protected publishable key, safe to ship.
16
66
  _DEFAULT_SUPABASE_URL = "https://gjinagyyjttyxnaoavnz.supabase.co"
17
67
  _DEFAULT_SUPABASE_KEY = "sb_publishable_qwN9PO1L7jUXhhbhhVk2CQ_z1FXG2Qf"
@@ -66,22 +116,31 @@ def _headers(*, prefer: Optional[str] = None, content_profile: bool = True) -> D
66
116
 
67
117
 
68
118
  def _request(method: str, path: str, *, data: Any = None, prefer: Optional[str] = None) -> Any:
119
+ if not _circuit.can_execute():
120
+ return {"_error": "circuit breaker open — Supabase temporarily unavailable", "_code": 503}
69
121
  url = f"{SUPABASE_URL}/rest/v1/{path}"
70
122
  body = json.dumps(data).encode("utf-8") if data else None
71
123
  req = Request(url, data=body, method=method, headers=_headers(prefer=prefer))
72
124
  try:
73
125
  with urlopen(req, timeout=10) as resp:
74
126
  raw = resp.read().decode("utf-8")
127
+ _circuit.record_success()
75
128
  return json.loads(raw) if raw.strip() else None
76
129
  except HTTPError as e:
77
130
  err = e.read().decode("utf-8", errors="replace")
131
+ # 4xx = client error (not a backend failure), don't trip breaker
132
+ if 400 <= e.code < 500:
133
+ _circuit.record_success()
134
+ else:
135
+ _circuit.record_failure()
78
136
  try:
79
137
  err_obj = json.loads(err)
80
138
  return {"_error": err_obj.get("message", err[:200]), "_code": e.code}
81
139
  except Exception:
82
140
  return {"_error": err[:200], "_code": e.code}
83
- except URLError as e:
84
- return {"_error": str(e.reason), "_code": 0}
141
+ except (URLError, OSError, TimeoutError) as e:
142
+ _circuit.record_failure()
143
+ return {"_error": str(getattr(e, 'reason', e)), "_code": 0}
85
144
 
86
145
 
87
146
  def sb_select(table: str, filters: str = "", order: Optional[str] = None, limit: Optional[int] = None) -> List[Dict]:
@@ -136,32 +195,49 @@ def enable_recording(api_key: str, project_id: str, agent_name: str, session_id:
136
195
  _recording_session_id = session_id
137
196
 
138
197
 
139
- def sb_rpc(fn_name: str, params: Dict) -> Any:
140
- url = f"{SUPABASE_URL}/rest/v1/rpc/{fn_name}"
141
- body = json.dumps(params).encode("utf-8")
142
- req = Request(url, data=body, method="POST", headers=_headers(content_profile=False))
143
- try:
144
- with urlopen(req, timeout=10) as resp:
145
- raw = resp.read().decode("utf-8")
146
- result = json.loads(raw) if raw.strip() else None
147
- except HTTPError as e:
148
- err = e.read().decode("utf-8", errors="replace")
198
+ def sb_rpc(fn_name: str, params: Dict, *, _max_retries: int = 3) -> Any:
199
+ import random as _random
200
+ if not _circuit.can_execute():
201
+ return {"_error": "circuit breaker open — Supabase temporarily unavailable", "_circuit": "open"}
202
+ last_err = None
203
+ for attempt in range(_max_retries):
204
+ url = f"{SUPABASE_URL}/rest/v1/rpc/{fn_name}"
205
+ body = json.dumps(params).encode("utf-8")
206
+ req = Request(url, data=body, method="POST", headers=_headers(content_profile=False))
149
207
  try:
150
- result = {"_error": json.loads(err).get("message", err[:200])}
151
- except Exception:
152
- result = {"_error": err[:200]}
153
- # Record errors too
154
- if _recording_enabled and fn_name not in _SKIP_RECORDING:
155
- _bg_record("error", {"rpc": fn_name, "error": str(err)[:200]})
156
- return result
157
- except URLError as e:
158
- return {"_error": str(e.reason)}
159
-
160
- # Auto-record tool calls to session events (hot-reloadable)
208
+ with urlopen(req, timeout=10) as resp:
209
+ raw = resp.read().decode("utf-8")
210
+ result = json.loads(raw) if raw.strip() else None
211
+ _circuit.record_success()
212
+ # Auto-record tool calls to session events (hot-reloadable)
213
+ if _recording_enabled and fn_name not in _SKIP_RECORDING:
214
+ _bg_record("tool_call", {"rpc": fn_name})
215
+ return result
216
+ except HTTPError as e:
217
+ err = e.read().decode("utf-8", errors="replace")
218
+ # 4xx errors are not transient don't retry, don't trip breaker
219
+ if 400 <= e.code < 500:
220
+ _circuit.record_success()
221
+ try:
222
+ result = {"_error": json.loads(err).get("message", err[:200])}
223
+ except Exception:
224
+ result = {"_error": err[:200]}
225
+ if _recording_enabled and fn_name not in _SKIP_RECORDING:
226
+ _bg_record("error", {"rpc": fn_name, "error": str(err)[:200]})
227
+ return result
228
+ _circuit.record_failure()
229
+ last_err = err
230
+ except (URLError, OSError, TimeoutError) as e:
231
+ _circuit.record_failure()
232
+ last_err = str(getattr(e, 'reason', e))
233
+ # Retry with jitter for transient errors (5xx, network)
234
+ if attempt < _max_retries - 1:
235
+ delay = (2 ** attempt) + _random.uniform(0, 1)
236
+ _time.sleep(delay)
237
+ # All retries exhausted
161
238
  if _recording_enabled and fn_name not in _SKIP_RECORDING:
162
- _bg_record("tool_call", {"rpc": fn_name})
163
-
164
- return result
239
+ _bg_record("error", {"rpc": fn_name, "error": str(last_err)[:200], "retries_exhausted": True})
240
+ return {"_error": str(last_err)[:200] if last_err else "request failed after retries"}
165
241
 
166
242
 
167
243
  def _bg_record(event_type: str, payload: dict):
@@ -53,8 +53,9 @@ class RealtimeListener:
53
53
  self.notify_callback = notify_callback
54
54
  self.service_role_key = service_role_key
55
55
 
56
- # Last 100 unread messages — drained by meshcode_check tool
57
- self.queue: Deque[Dict] = deque(maxlen=100)
56
+ # Last 500 unread messages — drained by meshcode_check tool
57
+ self.queue: Deque[Dict] = deque(maxlen=500)
58
+ self._overflow_warned = False
58
59
  self._task: Optional[asyncio.Task] = None
59
60
  # asyncio.Event() in Py3.10+ no longer requires a running loop, but
60
61
  # on older Python or certain Windows event-loop policies it can
@@ -99,22 +100,39 @@ class RealtimeListener:
99
100
  pass
100
101
 
101
102
  async def _run(self) -> None:
102
- """Outer loop: reconnect with exponential backoff on disconnect."""
103
+ """Outer loop: reconnect with exponential backoff on disconnect.
104
+
105
+ NEVER gives up — keeps retrying with capped backoff (max 60s).
106
+ The MCP server must stay alive regardless of Realtime health.
107
+ """
103
108
  backoff = 1
109
+ consecutive_failures = 0
104
110
  while not self._stop.is_set():
105
111
  try:
106
112
  await self._connect_and_listen()
107
113
  backoff = 1 # reset on clean disconnect
114
+ consecutive_failures = 0
108
115
  except asyncio.CancelledError:
109
116
  return
110
117
  except Exception as e:
111
- log.warning(f"Realtime connection error: {e}; reconnecting in {backoff}s")
118
+ consecutive_failures += 1
119
+ if consecutive_failures % 10 == 0:
120
+ log.error(
121
+ f"Realtime: {consecutive_failures} consecutive failures — "
122
+ f"still retrying (backoff={backoff}s). Last error: {e}"
123
+ )
124
+ else:
125
+ log.warning(
126
+ f"Realtime connection error ({consecutive_failures}): "
127
+ f"{e}; reconnecting in {backoff}s"
128
+ )
129
+ self._connected = False
112
130
  try:
113
131
  await asyncio.wait_for(self._stop.wait(), timeout=backoff)
114
132
  return # stop signaled
115
133
  except asyncio.TimeoutError:
116
134
  pass
117
- backoff = min(backoff * 2, 30)
135
+ backoff = min(backoff * 2, 60)
118
136
 
119
137
  async def _connect_and_listen(self) -> None:
120
138
  """Single connection lifecycle: connect, subscribe, listen."""
@@ -155,23 +173,33 @@ class RealtimeListener:
155
173
  await ws.send(json.dumps(join_msg))
156
174
 
157
175
  # Wait for phx_reply to confirm subscription was accepted.
176
+ # Retry up to 3 times with backoff — transient Supabase latency
177
+ # can cause one agent to time out while siblings succeed.
158
178
  self._subscription_ok = False
159
- try:
160
- reply_raw = await asyncio.wait_for(ws.recv(), timeout=10.0)
161
- reply = json.loads(reply_raw)
162
- reply_status = (reply.get("payload") or {}).get("status")
163
- if reply_status == "ok":
164
- self._subscription_ok = True
165
- log.info(f"Realtime subscription OK for {self.agent_name} on {topic}")
166
- else:
167
- log.error(
168
- f"Realtime subscription FAILED for {self.agent_name}: "
169
- f"status={reply_status} payload={reply.get('payload')}"
170
- )
171
- except asyncio.TimeoutError:
172
- log.error(f"Realtime subscription TIMEOUT — no phx_reply in 10s for {self.agent_name}")
173
- except Exception as e:
174
- log.error(f"Realtime subscription error reading phx_reply: {e}")
179
+ for _sub_attempt in range(3):
180
+ try:
181
+ reply_raw = await asyncio.wait_for(ws.recv(), timeout=15.0)
182
+ reply = json.loads(reply_raw)
183
+ reply_status = (reply.get("payload") or {}).get("status")
184
+ if reply_status == "ok":
185
+ self._subscription_ok = True
186
+ log.info(f"Realtime subscription OK for {self.agent_name} on {topic}")
187
+ break
188
+ else:
189
+ log.warning(
190
+ f"Realtime subscription attempt {_sub_attempt+1} failed for {self.agent_name}: "
191
+ f"status={reply_status} payload={reply.get('payload')}"
192
+ )
193
+ except asyncio.TimeoutError:
194
+ log.warning(f"Realtime subscription attempt {_sub_attempt+1} TIMEOUT for {self.agent_name}")
195
+ except Exception as e:
196
+ log.warning(f"Realtime subscription attempt {_sub_attempt+1} error: {e}")
197
+ # Retry: re-send join message after brief backoff
198
+ if _sub_attempt < 2:
199
+ await asyncio.sleep(2 ** _sub_attempt)
200
+ await ws.send(json.dumps(join_msg))
201
+ if not self._subscription_ok:
202
+ log.error(f"Realtime subscription FAILED after 3 attempts for {self.agent_name}")
175
203
 
176
204
  # Heartbeat task to keep the connection alive
177
205
  heartbeat_task = asyncio.create_task(self._heartbeat(ws))
@@ -228,6 +256,9 @@ class RealtimeListener:
228
256
  "id": record.get("id"),
229
257
  "parent_id": record.get("parent_msg_id"),
230
258
  }
259
+ if len(self.queue) >= 400 and not self._overflow_warned:
260
+ log.warning(f"Message queue at {len(self.queue)}/500 — risk of dropping messages")
261
+ self._overflow_warned = True
231
262
  self.queue.append(enriched)
232
263
  # Wake any meshcode_wait blocked on this event.
233
264
  try:
@@ -8,11 +8,14 @@ Run with:
8
8
  MESHCODE_PROJECT=my-app MESHCODE_AGENT=backend python -m meshcode_mcp serve
9
9
  """
10
10
  import asyncio
11
+ import atexit
11
12
  import json
12
13
  import logging
13
14
  import os
15
+ import signal
14
16
  import sys
15
17
  import hashlib as _hashlib
18
+ import traceback as _traceback
16
19
  from collections import deque
17
20
  from contextlib import asynccontextmanager
18
21
  from typing import Any, Dict, List, Optional, Union
@@ -375,29 +378,39 @@ def _get_api_key() -> str:
375
378
  return ""
376
379
 
377
380
 
378
- # Resolve project_id at startup. Try in order:
381
+ # Resolve project_id at startup with retry+backoff. Try in order:
379
382
  # 1. MESHCODE_PROJECT_ID env var (baked by `meshcode setup`, fastest)
380
383
  # 2. mc_resolve_project RPC with the user's api_key (security definer, bypasses RLS)
381
384
  # 3. Direct SELECT via get_project_id (only works if RLS is open / user is admin)
382
385
  _PROJECT_ID: Optional[str] = os.environ.get("MESHCODE_PROJECT_ID") or None
383
386
  if not _PROJECT_ID:
384
- _api_key = _get_api_key()
385
- if _api_key:
386
- try:
387
- _r = be.sb_rpc("mc_resolve_project", {
388
- "p_api_key": _api_key,
389
- "p_project_name": PROJECT_NAME,
390
- })
391
- if isinstance(_r, dict) and _r.get("project_id"):
392
- _PROJECT_ID = _r["project_id"]
393
- elif isinstance(_r, dict) and _r.get("error"):
394
- _mc_log(f" mc_resolve_project: {_r['error']}", "warn")
395
- except Exception as _e:
396
- _mc_log(f" mc_resolve_project failed: {_e}", "warn")
397
- if not _PROJECT_ID:
398
- _PROJECT_ID = be.get_project_id(PROJECT_NAME)
387
+ _BOOT_MAX_RETRIES = 3
388
+ _BOOT_BACKOFF = [2, 5, 10] # seconds between retries
389
+ for _boot_attempt in range(_BOOT_MAX_RETRIES):
390
+ _api_key = _get_api_key()
391
+ if _api_key:
392
+ try:
393
+ _r = be.sb_rpc("mc_resolve_project", {
394
+ "p_api_key": _api_key,
395
+ "p_project_name": PROJECT_NAME,
396
+ })
397
+ if isinstance(_r, dict) and _r.get("project_id"):
398
+ _PROJECT_ID = _r["project_id"]
399
+ break
400
+ elif isinstance(_r, dict) and _r.get("error"):
401
+ _mc_log(f" mc_resolve_project: {_r['error']}", "warn")
402
+ except Exception as _e:
403
+ _mc_log(f" mc_resolve_project failed: {_e}", "warn")
404
+ if not _PROJECT_ID:
405
+ _PROJECT_ID = be.get_project_id(PROJECT_NAME)
406
+ if _PROJECT_ID:
407
+ break
408
+ if _boot_attempt < _BOOT_MAX_RETRIES - 1:
409
+ _wait = _BOOT_BACKOFF[_boot_attempt]
410
+ _mc_log(f" project resolution failed (attempt {_boot_attempt+1}/{_BOOT_MAX_RETRIES}), retrying in {_wait}s...", "warn")
411
+ _time.sleep(_wait)
399
412
  if not _PROJECT_ID:
400
- _mc_log(f"project '{PROJECT_NAME}' not found (check MESHCODE_KEYCHAIN_PROFILE / MESHCODE_API_KEY)", "error")
413
+ _mc_log(f"project '{PROJECT_NAME}' not found after {_BOOT_MAX_RETRIES} attempts (check MESHCODE_KEYCHAIN_PROFILE / MESHCODE_API_KEY)", "error")
401
414
  sys.exit(2)
402
415
 
403
416
  # Resolve project plan for adaptive features (heartbeat interval, etc.)
@@ -492,16 +505,17 @@ def _schedule_flip(status: str, task: str = "") -> None:
492
505
 
493
506
 
494
507
  def _set_state(state: str, tool: str = "") -> None:
495
- """Update the state machine and broadcast to dashboard."""
508
+ """Update the state machine and broadcast to dashboard. Thread-safe."""
496
509
  global _current_state, _current_tool, _last_tool_at, _working_timer
497
- # Cancel any pending working→online timer
498
- if _working_timer is not None:
499
- _working_timer.cancel()
500
- _working_timer = None
501
- _current_state = state
502
- _current_tool = tool
503
- if state == "working":
504
- _last_tool_at = _time.time()
510
+ with _flip_lock:
511
+ # Cancel any pending working→online timer
512
+ if _working_timer is not None:
513
+ _working_timer.cancel()
514
+ _working_timer = None
515
+ _current_state = state
516
+ _current_tool = tool
517
+ if state == "working":
518
+ _last_tool_at = _time.time()
505
519
  _schedule_flip(state, tool)
506
520
 
507
521
 
@@ -552,13 +566,14 @@ def with_working_status(func):
552
566
  except Exception as e:
553
567
  if not skip:
554
568
  _auto_learn_error(name, e, list(kwargs.keys()))
555
- raise
569
+ # NEVER re-raise — return structured error instead of crashing
570
+ import traceback as _tb
571
+ _log_crash_to_db("tool_exception", f"{name}: {type(e).__name__}: {e}\n{_tb.format_exc()[-500:]}")
572
+ return {"error": f"tool {name} failed: {type(e).__name__}: {e}", "_recovered": True}
556
573
  finally:
557
574
  if not skip:
558
575
  global _last_tool_at
559
576
  _last_tool_at = _time.time()
560
- # Don't flip to online here — CPU-based detection in heartbeat
561
- # will handle the transition when LLM stops generating
562
577
  return awrapper
563
578
  else:
564
579
  @_functools.wraps(func)
@@ -575,13 +590,14 @@ def with_working_status(func):
575
590
  except Exception as e:
576
591
  if not skip:
577
592
  _auto_learn_error(name, e, list(kwargs.keys()))
578
- raise
593
+ # NEVER re-raise — return structured error instead of crashing
594
+ import traceback as _tb
595
+ _log_crash_to_db("tool_exception", f"{name}: {type(e).__name__}: {e}\n{_tb.format_exc()[-500:]}")
596
+ return {"error": f"tool {name} failed: {type(e).__name__}: {e}", "_recovered": True}
579
597
  finally:
580
598
  if not skip:
581
599
  global _last_tool_at
582
600
  _last_tool_at = _time.time()
583
- # Don't flip to online here — CPU-based detection in heartbeat
584
- # will handle the transition when LLM stops generating
585
601
  return swrapper
586
602
 
587
603
 
@@ -747,6 +763,53 @@ def _release_lease() -> None:
747
763
  pass
748
764
 
749
765
 
766
+ # ── Crash logging + graceful shutdown ──────────────────────────
767
+ _SHUTDOWN_LOGGED = False
768
+
769
+
770
+ def _log_crash_to_db(reason: str = "unknown", error_detail: str = "") -> None:
771
+ """Best-effort crash log to mc_agent_crash_logs table. Non-fatal if table doesn't exist."""
772
+ global _SHUTDOWN_LOGGED
773
+ if _SHUTDOWN_LOGGED:
774
+ return
775
+ _SHUTDOWN_LOGGED = True
776
+ try:
777
+ be.sb_rpc("mc_log_error", {
778
+ "p_api_key": _get_api_key(),
779
+ "p_project_id": _PROJECT_ID,
780
+ "p_agent_name": AGENT_NAME,
781
+ "p_error_type": reason,
782
+ "p_error_detail": error_detail[:2000],
783
+ "p_instance_id": _INSTANCE_ID,
784
+ })
785
+ except Exception:
786
+ # Table may not exist yet — fall back to status update
787
+ try:
788
+ be.set_status(_PROJECT_ID, AGENT_NAME, "offline",
789
+ f"crashed: {reason[:100]}", api_key=_get_api_key())
790
+ except Exception:
791
+ pass
792
+ _mc_log(f" crash logged: {reason}", "warn")
793
+
794
+
795
+ def _on_exit() -> None:
796
+ """atexit handler — release lease and log shutdown."""
797
+ _log_crash_to_db("process_exit", "atexit handler fired")
798
+ _release_lease()
799
+
800
+
801
+ def _on_signal(signum, frame) -> None:
802
+ """Signal handler for SIGTERM/SIGINT — clean shutdown."""
803
+ sig_name = signal.Signals(signum).name if hasattr(signal, 'Signals') else str(signum)
804
+ _log_crash_to_db("signal", f"Received {sig_name}")
805
+ _release_lease()
806
+ sys.exit(128 + signum)
807
+
808
+
809
+ atexit.register(_on_exit)
810
+ signal.signal(signal.SIGTERM, _on_signal)
811
+
812
+
750
813
  # ============================================================
751
814
  # Agent identity from Supabase profile (for system instructions)
752
815
  # ============================================================
@@ -1037,25 +1100,28 @@ def _heartbeat_thread_fn():
1037
1100
  try:
1038
1101
  be.sb_rpc("mc_heartbeat", {"p_project_id": _PROJECT_ID, "p_agent_name": AGENT_NAME, "p_version": _SDK_VERSION})
1039
1102
 
1040
- # CPU-based status detection
1103
+ # CPU-based status detection — read shared state under lock
1041
1104
  parent_cpu = _get_parent_cpu()
1042
- idle_secs = _time.time() - _last_tool_at
1105
+ with _flip_lock:
1106
+ cur_state = _current_state
1107
+ in_wait = _IN_WAIT
1108
+ idle_secs = _time.time() - _last_tool_at
1043
1109
 
1044
- if _IN_WAIT:
1110
+ if in_wait:
1045
1111
  # Actually in meshcode_wait right now — listening for messages
1046
- if _current_state != "waiting":
1112
+ if cur_state != "waiting":
1047
1113
  _set_state("waiting", "listening for messages")
1048
1114
  elif parent_cpu > 3.0:
1049
1115
  # LLM is actively generating tokens or streaming
1050
- if _current_state != "working":
1116
+ if cur_state != "working":
1051
1117
  _set_state("working", "generating response")
1052
- elif _current_state == "working":
1118
+ elif cur_state == "working":
1053
1119
  # LLM just stopped — transition to online (not sleeping)
1054
1120
  _set_state("online", "")
1055
- elif _current_state == "online" and idle_secs > 30:
1121
+ elif cur_state == "online" and idle_secs > 30:
1056
1122
  # Brief idle — show as idle, not sleeping yet
1057
1123
  _set_state("idle", "idle")
1058
- elif _current_state == "idle" and idle_secs > 300 and parent_cpu < 2.0 and not _STAY_AWAKE:
1124
+ elif cur_state == "idle" and idle_secs > 300 and parent_cpu < 2.0 and not _STAY_AWAKE:
1059
1125
  # Extended idle + no CPU activity → sleeping (5 min, not 90s)
1060
1126
  _set_state("sleeping", "sleeping")
1061
1127
 
@@ -1105,6 +1171,18 @@ async def lifespan(_app):
1105
1171
  )
1106
1172
  await _REALTIME.start()
1107
1173
 
1174
+ # Wait up to 5s for realtime to report connected before proceeding.
1175
+ # Without this, the lifespan yields before the WS is ready, and Claude
1176
+ # Code's handshake can time out on slower network paths — one agent
1177
+ # fails while siblings on the same box succeed.
1178
+ for _rt_check in range(10):
1179
+ if getattr(_REALTIME, '_connected', False):
1180
+ log.info(f"Realtime connected for {AGENT_NAME}")
1181
+ break
1182
+ await asyncio.sleep(0.5)
1183
+ else:
1184
+ log.warning(f"Realtime not connected after 5s for {AGENT_NAME} — continuing with polling fallback")
1185
+
1108
1186
  # IMMEDIATE: send first heartbeat + set online status BEFORE any tool calls.
1109
1187
  # Without this, the agent appears offline for up to 30s after boot.
1110
1188
  for _attempt in range(3):
@@ -1152,16 +1230,19 @@ async def lifespan(_app):
1152
1230
  })
1153
1231
  except Exception:
1154
1232
  pass # Never block shutdown
1155
- log.info("lifespan shutdown — stopping heartbeat + realtime + releasing lease")
1156
- _heartbeat_stop.set()
1157
- hb_thread.join(timeout=5)
1158
- await _REALTIME.stop()
1159
- # Flip to offline + release lease so the dashboard reflects reality
1160
- # within seconds (not waiting for the 30s cron to notice).
1233
+ log.info("lifespan shutdown — releasing lease + stopping heartbeat + realtime")
1234
+ # Release lease FIRST — before stopping heartbeat thread.
1235
+ # If heartbeat join times out, the lease is already released so
1236
+ # the agent won't block reconnection.
1161
1237
  try:
1162
1238
  _release_lease()
1163
1239
  except Exception as _e:
1164
1240
  log.warning(f"could not release lease: {_e}")
1241
+ _heartbeat_stop.set()
1242
+ hb_thread.join(timeout=5)
1243
+ if hb_thread.is_alive():
1244
+ log.warning("heartbeat thread did not stop within 5s — lease already released")
1245
+ await _REALTIME.stop()
1165
1246
 
1166
1247
 
1167
1248
  # ============================================================
@@ -2166,6 +2247,7 @@ def meshcode_scratchpad_set(key: str, value: Any) -> Dict[str, Any]:
2166
2247
  json_value = value if isinstance(value, (dict, list)) else {"_raw": value}
2167
2248
  return be.sb_rpc("mc_scratchpad_set", {
2168
2249
  "p_api_key": api_key,
2250
+ "p_project_id": _PROJECT_ID,
2169
2251
  "p_key": key,
2170
2252
  "p_value": json_value,
2171
2253
  "p_tier": "reference",
@@ -2182,9 +2264,9 @@ def meshcode_scratchpad_get(key: Optional[str] = None) -> Dict[str, Any]:
2182
2264
  """
2183
2265
  api_key = _get_api_key()
2184
2266
  if key:
2185
- return be.sb_rpc("mc_scratchpad_get", {"p_api_key": api_key, "p_key": key})
2267
+ return be.sb_rpc("mc_scratchpad_get", {"p_api_key": api_key, "p_project_id": _PROJECT_ID, "p_key": key})
2186
2268
  else:
2187
- return be.sb_rpc("mc_scratchpad_list", {"p_api_key": api_key})
2269
+ return be.sb_rpc("mc_scratchpad_list", {"p_api_key": api_key, "p_project_id": _PROJECT_ID})
2188
2270
 
2189
2271
 
2190
2272
  # ----------------- OBSIDIAN SYNC HELPER -----------------
@@ -2332,6 +2414,7 @@ def meshcode_forget(key: str) -> Dict[str, Any]:
2332
2414
  "p_api_key": api_key,
2333
2415
  "p_agent_name": AGENT_NAME,
2334
2416
  "p_key": key,
2417
+ "p_project_name": PROJECT_NAME,
2335
2418
  })
2336
2419
 
2337
2420
 
@@ -2357,6 +2440,50 @@ def meshcode_recall_search(query: str) -> Dict[str, Any]:
2357
2440
  })
2358
2441
 
2359
2442
 
2443
+ # ----------------- HEALTH CHECK -----------------
2444
+
2445
+ @mcp.tool()
2446
+ def meshcode_health() -> Dict[str, Any]:
2447
+ """Check MCP server health: DB connectivity, Realtime status, circuit breaker state, uptime."""
2448
+ import time as _t
2449
+ health: Dict[str, Any] = {
2450
+ "agent": AGENT_NAME,
2451
+ "project": PROJECT_NAME,
2452
+ "instance_id": _INSTANCE_ID,
2453
+ "sdk_version": _SDK_VERSION,
2454
+ }
2455
+
2456
+ # DB latency check
2457
+ _start = _t.monotonic()
2458
+ try:
2459
+ r = be.sb_select("mc_projects", f"id=eq.{_PROJECT_ID}", limit=1)
2460
+ health["db_latency_ms"] = round((_t.monotonic() - _start) * 1000, 1)
2461
+ health["db_status"] = "ok" if r else "empty"
2462
+ except Exception as e:
2463
+ health["db_latency_ms"] = round((_t.monotonic() - _start) * 1000, 1)
2464
+ health["db_status"] = f"error: {e}"
2465
+
2466
+ # Circuit breaker state
2467
+ health["circuit_breaker"] = {
2468
+ "state": be._circuit.state,
2469
+ "failure_count": be._circuit.failure_count,
2470
+ "threshold": be._circuit.failure_threshold,
2471
+ }
2472
+
2473
+ # Realtime status
2474
+ health["realtime_connected"] = getattr(_rt_state, 'connected', False) if '_rt_state' in dir() else "unknown"
2475
+
2476
+ # Process uptime
2477
+ try:
2478
+ import psutil
2479
+ proc = psutil.Process()
2480
+ health["uptime_seconds"] = round(_t.time() - proc.create_time(), 0)
2481
+ except Exception:
2482
+ health["uptime_seconds"] = "unknown (psutil not available)"
2483
+
2484
+ return health
2485
+
2486
+
2360
2487
  # ----------------- RESOURCES -----------------
2361
2488
 
2362
2489
  @mcp.tool()
@@ -2511,10 +2638,28 @@ def _auto_update() -> None:
2511
2638
 
2512
2639
 
2513
2640
  def run_server():
2514
- """Start the MCP server on stdio (default for Claude Code)."""
2641
+ """Start the MCP server on stdio (default for Claude Code).
2642
+
2643
+ Wraps mcp.run() with crash recovery: if the event loop dies for any
2644
+ reason, log the crash, release the lease, and exit cleanly instead of
2645
+ leaving the agent in a zombie state.
2646
+ """
2515
2647
  _auto_update()
2516
2648
  print(
2517
2649
  f"[meshcode-mcp] Starting server for {AGENT_NAME}@{PROJECT_NAME}",
2518
2650
  file=sys.stderr,
2519
2651
  )
2520
- mcp.run()
2652
+ try:
2653
+ mcp.run()
2654
+ except KeyboardInterrupt:
2655
+ _log_crash_to_db("keyboard_interrupt", "User stopped the agent")
2656
+ except SystemExit as e:
2657
+ _log_crash_to_db("system_exit", f"exit code: {e.code}")
2658
+ raise # re-raise so the process exits with the correct code
2659
+ except Exception as e:
2660
+ import traceback as _tb
2661
+ tb_str = _tb.format_exc()
2662
+ _log_crash_to_db("unhandled_exception", f"{type(e).__name__}: {e}\n{tb_str}")
2663
+ print(f"[meshcode-mcp] FATAL: {e}", file=sys.stderr)
2664
+ print(f"[meshcode-mcp] Stack trace logged to mc_agent_crash_logs", file=sys.stderr)
2665
+ sys.exit(1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: meshcode
3
- Version: 2.10.13
3
+ Version: 2.10.14
4
4
  Summary: Real-time communication between AI agents — Supabase-backed CLI
5
5
  Author-email: MeshCode <hello@meshcode.io>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "meshcode"
7
- version = "2.10.13"
7
+ version = "2.10.14"
8
8
  description = "Real-time communication between AI agents — Supabase-backed CLI"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes