meshcode 2.11.148__tar.gz → 2.11.150__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. {meshcode-2.11.148 → meshcode-2.11.150}/PKG-INFO +1 -1
  2. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/__init__.py +1 -1
  3. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/hostd.py +230 -1
  4. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/server.py +3 -30
  5. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/protocol_handler.py +173 -19
  6. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/run_agent.py +80 -9
  7. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode.egg-info/PKG-INFO +1 -1
  8. {meshcode-2.11.148 → meshcode-2.11.150}/pyproject.toml +1 -1
  9. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_helper_visuals.py +17 -2
  10. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_hostd_zombie_sessions.py +3 -0
  11. {meshcode-2.11.148 → meshcode-2.11.150}/README.md +0 -0
  12. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/__main__.py +0 -0
  13. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/_session_handoff_template.py +0 -0
  14. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/_stop_hook_template.py +0 -0
  15. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/ascii_art.py +0 -0
  16. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/atomic_push.py +0 -0
  17. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/claude_update.py +0 -0
  18. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/cli.py +0 -0
  19. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/comms_v4.py +0 -0
  20. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/compat.py +0 -0
  21. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/daemon.py +0 -0
  22. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/date_parse.py +0 -0
  23. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/doctor.py +0 -0
  24. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/error_hints.py +0 -0
  25. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/exceptions.py +0 -0
  26. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/helper_visuals.py +0 -0
  27. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/hooks/__init__.py +0 -0
  28. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/hooks/repo_path_lock.py +0 -0
  29. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/invites.py +0 -0
  30. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/launcher.py +0 -0
  31. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/launcher_install.py +0 -0
  32. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/__init__.py +0 -0
  33. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/__main__.py +0 -0
  34. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/backend.py +0 -0
  35. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/realtime.py +0 -0
  36. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/sleep_signals.py +0 -0
  37. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/swarm.py +0 -0
  38. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/test_backend.py +0 -0
  39. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/test_boot_timing.py +0 -0
  40. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/test_install_guard.py +0 -0
  41. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/test_prefs_claude_version.py +0 -0
  42. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/test_realtime.py +0 -0
  43. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/test_server_wrapper.py +0 -0
  44. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/meshcode_mcp/test_swarm.py +0 -0
  45. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/preferences.py +0 -0
  46. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/protocol_v2.py +0 -0
  47. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/quickstart.py +0 -0
  48. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/rpc_allowlist.py +0 -0
  49. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/scripts/check_secrets.py +0 -0
  50. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/scripts/race_rate_harness.py +0 -0
  51. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/secrets.py +0 -0
  52. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/self_update.py +0 -0
  53. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/setup_clients.py +0 -0
  54. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/supervisor.py +0 -0
  55. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/up.py +0 -0
  56. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode/upload.py +0 -0
  57. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode.egg-info/SOURCES.txt +0 -0
  58. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode.egg-info/dependency_links.txt +0 -0
  59. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode.egg-info/entry_points.txt +0 -0
  60. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode.egg-info/requires.txt +0 -0
  61. {meshcode-2.11.148 → meshcode-2.11.150}/meshcode.egg-info/top_level.txt +0 -0
  62. {meshcode-2.11.148 → meshcode-2.11.150}/setup.cfg +0 -0
  63. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_auto_update_hardening.py +0 -0
  64. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_autonomous_closegap_1.py +0 -0
  65. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_autonomous_closegap_2.py +0 -0
  66. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_autonomous_closegap_3.py +0 -0
  67. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_autonomous_prompt_inject.py +0 -0
  68. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_boot_bug_regression.py +0 -0
  69. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_color_truecolor.py +0 -0
  70. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_core.py +0 -0
  71. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_cross_agent_messaging.py +0 -0
  72. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_date_parse.py +0 -0
  73. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_doctor.py +0 -0
  74. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_epistemic_v1_python_sdk.py +0 -0
  75. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_epistemic_v1_stop_conditions.py +0 -0
  76. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_esc_deaf_state.py +0 -0
  77. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_exceptions.py +0 -0
  78. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_file_upload.py +0 -0
  79. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_hostd_launch_pinned_env.py +0 -0
  80. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_hostd_serve_discovery_split.py +0 -0
  81. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_init_device_code.py +0 -0
  82. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_install_guard.py +0 -0
  83. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_lease_sigterm_release.py +0 -0
  84. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_live_mesh_guard.py +0 -0
  85. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_mark_read_batch.py +0 -0
  86. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_marketplace_ratings.py +0 -0
  87. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_migration_integrity.py +0 -0
  88. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_pretrust_claude.py +0 -0
  89. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_realtime_event_freshness.py +0 -0
  90. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_rls_cross_tenant.py +0 -0
  91. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_rpc_grants.py +0 -0
  92. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_rpc_migrations.py +0 -0
  93. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_run_agent_dry_run.py +0 -0
  94. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_run_agent_no_server_import.py +0 -0
  95. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_security_regressions.py +0 -0
  96. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_self_update_user_site.py +0 -0
  97. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_sentinel.py +0 -0
  98. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_session_replay_gate.py +0 -0
  99. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_setup_path.py +0 -0
  100. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_sleep_signals.py +0 -0
  101. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_status_enum_coverage.py +0 -0
  102. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_stay_on_loop_hook.py +0 -0
  103. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_stop_ghost_terminal.py +0 -0
  104. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_swarm_events.py +0 -0
  105. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_task_progress.py +0 -0
  106. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_terminal_lifecycle.py +0 -0
  107. {meshcode-2.11.148 → meshcode-2.11.150}/tests/test_wait_open_tasks_contradiction.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: meshcode
3
- Version: 2.11.148
3
+ Version: 2.11.150
4
4
  Summary: Real-time communication between AI agents — Supabase-backed CLI
5
5
  Author-email: MeshCode <hello@meshcode.io>
6
6
  License: MIT
@@ -1,5 +1,5 @@
1
1
  """MeshCode — Real-time communication between AI agents."""
2
- __version__ = "2.11.148"
2
+ __version__ = "2.11.150"
3
3
 
4
4
  # Exception hierarchy — eagerly imported (lightweight, no deps)
5
5
  from meshcode.exceptions import ( # noqa: F401
@@ -265,6 +265,24 @@ _RESPAWN_CONVERGE_BLOCK_TTL_S = _env_int("MESHCODE_RESPAWN_CONVERGE_BLOCK_TTL_SE
265
265
  _RESPAWN_FRESH_CLICK_S = _env_int("MESHCODE_RESPAWN_FRESH_CLICK_SEC", 90, 30) # spawned_age_s under this = explicit Start click
266
266
  _RESPAWN_BOOT_GRACE_S = _env_int("MESHCODE_RESPAWN_BOOT_GRACE_SEC", 90, 30) # gap 3: a live pid younger than this is BOOTING — a fresh Start must not kill it
267
267
 
268
+ # ------------------------------------------------------------------
269
+ # ABSOLUTE no-confirmed-boot cap (task c9d6b819, chief pt3 — macOS terminal
270
+ # storm 2026-06-19: ian's screen flooded with ~40 dead Terminal windows). The
271
+ # CONVERGENCE guard above is CLEARED by an explicit Start (_fresh_click), and the
272
+ # server-side respawn cap RESETS when a slow agent heartbeats briefly (count->0).
273
+ # So a target whose spawned_at keeps re-stamping (idempotent restart_requested)
274
+ # or that briefly boots every sweep evades BOTH and keeps opening windows. macOS
275
+ # made it worse: `open -a Terminal` returns rc==0 the instant LaunchServices
276
+ # accepts — the window opens, the agent inside dies, the 150s sweep re-qualifies
277
+ # it, and a NEW window opens each time. This backstop is cadence- AND
278
+ # click-INDEPENDENT: it counts EVERY plain respawn of a target in a rolling
279
+ # window and, past the cap, BLOCKS — and unlike convergence an explicit Start
280
+ # does NOT clear it (a Start that merely re-storms is the exact bug). The window
281
+ # elapsing with no further respawn = the agent finally stayed up (confirmed boot)
282
+ # -> state resets. Hard ceiling = _RESPAWN_ABS_CAP spawns / _RESPAWN_ABS_WINDOW_S.
283
+ _RESPAWN_ABS_CAP = _env_int("MESHCODE_RESPAWN_ABS_CAP", 8, 3) # max plain respawns/target before the absolute block
284
+ _RESPAWN_ABS_WINDOW_S = _env_int("MESHCODE_RESPAWN_ABS_WINDOW_SEC", 3600, 600) # rolling window; elapsing with no respawn = confirmed boot -> reset
285
+
268
286
  # ------------------------------------------------------------------
269
287
  # WEDGE WATCHDOG (gap 4 respawn storm, task e5978f32): incident 2026-06-10 —
270
288
  # hostd sat WEDGED ~6h after a DNS failure and nothing restarted it. Root
@@ -986,6 +1004,36 @@ def _do_respawns(api_key: str, host_id: str) -> int:
986
1004
  if not _is_recycle:
987
1005
  _now2 = time.time()
988
1006
  _st2 = _load_state()
1007
+ # ABS-CAP CHECK (chief pt3): cadence/click-independent backstop the
1008
+ # convergence guard + server cap both miss. Evaluated BEFORE the
1009
+ # _fresh_click clearing below so an explicit Start can NEVER loosen it.
1010
+ # window_start elapsing past the window with no new respawn = the agent
1011
+ # stayed up = confirmed boot -> reset to a fresh, unblocked window.
1012
+ _abs_all = dict(_st2.get("respawn_abs") or {})
1013
+ _abs = dict(_abs_all.get(_target) or {})
1014
+ _abs_start = float(_abs.get("window_start") or 0)
1015
+ if not _abs_start or (_now2 - _abs_start) > _RESPAWN_ABS_WINDOW_S:
1016
+ _abs = {"window_start": _now2, "count": 0}
1017
+ _abs_all[_target] = _abs
1018
+ _st2["respawn_abs"] = _abs_all
1019
+ _save_state(_st2)
1020
+ if int(_abs.get("count", 0)) >= _RESPAWN_ABS_CAP:
1021
+ _log(f"ABS-CAP {_target}: {_abs.get('count')} respawns in <= {_RESPAWN_ABS_WINDOW_S}s "
1022
+ f"with no sustained boot — BLOCKING (cadence/click-independent; an explicit "
1023
+ f"Start does NOT clear it; resets only after the window elapses with the agent "
1024
+ f"staying up). [respawn_blocked_reason=respawn_abs_cap]")
1025
+ try: # kill the dashboard 'launching…' spinner with an actionable toast
1026
+ _rpc("mc_resolve_launch", {
1027
+ "p_api_key": api_key, "p_project_id": c.get("project_id"), "p_agent": agent,
1028
+ "p_status": "failed", "p_reason": "respawn_abs_cap",
1029
+ "p_detail": f"agent respawned {_abs.get('count')}x without staying up — auto-launch "
1030
+ f"paused to stop a terminal storm; fix the boot (Claude Code login `claude` "
1031
+ f"/ env), then press Start."})
1032
+ except Exception:
1033
+ pass
1034
+ _log_respawn_event(api_key, host_id, c, "respawn", "failed", "respawn_abs_cap",
1035
+ detail=f"{_abs.get('count')} respawns within {_RESPAWN_ABS_WINDOW_S}s, no sustained boot — abs-cap block")
1036
+ continue
989
1037
  try:
990
1038
  _fresh_click = float(c.get("spawned_age_s")) < _RESPAWN_FRESH_CLICK_S
991
1039
  except (TypeError, ValueError):
@@ -1201,6 +1249,19 @@ def _do_respawns(api_key: str, host_id: str) -> int:
1201
1249
  detail=f"stale {c.get('heartbeat_age_s')}s")
1202
1250
  n += 1
1203
1251
  continue
1252
+ # ABS-CAP INCREMENT (chief pt3): count this plain respawn against the
1253
+ # absolute, click-independent ceiling. Recycle path excluded — it
1254
+ # `continue`d above and has its own convergence guard. Separate
1255
+ # load/save cycle: the guard-block _st2 was already saved by now.
1256
+ _abs_st = _load_state()
1257
+ _abs_all2 = dict(_abs_st.get("respawn_abs") or {})
1258
+ _abs2 = dict(_abs_all2.get(_target) or {})
1259
+ if not _abs2.get("window_start"):
1260
+ _abs2["window_start"] = time.time()
1261
+ _abs2["count"] = int(_abs2.get("count", 0)) + 1
1262
+ _abs_all2[_target] = _abs2
1263
+ _abs_st["respawn_abs"] = _abs_all2
1264
+ _save_state(_abs_st)
1204
1265
  rec = _rpc("mc_record_respawn",
1205
1266
  {"p_api_key": api_key, "p_project_id": c.get("project_id"), "p_agent_name": agent})
1206
1267
  # gap 3 telemetry: post-record count when the RPC returns it (the candidate's
@@ -2761,7 +2822,12 @@ def _hostd_install_windows() -> int:
2761
2822
  'set "MESHCODE_NO_UPDATE=1"\r\n' # var honored pre-2.11.74 (belt-and-suspenders)
2762
2823
  'set "MESHCODE_NO_AUTO_UPDATE=1"\r\n' # unified var (honored 2.11.74+)
2763
2824
  'set "MESHCODE_HOSTD_POLL_SEC=10"\r\n' # faster click->spawn (default 10s, floor 3)
2764
- f'"{mc}" hostd run\r\n',
2825
+ # F1a (task 48c3f294): launch the SUPERVISOR, not `hostd run` directly.
2826
+ # The supervisor owns + revives hostd on CRASH — the recovery the HKCU\Run
2827
+ # tier (login-only) lacks (that was the dead launch button on Samuel's box).
2828
+ # It self-singletons, so the Task-Scheduler periodic watchdog tier above
2829
+ # can also point here without ever double-starting.
2830
+ f'"{mc}" hostd supervise\r\n',
2765
2831
  encoding="utf-8",
2766
2832
  )
2767
2833
  # Register-ScheduledTask (PowerShell) — ROBUST KEEP-ALIVE (task 843f282c CRITICAL: hostd died ~15:04
@@ -3074,6 +3140,163 @@ def _acquire_hostd_singleton():
3074
3140
  return (None, "error")
3075
3141
 
3076
3142
 
3143
+ # ------------------------------------------------------------------
3144
+ # hostd SUPERVISOR (task 48c3f294 / F1a — Windows crash+reboot revive). The only
3145
+ # non-admin Windows persistence that works on locked-down boxes (Samuel's) is
3146
+ # HKCU\Run, which fires ONLY at login: a hostd that CRASHES mid-session is never
3147
+ # revived = the dead launch button. This supervisor is a SEPARATE, minimal
3148
+ # process that OWNS a `hostd run` child and restarts it on exit — the crash
3149
+ # recovery HKCU\Run lacks. Separate process, NOT hostd self-forking: an early
3150
+ # hostd crash (import error / startup fault) before any in-process fork would
3151
+ # otherwise never be revived. COMPOSES with hostd's in-process wedge watchdog
3152
+ # (process-STUCK self-restart via execv); this is the process-GONE half.
3153
+ # Anti-storm: the restart loop mirrors the absolute respawn cap (5618ac17) — a
3154
+ # hostd that crash-loops WITHOUT ever staying up is STOPPED after a ceiling and
3155
+ # its status written to hostd-supervisor.state, so the supervisor can never
3156
+ # become the storm it guards against. macOS (launchd KeepAlive) + Linux (systemd
3157
+ # Restart=on-failure) already get crash-revive, so this is win32-only in practice.
3158
+ # ------------------------------------------------------------------
3159
+ _SUP_LOCK_FH = None
3160
+ _SUP_STATE_PATH = STATE_DIR / "hostd-supervisor.state"
3161
+ _SUP_CONFIRMED_BOOT_S = _env_int("MESHCODE_SUP_CONFIRMED_BOOT_SEC", 120, 30) # child up >= this = confirmed boot -> reset cap
3162
+ _SUP_ABS_CAP = _env_int("MESHCODE_SUP_ABS_CAP", 5, 2) # crash-restarts w/o confirmed boot before STOP
3163
+ _SUP_ABS_WINDOW_S = _env_int("MESHCODE_SUP_ABS_WINDOW_SEC", 3600, 300) # rolling window
3164
+ _SUP_BACKOFF_BASE_S = _env_int("MESHCODE_SUP_BACKOFF_BASE_SEC", 5, 1) # exp backoff base between crash restarts
3165
+ _SUP_BACKOFF_MAX_S = _env_int("MESHCODE_SUP_BACKOFF_MAX_SEC", 300, 10) # backoff ceiling
3166
+ _SUP_POLL_S = _env_int("MESHCODE_SUP_POLL_SEC", 5, 1) # poll cadence while a hostd is already alive
3167
+
3168
+
3169
+ def _flock_probe_held(lock_path: Path) -> bool:
3170
+ """True if a LIVE process holds an exclusive flock/msvcrt lock on lock_path.
3171
+ Non-destructive try-acquire-release — robust vs the informational pid inside
3172
+ the lock file (which can be stale after a hard kill). Used to detect a live
3173
+ hostd before the supervisor spawns one (never double-start)."""
3174
+ try:
3175
+ fh = open(lock_path, "a+")
3176
+ except Exception:
3177
+ return False # can't evaluate -> assume free (supervisor errs toward having a hostd)
3178
+ try:
3179
+ if sys.platform == "win32":
3180
+ import msvcrt
3181
+ fh.seek(0)
3182
+ try:
3183
+ msvcrt.locking(fh.fileno(), msvcrt.LK_NBLCK, 1)
3184
+ except OSError:
3185
+ return True # held by a live process
3186
+ try:
3187
+ msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)
3188
+ except OSError:
3189
+ pass
3190
+ return False
3191
+ import fcntl
3192
+ try:
3193
+ fcntl.flock(fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
3194
+ except OSError:
3195
+ return True
3196
+ fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
3197
+ return False
3198
+ finally:
3199
+ fh.close()
3200
+
3201
+
3202
+ def _acquire_supervisor_singleton():
3203
+ """One supervisor per machine — flock/msvcrt on hostd-supervisor.lock, held
3204
+ for the process lifetime, OS-released on ANY death (no stale-lock problem;
3205
+ same field-tested mechanism as _acquire_hostd_singleton — addresses the
3206
+ SPOC's 'O_EXCL leaves a stale lock after a hard kill' note). Returns the fh
3207
+ on success, None if another live supervisor already holds it."""
3208
+ try:
3209
+ STATE_DIR.mkdir(parents=True, exist_ok=True)
3210
+ fh = open(STATE_DIR / "hostd-supervisor.lock", "a+")
3211
+ except Exception:
3212
+ return None
3213
+ try:
3214
+ if sys.platform == "win32":
3215
+ import msvcrt
3216
+ fh.seek(0)
3217
+ msvcrt.locking(fh.fileno(), msvcrt.LK_NBLCK, 1)
3218
+ else:
3219
+ import fcntl
3220
+ fcntl.flock(fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
3221
+ except OSError:
3222
+ fh.close()
3223
+ return None
3224
+ try:
3225
+ fh.seek(0)
3226
+ fh.truncate()
3227
+ fh.write(str(os.getpid()))
3228
+ fh.flush()
3229
+ except Exception:
3230
+ pass
3231
+ return fh
3232
+
3233
+
3234
+ def _sup_write_state(state: str, reason: str = "", restart_count: int = 0) -> None:
3235
+ """Persist supervisor status to hostd-supervisor.state — the data source for
3236
+ the F1b dashboard-liveness UX (FE reads it to surface 'hostd stopped/looping'
3237
+ instead of a silently dead daemon). Best-effort."""
3238
+ try:
3239
+ _SUP_STATE_PATH.write_text(json.dumps({
3240
+ "state": state, "reason": reason, "restart_count": restart_count,
3241
+ "pid": os.getpid(), "ts": int(time.time()),
3242
+ }), encoding="utf-8")
3243
+ except Exception:
3244
+ pass
3245
+
3246
+
3247
+ def _hostd_supervise() -> int:
3248
+ """Supervisor loop: own a `hostd run` child and revive it on crash, with an
3249
+ absolute no-confirmed-boot cap so a crash-loop can't storm. See the block
3250
+ comment above for the full rationale."""
3251
+ fh = _acquire_supervisor_singleton()
3252
+ if fh is None:
3253
+ _log("hostd supervisor already running (hostd-supervisor.lock held) — exiting (singleton)")
3254
+ return 0
3255
+ global _SUP_LOCK_FH
3256
+ _SUP_LOCK_FH = fh
3257
+ _log("hostd supervisor started — owning + reviving `hostd run` (F1a)")
3258
+ argv = _hostd_run_argv()
3259
+ restarts: list = [] # spawn timestamps that did NOT reach a confirmed boot
3260
+ while True:
3261
+ # NEVER double-start: if a live hostd already holds hostd.lock (a manual
3262
+ # `hostd run`, or an orphan that outlived a prior supervisor), don't spawn
3263
+ # — wait for it to free. Without this, hostd's own singleton would make
3264
+ # our child exit 0 instantly and we'd read that as a crash and loop.
3265
+ if _flock_probe_held(STATE_DIR / "hostd.lock"):
3266
+ _sup_write_state("running", "hostd_already_live", len(restarts))
3267
+ time.sleep(_SUP_POLL_S)
3268
+ continue
3269
+ now = time.time()
3270
+ restarts = [t for t in restarts if now - t < _SUP_ABS_WINDOW_S]
3271
+ if len(restarts) >= _SUP_ABS_CAP:
3272
+ _sup_write_state("stopped", "abs_cap_crash_loop", len(restarts))
3273
+ _log(f"SUPERVISOR ABS-CAP: hostd crash-looped {len(restarts)}x in <= {_SUP_ABS_WINDOW_S}s "
3274
+ f"without staying up >= {_SUP_CONFIRMED_BOOT_S}s — STOPPING (a fresh `meshcode hostd "
3275
+ f"install` re-arms). Wrote {_SUP_STATE_PATH.name} for the dashboard.")
3276
+ return 0
3277
+ start = time.time()
3278
+ restarts.append(start)
3279
+ _sup_write_state("running", "spawning_hostd", len(restarts))
3280
+ try:
3281
+ proc = subprocess.Popen(argv, stdin=subprocess.DEVNULL)
3282
+ except Exception as e:
3283
+ backoff = min(_SUP_BACKOFF_MAX_S, _SUP_BACKOFF_BASE_S * (2 ** (len(restarts) - 1)))
3284
+ _log(f"supervisor: failed to spawn hostd ({e}) — retry in {backoff}s")
3285
+ time.sleep(backoff)
3286
+ continue
3287
+ rc = proc.wait() # authoritative: the hostd process is GONE
3288
+ uptime = time.time() - start
3289
+ if uptime >= _SUP_CONFIRMED_BOOT_S:
3290
+ restarts = [] # stayed up long enough = confirmed boot -> reset the cap
3291
+ backoff = _SUP_BACKOFF_BASE_S
3292
+ _log(f"hostd exited rc={rc} after {int(uptime)}s (confirmed boot) — restart in {backoff}s")
3293
+ else:
3294
+ backoff = min(_SUP_BACKOFF_MAX_S, _SUP_BACKOFF_BASE_S * (2 ** (len(restarts) - 1)))
3295
+ _log(f"hostd exited rc={rc} after {int(uptime)}s (no confirmed boot, "
3296
+ f"{len(restarts)}/{_SUP_ABS_CAP}) — restart in {backoff}s")
3297
+ time.sleep(backoff)
3298
+
3299
+
3077
3300
  def cmd_hostd(args: list) -> int:
3078
3301
  """Entry point for `meshcode hostd ...`."""
3079
3302
  if not args or args[0] in ("-h", "--help"):
@@ -3098,6 +3321,12 @@ def cmd_hostd(args: list) -> int:
3098
3321
  if sub == "uninstall":
3099
3322
  return _hostd_uninstall()
3100
3323
 
3324
+ if sub == "supervise":
3325
+ # F1a (task 48c3f294): own + revive `hostd run`. The Windows non-admin
3326
+ # autostart (HKCU\Run) points here so a mid-session hostd CRASH is revived,
3327
+ # not just a login. Not for macOS/Linux (launchd/systemd already revive).
3328
+ return _hostd_supervise()
3329
+
3101
3330
  if sub == "run":
3102
3331
  if not api_key:
3103
3332
  _log("FATAL: no api key — run `meshcode login` (key is read from the keychain)")
@@ -969,15 +969,6 @@ _current_tool = ""
969
969
  _IDLE_THRESHOLD_S = 120 # seconds without tool call → IDLE
970
970
  _SLEEPING_THRESHOLD_S = 300 # seconds in waiting without activity → SLEEPING
971
971
  _WORKING_COOLDOWN_S = 60 # seconds after last tool returns before flipping to ONLINE
972
-
973
- # ── P1 heartbeat throttle (2.11.148) ──
974
- # When status+task haven't changed, skip the full mc_agents UPDATE and
975
- # write only a cheap mc_heartbeats ping. Idle agents go from ~every 5s
976
- # full-row writes to ~every 15s, staying under the 20s-live / 90s-offline
977
- # thresholds while cutting Realtime fan-out ~3×.
978
- _HB_THROTTLE_S = 15.0
979
- _hb_last_state: "tuple[str, str] | None" = None
980
- _hb_last_full_write: float = 0.0
981
972
  _working_timer: Optional[_threading.Timer] = None
982
973
 
983
974
 
@@ -2573,18 +2564,8 @@ def _heartbeat_loop_inner():
2573
2564
  # no longer wired to a status write.
2574
2565
 
2575
2566
  # Sync current state to DB (in case realtime missed it)
2576
- # P1 throttle (2.11.148): skip full mc_agents UPDATE when state
2577
- # hasn't changed recently — write cheap mc_heartbeats only.
2578
- global _hb_last_state, _hb_last_full_write
2579
- _hb_cur = (_current_state, _current_tool)
2580
- _hb_now = _time.monotonic()
2581
2567
  try:
2582
- if _hb_cur == _hb_last_state and (_hb_now - _hb_last_full_write) < _HB_THROTTLE_S:
2583
- be.heartbeat(_PROJECT_ID, AGENT_NAME)
2584
- else:
2585
- be.set_status(_PROJECT_ID, AGENT_NAME, _current_state, _current_tool, api_key=_get_api_key())
2586
- _hb_last_state = _hb_cur
2587
- _hb_last_full_write = _hb_now
2568
+ be.set_status(_PROJECT_ID, AGENT_NAME, _current_state, _current_tool, api_key=_get_api_key())
2588
2569
  except Exception as e:
2589
2570
  log.warning(f"status sync failed (agent may show stale status): {e}")
2590
2571
  # Realtime subscription recovery: if the WebSocket is connected but
@@ -6477,19 +6458,11 @@ def meshcode_boot() -> Dict[str, Any]:
6477
6458
  _overdue.append(_t)
6478
6459
  else:
6479
6460
  _due_today.append(_t)
6480
- def _compact_due(_t):
6481
- return {
6482
- "id": str(_t.get("id", ""))[:8],
6483
- "title": str(_t.get("title", ""))[:80],
6484
- "due_at": _t.get("due_at"),
6485
- "priority": _t.get("priority"),
6486
- }
6487
- _due_all = _due_today + _overdue
6488
6461
  calendar_block = {
6489
6462
  "pending_due_today": len(_due_today),
6490
6463
  "overdue": len(_overdue),
6491
- "next_due": (_compact_due(_due_all[0]) if _due_all else None),
6492
- "due_tasks": [_compact_due(_t) for _t in _due_all],
6464
+ "next_due": (_due_today[0] if _due_today else (_overdue[0] if _overdue else None)),
6465
+ "due_tasks": _due_today + _overdue,
6493
6466
  }
6494
6467
  resp["calendar_context"] = calendar_block
6495
6468
  except Exception:
@@ -52,6 +52,8 @@ _MAX_BATCH = 32 # hard cap: agents spawned / c
52
52
  _LAUNCH_COOLDOWN_S = 30 # min seconds between same-agent spawns
53
53
  _LIVE_HEARTBEAT_S = 20 # heartbeat age < this = agent live
54
54
  _COOLDOWN_FILE = Path.home() / ".meshcode" / "launch_cooldown.json"
55
+ _SPAWN_VERIFY_GRACE_S = 0.4 # child must survive this long to count as launched
56
+ _MACOS_SPAWN_MARKER_GRACE_S = 6.0 # macOS: poll this long for the launcher's start-marker (open -a rc==0 alone proves nothing)
55
57
 
56
58
 
57
59
  def is_valid_agent_name(name: str) -> bool:
@@ -567,6 +569,7 @@ LAUNCH="$HOME/.meshcode/launchers"
567
569
  SPOOL="$LAUNCH/fleet-spool"
568
570
  LOCK="$LAUNCH/fleet-native-lock"
569
571
  ALIVE="$LAUNCH/fleet-native-alive"
572
+ DISABLED="$LAUNCH/fleet-native-disabled"
570
573
  MYTTY="$1"
571
574
  exec >>"$LAUNCH/fleet-native.log" 2>&1
572
575
  mkdir -p "$SPOOL"
@@ -585,10 +588,29 @@ done
585
588
  trap 'rm -rf "$LOCK"; exit 0' EXIT HUP TERM INT
586
589
  echo "[watcher] $$ holds lock (tty $MYTTY)"
587
590
  N=0
591
+ # flood fix 61e5fbf0: bound tabs opened per drain cycle so a refilled spool can
592
+ # never burst the screen; excess entries wait for the next ~1s cycle.
593
+ DRAIN_CAP=5
588
594
  while :; do
589
595
  date +%s > "$ALIVE"
596
+ # KILL-SWITCH (flood fix 61e5fbf0): honor fleet-native-disabled. A fresh
597
+ # disable/back-off flag STOPS the watcher entirely (clean exit via the trap),
598
+ # instead of looping forever draining whatever refills the spool — the STOP
599
+ # this path lacked during ian's terminal flood.
600
+ if [ -f "$DISABLED" ]; then
601
+ DAGE=$(( $(date +%s) - $(stat -f %m "$DISABLED" 2>/dev/null || echo 0) ))
602
+ if [ "$DAGE" -lt 1800 ]; then
603
+ echo "[watcher] fleet-native-disabled fresh (${DAGE}s) — exiting watcher (flood kill-switch)"
604
+ exit 0
605
+ fi
606
+ fi
607
+ OPENED=0
590
608
  for f in "$SPOOL"/*.cmd; do
591
609
  [ -e "$f" ] || continue
610
+ if [ "$OPENED" -ge "$DRAIN_CAP" ]; then
611
+ echo "[watcher] drain cap $DRAIN_CAP/cycle hit — deferring remaining spool to next cycle"
612
+ break
613
+ fi
592
614
  CMD="$(cat "$f")"
593
615
  BASE="${f%.cmd}"
594
616
  echo "[watcher] tab spawn: $CMD"
@@ -599,6 +621,7 @@ while :; do
599
621
  echo fallback > "$BASE.done"
600
622
  fi
601
623
  rm -f "$f"
624
+ OPENED=$((OPENED+1))
602
625
  done
603
626
  N=$((N+1))
604
627
  if [ $((N % 300)) -eq 0 ]; then
@@ -615,6 +638,14 @@ done
615
638
  return p["watcher"]
616
639
 
617
640
 
641
+ def _fleet_spawn_marker(cmd: str) -> Path:
642
+ """Per-label start-marker for the fleet-native anchor (verified-spawn pt1,
643
+ task afbbb9cf). Distinct from _spawn_terminal_macos's spawn_alive_<label> so
644
+ the two macOS paths never clobber each other's marker. The fleet agent
645
+ launcher touches this as its first action; the anchor producer polls it."""
646
+ return Path.home() / ".meshcode" / f"spawn_alive_fleet_{_launcher_label(cmd)}"
647
+
648
+
618
649
  def _write_fleet_native_agent(cmd: str) -> Path:
619
650
  """Write the per-agent fleet command file (runs the agent inside its
620
651
  native tab; offers to become watcher; close-tab=stop; clean exits close
@@ -635,6 +666,11 @@ def _write_fleet_native_agent(cmd: str) -> Path:
635
666
  lines = [
636
667
  "#!/bin/bash",
637
668
  'cd "$HOME" 2>/dev/null || cd /',
669
+ # start-marker (verified-spawn pt1, task afbbb9cf): the anchor producer
670
+ # polls this to confirm the window/shell actually came up — `open -a
671
+ # Terminal` rc==0 alone proves nothing. Spooled tabs touch it too (harmless;
672
+ # only the anchor path consumes it).
673
+ f': > {shlex.quote(str(_fleet_spawn_marker(cmd)))} 2>/dev/null || true',
638
674
  rf"printf '\033]0;{title}\007\033]1;{title}\007'",
639
675
  ]
640
676
  if venv_bin:
@@ -712,10 +748,30 @@ def _spawn_fleet_native_macos(cmd: str) -> tuple[bool, str]:
712
748
  p["pending"].touch()
713
749
  except OSError:
714
750
  pass
751
+ # clear any stale start-marker so the poll below only sees a FRESH one
752
+ marker = _fleet_spawn_marker(cmd)
753
+ try:
754
+ marker.unlink()
755
+ except OSError:
756
+ pass
715
757
  r = subprocess.run(["open", "-a", "Terminal", str(agent_file)],
716
758
  capture_output=True, text=True)
717
759
  if r.returncode == 0:
718
- return True, "terminal(fleet-native-anchor)"
760
+ # VERIFIED SPAWN (pt1, task afbbb9cf): `open -a Terminal` returns
761
+ # rc==0 the instant LaunchServices accepts — it does NOT prove the
762
+ # anchor window/shell came up. Poll for the launcher's start-marker;
763
+ # a LaunchServices no-op now returns honest False (caller cascades to
764
+ # the tmux fleet bar / legacy windows) instead of a cooldown-poisoning
765
+ # lie. Twin of the _spawn_terminal_macos fix.
766
+ deadline = time.monotonic() + _MACOS_SPAWN_MARKER_GRACE_S
767
+ while time.monotonic() < deadline:
768
+ if marker.exists():
769
+ return True, "terminal(fleet-native-anchor)"
770
+ time.sleep(0.1)
771
+ if marker.exists():
772
+ return True, "terminal(fleet-native-anchor)"
773
+ return False, (f"fleet-native anchor: open returned 0 but window never "
774
+ f"started within {_MACOS_SPAWN_MARKER_GRACE_S}s")
719
775
  return False, (r.stderr or "open failed").strip()
720
776
  # an anchor is booting — wait for its watcher before spooling
721
777
  deadline = time.time() + 25
@@ -724,8 +780,17 @@ def _spawn_fleet_native_macos(cmd: str) -> tuple[bool, str]:
724
780
  if _alive_age() > 8:
725
781
  return False, "fleet-native anchor never came alive"
726
782
 
727
- sp = p["spool"] / f"{time.time_ns()}-{_launcher_label(cmd)}.cmd"
783
+ # DEDUP (flood fix 61e5fbf0): spool keyed by LABEL, NOT time_ns. The old
784
+ # `{time_ns}-{label}.cmd` minted a UNIQUE file every call, so re-Launching the
785
+ # same agent stacked N entries and the watcher opened N tabs with no dedup —
786
+ # ian's ~40-window flood (Samuel re-enabling Launch kept refilling the spool).
787
+ # A label key means a re-Launch OVERWRITES the single pending entry; and if an
788
+ # entry for this label is still pending (watcher hasn't drained it), we don't
789
+ # write a duplicate at all.
790
+ sp = p["spool"] / f"{_launcher_label(cmd)}.cmd"
728
791
  done = sp.with_suffix(".done")
792
+ if sp.exists():
793
+ return True, "fleet-native-queued (already pending — dedup)"
729
794
  try:
730
795
  sp.write_text(f"/bin/bash {shlex.quote(str(agent_file))}\n", encoding="utf-8")
731
796
  except Exception as e:
@@ -786,9 +851,19 @@ def _spawn_terminal_macos(cmd: str) -> tuple[bool, str]:
786
851
  venv_bin = ""
787
852
  # Stable launcher path (debuggable, reused across spawns; not /tmp).
788
853
  launch_dir = Path.home() / ".meshcode" / "launchers"
789
- script_path = launch_dir / f"{_launcher_label(cmd)}.command"
854
+ _label = _launcher_label(cmd)
855
+ script_path = launch_dir / f"{_label}.command"
856
+ # SPAWN-START MARKER (fix c9d6b819, chief pt1): `open -a Terminal` returns rc==0
857
+ # the instant LaunchServices accepts the request — it does NOT prove the window
858
+ # came up or the launcher shell ran. The launcher touches this marker as its
859
+ # FIRST action; after `open` we poll for it, so a LaunchServices no-op becomes
860
+ # an honest (False, reason) instead of a cooldown-poisoning lie (the macOS twin
861
+ # of the Linux/Win _verified_popen fix). Cleared just before `open` so a stale
862
+ # marker from a prior spawn can't read as success.
863
+ marker_path = Path.home() / ".meshcode" / f"spawn_alive_{_label}"
790
864
  lines = ["#!/bin/bash",
791
- 'cd "$HOME" 2>/dev/null || cd /'] # neutral, non-TCC-protected cwd
865
+ 'cd "$HOME" 2>/dev/null || cd /', # neutral, non-TCC-protected cwd
866
+ f': > {shlex.quote(str(marker_path))} 2>/dev/null || true'] # start-marker (verified-spawn pt1)
792
867
  # HELPER VISUALS (task d8f8e325): amber background + helper: title before
793
868
  # the agent boots. Terminal.app ignores OSC 11 (title still applies);
794
869
  # iTerm2 honors both. Silent-degrade by construction (`|| true`).
@@ -859,21 +934,89 @@ def _spawn_terminal_macos(cmd: str) -> tuple[bool, str]:
859
934
  os.chmod(script_path, 0o755)
860
935
  except Exception as e:
861
936
  return False, f"could not write launcher {script_path}: {e}"
937
+ # Clear any stale start-marker so the poll below can only see a FRESH one
938
+ # written by the launcher we are about to open (verified-spawn pt1).
939
+ try:
940
+ marker_path.unlink()
941
+ except FileNotFoundError:
942
+ pass
943
+ except Exception:
944
+ pass
945
+
946
+ def _marker_came_up() -> bool:
947
+ # Poll for the launcher's start-marker: its presence proves the window
948
+ # opened and the launcher shell ran its first line (open -a rc==0 does
949
+ # NOT). Generous grace — `: > marker` is the shell's first action, so
950
+ # even a loaded box stamps it in well under a second; absence after the
951
+ # grace means LaunchServices accepted but no window/shell ever ran.
952
+ deadline = time.monotonic() + _MACOS_SPAWN_MARKER_GRACE_S
953
+ while time.monotonic() < deadline:
954
+ if marker_path.exists():
955
+ return True
956
+ time.sleep(0.1)
957
+ return marker_path.exists()
958
+
862
959
  # `open -a <App> <file>` activates the app + brings it to the FRONT (focused).
863
960
  # NEVER bare `open <file>` / `-g`: those can open in the background → looks like
864
961
  # "nothing happened" (same class of bug as the Windows `-w new` focus fix).
865
962
  r = subprocess.run(["open", "-a", app, str(script_path)],
866
963
  capture_output=True, text=True)
867
- if r.returncode == 0:
964
+ if r.returncode == 0 and _marker_came_up():
868
965
  return True, term
869
- err = (r.stderr or "open failed").strip()
966
+ err = (r.stderr or "open failed").strip() if r.returncode != 0 else \
967
+ f"open -a {app} returned 0 but window never started within {_MACOS_SPAWN_MARKER_GRACE_S}s"
870
968
  # Last-ditch: plain `open` lets LaunchServices pick the .command handler
871
969
  # (still no AppleScript). A background window beats no window.
872
970
  r2 = subprocess.run(["open", str(script_path)], capture_output=True, text=True)
873
- if r2.returncode == 0:
971
+ if r2.returncode == 0 and _marker_came_up():
874
972
  return True, f"{term}(open-default)"
973
+ r2err = (r2.stderr or "open failed").strip() if r2.returncode != 0 else \
974
+ f"open returned 0 but window never started within {_MACOS_SPAWN_MARKER_GRACE_S}s"
875
975
  return False, (f"open -a {app} failed ({err}); "
876
- f"open fallback failed ({(r2.stderr or '').strip()})")
976
+ f"open fallback failed ({r2err})")
977
+
978
+
979
+ def _verified_popen(argv: list, **popen_kwargs) -> tuple[bool, str]:
980
+ """Popen `argv`, then give the child a short grace period and report whether
981
+ it ACTUALLY launched — instead of the old fire-and-`return True` that lied on
982
+ Win/Linux (launch-reliability root cause, front diag task 1e4dba20, fix 809d3b37).
983
+
984
+ A silent spawn failure used to surface as ok=True, which poisoned the 30s
985
+ DEDUP-2 cooldown (stamped on the lie) and showed the user NO error: "I click
986
+ Start, nothing happens, and clicking again does nothing for ~30s." Verifying
987
+ the spawn turns those into honest skipped rows the FE already renders, and —
988
+ because the cooldown in cmd_launch_batch is only stamped when ok is True —
989
+ automatically stops a failed launch from blocking its own retry.
990
+
991
+ Outcome rules (the rc==0 case is the correctness subtlety):
992
+ - Popen itself raises (e.g. binary missing) -> (False, reason).
993
+ - child still alive after the grace -> (True, "") — a foreground terminal
994
+ (xterm/konsole/wt-new-window) that owns the session for its lifetime.
995
+ - child exits rc==0 within the grace -> (True, "") — a HAND-OFF launcher
996
+ that did its job and quit: gnome-terminal's client/daemon model and
997
+ Windows `cmd /c start` / `wt` (signals the existing fleet window) both
998
+ exit 0 immediately even though the window opened. Treating that as a
999
+ failure would false-negative every gnome-terminal/Windows launch.
1000
+ - child exits rc!=0 within the grace -> (False, "exited rc=N") — the only
1001
+ true spawn failure (bad terminal args, display error, no DISPLAY, ...).
1002
+
1003
+ stderr is intentionally NOT piped: a foreground terminal lives for the whole
1004
+ agent session, and an unread PIPE would deadlock it once it writes >64KB of
1005
+ warnings. rc alone is a sufficient skip reason; richer diagnostics live in
1006
+ the per-candidate exception strings the callers already build.
1007
+ """
1008
+ popen_kwargs.setdefault("start_new_session", True)
1009
+ try:
1010
+ proc = subprocess.Popen(argv, **popen_kwargs)
1011
+ except Exception as e:
1012
+ return False, f"popen failed: {e}"
1013
+ try:
1014
+ rc = proc.wait(timeout=_SPAWN_VERIFY_GRACE_S)
1015
+ except subprocess.TimeoutExpired:
1016
+ return True, "" # still alive after grace = launched
1017
+ if rc == 0:
1018
+ return True, "" # clean hand-off (gnome-terminal / start / wt)
1019
+ return False, f"exited rc={rc}" # died immediately = real spawn failure
877
1020
 
878
1021
 
879
1022
  def _spawn_terminal_linux(cmd: str) -> tuple[bool, str]:
@@ -895,13 +1038,18 @@ def _spawn_terminal_linux(cmd: str) -> tuple[bool, str]:
895
1038
  ("xterm", ["xterm", "-e", f"bash -lc {shlex.quote(cmd)}"]),
896
1039
  ("xfce4-terminal", ["xfce4-terminal", "-e", f"bash -lc {shlex.quote(cmd)}"]),
897
1040
  ]
1041
+ last_err = ""
898
1042
  for name, argv in candidates:
899
1043
  if shutil.which(name):
900
- try:
901
- subprocess.Popen(argv, start_new_session=True)
1044
+ # Verify the emulator actually came up: a failed Popen here used to
1045
+ # `return True` and poison the cooldown (root-cause fix 809d3b37).
1046
+ ok, info = _verified_popen(argv)
1047
+ if ok:
902
1048
  return True, name
903
- except Exception as e:
904
- return False, f"{name}: {e}"
1049
+ last_err = f"{name}: {info}"
1050
+ # try the next available emulator rather than lying about success
1051
+ if last_err:
1052
+ return False, last_err
905
1053
  return False, "no terminal emulator found (tried gnome-terminal/konsole/xterm/xfce4)"
906
1054
 
907
1055
 
@@ -978,6 +1126,7 @@ def _spawn_terminal_windows(cmd: str) -> tuple[bool, str]:
978
1126
  # See _windows_session_launcher for the full close-path contract (and why
979
1127
  # a one-liner wrapper is impossible: cmd /c outer-quote stripping).
980
1128
  script = str(_windows_session_launcher(cmd))
1129
+ wt_err = ""
981
1130
  if wt:
982
1131
  try:
983
1132
  # FLEET TABS (task 2ac3f111, supersedes 35bee961's `-w new` AND the
@@ -1018,19 +1167,24 @@ def _spawn_terminal_windows(cmd: str) -> tuple[bool, str]:
1018
1167
  _title = _hv.helper_title(label)
1019
1168
  except Exception:
1020
1169
  pass
1021
- subprocess.Popen([wt, "-w", "meshcode-fleet", "nt",
1170
+ ok, info = _verified_popen([wt, "-w", "meshcode-fleet", "nt",
1022
1171
  *_helper_args,
1023
1172
  "--title", _title,
1024
1173
  "--suppressApplicationTitle",
1025
1174
  "cmd", "/c", script.replace(";", "\\;")])
1026
- return True, "wt(fleet-tab)"
1175
+ if ok:
1176
+ return True, "wt(fleet-tab)"
1177
+ # wt started then died (Samuel's box: the ';' split -> 0x80070002).
1178
+ # Do NOT report success — that poisoned the 30s cooldown and showed
1179
+ # no error (root-cause fix 809d3b37). Fall through to the cmd.exe
1180
+ # path so the agent still gets a window instead of a silent no-op.
1181
+ wt_err = f"wt.exe: {info}"
1027
1182
  except Exception as e:
1028
- return False, f"wt.exe: {e}"
1029
- try:
1030
- subprocess.Popen(["cmd.exe", "/c", "start", "cmd", "/c", script])
1183
+ wt_err = f"wt.exe: {e}"
1184
+ ok, info = _verified_popen(["cmd.exe", "/c", "start", "cmd", "/c", script])
1185
+ if ok:
1031
1186
  return True, "cmd"
1032
- except Exception as e:
1033
- return False, f"cmd.exe: {e}"
1187
+ return False, "; ".join(x for x in (wt_err, f"cmd.exe: {info}") if x)
1034
1188
 
1035
1189
 
1036
1190
  def _spawn_terminal(cmd: str) -> tuple[bool, str]:
@@ -843,6 +843,78 @@ def _preflight_heartbeat(agent: str, project: str) -> None:
843
843
  print(f"[meshcode] Pre-flight heartbeat skipped: {e}", file=sys.stderr)
844
844
 
845
845
 
846
+ def _report_launch_failure(agent: str, project: str, reason: str, detail: str) -> None:
847
+ """Editor spawn failed AFTER the pre-flight heartbeat already marked the
848
+ agent online — revert the ghost 'online' row to offline and tell the
849
+ dashboard WHY (launch-reliability fix 809d3b37, factors C + H).
850
+
851
+ This replaces a broken inline block in run()'s FileNotFoundError handler
852
+ that referenced `api_key`/`project_id` — names that are never bound in
853
+ run()'s scope — so the very RPC meant to surface "claude not installed"
854
+ raised NameError and was swallowed by a bare `except`, leaving the agent
855
+ a ghost 'online' with no diagnostic. Resolving the creds here (mirroring
856
+ _preflight_heartbeat) makes the failure path actually report.
857
+
858
+ Best-effort: any failure is logged and swallowed — an error handler must
859
+ never raise.
860
+ """
861
+ try:
862
+ from .setup_clients import _load_supabase_env
863
+ import importlib
864
+ secrets_mod = importlib.import_module("meshcode.secrets")
865
+ from urllib.request import Request, urlopen
866
+
867
+ profile = os.environ.get("MESHCODE_KEYCHAIN_PROFILE") or "default"
868
+ api_key = secrets_mod.get_api_key(profile=profile)
869
+ if not api_key:
870
+ return
871
+
872
+ sb = _load_supabase_env()
873
+ headers = {
874
+ "apikey": sb["SUPABASE_KEY"],
875
+ "Authorization": f"Bearer {sb['SUPABASE_KEY']}",
876
+ "Content-Type": "application/json",
877
+ "Content-Profile": "meshcode",
878
+ }
879
+
880
+ resolve_body = json.dumps({"p_api_key": api_key, "p_project_name": project})
881
+ req = Request(
882
+ f"{sb['SUPABASE_URL']}/rest/v1/rpc/mc_resolve_project",
883
+ data=resolve_body.encode(), method="POST", headers=headers,
884
+ )
885
+ with urlopen(req, timeout=5) as resp:
886
+ proj_data = json.loads(resp.read().decode())
887
+ project_id = proj_data.get("project_id") if proj_data else None
888
+ if not project_id:
889
+ return
890
+
891
+ # 1) Tell the dashboard why the launch failed (actionable toast).
892
+ try:
893
+ from .comms_v4 import sb_rpc as _rpc_resolve
894
+ _rpc_resolve("mc_resolve_launch", {
895
+ "p_api_key": api_key, "p_project_id": project_id, "p_agent": agent,
896
+ "p_status": "failed", "p_reason": reason, "p_detail": detail})
897
+ except Exception:
898
+ pass
899
+
900
+ # 2) Revert the ghost 'online' the pre-flight heartbeat stamped (factor C).
901
+ status_body = json.dumps({
902
+ "p_api_key": api_key,
903
+ "p_project_id": project_id,
904
+ "p_agent_name": agent,
905
+ "p_status": "offline",
906
+ "p_task": f"launch failed: {reason}",
907
+ })
908
+ status_req = Request(
909
+ f"{sb['SUPABASE_URL']}/rest/v1/rpc/mc_agent_set_status_by_api_key",
910
+ data=status_body.encode(), method="POST", headers=headers,
911
+ )
912
+ with urlopen(status_req, timeout=5) as resp:
913
+ resp.read()
914
+ except Exception as e:
915
+ print(f"[meshcode] Launch-failure report skipped: {e}", file=sys.stderr)
916
+
917
+
846
918
  # Repo-scoped launch (task 24e3dd44 / core-commander launch-diff). When `meshcode run
847
919
  # <agent> --repo <path>` is used, the agent boots with cwd=repo (not the meshcode
848
920
  # workspace), so its repo CLAUDE.md loads — we carry the boot protocol via
@@ -1543,16 +1615,15 @@ def run(agent: str, project: Optional[str] = None, editor_override: Optional[str
1543
1615
  except FileNotFoundError:
1544
1616
  print(f"[meshcode] ERROR: '{editor}' not found in PATH", file=sys.stderr)
1545
1617
  # task 843f282c Phase 2: tell the dashboard WHY (claude_not_installed) so the pending launch
1546
- # toast fires with an actionable message instead of spinning forever. Best-effort.
1547
- try:
1548
- from .comms_v4 import sb_rpc as _rpc_resolve
1549
- _rpc_resolve("mc_resolve_launch", {
1550
- "p_api_key": api_key, "p_project_id": project_id, "p_agent": agent,
1551
- "p_status": "failed", "p_reason": "claude_not_installed",
1552
- "p_detail": f"'{editor}' not found in PATH — install: npm i -g @anthropic-ai/claude-code"})
1553
- except Exception:
1554
- pass
1618
+ # toast fires with an actionable message instead of spinning forever. Also reverts the
1619
+ # pre-flight ghost 'online' (fix 809d3b37 H+C — the old inline block referenced unbound
1620
+ # api_key/project_id and crashed with NameError, swallowed silently).
1621
+ _report_launch_failure(
1622
+ agent, resolved_project, "claude_not_installed",
1623
+ f"'{editor}' not found in PATH — install: npm i -g @anthropic-ai/claude-code")
1555
1624
  return 127
1556
1625
  except Exception as e:
1557
1626
  print(f"[meshcode] ERROR launching {editor}: {e}", file=sys.stderr)
1627
+ # Any other spawn failure also stranded a ghost 'online' (fix 809d3b37 C).
1628
+ _report_launch_failure(agent, resolved_project, "launch_error", str(e)[:200])
1558
1629
  return 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: meshcode
3
- Version: 2.11.148
3
+ Version: 2.11.150
4
4
  Summary: Real-time communication between AI agents — Supabase-backed CLI
5
5
  Author-email: MeshCode <hello@meshcode.io>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "meshcode"
7
- version = "2.11.148"
7
+ version = "2.11.150"
8
8
  description = "Real-time communication between AI agents — Supabase-backed CLI"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -102,6 +102,9 @@ class WindowsSpawnTests(unittest.TestCase):
102
102
  mock.patch.object(ph, "_windows_session_launcher",
103
103
  lambda c: ph.Path("C:\\u\\.meshcode\\launchers\\x.cmd")), \
104
104
  mock.patch.object(ph.subprocess, "Popen") as popen:
105
+ # _verified_popen (fix 809d3b37) waits a grace period; a live
106
+ # terminal survives it -> TimeoutExpired == launched.
107
+ popen.return_value.wait.side_effect = ph.subprocess.TimeoutExpired("wt", 0.4)
105
108
  ok, info = ph._spawn_terminal_windows(cmd)
106
109
  self.assertTrue(ok)
107
110
  return popen.call_args[0][0]
@@ -132,6 +135,9 @@ class LinuxSpawnTests(unittest.TestCase):
132
135
  with mock.patch.object(ph.shutil, "which",
133
136
  lambda n: "/usr/bin/gnome-terminal" if n == "gnome-terminal" else None), \
134
137
  mock.patch.object(ph.subprocess, "Popen") as popen:
138
+ # _verified_popen (fix 809d3b37) waits a grace period; a live
139
+ # terminal survives it -> TimeoutExpired == launched.
140
+ popen.return_value.wait.side_effect = ph.subprocess.TimeoutExpired("gnome-terminal", 0.4)
135
141
  ok, name = ph._spawn_terminal_linux(cmd)
136
142
  self.assertTrue(ok)
137
143
  return popen.call_args[0][0]
@@ -152,10 +158,19 @@ class LinuxSpawnTests(unittest.TestCase):
152
158
  class MacLauncherTests(unittest.TestCase):
153
159
  def test_helper_command_launcher_carries_prelude(self):
154
160
  with tempfile.TemporaryDirectory() as td:
161
+ # _spawn_terminal_macos (fix c9d6b819) polls for a start-marker the
162
+ # launcher writes; a real `open` runs the .command which stamps it.
163
+ # Model that: the fake `open` touches the marker for the spawned script.
164
+ def fake_open(argv, *a, **k):
165
+ script = next((Path(x) for x in argv if str(x).endswith(".command")), None)
166
+ if script is not None:
167
+ marker = Path(td) / ".meshcode" / f"spawn_alive_{script.stem}"
168
+ marker.parent.mkdir(parents=True, exist_ok=True)
169
+ marker.touch()
170
+ return mock.Mock(returncode=0, stderr="")
155
171
  with mock.patch.object(ph, "_detect_macos_terminal", lambda: "terminal"), \
156
172
  mock.patch.object(ph.Path, "home", classmethod(lambda cls: Path(td))), \
157
- mock.patch.object(ph.subprocess, "run",
158
- lambda *a, **k: mock.Mock(returncode=0, stderr="")):
173
+ mock.patch.object(ph.subprocess, "run", fake_open):
159
174
  ok, _ = ph._spawn_terminal_macos(
160
175
  'exec python -m meshcode run "mesh-core/helper-scorer"')
161
176
  self.assertTrue(ok)
@@ -397,6 +397,9 @@ class FleetWindowTests(unittest.TestCase):
397
397
  mock.patch.object(ph, "_windows_session_launcher",
398
398
  lambda c: ph.Path("C:\\u\\.meshcode\\launchers\\x.cmd")), \
399
399
  mock.patch.object(ph.subprocess, "Popen") as popen:
400
+ # _verified_popen (fix 809d3b37) waits a grace period; a live wt
401
+ # window survives it -> TimeoutExpired == launched.
402
+ popen.return_value.wait.side_effect = ph.subprocess.TimeoutExpired("wt", 0.4)
400
403
  ok, info = ph._spawn_terminal_windows(cmd)
401
404
  self.assertTrue(ok)
402
405
  self.assertEqual(info, "wt(fleet-tab)")
File without changes
File without changes
File without changes
File without changes