specmem-hardwicksoftware 3.5.99 → 3.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2034,6 +2034,10 @@ class FrankensteinEmbeddings:
2034
2034
  # THREAD SAFETY: Lock for model loading to prevent race conditions
2035
2035
  self._model_lock = threading.Lock()
2036
2036
 
2037
+ # Health status flag: reflects whether model is loaded and functional
2038
+ # Set to False on load failure, True on successful load + health check
2039
+ self._model_healthy = True
2040
+
2037
2041
  # ═══════════════════════════════════════════════════════════════════
2038
2042
  # OPT-6: LAZY LOADING - Don't load model until first request
2039
2043
  # ═══════════════════════════════════════════════════════════════════
@@ -2142,43 +2146,79 @@ class FrankensteinEmbeddings:
2142
2146
 
2143
2147
  Uses double-checked locking pattern to avoid lock contention when
2144
2148
  model is already loaded.
2149
+
2150
+ Retries with exponential backoff on failure (Issue #17 fix).
2151
+ Configurable via:
2152
+ - SPECMEM_MODEL_RELOAD_RETRIES (default 3)
2153
+ - SPECMEM_MODEL_RELOAD_DELAY_MS (default 1000) - base delay in ms
2154
+
2155
+ Raises RuntimeError if all retries fail, ensuring callers get an
2156
+ explicit error instead of silent failure.
2145
2157
  """
2146
- # Fast path: model already loaded (no lock needed)
2147
- if self.model is not None:
2158
+ # Fast path: model already loaded and healthy (no lock needed)
2159
+ if self.model is not None and getattr(self, '_model_healthy', True):
2148
2160
  return
2149
2161
 
2162
+ max_retries = int(os.environ.get('SPECMEM_MODEL_RELOAD_RETRIES', '3'))
2163
+ base_delay_ms = int(os.environ.get('SPECMEM_MODEL_RELOAD_DELAY_MS', '1000'))
2164
+
2150
2165
  # Slow path: need to load model (with lock)
2151
2166
  with self._model_lock:
2152
2167
  # Double-check inside lock (another thread may have loaded it)
2153
- if self.model is not None:
2168
+ if self.model is not None and getattr(self, '_model_healthy', True):
2154
2169
  return
2155
2170
 
2156
- print(f"🔄 Lazy-loading model: {self.base_model} ({_BEST_ONNX_FILE})", file=sys.stderr)
2157
- start = time.time()
2158
- try:
2159
- # NOTE: backend='onnx' is REQUIRED for model_kwargs file_name to work
2160
- self.model = SentenceTransformer(
2161
- self.base_model,
2162
- device='cpu',
2163
- backend='onnx',
2164
- cache_folder=str(self.cache_dir),
2165
- model_kwargs={"file_name": _BEST_ONNX_FILE}
2166
- )
2167
- load_time = (time.time() - start) * 1000
2168
- print(f"✅ Model loaded in {load_time:.0f}ms - ready to embed!", file=sys.stderr)
2171
+ last_error = None
2172
+ for attempt in range(1, max_retries + 1):
2173
+ print(f"[MODEL-RELOAD] Loading model: {self.base_model} ({_BEST_ONNX_FILE}) (attempt {attempt}/{max_retries})", file=sys.stderr)
2174
+ start = time.time()
2175
+ try:
2176
+ # NOTE: backend='onnx' is REQUIRED for model_kwargs file_name to work
2177
+ self.model = SentenceTransformer(
2178
+ self.base_model,
2179
+ device='cpu',
2180
+ backend='onnx',
2181
+ cache_folder=str(self.cache_dir),
2182
+ model_kwargs={"file_name": _BEST_ONNX_FILE}
2183
+ )
2184
+ load_time = (time.time() - start) * 1000
2169
2185
 
2170
- # Update native dims if we didn't know them
2171
- actual_dims = self.model.get_sentence_embedding_dimension()
2172
- if self.dim_config.native_dims != actual_dims:
2173
- print(f" Native dims updated: {self.dim_config.native_dims} -> {actual_dims}", file=sys.stderr)
2174
- self.dim_config.native_dims = actual_dims
2186
+ # Verify the model actually works by doing a test encode
2187
+ test_embedding = self.model.encode("health check", show_progress_bar=False)
2188
+ if test_embedding is None or len(test_embedding) == 0:
2189
+ raise RuntimeError("Model loaded but produced empty embedding on health check")
2175
2190
 
2176
- except Exception as e:
2177
- print(f" Model loading failed: {e}", file=sys.stderr)
2178
- raise
2191
+ self._model_healthy = True
2192
+ print(f"[MODEL-RELOAD] Model loaded and verified in {load_time:.0f}ms (attempt {attempt}) - ready to embed!", file=sys.stderr)
2179
2193
 
2180
- # Update last request time so idle monitor resets
2181
- self.last_request_time = time.time()
2194
+ # Update native dims if we didn't know them
2195
+ actual_dims = self.model.get_sentence_embedding_dimension()
2196
+ if self.dim_config.native_dims != actual_dims:
2197
+ print(f" Native dims updated: {self.dim_config.native_dims} -> {actual_dims}", file=sys.stderr)
2198
+ self.dim_config.native_dims = actual_dims
2199
+
2200
+ # Update last request time so idle monitor resets
2201
+ self.last_request_time = time.time()
2202
+ return # Success
2203
+
2204
+ except Exception as e:
2205
+ last_error = e
2206
+ self.model = None
2207
+ self._model_healthy = False
2208
+ print(f"[MODEL-RELOAD] Attempt {attempt}/{max_retries} failed: {e}", file=sys.stderr)
2209
+
2210
+ if attempt < max_retries:
2211
+ # Exponential backoff: base_delay * 2^(attempt-1)
2212
+ # e.g., with 1000ms base: 1s, 2s, 4s
2213
+ delay_seconds = (base_delay_ms / 1000.0) * (2 ** (attempt - 1))
2214
+ print(f"[MODEL-RELOAD] Retrying in {delay_seconds:.1f}s...", file=sys.stderr)
2215
+ time.sleep(delay_seconds)
2216
+
2217
+ # All retries exhausted
2218
+ self._model_healthy = False
2219
+ error_msg = f"Model reload failed after {max_retries} attempts. Last error: {last_error}"
2220
+ print(f"[MODEL-RELOAD] FATAL: {error_msg}", file=sys.stderr)
2221
+ raise RuntimeError(error_msg)
2182
2222
 
2183
2223
  def _query_database_dimension(self) -> int:
2184
2224
  """
@@ -2503,7 +2543,8 @@ class FrankensteinEmbeddings:
2503
2543
  'ram_usage_mb': round(self.ram_guard.get_ram_usage_mb(), 1),
2504
2544
  'ram_limit_mb': self.ram_guard.MAX_RAM_MB,
2505
2545
  'throttling_enabled': self.enable_throttling,
2506
- 'model_loaded': self.model is not None
2546
+ 'model_loaded': self.model is not None,
2547
+ 'model_healthy': getattr(self, '_model_healthy', True)
2507
2548
  }
2508
2549
 
2509
2550
  # Add low-resource optimization stats
@@ -2564,11 +2605,17 @@ class EmbeddingServer:
2564
2605
  self.shutdown_requested = False
2565
2606
 
2566
2607
  # KYS (Keep Yourself Safe) watchdog - two-way health check
2567
- # If MCP server doesn't send "kys" heartbeat within 90 seconds, we suicide
2608
+ # If MCP server doesn't send "kys" heartbeat within timeout, take action
2568
2609
  # This prevents orphan embedding servers when MCP crashes
2569
- # Grace period increased to handle startup delays and heavy operations
2610
+ # Timeout and mode are configurable via environment variables
2570
2611
  self.last_kys_time = time.time()
2571
- self.kys_timeout = 90 # 25 seconds from MCP + 65 second grace period
2612
+ self.kys_timeout = int(os.environ.get('SPECMEM_KYS_TIMEOUT_SECONDS', '600'))
2613
+ # KYS mode: "kill" = process exit (old behavior), "unload" = release model but keep socket,
2614
+ # "standby" = keep everything loaded and just idle
2615
+ self.kys_mode = os.environ.get('SPECMEM_KYS_MODE', 'unload').lower()
2616
+ if self.kys_mode not in ('kill', 'unload', 'standby'):
2617
+ print(f"[KYS] Invalid SPECMEM_KYS_MODE '{self.kys_mode}', defaulting to 'unload'", file=sys.stderr)
2618
+ self.kys_mode = 'unload'
2572
2619
 
2573
2620
  # QQMS v2 - enhanced queue with FIFO + ACK (takes precedence if provided)
2574
2621
  self.qqms_v2 = qqms_v2
@@ -2800,10 +2847,15 @@ class EmbeddingServer:
2800
2847
  KYS (Keep Yourself Safe) Watchdog - Two-way health check system.
2801
2848
 
2802
2849
  The MCP server sends {"type": "kys", "text": "kurt cobain t minus 25"} every 25 seconds.
2803
- If we don't receive this heartbeat within 30 seconds (25 + 5 grace), we commit suicide.
2850
+ If we don't receive this heartbeat within the configured timeout, we take action.
2804
2851
  This prevents orphan embedding servers when MCP crashes or is killed.
2805
2852
 
2806
2853
  Without this, crashed MCP leaves zombie embedding servers consuming RAM/CPU forever.
2854
+
2855
+ Modes (SPECMEM_KYS_MODE):
2856
+ - "kill": Process exit (original behavior)
2857
+ - "unload": Release ONNX model from memory but keep socket listener alive (default)
2858
+ - "standby": Keep everything loaded, just idle
2807
2859
  """
2808
2860
  def is_claude_alive_for_project():
2809
2861
  """Check if any Claude/node process is running for this project directory."""
@@ -2827,6 +2879,28 @@ class EmbeddingServer:
2827
2879
  except Exception:
2828
2880
  return False # Assume dead if we can't check
2829
2881
 
2882
+ def _kys_unload_model():
2883
+ """Unload the model to free RAM but keep the socket listener alive.
2884
+ On next request, _ensure_model_loaded() will reload it."""
2885
+ try:
2886
+ if hasattr(self.embedder, 'model') and self.embedder.model is not None:
2887
+ del self.embedder.model
2888
+ self.embedder.model = None
2889
+ import gc
2890
+ gc.collect()
2891
+ try:
2892
+ import torch
2893
+ if torch.cuda.is_available():
2894
+ torch.cuda.empty_cache()
2895
+ except Exception:
2896
+ pass
2897
+ print(f"[KYS-UNLOAD] Model released from memory. Socket still listening.", file=sys.stderr)
2898
+ print(f"[KYS-UNLOAD] Model will reload on next embedding request.", file=sys.stderr)
2899
+ else:
2900
+ print(f"[KYS-UNLOAD] Model already unloaded, nothing to do.", file=sys.stderr)
2901
+ except Exception as e:
2902
+ print(f"[KYS-UNLOAD] Error unloading model: {e}", file=sys.stderr)
2903
+
2830
2904
  def watchdog():
2831
2905
  # STARTUP GRACE PERIOD: Don't enforce KYS for first 60 seconds
2832
2906
  # This allows MCP server to fully initialize (can take 50-60+ seconds)
@@ -2849,49 +2923,79 @@ class EmbeddingServer:
2849
2923
  if is_claude_alive_for_project():
2850
2924
  # Claude is alive! Don't kill even without heartbeat
2851
2925
  if time_since_kys > self.kys_timeout and int(time_since_kys) % 120 < 10:
2852
- print(f"ℹ️ KYS: No heartbeat for {time_since_kys:.0f}s but Claude process detected - staying alive", file=sys.stderr)
2926
+ print(f"[KYS] No heartbeat for {time_since_kys:.0f}s but Claude process detected - staying alive", file=sys.stderr)
2853
2927
  continue
2854
2928
 
2855
- # NEW LOGIC: Only kill if BOTH conditions are true:
2856
- # 1. No heartbeat for kys_timeout (90s)
2929
+ # Only take action if BOTH conditions are true:
2930
+ # 1. No heartbeat for kys_timeout
2857
2931
  # 2. No embedding activity for activity_grace_period (5 min)
2858
- # This prevents killing active servers just because heartbeat stopped
2932
+ # This prevents acting on active servers just because heartbeat stopped
2859
2933
  if time_since_kys > self.kys_timeout:
2860
2934
  if time_since_activity < activity_grace_period:
2861
- # Recent activity - don't kill, just warn once per minute
2935
+ # Recent activity - don't act, just warn once per minute
2862
2936
  if int(time_since_kys) % 60 < 10:
2863
- print(f"⚠️ KYS: No heartbeat for {time_since_kys:.0f}s but recent activity ({time_since_activity:.0f}s ago) - staying alive", file=sys.stderr)
2937
+ print(f"[KYS] No heartbeat for {time_since_kys:.0f}s but recent activity ({time_since_activity:.0f}s ago) - staying alive", file=sys.stderr)
2864
2938
  continue
2865
2939
 
2866
- print(f"", file=sys.stderr)
2867
- print(f"💀 KYS WATCHDOG TRIGGERED", file=sys.stderr)
2868
- print(f" No heartbeat from MCP in {time_since_kys:.0f}s (timeout: {self.kys_timeout}s)", file=sys.stderr)
2869
- print(f" No embedding activity for {time_since_activity:.0f}s (grace: {activity_grace_period}s)", file=sys.stderr)
2870
- print(f" MCP server likely crashed - committing suicide to prevent zombie", file=sys.stderr)
2871
- print(f" 'kurt cobain t minus 0'", file=sys.stderr)
2872
- print(f"", file=sys.stderr)
2940
+ # --- KYS MODE DISPATCH ---
2941
+ if self.kys_mode == 'standby':
2942
+ # STANDBY MODE: Keep everything loaded, just log and continue
2943
+ if int(time_since_kys) % 120 < 10:
2944
+ print(f"[KYS-STANDBY] No heartbeat for {time_since_kys:.0f}s, no activity for {time_since_activity:.0f}s - idling in standby mode", file=sys.stderr)
2945
+ continue
2873
2946
 
2874
- # Write death reason file so clients know to auto-respawn
2875
- try:
2876
- death_reason_path = os.path.join(os.path.dirname(self.socket_path), 'embedding-death-reason.txt')
2877
- with open(death_reason_path, 'w') as f:
2878
- f.write(f"kys\n{time.time()}\nNo heartbeat ({time_since_kys:.0f}s) AND no activity ({time_since_activity:.0f}s)")
2879
- print(f" 📝 Death reason written to {death_reason_path}", file=sys.stderr)
2880
- except Exception as e:
2881
- print(f" ⚠️ Failed to write death reason: {e}", file=sys.stderr)
2947
+ elif self.kys_mode == 'unload':
2948
+ # UNLOAD MODE: Release model from memory but keep socket alive
2949
+ # Only unload once - check if model is still loaded
2950
+ if hasattr(self.embedder, 'model') and self.embedder.model is not None:
2951
+ print(f"", file=sys.stderr)
2952
+ print(f"[KYS-UNLOAD] WATCHDOG TRIGGERED (mode=unload)", file=sys.stderr)
2953
+ print(f" No heartbeat from MCP in {time_since_kys:.0f}s (timeout: {self.kys_timeout}s)", file=sys.stderr)
2954
+ print(f" No embedding activity for {time_since_activity:.0f}s (grace: {activity_grace_period}s)", file=sys.stderr)
2955
+ print(f" Unloading model to free RAM - socket stays alive for reconnection", file=sys.stderr)
2956
+ _kys_unload_model()
2957
+
2958
+ # Write status file so clients know state
2959
+ try:
2960
+ death_reason_path = os.path.join(os.path.dirname(self.socket_path), 'embedding-death-reason.txt')
2961
+ with open(death_reason_path, 'w') as f:
2962
+ f.write(f"kys-unload\n{time.time()}\nModel unloaded after no heartbeat ({time_since_kys:.0f}s) AND no activity ({time_since_activity:.0f}s). Socket still alive.")
2963
+ except Exception as e:
2964
+ print(f" [KYS-UNLOAD] Failed to write status file: {e}", file=sys.stderr)
2965
+ # Don't exit - keep looping. Model will reload on next request.
2966
+ continue
2882
2967
 
2883
- # Set shutdown flag and force exit
2884
- self.shutdown_requested = True
2968
+ else:
2969
+ # KILL MODE (original behavior): Process exit
2970
+ print(f"", file=sys.stderr)
2971
+ print(f"[KYS-KILL] WATCHDOG TRIGGERED (mode=kill)", file=sys.stderr)
2972
+ print(f" No heartbeat from MCP in {time_since_kys:.0f}s (timeout: {self.kys_timeout}s)", file=sys.stderr)
2973
+ print(f" No embedding activity for {time_since_activity:.0f}s (grace: {activity_grace_period}s)", file=sys.stderr)
2974
+ print(f" MCP server likely crashed - committing suicide to prevent zombie", file=sys.stderr)
2975
+ print(f" 'kurt cobain t minus 0'", file=sys.stderr)
2976
+ print(f"", file=sys.stderr)
2977
+
2978
+ # Write death reason file so clients know to auto-respawn
2979
+ try:
2980
+ death_reason_path = os.path.join(os.path.dirname(self.socket_path), 'embedding-death-reason.txt')
2981
+ with open(death_reason_path, 'w') as f:
2982
+ f.write(f"kys\n{time.time()}\nNo heartbeat ({time_since_kys:.0f}s) AND no activity ({time_since_activity:.0f}s)")
2983
+ print(f" Death reason written to {death_reason_path}", file=sys.stderr)
2984
+ except Exception as e:
2985
+ print(f" Failed to write death reason: {e}", file=sys.stderr)
2986
+
2987
+ # Set shutdown flag and force exit
2988
+ self.shutdown_requested = True
2885
2989
 
2886
- # Give a moment for cleanup
2887
- time.sleep(1)
2990
+ # Give a moment for cleanup
2991
+ time.sleep(1)
2888
2992
 
2889
- # Force exit - os._exit bypasses finally blocks for immediate death
2890
- os._exit(0)
2993
+ # Force exit - os._exit bypasses finally blocks for immediate death
2994
+ os._exit(0)
2891
2995
 
2892
2996
  thread = threading.Thread(target=watchdog, daemon=True)
2893
2997
  thread.start()
2894
- print(f" 🛡️ KYS Watchdog: ENABLED (suicide if no heartbeat in {self.kys_timeout}s)", file=sys.stderr)
2998
+ print(f" KYS Watchdog: ENABLED (mode={self.kys_mode}, timeout={self.kys_timeout}s)", file=sys.stderr)
2895
2999
 
2896
3000
  def _process_codebase_files(self, batch_size: int = 200, limit: int = 0, project_path: str = None) -> Dict:
2897
3001
  """
@@ -3277,11 +3381,13 @@ class EmbeddingServer:
3277
3381
  # Fast readiness check - just returns model loading state
3278
3382
  # Used by specmem-init for event-based startup instead of timeouts
3279
3383
  model_loaded = self.embedder.model is not None
3384
+ model_healthy = getattr(self.embedder, '_model_healthy', True)
3280
3385
  return {
3281
- 'ready': model_loaded,
3386
+ 'ready': model_loaded and model_healthy,
3282
3387
  'model_loaded': model_loaded,
3388
+ 'model_healthy': model_healthy,
3283
3389
  'lazy_loading': self.embedder.low_resource_config.lazy_loading,
3284
- 'status': 'ready' if model_loaded else 'loading'
3390
+ 'status': 'ready' if (model_loaded and model_healthy) else ('error' if not model_healthy else 'loading')
3285
3391
  }
3286
3392
  elif req_type == 'kys':
3287
3393
  # KYS (Keep Yourself Safe) heartbeat from MCP server
@@ -3292,6 +3398,9 @@ class EmbeddingServer:
3292
3398
  'status': 'alive',
3293
3399
  'ack': 'kurt cobain t minus reset',
3294
3400
  'timeout_remaining': self.kys_timeout,
3401
+ 'kys_mode': self.kys_mode,
3402
+ 'model_loaded': self.embedder.model is not None,
3403
+ 'model_healthy': getattr(self.embedder, '_model_healthy', True),
3295
3404
  'project': PROJECT_DIR_NAME
3296
3405
  }
3297
3406
  elif req_type == 'get_dimension':
@@ -3323,10 +3432,12 @@ class EmbeddingServer:
3323
3432
  # Stats request (or health check)
3324
3433
  if request.get('stats'):
3325
3434
  model_loaded = self.embedder.model is not None
3435
+ model_healthy = getattr(self.embedder, '_model_healthy', True)
3326
3436
  stats_response = {
3327
- 'status': 'healthy', # For health check compatibility
3328
- 'ready': model_loaded, # For event-based startup polling
3329
- 'model_loaded': model_loaded, # Explicit model loading state
3437
+ 'status': 'healthy' if model_healthy else 'degraded',
3438
+ 'ready': model_loaded and model_healthy,
3439
+ 'model_loaded': model_loaded,
3440
+ 'model_healthy': model_healthy,
3330
3441
  'stats': self.embedder.get_stats(),
3331
3442
  'model': 'frankenstein-v5-dynamic',
3332
3443
  'project': PROJECT_DIR_NAME,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "specmem-hardwicksoftware",
3
- "version": "3.5.99",
3
+ "version": "3.6.1",
4
4
  "type": "module",
5
5
  "description": "Persistent memory system for coding sessions - semantic search with pgvector, token compression, team coordination, file watching. Needs root: installs system-wide hooks, manages docker/PostgreSQL, writes global configs, handles screen sessions. justcalljon.pro",
6
6
  "main": "dist/index.js",