lollms-client 1.3.4__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lollms-client might be problematic. Click here for more details.

@@ -8,6 +8,7 @@ import subprocess
8
8
  import sys
9
9
  import threading
10
10
  import time
11
+ import tempfile
11
12
  from pathlib import Path
12
13
  from typing import Optional, Callable, List, Union, Dict, Any, Set
13
14
  import base64
@@ -20,29 +21,62 @@ from ascii_colors import ASCIIColors, trace_exception
20
21
  import pipmaster as pm
21
22
  import platform
22
23
 
23
- # Ensure llama-cpp-binaries and requests are installed
24
- pm.ensure_packages(["requests", "pillow"]) # pillow for dummy image in test
24
+ # --- Multi-process locking for registry ---
25
+ # On Windows, we need msvcrt, on POSIX, fcntl
26
+ try:
27
+ if platform.system() == "Windows":
28
+ import msvcrt
29
+ else:
30
+ import fcntl
31
+ except ImportError:
32
+ # This might happen in some restricted environments.
33
+ # The binding will fall back to thread-safety only.
34
+ msvcrt = fcntl = None
35
+
36
+
37
+ class FileLock:
38
+ def __init__(self, lock_file_path):
39
+ self.lock_file_path = lock_file_path
40
+ self.lock_file = None
41
+ self._is_windows = platform.system() == "Windows"
42
+
43
+ def __enter__(self):
44
+ self.lock_file = open(self.lock_file_path, 'w')
45
+ if self._is_windows and msvcrt:
46
+ msvcrt.locking(self.lock_file.fileno(), msvcrt.LK_LOCK, 1)
47
+ elif not self._is_windows and fcntl:
48
+ fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_EX)
49
+ return self
50
+
51
+ def __exit__(self, exc_type, exc_val, exc_tb):
52
+ if self.lock_file:
53
+ if self._is_windows and msvcrt:
54
+ self.lock_file.seek(0)
55
+ msvcrt.locking(self.lock_file.fileno(), msvcrt.LK_UNLCK, 1)
56
+ elif not self._is_windows and fcntl:
57
+ fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_UN)
58
+ self.lock_file.close()
59
+ self.lock_file = None
60
+
61
+ # --- End multi-process locking ---
62
+
63
+
64
+ # Ensure llama-cpp-binaries, requests, pillow, and psutil are installed
65
+ pm.ensure_packages(["requests", "pillow", "psutil"]) # pillow for dummy image in test, psutil for multi-process management
25
66
  if not pm.is_installed("llama-cpp-binaries"):
26
67
  def install_llama_cpp():
27
68
  system = platform.system()
28
69
  python_version_simple = f"py{sys.version_info.major}" # e.g. py310 for 3.10
29
70
 
30
- # Determine CUDA suffix based on common recent versions. Adjust if needed.
31
- # For simplicity, we'll target a common recent CUDA version.
32
- # Users with specific needs might need to install manually.
33
- # As of late 2023/early 2024, cu121 or cu118 are common.
34
- # The oobabooga binaries often use +cu124 for recent builds. Let's try that.
35
71
  cuda_suffix = "+cu124"
36
72
 
37
73
 
38
74
  if system == "Windows":
39
- # llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl
40
- url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.14.0{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
41
- fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl" # Generic py3
75
+ url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
76
+ fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl" # Generic py3
42
77
  elif system == "Linux":
43
- # llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl
44
- url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
45
- fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl" # Generic py3
78
+ url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
79
+ fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl" # Generic py3
46
80
  else:
47
81
  ASCIIColors.warning(f"Unsupported OS for prebuilt llama-cpp-binaries: {system}. Please install manually.")
48
82
  return
@@ -63,11 +97,12 @@ if not pm.is_installed("llama-cpp-binaries"):
63
97
 
64
98
  try:
65
99
  import llama_cpp_binaries
100
+ import psutil
66
101
  except ImportError:
67
- ASCIIColors.error("llama-cpp-binaries package not found. Please install it.")
68
- ASCIIColors.error("You can try: pip install llama-cpp-python[server] (for server support)")
69
- ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases or https://pypi.org/project/llama-cpp-python/#files")
102
+ ASCIIColors.error("llama-cpp-binaries or psutil package not found. Please ensure they are installed.")
103
+ ASCIIColors.error("You can try: pip install llama-cpp-python[server] psutil")
70
104
  llama_cpp_binaries = None
105
+ psutil = None
71
106
 
72
107
 
73
108
  # --- Predefined patterns ---
@@ -112,10 +147,130 @@ def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
112
147
  while name_part and name_part[-1] in ['.', '-', '_']: name_part = name_part[:-1]
113
148
  return name_part
114
149
 
115
- # --- Global Server Registry ---
116
- _active_servers: Dict[tuple, 'LlamaCppServerProcess'] = {}
117
- _server_ref_counts: Dict[tuple, int] = {}
118
- _server_registry_lock = threading.Lock()
150
+ # --- Global Server Registry (File-based for multi-process support) ---
151
+
152
+ class ServerRegistry:
153
+ def __init__(self):
154
+ self.registry_dir = Path(tempfile.gettempdir()) / "lollms_llamacpp_servers"
155
+ self.registry_dir.mkdir(parents=True, exist_ok=True)
156
+ self.registry_file = self.registry_dir / "registry.json"
157
+ self.lock_file = self.registry_dir / "registry.lock"
158
+ self.my_pid = os.getpid()
159
+
160
+ def _is_pid_running(self, pid: int) -> bool:
161
+ if psutil is None: return True # Conservative default if psutil is missing
162
+ return psutil.pid_exists(pid)
163
+
164
+ def _read_registry(self) -> Dict[str, Any]:
165
+ if not self.registry_file.exists():
166
+ return {}
167
+ try:
168
+ with open(self.registry_file, 'r') as f:
169
+ return json.load(f)
170
+ except (json.JSONDecodeError, FileNotFoundError):
171
+ return {}
172
+
173
+ def _write_registry(self, data: Dict[str, Any]):
174
+ with open(self.registry_file, 'w') as f:
175
+ json.dump(data, f, indent=2)
176
+
177
+ def _clean_stale_entries(self, registry_data: Dict[str, Any]) -> bool:
178
+ """Cleans stale servers and clients. Returns True if changes were made."""
179
+ changed = False
180
+ # Clean dead servers
181
+ dead_servers = [k for k, v in registry_data.items() if not self._is_pid_running(v['pid'])]
182
+ for key in dead_servers:
183
+ ASCIIColors.warning(f"Registry Cleaner: Found dead server process (PID: {registry_data[key]['pid']}). Removing entry {key}.")
184
+ del registry_data[key]
185
+ changed = True
186
+
187
+ # Clean dead clients from living servers
188
+ for key, server_info in list(registry_data.items()):
189
+ dead_clients = [pid for pid in server_info.get('client_pids', []) if not self._is_pid_running(pid)]
190
+ if dead_clients:
191
+ ASCIIColors.warning(f"Registry Cleaner: Found dead client PIDs {dead_clients} for server {key}. Cleaning up.")
192
+ server_info['client_pids'] = [pid for pid in server_info['client_pids'] if pid not in dead_clients]
193
+ server_info['ref_count'] = len(server_info['client_pids'])
194
+ changed = True
195
+
196
+ # If a server has no clients left after cleanup, it's an orphan. Remove it.
197
+ if server_info['ref_count'] <= 0:
198
+ ASCIIColors.warning(f"Registry Cleaner: Server {key} (PID: {server_info['pid']}) has no clients left. Shutting it down.")
199
+ try:
200
+ p = psutil.Process(server_info['pid'])
201
+ p.terminate()
202
+ p.wait(timeout=5)
203
+ except psutil.NoSuchProcess: pass
204
+ except Exception as e: ASCIIColors.error(f"Error terminating orphaned server PID {server_info['pid']}: {e}")
205
+ del registry_data[key]
206
+ changed = True
207
+
208
+ return changed
209
+
210
+ def get_server(self, server_key: str) -> Optional[Dict[str, Any]]:
211
+ with FileLock(self.lock_file):
212
+ registry = self._read_registry()
213
+ self._clean_stale_entries(registry) # Always clean before read
214
+ server_info = registry.get(server_key)
215
+ if server_info:
216
+ self._write_registry(registry) # Write back changes from cleaning
217
+ return server_info
218
+
219
+ def register_new_server(self, server_key: str, pid: int, port: int):
220
+ with FileLock(self.lock_file):
221
+ registry = self._read_registry()
222
+ # Clean just in case something happened between server start and registration
223
+ self._clean_stale_entries(registry)
224
+
225
+ registry[server_key] = {
226
+ "pid": pid, "port": port,
227
+ "ref_count": 1, "client_pids": [self.my_pid]
228
+ }
229
+ self._write_registry(registry)
230
+ ASCIIColors.info(f"Process {self.my_pid} registered new server {server_key} (PID: {pid}, Port: {port})")
231
+
232
+ def increment_ref_count(self, server_key: str):
233
+ with FileLock(self.lock_file):
234
+ registry = self._read_registry()
235
+ self._clean_stale_entries(registry)
236
+
237
+ server_info = registry.get(server_key)
238
+ if server_info:
239
+ if self.my_pid not in server_info['client_pids']:
240
+ server_info['client_pids'].append(self.my_pid)
241
+ server_info['ref_count'] = len(server_info['client_pids'])
242
+ self._write_registry(registry)
243
+ ASCIIColors.info(f"Process {self.my_pid} attached to server {server_key}. New ref_count: {server_info['ref_count']}")
244
+ else:
245
+ ASCIIColors.warning(f"Process {self.my_pid} tried to attach to non-existent server {server_key}.")
246
+
247
+ def decrement_ref_count(self, server_key: str):
248
+ with FileLock(self.lock_file):
249
+ registry = self._read_registry()
250
+ made_changes = self._clean_stale_entries(registry)
251
+
252
+ server_info = registry.get(server_key)
253
+ if server_info:
254
+ if self.my_pid in server_info['client_pids']:
255
+ server_info['client_pids'].remove(self.my_pid)
256
+ server_info['ref_count'] = len(server_info['client_pids'])
257
+ made_changes = True
258
+ ASCIIColors.info(f"Process {self.my_pid} detached from server {server_key}. New ref_count: {server_info['ref_count']}")
259
+
260
+ if server_info['ref_count'] <= 0:
261
+ ASCIIColors.info(f"Last client (PID: {self.my_pid}) detached. Shutting down server {server_key} (PID: {server_info['pid']}).")
262
+ try:
263
+ p = psutil.Process(server_info['pid'])
264
+ p.terminate()
265
+ p.wait(timeout=10)
266
+ except psutil.NoSuchProcess:
267
+ ASCIIColors.warning(f"Server process {server_info['pid']} was already gone.")
268
+ except Exception as e:
269
+ ASCIIColors.error(f"Error terminating server process {server_info['pid']}: {e}")
270
+ del registry[server_key]
271
+
272
+ if made_changes:
273
+ self._write_registry(registry)
119
274
 
120
275
  BindingName = "LlamaCppServerBinding"
121
276
  DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
@@ -125,9 +280,12 @@ class LlamaCppServerProcess:
125
280
  model_path: Union[str, Path],
126
281
  clip_model_path: Optional[Union[str, Path]] = None,
127
282
  server_binary_path: Optional[Union[str, Path]]=None,
128
- server_args: Dict[str, Any]={}
283
+ server_args: Dict[str, Any]={},
284
+ process_pid: Optional[int]=None, # PID if we are attaching to existing process
285
+ port: Optional[int]=None,
129
286
  ):
130
- """Initialize the Llama.cpp server process.
287
+ """Initialize the Llama.cpp server process wrapper.
288
+ Can either start a new process or wrap an existing one.
131
289
  """
132
290
  self.model_path = Path(model_path)
133
291
  self.clip_model_path = Path(clip_model_path) if clip_model_path else None
@@ -139,12 +297,14 @@ class LlamaCppServerProcess:
139
297
  else:
140
298
  raise FileNotFoundError("llama_cpp_binaries not found and no server_binary_path provided.")
141
299
 
142
- self.port: Optional[int] = None # Set by start() method
300
+ self.port: Optional[int] = port
301
+ self.pid: Optional[int] = process_pid
143
302
  self.server_args = server_args
144
- self.process: Optional[subprocess.Popen] = None
303
+ # The actual subprocess.Popen object. Will be None if this instance is just a client to a server started by another process.
304
+ self.process: Optional[subprocess.Popen] = None
145
305
  self.session = requests.Session()
146
306
  self.host = self.server_args.get("host",DEFAULT_LLAMACPP_SERVER_HOST)
147
- self.base_url: Optional[str] = None # Set by start() method
307
+ self.base_url: Optional[str] = f"http://{self.host}:{self.port}" if self.port else None
148
308
  self.is_healthy = False
149
309
  self._stderr_lines: List[str] = []
150
310
  self._stderr_thread: Optional[threading.Thread] = None
@@ -156,6 +316,23 @@ class LlamaCppServerProcess:
156
316
  if not self.server_binary_path.exists():
157
317
  raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
158
318
 
319
+ def attach(self):
320
+ """Attaches to an already running process by checking its health."""
321
+ if not self.pid or not self.port:
322
+ raise ValueError("Cannot attach without PID and port.")
323
+ self.base_url = f"http://{self.host}:{self.port}"
324
+ health_url = f"{self.base_url}/health"
325
+ try:
326
+ response = self.session.get(health_url, timeout=5)
327
+ if response.status_code == 200 and response.json().get("status") == "ok":
328
+ self.is_healthy = True
329
+ ASCIIColors.green(f"Successfully attached to Llama.cpp server on port {self.port} (PID: {self.pid}).")
330
+ return
331
+ except requests.exceptions.RequestException as e:
332
+ ASCIIColors.warning(f"Failed to attach to server on port {self.port}: {e}")
333
+ self.is_healthy = False
334
+ raise ConnectionError(f"Could not connect to existing server at {health_url}")
335
+
159
336
  def _filter_stderr(self, stderr_pipe):
160
337
  try:
161
338
  for line in iter(stderr_pipe.readline, ''):
@@ -216,6 +393,7 @@ class LlamaCppServerProcess:
216
393
 
217
394
  try:
218
395
  self.process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, bufsize=1, env=env)
396
+ self.pid = self.process.pid
219
397
  except Exception as e:
220
398
  ASCIIColors.error(f"Failed to start llama.cpp server process on port {self.port}: {e}"); trace_exception(e); raise
221
399
 
@@ -234,7 +412,7 @@ class LlamaCppServerProcess:
234
412
  response = self.session.get(health_url, timeout=2)
235
413
  if response.status_code == 200 and response.json().get("status") == "ok":
236
414
  self.is_healthy = True
237
- ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port}.")
415
+ ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port} (PID: {self.pid}).")
238
416
  return
239
417
  except requests.exceptions.ConnectionError: time.sleep(1)
240
418
  except Exception as e: ASCIIColors.warning(f"Health check for port {self.port} failed: {e}"); time.sleep(1)
@@ -245,12 +423,13 @@ class LlamaCppServerProcess:
245
423
  raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
246
424
 
247
425
  def shutdown(self):
426
+ """ This method only shuts down a server if this instance owns the Popen object.
427
+ The actual termination for multi-process is handled by the ServerRegistry. """
248
428
  self.is_healthy = False
249
429
  if self.process:
250
- ASCIIColors.info(f"Shutting down Llama.cpp server (PID: {self.process.pid} on port {self.port})...")
430
+ ASCIIColors.info(f"Shutting down owned Llama.cpp server process (PID: {self.process.pid} on port {self.port})...")
251
431
  try:
252
- if os.name == 'nt': self.process.terminate()
253
- else: self.process.terminate()
432
+ self.process.terminate()
254
433
  self.process.wait(timeout=10)
255
434
  except subprocess.TimeoutExpired:
256
435
  ASCIIColors.warning(f"Llama.cpp server (port {self.port}) did not terminate gracefully, killing...")
@@ -269,45 +448,31 @@ class LlamaCppServerBinding(LollmsLLMBinding):
269
448
  "n_gpu_layers": 0, "n_ctx": 128000, "n_batch": 512,
270
449
  "embedding": False, "verbose": False, "server_startup_timeout": 120,
271
450
  "parallel_slots": 4, # Default parallel slots for server
451
+ "stop_sequences": ["<|im_start|>"], # Default stop sequences
272
452
  }
273
453
 
274
- def __init__(self,
275
- **kwargs
276
- ):
277
- """Initialize the Llama.cpp server binding.
278
- Args:
279
- model_name (str): Name of the model to load. If None, will use initial_model_name_preference.
280
- models_path (str): Path to the directory containing model files.
281
- clip_model_name (str): Optional name of the clip model to use. If None, will try to auto-detect based on the main model.
282
- config (dict): Additional configuration options for the server.
283
- default_completion_format (ELF_COMPLETION_FORMAT): Default format for completions.
284
-
285
- """
454
+ def __init__(self, **kwargs):
286
455
  super().__init__(BindingName, **kwargs)
287
- if llama_cpp_binaries is None: raise ImportError("llama-cpp-binaries package is required but not found.")
456
+ if llama_cpp_binaries is None or psutil is None:
457
+ raise ImportError("llama-cpp-binaries and psutil packages are required.")
288
458
 
459
+ self.registry = ServerRegistry()
289
460
  models_path = kwargs.get("models_path", Path(__file__).parent/"models")
290
461
  self.models_path = Path(models_path)
291
- # Store initial preferences, but do not load/start server yet.
292
462
  self.initial_model_name_preference: Optional[str] = kwargs.get("model_name")
293
- self.user_provided_model_name: Optional[str] = kwargs.get("model_name") # Tracks the latest requested model
463
+ self.user_provided_model_name: Optional[str] = kwargs.get("model_name")
294
464
  self.initial_clip_model_name_preference: Optional[str] = kwargs.get("clip_model_name")
295
-
296
- self._model_path_map: Dict[str, Path] = {} # Maps unique name to full Path
297
-
298
- # Initial scan for available models (to populate listModels)
465
+ self._model_path_map: Dict[str, Path] = {}
299
466
  self._scan_models()
300
-
301
467
  self.default_completion_format = kwargs.get("default_completion_format", ELF_COMPLETION_FORMAT.Chat)
302
468
  self.server_args = {**self.DEFAULT_SERVER_ARGS, **(kwargs.get("config") or {}), **kwargs}
303
469
  self.server_binary_path = self._get_server_binary_path()
304
470
 
305
- # Current state of the loaded model and server
306
471
  self.current_model_path: Optional[Path] = None
307
- self.clip_model_path: Optional[Path] = None # Actual resolved path of loaded clip model
472
+ self.clip_model_path: Optional[Path] = None
308
473
  self.server_process: Optional[LlamaCppServerProcess] = None
309
474
  self.port: Optional[int] = None
310
- self.server_key: Optional[tuple] = None
475
+ self.server_key: Optional[str] = None
311
476
 
312
477
  ASCIIColors.info("LlamaCppServerBinding initialized. Server will start on-demand with first generation call.")
313
478
 
@@ -324,7 +489,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
324
489
  bin_path = Path(bin_path_str)
325
490
  if bin_path.exists() and bin_path.is_file():
326
491
  ASCIIColors.info(f"Using binary from llama-cpp-binaries: {bin_path}"); return bin_path
327
- raise FileNotFoundError("Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' or 'llama-cpp-python[server]' is installed or provide 'llama_server_binary_path'.")
492
+ raise FileNotFoundError("Llama.cpp server binary not found.")
328
493
 
329
494
  def _resolve_model_path(self, model_name_or_path: str) -> Path:
330
495
  """
@@ -332,36 +497,16 @@ class LlamaCppServerBinding(LollmsLLMBinding):
332
497
  It prioritizes the internal map, then checks for absolute/relative paths,
333
498
  and rescans the models directory as a fallback.
334
499
  """
335
- # 1. Check if the provided name is a key in our map
336
500
  if model_name_or_path in self._model_path_map:
337
- resolved_path = self._model_path_map[model_name_or_path]
338
- ASCIIColors.info(f"Resolved model name '{model_name_or_path}' to path: {resolved_path}")
339
- return resolved_path
340
-
341
- # 2. If not in map, treat it as a potential path (absolute or relative to models_path)
501
+ return self._model_path_map[model_name_or_path]
342
502
  model_p = Path(model_name_or_path)
343
- if model_p.is_absolute():
344
- if model_p.exists() and model_p.is_file():
345
- return model_p
346
-
503
+ if model_p.is_absolute() and model_p.exists(): return model_p
347
504
  path_in_models_dir = self.models_path / model_name_or_path
348
- if path_in_models_dir.exists() and path_in_models_dir.is_file():
349
- ASCIIColors.info(f"Found model at relative path: {path_in_models_dir}")
350
- return path_in_models_dir
351
-
352
- # 3. As a fallback, rescan the models directory in case the file was just added
353
- ASCIIColors.info("Model not found in cache, rescanning directory...")
505
+ if path_in_models_dir.exists(): return path_in_models_dir
354
506
  self._scan_models()
355
507
  if model_name_or_path in self._model_path_map:
356
- resolved_path = self._model_path_map[model_name_or_path]
357
- ASCIIColors.info(f"Found model '{model_name_or_path}' after rescan: {resolved_path}")
358
- return resolved_path
359
-
360
- # Final check for absolute path after rescan
361
- if model_p.is_absolute() and model_p.exists() and model_p.is_file():
362
- return model_p
363
-
364
- raise FileNotFoundError(f"Model '{model_name_or_path}' not found in the map, as an absolute path, or within '{self.models_path}'.")
508
+ return self._model_path_map[model_name_or_path]
509
+ raise FileNotFoundError(f"Model '{model_name_or_path}' not found.")
365
510
 
366
511
  def _find_available_port(self) -> int:
367
512
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -369,147 +514,105 @@ class LlamaCppServerBinding(LollmsLLMBinding):
369
514
 
370
515
  def _release_server_instance(self):
371
516
  if self.server_process and self.server_key:
372
- with _server_registry_lock:
373
- if self.server_key in _server_ref_counts:
374
- _server_ref_counts[self.server_key] -= 1
375
- ASCIIColors.info(f"Decremented ref count for server {self.server_key}. New count: {_server_ref_counts[self.server_key]}")
376
- if _server_ref_counts[self.server_key] <= 0:
377
- ASCIIColors.info(f"Ref count for server {self.server_key} is zero. Shutting it down.")
378
- server_to_stop = _active_servers.pop(self.server_key, None)
379
- _server_ref_counts.pop(self.server_key, None)
380
- if server_to_stop:
381
- try: server_to_stop.shutdown()
382
- except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
383
- else:
384
- ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
385
- _active_servers.pop(self.server_key, None) # Ensure removal
386
-
517
+ self.registry.decrement_ref_count(self.server_key)
387
518
  self.server_process = None
388
519
  self.port = None
389
520
  self.server_key = None
390
- self.current_model_path = None # Also clear this binding's model association
391
- self.clip_model_path = None # And clip model association
521
+ self.current_model_path = None
522
+ self.clip_model_path = None
392
523
 
393
524
  def load_model(self, model_name_or_path: str) -> bool:
394
- self.user_provided_model_name = model_name_or_path # Keep track of the selected model name
525
+ self.user_provided_model_name = model_name_or_path
395
526
  try:
396
527
  resolved_model_path = self._resolve_model_path(model_name_or_path)
397
528
  except Exception as ex:
398
- trace_exception(ex)
399
- return False
529
+ trace_exception(ex); return False
400
530
 
401
- # Determine the final clip_model_path for this server instance
402
- # Priority: 1. Explicit `initial_clip_model_name_preference` from __init__ (if valid path)
403
- # 2. Auto-detection based on the resolved main model.
404
531
  final_clip_model_path: Optional[Path] = None
405
532
  if self.initial_clip_model_name_preference:
406
533
  p_clip_pref = Path(self.initial_clip_model_name_preference)
407
- if p_clip_pref.is_absolute() and p_clip_pref.exists():
408
- final_clip_model_path = p_clip_pref
409
- ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
410
- elif (self.models_path / self.initial_clip_model_name_preference).exists():
411
- final_clip_model_path = self.models_path / self.initial_clip_model_name_preference
412
- ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path} (relative to models path)")
413
- else:
414
- ASCIIColors.warning(f"Specified initial clip_model_name '{self.initial_clip_model_name_preference}' not found. Attempting auto-detection.")
534
+ if p_clip_pref.is_absolute() and p_clip_pref.exists(): final_clip_model_path = p_clip_pref
535
+ elif (self.models_path / p_clip_pref).exists(): final_clip_model_path = self.models_path / p_clip_pref
536
+ else: ASCIIColors.warning(f"Specified clip model '{self.initial_clip_model_name_preference}' not found.")
415
537
 
416
- if not final_clip_model_path: # If no explicit path was provided or it was invalid, try auto-detection
538
+ if not final_clip_model_path:
417
539
  base_name = get_gguf_model_base_name(resolved_model_path.stem)
418
540
  potential_paths = [
419
541
  resolved_model_path.parent / f"{base_name}.mmproj",
420
542
  resolved_model_path.parent / f"mmproj-{base_name}.gguf",
421
- resolved_model_path.with_suffix(".mmproj"),
422
- self.models_path / f"{base_name}.mmproj", # Check in general models dir too
543
+ self.models_path / f"{base_name}.mmproj",
423
544
  self.models_path / f"mmproj-{base_name}.gguf",
424
545
  ]
425
546
  for p_clip in potential_paths:
426
- if p_clip.exists():
427
- final_clip_model_path = p_clip
428
- ASCIIColors.info(f"Auto-detected LLaVA clip model: {final_clip_model_path}")
429
- break
547
+ if p_clip.exists(): final_clip_model_path = p_clip; break
430
548
 
431
- final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
432
-
433
- # Server key based on model and essential server configurations (like clip model)
434
- new_server_key = (str(resolved_model_path), final_clip_model_path_str)
549
+ final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else "None"
550
+ new_server_key = f"{resolved_model_path}|{final_clip_model_path_str}"
435
551
 
436
- with _server_registry_lock:
437
- # If this binding instance is already using the exact same server, do nothing
438
- if self.server_process and self.server_key == new_server_key and self.server_process.is_healthy:
439
- ASCIIColors.info(f"Model '{model_name_or_path}' with clip '{final_clip_model_path_str}' is already loaded and server is healthy on port {self.port}. No change.")
440
- return True
441
-
442
- # If this binding was using a *different* server, release it first
443
- if self.server_process and self.server_key != new_server_key:
444
- ASCIIColors.info(f"Switching models. Releasing previous server: {self.server_key}")
445
- self._release_server_instance() # This clears self.server_process, self.port, self.server_key
446
-
447
- # Check if a suitable server already exists in the global registry
448
- if new_server_key in _active_servers:
449
- existing_server = _active_servers[new_server_key]
450
- if existing_server.is_healthy:
451
- ASCIIColors.info(f"Reusing existing healthy server for {new_server_key} on port {existing_server.port}.")
452
- self.server_process = existing_server
453
- self.port = existing_server.port
454
- _server_ref_counts[new_server_key] += 1
455
- self.current_model_path = resolved_model_path
456
- self.clip_model_path = final_clip_model_path # Update binding's clip path
457
- self.server_key = new_server_key
458
- return True
459
- else: # Found existing but unhealthy server
460
- ASCIIColors.warning(f"Found unhealthy server for {new_server_key}. Attempting to remove and restart.")
461
- try: existing_server.shutdown()
462
- except Exception as e: ASCIIColors.error(f"Error shutting down unhealthy server {new_server_key}: {e}")
463
- _active_servers.pop(new_server_key, None)
464
- _server_ref_counts.pop(new_server_key, None)
465
-
466
- # No suitable server found or existing was unhealthy: start a new one
467
- ASCIIColors.info(f"Starting new server for {new_server_key}.")
468
- self.current_model_path = resolved_model_path
469
- self.clip_model_path = final_clip_model_path # Update binding's clip path for the new server
470
- self.server_key = new_server_key # Set before potential failure to allow cleanup by _release_server_instance
471
-
472
- new_port_for_server = self._find_available_port()
473
-
474
- current_server_args_for_new_server = self.server_args.copy()
475
- # Ensure parallel_slots is set; it's crucial for shared servers
476
- if "parallel_slots" not in current_server_args_for_new_server or not isinstance(current_server_args_for_new_server["parallel_slots"], int) or current_server_args_for_new_server["parallel_slots"] <=0:
477
- current_server_args_for_new_server["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
478
-
479
- ASCIIColors.info(f"New Llama.cpp server: model={self.current_model_path}, clip={self.clip_model_path}, port={new_port_for_server}, slots={current_server_args_for_new_server['parallel_slots']}")
552
+ if self.server_process and self.server_key == new_server_key and self.server_process.is_healthy:
553
+ ASCIIColors.info(f"Model '{model_name_or_path}' is already loaded. No change.")
554
+ return True
480
555
 
556
+ if self.server_process and self.server_key != new_server_key:
557
+ self._release_server_instance()
558
+
559
+ # Check registry for an existing server
560
+ existing_server_info = self.registry.get_server(new_server_key)
561
+ if existing_server_info:
562
+ ASCIIColors.info(f"Found existing server for {new_server_key} in registry (PID: {existing_server_info['pid']}, Port: {existing_server_info['port']}). Attaching...")
481
563
  try:
482
- new_server = LlamaCppServerProcess(
483
- model_path=str(self.current_model_path),
484
- clip_model_path=str(self.clip_model_path) if self.clip_model_path else None,
485
- server_binary_path=str(self.server_binary_path),
486
- server_args=current_server_args_for_new_server,
564
+ self.server_process = LlamaCppServerProcess(
565
+ model_path=resolved_model_path, clip_model_path=final_clip_model_path,
566
+ process_pid=existing_server_info['pid'], port=existing_server_info['port'],
567
+ server_args=self.server_args
487
568
  )
488
- new_server.start(port_to_use=new_port_for_server) # Actual server start
489
-
490
- if new_server.is_healthy:
491
- self.server_process = new_server
492
- self.port = new_port_for_server
493
- _active_servers[self.server_key] = new_server
494
- _server_ref_counts[self.server_key] = 1
495
- ASCIIColors.green(f"New server {self.server_key} started on port {self.port}.")
496
- return True
497
- else: # Should have been caught by new_server.start() raising an error
498
- ASCIIColors.error(f"New server {self.server_key} failed to become healthy (this state should be rare).")
499
- self._release_server_instance() # Clean up registry if something went very wrong
500
- return False
569
+ self.server_process.attach() # This verifies health
570
+ self.port = self.server_process.port
571
+ self.current_model_path = resolved_model_path
572
+ self.clip_model_path = final_clip_model_path
573
+ self.server_key = new_server_key
574
+ self.registry.increment_ref_count(new_server_key)
575
+ return True
501
576
  except Exception as e:
502
- ASCIIColors.error(f"Failed to load model '{model_name_or_path}' and start server: {e}")
503
- trace_exception(e)
504
- self._release_server_instance() # Ensure cleanup if start failed
577
+ ASCIIColors.error(f"Failed to attach to existing server: {e}. It might be stale. Will attempt to start a new one.")
578
+ self.registry.decrement_ref_count(new_server_key) # Clean up failed attach
579
+
580
+ # Start a new server
581
+ ASCIIColors.info(f"No existing server found for {new_server_key}. Starting a new one.")
582
+ self.current_model_path = resolved_model_path
583
+ self.clip_model_path = final_clip_model_path
584
+ self.server_key = new_server_key
585
+
586
+ try:
587
+ new_port = self._find_available_port()
588
+ current_server_args = self.server_args.copy()
589
+ if "parallel_slots" not in current_server_args or current_server_args["parallel_slots"] <=0:
590
+ current_server_args["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
591
+
592
+ new_server = LlamaCppServerProcess(
593
+ model_path=self.current_model_path, clip_model_path=self.clip_model_path,
594
+ server_binary_path=self.server_binary_path, server_args=current_server_args
595
+ )
596
+ new_server.start(port_to_use=new_port)
597
+
598
+ if new_server.is_healthy:
599
+ self.server_process = new_server
600
+ self.port = new_port
601
+ self.registry.register_new_server(self.server_key, new_server.pid, new_port)
602
+ ASCIIColors.green(f"New server {self.server_key} started and registered.")
603
+ return True
604
+ else:
505
605
  return False
606
+ except Exception as e:
607
+ ASCIIColors.error(f"Failed to start new server for '{model_name_or_path}': {e}"); trace_exception(e)
608
+ self._release_server_instance()
609
+ return False
506
610
 
507
611
  def unload_model(self):
508
612
  if self.server_process:
509
- ASCIIColors.info(f"Unloading model for binding. Current server: {self.server_key}, port: {self.port}")
510
- self._release_server_instance() # Handles ref counting and actual shutdown if needed
613
+ self._release_server_instance()
511
614
  else:
512
- ASCIIColors.info("Unload_model called, but no server process was active for this binding instance.")
615
+ ASCIIColors.info("Unload called, but no server was active for this binding instance.")
513
616
 
514
617
  def _ensure_server_is_running(self) -> bool:
515
618
  """
@@ -521,21 +624,18 @@ class LlamaCppServerBinding(LollmsLLMBinding):
521
624
 
522
625
  ASCIIColors.info("Server is not running. Attempting to start on-demand...")
523
626
 
524
- # Determine which model to load
525
627
  model_to_load = self.user_provided_model_name or self.initial_model_name_preference
526
628
 
527
629
  if not model_to_load:
528
- # No model specified, try to find one automatically
529
630
  self._scan_models()
530
631
  available_models = self.listModels()
531
632
  if not available_models:
532
633
  ASCIIColors.error("No model specified and no GGUF models found in models path.")
533
634
  return False
534
635
 
535
- model_to_load = available_models[0]['name'] # Pick the first one
636
+ model_to_load = available_models[0]['name']
536
637
  ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{model_to_load}'")
537
638
 
538
- # Now, attempt to load the selected model
539
639
  if self.load_model(model_to_load):
540
640
  return True
541
641
  else:
@@ -543,7 +643,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
543
643
  return False
544
644
 
545
645
  def _get_request_url(self, endpoint: str) -> str:
546
- # This function now assumes _ensure_server_is_running has been called.
547
646
  return f"{self.server_process.base_url}{endpoint}"
548
647
 
549
648
  def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
@@ -551,10 +650,10 @@ class LlamaCppServerBinding(LollmsLLMBinding):
551
650
  repeat_penalty: float = 1.1, repeat_last_n: Optional[int] = 64,
552
651
  seed: Optional[int] = None, stream: bool = False, use_chat_format: bool = True,
553
652
  images: Optional[List[str]] = None,
554
- split:Optional[bool]=False, # put to true if the prompt is a discussion
555
- user_keyword:Optional[str]="!@>user:",
556
- ai_keyword:Optional[str]="!@>assistant:",
557
-
653
+ stop_sequences: Optional[List[str]] = None,
654
+ split:Optional[bool]=False,
655
+ user_keyword:Optional[str]="!@>user:",
656
+ ai_keyword:Optional[str]="!@>assistant:",
558
657
  **extra_params) -> Dict:
559
658
  payload_params = {
560
659
  "temperature": self.server_args.get("temperature", 0.7), "top_k": self.server_args.get("top_k", 40),
@@ -568,6 +667,15 @@ class LlamaCppServerBinding(LollmsLLMBinding):
568
667
  payload_params.update({"temperature": temperature, "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n})
569
668
  if n_predict is not None: payload_params['n_predict'] = n_predict
570
669
  if seed is not None: payload_params['seed'] = seed
670
+
671
+ # --- Handle stop sequences ---
672
+ all_stop_sequences = set(self.server_args.get("stop_sequences", []))
673
+ if stop_sequences:
674
+ all_stop_sequences.update(stop_sequences)
675
+ if all_stop_sequences:
676
+ payload_params['stop'] = list(all_stop_sequences)
677
+ # --- End stop sequences ---
678
+
571
679
  payload_params = {k: v for k, v in payload_params.items() if v is not None}
572
680
  payload_params.update(extra_params)
573
681
 
@@ -579,7 +687,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
579
687
  messages += self.split_discussion(user_content,user_keyword=user_keyword, ai_keyword=ai_keyword)
580
688
  else:
581
689
  messages.append({"role": "user", "content": user_content})
582
- if images and self.clip_model_path: # Use the binding's current clip_model_path
690
+ if images and self.clip_model_path:
583
691
  image_parts = []
584
692
  for img_path in images:
585
693
  try:
@@ -594,7 +702,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
594
702
  else:
595
703
  full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:" if system_prompt and system_prompt.strip() else prompt
596
704
  final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
597
- if images and self.clip_model_path: # Use binding's clip_model_path
705
+ if images and self.clip_model_path:
598
706
  image_data_list = []
599
707
  for i, img_path in enumerate(images):
600
708
  try:
@@ -620,6 +728,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
620
728
  n_threads: Optional[int] = None,
621
729
  ctx_size: int | None = None,
622
730
  streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
731
+ stop_sequences: Optional[List[str]] = None,
623
732
  split:Optional[bool]=False,
624
733
  user_keyword:Optional[str]="!@>user:",
625
734
  ai_keyword:Optional[str]="!@>assistant:",
@@ -639,6 +748,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
639
748
  repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
640
749
  seed=seed if seed is not None else self.server_args.get("seed", -1), stream=stream,
641
750
  use_chat_format=_use_chat_format, images=images,
751
+ stop_sequences=stop_sequences,
642
752
  split= split, user_keyword=user_keyword, ai_keyword=ai_keyword, **generation_kwargs
643
753
  )
644
754
  endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
@@ -668,7 +778,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
668
778
  else:
669
779
  response_data = response.json()
670
780
  return response_data.get('choices', [{}])[0].get('message', {}).get('content', '') if _use_chat_format \
671
- else response_data.get('content','') # /completion has 'content' at top level for non-stream
781
+ else response_data.get('content','')
672
782
  except requests.exceptions.RequestException as e:
673
783
  error_message = f"Llama.cpp server request error: {e}"
674
784
  if e.response is not None:
@@ -694,6 +804,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
694
804
  n_threads: Optional[int] = None,
695
805
  ctx_size: Optional[int] = None,
696
806
  streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
807
+ stop_sequences: Optional[List[str]] = None,
697
808
  **generation_kwargs
698
809
  ) -> Union[str, dict]:
699
810
 
@@ -706,6 +817,13 @@ class LlamaCppServerBinding(LollmsLLMBinding):
706
817
  "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty,
707
818
  "seed": seed, "stream": stream, **generation_kwargs
708
819
  }
820
+
821
+ all_stop_sequences = set(self.server_args.get("stop_sequences", []))
822
+ if stop_sequences:
823
+ all_stop_sequences.update(stop_sequences)
824
+ if all_stop_sequences:
825
+ payload['stop'] = list(all_stop_sequences)
826
+
709
827
  payload = {k: v for k, v in payload.items() if v is not None}
710
828
 
711
829
  endpoint = "/v1/chat/completions"
@@ -724,18 +842,20 @@ class LlamaCppServerBinding(LollmsLLMBinding):
724
842
  if line_str == '[DONE]': break
725
843
  try:
726
844
  chunk_data = json.loads(line_str)
727
- chunk_content = chunk_data.get('choices', [{}])[0].get('delta', {}).get('content', '')
728
- if chunk_content:
729
- full_response_text += chunk_content
730
- if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
731
- ASCIIColors.info("Streaming callback requested stop.")
732
- response.close()
733
- break
845
+ choices = chunk_data.get('choices', [{}])
846
+ if choices and len(choices)>0:
847
+ chunk_content = choices[0].get('delta', {}).get('content', '')
848
+ if chunk_content:
849
+ full_response_text += chunk_content
850
+ if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
851
+ ASCIIColors.info("Streaming callback requested stop.")
852
+ response.close()
853
+ break
734
854
  except json.JSONDecodeError:
735
855
  ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
736
856
  continue
737
857
  return full_response_text
738
- else: # Not streaming
858
+ else:
739
859
  response_data = response.json()
740
860
  return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
741
861
 
@@ -794,7 +914,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
794
914
  return []
795
915
 
796
916
  def get_model_info(self) -> dict:
797
- # This method reports the current state without triggering a server start
798
917
  is_loaded = self.server_process is not None and self.server_process.is_healthy
799
918
  info = {
800
919
  "name": self.binding_name,
@@ -893,7 +1012,10 @@ class LlamaCppServerBinding(LollmsLLMBinding):
893
1012
  return None
894
1013
 
895
1014
  if __name__ == '__main__':
896
- global full_streamed_text
1015
+ # NOTE: This test block is designed for a single-process scenario to verify basic functionality.
1016
+ # Testing the multi-process capabilities requires a separate script that launches multiple
1017
+ # instances of a test program using this binding. The logic here, however, will now use the
1018
+ # new file-based registry system.
897
1019
  full_streamed_text = ""
898
1020
  ASCIIColors.yellow("Testing LlamaCppServerBinding...")
899
1021
 
@@ -917,6 +1039,7 @@ if __name__ == '__main__':
917
1039
  binding_config = {
918
1040
  "n_gpu_layers": 0, "n_ctx": 512, "embedding": True,
919
1041
  "verbose": False, "server_startup_timeout": 180, "parallel_slots": 2,
1042
+ "stop_sequences": ["<|user|>", "\nUSER:"], # Example default stop sequences
920
1043
  }
921
1044
 
922
1045
  active_binding1: Optional[LlamaCppServerBinding] = None
@@ -933,12 +1056,18 @@ if __name__ == '__main__':
933
1056
  ASCIIColors.info(f"Initial model info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
934
1057
 
935
1058
  prompt_text = "What is the capital of France?"
936
- generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
1059
+ generated_text = active_binding1.generate_text(
1060
+ prompt_text,
1061
+ system_prompt="Concise expert.",
1062
+ n_predict=20,
1063
+ stream=False,
1064
+ stop_sequences=["Paris"] # Test per-call stop sequence
1065
+ )
937
1066
 
938
- if isinstance(generated_text, str) and "Paris" in generated_text:
939
- ASCIIColors.green(f"SUCCESS: Auto-start generation successful. Response: {generated_text}")
1067
+ if isinstance(generated_text, str) and "Paris" not in generated_text: # Should stop *before* generating Paris
1068
+ ASCIIColors.green(f"SUCCESS: Auto-start generation with stop sequence successful. Response: '{generated_text}'")
940
1069
  else:
941
- ASCIIColors.error(f"FAILURE: Auto-start generation failed. Response: {generated_text}")
1070
+ ASCIIColors.error(f"FAILURE: Auto-start generation failed or stop sequence ignored. Response: {generated_text}")
942
1071
 
943
1072
  ASCIIColors.info(f"Model info after auto-start: {json.dumps(active_binding1.get_model_info(), indent=2)}")
944
1073
  if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
@@ -949,7 +1078,6 @@ if __name__ == '__main__':
949
1078
  active_binding2 = LlamaCppServerBinding(
950
1079
  model_name=model_name_str, models_path=str(models_path), config=binding_config
951
1080
  )
952
- # This call should reuse the server from binding1
953
1081
  generated_text_b2 = active_binding2.generate_text("Ping", n_predict=5, stream=False)
954
1082
  if isinstance(generated_text_b2, str):
955
1083
  ASCIIColors.green(f"SUCCESS: Binding2 generation successful. Response: {generated_text_b2}")
@@ -966,14 +1094,6 @@ if __name__ == '__main__':
966
1094
  active_binding1.unload_model()
967
1095
  ASCIIColors.info("Binding1 unloaded. Ref count should be 1, server still up for binding2.")
968
1096
 
969
- # The server should still be up because binding2 holds a reference
970
- with _server_registry_lock:
971
- if not _active_servers:
972
- ASCIIColors.error("FAILURE: Server shut down prematurely while still referenced by binding2.")
973
- else:
974
- ASCIIColors.green("SUCCESS: Server correctly remained active for binding2.")
975
-
976
- # This call should re-acquire a reference to the same server for binding1
977
1097
  generated_text_reloaded = active_binding1.generate_text("Test reload", n_predict=5, stream=False)
978
1098
  if isinstance(generated_text_reloaded, str):
979
1099
  ASCIIColors.green(f"SUCCESS: Generation after reload successful. Response: {generated_text_reloaded}")
@@ -1011,17 +1131,18 @@ if __name__ == '__main__':
1011
1131
  ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
1012
1132
  if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
1013
1133
  if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
1134
+ # Any other bindings will be cleaned up by __del__ on exit
1014
1135
 
1015
- with _server_registry_lock:
1016
- if _active_servers:
1017
- ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after tests.")
1018
- for key, server_proc in list(_active_servers.items()):
1019
- ASCIIColors.info(f"Force shutting down stray server: {key}")
1020
- try: server_proc.shutdown()
1021
- except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
1022
- _active_servers.pop(key, None)
1023
- _server_ref_counts.pop(key, None)
1136
+ registry = ServerRegistry()
1137
+ with FileLock(registry.lock_file):
1138
+ final_state = registry._read_registry()
1139
+ if not final_state or not any(c for s in final_state.values() for c in s.get('client_pids',[])):
1140
+ ASCIIColors.green("All servers shut down correctly and registry is empty or has no clients.")
1141
+ if final_state: registry._write_registry({}) # Clean up for next run
1024
1142
  else:
1025
- ASCIIColors.green("All servers shut down correctly.")
1143
+ ASCIIColors.warning(f"Warning: Registry is not empty after tests: {final_state}")
1144
+ registry._clean_stale_entries(final_state)
1145
+ registry._write_registry(final_state)
1146
+ ASCIIColors.info("Forced a final registry cleanup.")
1026
1147
 
1027
1148
  ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")