PyPI - lollms-client - Versions diffs - 0.15.1__py3-none-any.whl → 0.15.2__py3-none-any.whl - Mend - Supply Chain Defender

lollms-client 0.15.1py3-none-any.whl → 0.15.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lollms-client might be problematic. Click here for more details.

Files changed (7) hide show

lollms_client/llm_bindings/llamacpp/__init__.py CHANGED Viewed

@@ -24,211 +24,156 @@ pm.ensure_packages(["requests", "pillow"]) # pillow for dummy image in test
 if not pm.is_installed("llama-cpp-binaries"):
     def install_llama_cpp():
         system = platform.system()
+        python_version_simple = f"py{sys.version_info.major}{sys.version_info.minor}" # e.g. py310 for 3.10
+        # Determine CUDA suffix based on common recent versions. Adjust if needed.
+        # For simplicity, we'll target a common recent CUDA version.
+        # Users with specific needs might need to install manually.
+        # As of late 2023/early 2024, cu121 or cu118 are common.
+        # The oobabooga binaries often use +cu124 for recent builds. Let's try that.
+        cuda_suffix = "+cu124"
         if system == "Windows":
-            url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl"
+            # llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl
+            url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
+            fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl" # Generic py3
         elif system == "Linux":
-            url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl"
+            # llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl
+            url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
+            fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl" # Generic py3
         else:
-            print(f"Unsupported OS: {system}")
+            ASCIIColors.warning(f"Unsupported OS for prebuilt llama-cpp-binaries: {system}. Please install manually.")
             return
-        pm.install(url)
+        ASCIIColors.info(f"Attempting to install llama-cpp-binaries from: {url}")
+        try:
+            pm.install(url)
+        except Exception as e:
+            ASCIIColors.warning(f"Failed to install specific version from {url}: {e}")
+            ASCIIColors.info(f"Attempting fallback URL: {fallback_url}")
+            try:
+                pm.install(fallback_url)
+            except Exception as e_fallback:
+                ASCIIColors.error(f"Failed to install from fallback URL {fallback_url}: {e_fallback}")
+                ASCIIColors.error("Please try installing llama-cpp-binaries manually, e.g., 'pip install llama-cpp-python[server]' or from a wheel.")
     install_llama_cpp()
 try:
     import llama_cpp_binaries
 except ImportError:
     ASCIIColors.error("llama-cpp-binaries package not found. Please install it.")
-    ASCIIColors.error("You can try: pip install llama-cpp-binaries")
-    ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases")
+    ASCIIColors.error("You can try: pip install llama-cpp-python[server] (for server support)")
+    ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases or https://pypi.org/project/llama-cpp-python/#files")
     llama_cpp_binaries = None
 # --- Predefined patterns ---
-# Quantization type strings (derived from ggml.h, llama.cpp, and common usage)
-# These are the "core component" strings, without separators like '.', '-', or '_'
 _QUANT_COMPONENTS_SET: Set[str] = {
-    # K-quants (most common, often with S/M/L suffix, and now XS/XXS)
-    "Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K",
-    "Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S", # No Q6_K_S usually
-    "Q3_K_M", "Q4_K_M", "Q5_K_M",          # No Q2/Q6_K_M usually
-    "Q3_K_L",                              # Only Q3_K_L is common
-    # Adding XS and XXS variants for K-quants by analogy with IQ types
-    "Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
-    "Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS",
-    # Non-K-quant legacy types
-    "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
-    # Floating point types
-    "F16", "FP16", "F32", "FP32", "BF16",
-    # IQ (Innovative Quantization) types
-    "IQ1_S", "IQ1_M",
-    "IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
-    "IQ3_XXS", "IQ3_S", "IQ3_M",
-    "IQ4_NL", "IQ4_XS",
-    # Newer IQ K-Quant variants (IQ types using K-quant style super-blocks)
-    "IQ3_M_K", "IQ3_S_K",  # Adding IQ3_S_K as it's plausible
-    "IQ4_XS_K", "IQ4_NL_K", # Adding IQ4_NL_K as it's plausible
-    # Basic integer types (less common in user-facing LLM filenames as primary quantizer)
-    "I8", "I16", "I32",
-    # Special GGUF type names that might appear (from ggml.c `ggml_type_name`)
-    "ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
-    "MOSTLY_Q8_0",
-    "MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
+    "Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S",
+    "Q3_K_M", "Q4_K_M", "Q5_K_M", "Q3_K_L", "Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
+    "Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS", "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
+    "F16", "FP16", "F32", "FP32", "BF16", "IQ1_S", "IQ1_M", "IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
+    "IQ3_XXS", "IQ3_S", "IQ3_M", "IQ4_NL", "IQ4_XS", "IQ3_M_K", "IQ3_S_K", "IQ4_XS_K", "IQ4_NL_K",
+    "I8", "I16", "I32", "ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
+    "MOSTLY_Q8_0", "MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
     "MOSTLY_Q4_K_S", "MOSTLY_Q4_K_M", "MOSTLY_Q5_K_S", "MOSTLY_Q5_K_M", "MOSTLY_Q6_K",
-    "MOSTLY_IQ1_S", "MOSTLY_IQ1_M", # Adding these
-    "MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
-    "MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", # Adding IQ3_M, IQ3_S
-    "MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
+    "MOSTLY_IQ1_S", "MOSTLY_IQ1_M", "MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
+    "MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", "MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
 }
-# Common descriptive suffixes for model names
 _MODEL_NAME_SUFFIX_COMPONENTS_SET: Set[str] = {
     "instruct", "chat", "GGUF", "HF", "ggml", "pytorch", "AWQ", "GPTQ", "EXL2",
-    "base", "cont", "continue", "ft", # Fine-tuning related
-    "v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0", # Common version tags if they are truly suffixes
-    # Be cautious with general version numbers (e.g., "v1", "v2") or model sizes (e.g., "7b")
-    # as they are often integral parts of the base name. Only add if they are
-    # *always* extraneous suffixes in your context.
-    # The ones above are more specific and often appear as full suffix components.
+    "base", "cont", "continue", "ft", "v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0"
 }
-# Combine, ensure uniqueness by using sets, then sort by length descending.
-# Sorting ensures longer patterns (e.g., "Q4_K_M") are checked before
-# shorter sub-patterns (e.g., "Q4_K" or "K_M").
 _ALL_REMOVABLE_COMPONENTS: List[str] = sorted(
-    list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)),
-    key=len,
-    reverse=True
+    list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)), key=len, reverse=True
 )
 def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
-    """
-    Extracts a base model name from a GGUF filename or path by removing
-    the .gguf extension and then iteratively stripping known quantization
-    patterns and common descriptive suffixes from the end of the name.
-    The stripping is case-insensitive and checks for patterns preceded
-    by '.', '-', or '_'.
-    Args:
-        file_path_or_name: The file path (as a string or Path object)
-                           or just the filename string.
-    Returns:
-        The derived base model name string.
-    """
-    if isinstance(file_path_or_name, str):
-        p = Path(file_path_or_name)
-    elif isinstance(file_path_or_name, Path):
-        p = file_path_or_name
-    else:
-        raise TypeError(
-            "Input must be a string or Path object. "
-            f"Got: {type(file_path_or_name)}"
-        )
-    name_part = p.name  # Full filename, e.g., "MyModel-7B-chat.Q4_K_M.gguf"
-    # 1. Remove .gguf extension (case-insensitive)
-    if name_part.lower().endswith(".gguf"):
-        name_part = name_part[:-5]  # Remove last 5 chars: ".gguf"
-    # 2. Iteratively strip known components (quantization, common suffixes)
-    #    These components are usually preceded by '.', '-', or '_'
+    if isinstance(file_path_or_name, str): p = Path(file_path_or_name)
+    elif isinstance(file_path_or_name, Path): p = file_path_or_name
+    else: raise TypeError(f"Input must be a string or Path object. Got: {type(file_path_or_name)}")
+    name_part = p.stem if p.suffix.lower() == ".gguf" else p.name
+    if name_part.lower().endswith(".gguf"): name_part = name_part[:-5]
     while True:
         original_name_part_len = len(name_part)
         stripped_in_this_iteration = False
         for component in _ALL_REMOVABLE_COMPONENTS:
             component_lower = component.lower()
-            # Check for patterns like ".component", "-component", or "_component"
             for separator in [".", "-", "_"]:
                 pattern_to_check = f"{separator}{component_lower}"
                 if name_part.lower().endswith(pattern_to_check):
-                    # Remove from the original-case name_part
                     name_part = name_part[:-(len(pattern_to_check))]
-                    stripped_in_this_iteration = True
-                    break  # Break from separator loop
-            if stripped_in_this_iteration:
-                break # Break from component loop (found a match, restart while loop with shorter name_part)
-        # If no component was stripped in a full pass through _ALL_REMOVABLE_COMPONENTS,
-        # or if name_part became empty, we're done.
-        if not stripped_in_this_iteration or not name_part:
-            break
-    # 3. Final cleanup: remove trailing separators if any are left after stripping
-    while name_part and name_part[-1] in ['.', '-', '_']:
-        name_part = name_part[:-1]
+                    stripped_in_this_iteration = True; break
+            if stripped_in_this_iteration: break
+        if not stripped_in_this_iteration or not name_part: break
+    while name_part and name_part[-1] in ['.', '-', '_']: name_part = name_part[:-1]
     return name_part
+# --- Global Server Registry ---
+_active_servers: Dict[tuple, 'LlamaCppServerProcess'] = {}
+_server_ref_counts: Dict[tuple, int] = {}
+_server_registry_lock = threading.Lock()
 BindingName = "LlamaCppServerBinding"
 DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
-DEFAULT_LLAMACPP_SERVER_PORT = 9641
-# Based on the LlamaServer class provided in the prompt
+# Port is now dynamic, this constant is less critical for direct use but good for reference.
+# DEFAULT_LLAMACPP_SERVER_PORT = 9641
 class LlamaCppServerProcess:
-    def __init__(self, model_path: str|Path, clip_model_path: str = None, server_binary_path: str=None, port: int=None, server_args: Dict[str, Any]={}):
+    def __init__(self, model_path: Union[str, Path], clip_model_path: Optional[Union[str, Path]] = None, server_binary_path: Optional[Union[str, Path]]=None, server_args: Dict[str, Any]={}):
         self.model_path = Path(model_path)
-        self.clip_model_path = clip_model_path
-        self.server_binary_path = Path(server_binary_path)
-        if self.server_binary_path is None:
-            self.server_binary_path = llama_cpp_binaries.get_binary_path()
-        self.port = port if port else DEFAULT_LLAMACPP_SERVER_PORT
+        self.clip_model_path = Path(clip_model_path) if clip_model_path else None
+        if server_binary_path:
+            self.server_binary_path = Path(server_binary_path)
+        elif llama_cpp_binaries:
+            self.server_binary_path = Path(llama_cpp_binaries.get_binary_path())
+        else:
+            raise FileNotFoundError("llama_cpp_binaries not found and no server_binary_path provided.")
+        self.port: Optional[int] = None # Set by start() method
         self.server_args = server_args
         self.process: Optional[subprocess.Popen] = None
         self.session = requests.Session()
-        self.host = DEFAULT_LLAMACPP_SERVER_HOST
-        self.base_url = f"http://{self.host}:{self.port}"
+        self.host = self.server_args.get("host",DEFAULT_LLAMACPP_SERVER_HOST)
+        self.base_url: Optional[str] = None # Set by start() method
         self.is_healthy = False
-        self._stderr_lines = [] # Store last few stderr lines for debugging
-        self._stderr_thread = None
+        self._stderr_lines: List[str] = []
+        self._stderr_thread: Optional[threading.Thread] = None
         if not self.model_path.exists():
             raise FileNotFoundError(f"Model file not found: {self.model_path}")
+        if self.clip_model_path and not self.clip_model_path.exists():
+            ASCIIColors.warning(f"Clip model file '{self.clip_model_path}' not found. Vision features may not work or may use a different auto-detected clip model.")
         if not self.server_binary_path.exists():
             raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
-        self._start_server()
     def _filter_stderr(self, stderr_pipe):
         try:
             for line in iter(stderr_pipe.readline, ''):
                 if line:
                     self._stderr_lines.append(line.strip())
-                    if len(self._stderr_lines) > 50: # Keep last 50 lines
-                        self._stderr_lines.pop(0)
-                    # Simple progress or key info logging
+                    if len(self._stderr_lines) > 50: self._stderr_lines.pop(0)
                     if "llama_model_loaded" in line or "error" in line.lower() or "failed" in line.lower():
-                        ASCIIColors.debug(f"[LLAMA_SERVER_STDERR] {line.strip()}")
-                    elif "running" in line and "port" in line: # Server startup message
-                        ASCIIColors.info(f"[LLAMA_SERVER_STDERR] {line.strip()}")
-        except ValueError: # Pipe closed
-            pass
-        except Exception as e:
-            ASCIIColors.warning(f"Exception in stderr filter thread: {e}")
-    def _start_server(self, is_embedding=False):
+                        ASCIIColors.debug(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
+                    elif "running on port" in line: # Server startup message
+                        ASCIIColors.info(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
+        except ValueError: pass
+        except Exception as e: ASCIIColors.warning(f"Exception in stderr filter thread for port {self.port}: {e}")
+    def start(self, port_to_use: int):
+        self.port = port_to_use
+        self.base_url = f"http://{self.host}:{self.port}"
         cmd = [
             str(self.server_binary_path),
             "--model", str(self.model_path),
             "--host", self.host,
             "--port", str(self.port),
-            # Add other common defaults or arguments from self.server_args
         ]
-        # Common arguments mapping from LlamaCppBinding to server CLI args
-        # (This needs to be kept in sync with llama.cpp server's CLI)
         arg_map = {
             "n_ctx": "--ctx-size", "n_gpu_layers": "--gpu-layers", "main_gpu": "--main-gpu",
             "tensor_split": "--tensor-split", "use_mmap": (lambda v: ["--no-mmap"] if not v else []),
@@ -236,446 +181,356 @@ class LlamaCppServerProcess:
             "n_batch": "--batch-size", "n_threads": "--threads", "n_threads_batch": "--threads-batch",
             "rope_scaling_type": "--rope-scaling", "rope_freq_base": "--rope-freq-base",
             "rope_freq_scale": "--rope-freq-scale",
-            "embedding": (lambda v: ["--embedding"] if is_embedding else []), # Server needs to be started with embedding support
+            "embedding": (lambda v: ["--embedding"] if v else []),
             "verbose": (lambda v: ["--verbose"] if v else []),
-            "chat_template": "--chat-template", # For newer servers if they support jinja chat templates
-                                              # Old llama.cpp server used --chatml or specific format flags
+            "chat_template": "--chat-template",
+            "parallel_slots": "--parallel", # Number of parallel processing slots
         }
-        # For LLaVA, specific args are needed
-        if self.clip_model_path:
+        if self.clip_model_path: # This should be the actual path resolved by the binding
             cmd.extend(["--mmproj", str(self.clip_model_path)])
-            # The server might automatically detect LLaVA chat format or need a specific flag
-            # e.g., --chat-template llava-1.5 (if server supports templates)
-            # For older servers, a specific chat format flag like --chatml with LLaVA prompt structure was used.
-            # The server from llama-cpp-binaries is usually quite up-to-date.
         for key, cli_arg in arg_map.items():
             val = self.server_args.get(key)
             if val is not None:
-                if callable(cli_arg): # For args like --no-mmap
-                    cmd.extend(cli_arg(val))
-                else:
-                    cmd.extend([cli_arg, str(val)])
+                if callable(cli_arg): cmd.extend(cli_arg(val))
+                else: cmd.extend([cli_arg, str(val)])
-        # Add any extra CLI flags directly
         extra_cli_flags = self.server_args.get("extra_cli_flags", [])
-        if isinstance(extra_cli_flags, str): # If it's a string, split it
-            extra_cli_flags = extra_cli_flags.split()
+        if isinstance(extra_cli_flags, str): extra_cli_flags = extra_cli_flags.split()
         cmd.extend(extra_cli_flags)
-        ASCIIColors.info(f"Starting Llama.cpp server with command: {' '.join(cmd)}")
+        ASCIIColors.info(f"Starting Llama.cpp server ({' '.join(cmd)})")
-        # Prevent paths with spaces from breaking the command on some OS, though Popen usually handles this.
-        # For safety, ensure paths are quoted if necessary, or rely on Popen's list-based command.
         env = os.environ.copy()
-        # On Linux, it might be necessary to set LD_LIBRARY_PATH if server binary has shared lib dependencies in its folder
         if os.name == 'posix' and self.server_binary_path.parent != Path('.'):
             lib_path_str = str(self.server_binary_path.parent.resolve())
             current_ld_path = env.get('LD_LIBRARY_PATH', '')
-            if current_ld_path:
-                env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}"
-            else:
-                env['LD_LIBRARY_PATH'] = lib_path_str
+            env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}" if current_ld_path else lib_path_str
         try:
-            ASCIIColors.green(f"running server: {' '.join(cmd)}")
-            self.process = subprocess.Popen(
-                cmd,
-                stderr=subprocess.PIPE,
-                stdout=subprocess.PIPE, # Capture stdout as well for debugging
-                text=True,
-                bufsize=1, # Line buffered
-                env=env
-            )
+            self.process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, bufsize=1, env=env)
         except Exception as e:
-            ASCIIColors.error(f"Failed to start llama.cpp server process: {e}")
-            trace_exception(e)
-            raise
+            ASCIIColors.error(f"Failed to start llama.cpp server process on port {self.port}: {e}"); trace_exception(e); raise
-        # Start stderr/stdout reading threads
         self._stderr_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stderr,), daemon=True)
         self._stderr_thread.start()
-        # self._stdout_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stdout,), daemon=True) # can use same filter
-        # self._stdout_thread.start()
-        # Wait for server to be healthy
         health_url = f"{self.base_url}/health"
-        max_wait_time = self.server_args.get("server_startup_timeout", 60) # seconds
+        max_wait_time = self.server_args.get("server_startup_timeout", 60)
         start_time = time.time()
         while time.time() - start_time < max_wait_time:
             if self.process.poll() is not None:
-                exit_code = self.process.poll()
-                stderr_output = "\n".join(self._stderr_lines[-10:]) # Last 10 lines
-                raise RuntimeError(f"Llama.cpp server process terminated unexpectedly with exit code {exit_code} during startup. Stderr:\n{stderr_output}")
+                stderr_output = "\n".join(self._stderr_lines[-10:])
+                raise RuntimeError(f"Llama.cpp server (port {self.port}) terminated unexpectedly (exit code {self.process.poll()}) during startup. Stderr:\n{stderr_output}")
             try:
                 response = self.session.get(health_url, timeout=2)
                 if response.status_code == 200 and response.json().get("status") == "ok":
                     self.is_healthy = True
                     ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port}.")
                     return
-            except requests.exceptions.ConnectionError:
-                time.sleep(1) # Wait and retry
-            except Exception as e:
-                ASCIIColors.warning(f"Health check failed: {e}")
-                time.sleep(1)
+            except requests.exceptions.ConnectionError: time.sleep(1)
+            except Exception as e: ASCIIColors.warning(f"Health check for port {self.port} failed: {e}"); time.sleep(1)
         self.is_healthy = False
-        self.stop() # Ensure process is killed if health check failed
+        self.shutdown()
         stderr_output = "\n".join(self._stderr_lines[-10:])
         raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
-    def stop(self):
+    def shutdown(self):
         self.is_healthy = False
         if self.process:
-            ASCIIColors.info(f"Stopping Llama.cpp server (PID: {self.process.pid})...")
+            ASCIIColors.info(f"Shutting down Llama.cpp server (PID: {self.process.pid} on port {self.port})...")
             try:
-                # Try graceful termination first
-                if os.name == 'nt': # Windows
-                    # Sending CTRL_C_EVENT to the process group might be more effective for console apps
-                    # self.process.send_signal(signal.CTRL_C_EVENT) # Requires creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
-                     self.process.terminate() # For Windows, terminate is often like kill
-                else: # POSIX
-                    self.process.terminate() # Sends SIGTERM
-                self.process.wait(timeout=10) # Wait for graceful shutdown
+                if os.name == 'nt': self.process.terminate()
+                else: self.process.terminate()
+                self.process.wait(timeout=10)
             except subprocess.TimeoutExpired:
-                ASCIIColors.warning("Llama.cpp server did not terminate gracefully, killing...")
-                self.process.kill() # Force kill
-                try:
-                    self.process.wait(timeout=5)
-                except subprocess.TimeoutExpired:
-                    ASCIIColors.error("Failed to kill llama.cpp server process.")
-            except Exception as e:
-                ASCIIColors.error(f"Error during server stop: {e}")
+                ASCIIColors.warning(f"Llama.cpp server (port {self.port}) did not terminate gracefully, killing...")
+                self.process.kill()
+                try: self.process.wait(timeout=5)
+                except subprocess.TimeoutExpired: ASCIIColors.error(f"Failed to kill llama.cpp server process (port {self.port}).")
+            except Exception as e: ASCIIColors.error(f"Error during server shutdown (port {self.port}): {e}")
             finally:
                 self.process = None
-                if self._stderr_thread and self._stderr_thread.is_alive():
-                    self._stderr_thread.join(timeout=1) # Wait for thread to finish
-                ASCIIColors.info("Llama.cpp server stopped.")
+                if self._stderr_thread and self._stderr_thread.is_alive(): self._stderr_thread.join(timeout=1)
+                ASCIIColors.info(f"Llama.cpp server on port {self.port} shut down.")
 class LlamaCppServerBinding(LollmsLLMBinding):
-    """
-    Binding for llama.cpp server using pre-compiled binaries.
-    Manages a local llama.cpp server subprocess and communicates via HTTP.
-    """
-    # Default parameters for the llama.cpp server
     DEFAULT_SERVER_ARGS = {
-        "n_gpu_layers": 0,
-        "n_ctx": 128000,
-        "n_batch": 512,
-        "embedding": False, # Enable if embeddings are needed via /embedding or /v1/embeddings
-        "verbose": False,
-        "server_startup_timeout": 120, # seconds
-        # "chat_format": "chatml", # Deprecated in favor of --chat-template, but some old servers might need it
-        # For LLaVA
-        # "clip_model_path": None,
-        # "chat_template": "llava-1.5" # if server supports it. Or specific prompt structure.
+        "n_gpu_layers": 0, "n_ctx": 128000, "n_batch": 512,
+        "embedding": False, "verbose": False, "server_startup_timeout": 120,
+        "parallel_slots": 4, # Default parallel slots for server
     }
-    def __init__(self,
-                 model_name: str, # Name of the GGUF file (e.g., "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
-                 models_path: str,
-                 clip_model_name: str = None,
-                 config: Optional[Dict[str, Any]] = None, # Binding specific config from global_config.yaml
-                 default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
-                 **kwargs # Overrides for server_args
-                 ):
+    def __init__(self, model_name: str, models_path: str, clip_model_name: Optional[str] = None,
+                 config: Optional[Dict[str, Any]] = None, default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat, **kwargs):
         super().__init__(binding_name=BindingName)
-        if llama_cpp_binaries is None:
-            raise ImportError("llama-cpp-binaries package is required but not found.")
+        if llama_cpp_binaries is None: raise ImportError("llama-cpp-binaries package is required but not found.")
         self.models_path = Path(models_path)
-        self.model_name = model_name
-        self.model_path = self.models_path/self.model_name
-        self.clip_model_path = self.models_path/clip_model_name if clip_model_name else None
-        self.default_completion_format = default_completion_format
+        self.user_provided_model_name = model_name # Store the name/path user gave
-        self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {})}
-        self.server_args.update(kwargs) # Apply direct kwargs overrides
+        # Initial hint for clip_model_path, resolved fully in load_model
+        self.clip_model_path: Optional[Path] = None
+        if clip_model_name:
+            p_clip = Path(clip_model_name)
+            if p_clip.is_absolute() and p_clip.exists():
+                self.clip_model_path = p_clip
+            elif (self.models_path / clip_model_name).exists(): # Relative to models_path
+                self.clip_model_path = self.models_path / clip_model_name
+            else:
+                ASCIIColors.warning(f"Specified clip_model_name '{clip_model_name}' not found. Will rely on auto-detection if applicable.")
+        self.default_completion_format = default_completion_format
+        self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {}), **kwargs}
         self.server_binary_path = self._get_server_binary_path()
-        self.current_model_path: Optional[Path] = None
+        self.current_model_path: Optional[Path] = None # Actual resolved path of loaded model
         self.server_process: Optional[LlamaCppServerProcess] = None
         self.port: Optional[int] = None
+        self.server_key: Optional[tuple] = None
-        # Attempt to load the model (which starts the server)
-        self.load_model(str(self.model_path))
+        if not self.load_model(self.user_provided_model_name):
+            ASCIIColors.error(f"Initial model load for '{self.user_provided_model_name}' failed. Binding may not be functional.")
     def _get_server_binary_path(self) -> Path:
-        try:
-            # Check if a custom path is provided in config
-            custom_path_str = self.server_args.get("llama_server_binary_path")
-            if custom_path_str:
-                custom_path = Path(custom_path_str)
-                if custom_path.exists() and custom_path.is_file():
-                    ASCIIColors.info(f"Using custom llama.cpp server binary path: {custom_path}")
-                    return custom_path
-                else:
-                    ASCIIColors.warning(f"Custom llama.cpp server binary path '{custom_path_str}' not found or not a file. Falling back.")
-            # Default to using llama_cpp_binaries
-            bin_path_str = llama_cpp_binaries.get_binary_path() # specify "server"
+        custom_path_str = self.server_args.get("llama_server_binary_path")
+        if custom_path_str:
+            custom_path = Path(custom_path_str)
+            if custom_path.exists() and custom_path.is_file():
+                ASCIIColors.info(f"Using custom llama.cpp server binary: {custom_path}"); return custom_path
+            else: ASCIIColors.warning(f"Custom binary '{custom_path_str}' not found. Falling back.")
+        if llama_cpp_binaries:
+            bin_path_str = llama_cpp_binaries.get_binary_path()
             if bin_path_str:
                 bin_path = Path(bin_path_str)
                 if bin_path.exists() and bin_path.is_file():
-                    ASCIIColors.info(f"Using llama.cpp server binary from llama-cpp-binaries: {bin_path}")
-                    return bin_path
-            raise FileNotFoundError("Could not locate llama.cpp server binary via llama-cpp-binaries or custom path.")
-        except Exception as e:
-            ASCIIColors.error(f"Error getting llama.cpp server binary path: {e}")
-            trace_exception(e)
-            # As a last resort, try a common name in system PATH or a known location if Lollms ships one
-            # For now, rely on llama-cpp-binaries or explicit config.
-            raise FileNotFoundError(
-                "Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' is installed "
-                "or provide 'llama_server_binary_path' in the binding's configuration."
-            ) from e
-    def _resolve_model_path(self, model_path: str) -> Path:
-        # Search order:
-        # 1. Absolute path
-        # 2. Relative to binding-specific models path (e.g., personal_models_path/LlamaCppServerBinding/)
-        # 3. Relative to personal_models_path
-        # 4. Relative to models_zoo_path
+                    ASCIIColors.info(f"Using binary from llama-cpp-binaries: {bin_path}"); return bin_path
+        raise FileNotFoundError("Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' or 'llama-cpp-python[server]' is installed or provide 'llama_server_binary_path'.")
+    def _resolve_model_path(self, model_name_or_path: str) -> Path:
+        model_p = Path(model_name_or_path)
+        if model_p.is_absolute():
+            if model_p.exists(): return model_p
+            else: raise FileNotFoundError(f"Absolute model path specified but not found: {model_p}")
-        model_p = Path(model_path)
-        if model_p.is_absolute() and model_p.exists():
-            return model_p
-        paths_to_check = []
-        binding_specific_folder_name = self.binding_name # "LlamaCppServerBinding"
-        paths_to_check.append(self.models_path)
-        for p in paths_to_check:
-            if p.exists() and p.is_file():
-                ASCIIColors.info(f"Found model at: {p}")
-                return p
+        path_in_models_dir = self.models_path / model_name_or_path
+        if path_in_models_dir.exists() and path_in_models_dir.is_file():
+            ASCIIColors.info(f"Found model at: {path_in_models_dir}"); return path_in_models_dir
-        raise FileNotFoundError(f"Model '{model_path}' not found in standard Lollms model paths or as an absolute path.")
+        raise FileNotFoundError(f"Model '{model_name_or_path}' not found as absolute path or within '{self.models_path}'.")
     def _find_available_port(self) -> int:
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            s.bind(('', 0)) # Bind to port 0 to get an OS-assigned available port
-            return s.getsockname()[1]
-    def load_model(self, model_name: str) -> bool:
-        resolved_path = self._resolve_model_path(model_name)
+            s.bind(('', 0)); return s.getsockname()[1]
+    def _release_server_instance(self):
+        if self.server_process and self.server_key:
+            with _server_registry_lock:
+                if self.server_key in _server_ref_counts:
+                    _server_ref_counts[self.server_key] -= 1
+                    ASCIIColors.info(f"Decremented ref count for server {self.server_key}. New count: {_server_ref_counts[self.server_key]}")
+                    if _server_ref_counts[self.server_key] <= 0:
+                        ASCIIColors.info(f"Ref count for server {self.server_key} is zero. Shutting it down.")
+                        server_to_stop = _active_servers.pop(self.server_key, None)
+                        _server_ref_counts.pop(self.server_key, None)
+                        if server_to_stop:
+                            try: server_to_stop.shutdown()
+                            except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
+                        # else: ASCIIColors.warning(f"Attempted to stop server {self.server_key} but it was not in _active_servers.") # Can be noisy
+                else:
+                     ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
+                     _active_servers.pop(self.server_key, None) # Ensure removal
-        if self.server_process and self.server_process.is_healthy and self.current_model_path == resolved_path:
-            ASCIIColors.info(f"Model '{model_name}' is already loaded and server is running.")
-            return True
+        self.server_process = None
+        self.port = None
+        self.server_key = None
-        if self.server_process:
-            self.unload_model() # Stop existing server
-        self.model_name = model_name # Store the name provided by user
-        self.current_model_path = resolved_path
-        self.port = self._find_available_port()
+    def load_model(self, model_name_or_path: str) -> bool:
+        resolved_model_path = self._resolve_model_path(model_name_or_path)
-        ASCIIColors.info(f"Attempting to start Llama.cpp server for model: {self.current_model_path} on port {self.port}")
-        # Prepare server_args specifically for this model load
-        current_server_args = self.server_args.copy()
-        if not self.clip_model_path:
-            # Try to find a corresponding .mmproj file or allow user to specify in config
-            # e.g. if model is llava-v1.5-7b.Q4_K_M.gguf, look for llava-v1.5-7b.mmproj or mmproj-modelname.gguf
-            base_name = get_gguf_model_base_name(self.current_model_path.stem) # etc.
-            potential_clip_paths = [
-                self.current_model_path.parent / f"{base_name}.mmproj",
-                self.current_model_path.parent / f"mmproj-{base_name}.gguf", # Common pattern
-                self.current_model_path.with_suffix(".mmproj"),
+        # Determine the clip_model_path for this server instance
+        # Priority: 1. Explicit `clip_model_path` from init (if exists) 2. Auto-detection
+        final_clip_model_path: Optional[Path] = None
+        if self.clip_model_path and self.clip_model_path.exists(): # From __init__
+            final_clip_model_path = self.clip_model_path
+            ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
+        elif not self.clip_model_path or (self.clip_model_path and not self.clip_model_path.exists()): # if init path was bad or not given
+            if self.clip_model_path and not self.clip_model_path.exists():
+                ASCIIColors.warning(f"Initial clip model path '{self.clip_model_path}' not found. Attempting auto-detection.")
+            base_name = get_gguf_model_base_name(resolved_model_path.stem)
+            potential_paths = [
+                resolved_model_path.parent / f"{base_name}.mmproj",
+                resolved_model_path.parent / f"mmproj-{base_name}.gguf",
+                resolved_model_path.with_suffix(".mmproj"),
+                self.models_path / f"{base_name}.mmproj", # Check in general models dir too
+                self.models_path / f"mmproj-{base_name}.gguf",
             ]
-            found_clip_path = None
-            for p_clip in potential_clip_paths:
+            for p_clip in potential_paths:
                 if p_clip.exists():
-                    found_clip_path = str(p_clip)
-                    ASCIIColors.info(f"Auto-detected LLaVA clip model: {found_clip_path}")
+                    final_clip_model_path = p_clip
+                    ASCIIColors.info(f"Auto-detected LLaVA clip model: {final_clip_model_path}")
                     break
-            if found_clip_path:
-                self.clip_model_path = found_clip_path
-                # Set a default LLaVA chat template if server supports it, or rely on server auto-detection
-                #if not current_server_args.get("chat_template") and not current_server_args.get("chat_format"):
-                #    current_server_args["chat_template"] = "llava-1.5" # Common default
-            else:
-                ASCIIColors.warning("Vision capabilities will likely not work. Please ensure the .mmproj file is "
-                                    "next to the model or specify 'clip_model_path' in binding config.")
+        final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
+        # Server key based on model and essential server configurations (like clip model)
+        # More server_args could be added to the key if they necessitate separate server instances
+        # For example, different n_gpu_layers might require a server restart.
+        # For now, model and clip model are the main differentiators for distinct servers.
+        new_server_key = (str(resolved_model_path), final_clip_model_path_str)
+        with _server_registry_lock:
+            # If this binding instance is already using the exact same server, do nothing
+            if self.server_process and self.server_key == new_server_key and self.server_process.is_healthy:
+                ASCIIColors.info(f"Model '{model_name_or_path}' with clip '{final_clip_model_path_str}' is already loaded and server is healthy on port {self.port}. No change.")
+                return True
+            # If this binding was using a *different* server, release it first
+            if self.server_process and self.server_key != new_server_key:
+                ASCIIColors.info(f"Switching models. Releasing previous server: {self.server_key}")
+                self._release_server_instance() # This clears self.server_process, self.port, self.server_key
+            # Check if a suitable server already exists in the global registry
+            if new_server_key in _active_servers:
+                existing_server = _active_servers[new_server_key]
+                if existing_server.is_healthy:
+                    ASCIIColors.info(f"Reusing existing healthy server for {new_server_key} on port {existing_server.port}.")
+                    self.server_process = existing_server
+                    self.port = existing_server.port
+                    _server_ref_counts[new_server_key] += 1
+                    self.current_model_path = resolved_model_path
+                    self.clip_model_path = final_clip_model_path # Update binding's clip path
+                    self.server_key = new_server_key
+                    return True
+                else: # Found existing but unhealthy server
+                    ASCIIColors.warning(f"Found unhealthy server for {new_server_key}. Attempting to remove and restart.")
+                    try: existing_server.shutdown()
+                    except Exception as e: ASCIIColors.error(f"Error shutting down unhealthy server {new_server_key}: {e}")
+                    _active_servers.pop(new_server_key, None)
+                    _server_ref_counts.pop(new_server_key, None)
+            # No suitable server found or existing was unhealthy: start a new one
+            ASCIIColors.info(f"Starting new server for {new_server_key}.")
+            self.current_model_path = resolved_model_path
+            self.clip_model_path = final_clip_model_path # Update binding's clip path for the new server
+            self.server_key = new_server_key # Set before potential failure to allow cleanup by _release_server_instance
+            new_port_for_server = self._find_available_port()
+            current_server_args_for_new_server = self.server_args.copy()
+            # Ensure parallel_slots is set; it's crucial for shared servers
+            if "parallel_slots" not in current_server_args_for_new_server or not isinstance(current_server_args_for_new_server["parallel_slots"], int) or current_server_args_for_new_server["parallel_slots"] <=0:
+                current_server_args_for_new_server["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
+            ASCIIColors.info(f"New Llama.cpp server: model={self.current_model_path}, clip={self.clip_model_path}, port={new_port_for_server}, slots={current_server_args_for_new_server['parallel_slots']}")
+            try:
+                new_server = LlamaCppServerProcess(
+                    model_path=str(self.current_model_path),
+                    clip_model_path=str(self.clip_model_path) if self.clip_model_path else None,
+                    server_binary_path=str(self.server_binary_path),
+                    server_args=current_server_args_for_new_server,
+                )
+                new_server.start(port_to_use=new_port_for_server) # Actual server start
+                if new_server.is_healthy:
+                    self.server_process = new_server
+                    self.port = new_port_for_server
+                    _active_servers[self.server_key] = new_server
+                    _server_ref_counts[self.server_key] = 1
+                    ASCIIColors.green(f"New server {self.server_key} started on port {self.port}.")
+                    return True
+                else: # Should have been caught by new_server.start() raising an error
+                    ASCIIColors.error(f"New server {self.server_key} failed to become healthy (this state should be rare).")
+                    self._release_server_instance() # Clean up registry if something went very wrong
+                    return False
+            except Exception as e:
+                ASCIIColors.error(f"Failed to load model '{model_name_or_path}' and start server: {e}")
+                trace_exception(e)
+                self._release_server_instance() # Ensure cleanup if start failed
+                return False
-        try:
-            self.server_process = LlamaCppServerProcess(
-                model_path=str(self.current_model_path),
-                clip_model_path = str(self.clip_model_path),
-                server_binary_path=str(self.server_binary_path),
-                port=self.port,
-                server_args=current_server_args,
-            )
-            return self.server_process.is_healthy
-        except Exception as e:
-            ASCIIColors.error(f"Failed to load model '{model_name}' and start server: {e}")
-            trace_exception(e)
-            self.server_process = None
-            self.current_model_path = None
-            return False
     def unload_model(self):
         if self.server_process:
-            self.server_process.stop()
-            self.server_process = None
+            ASCIIColors.info(f"Unloading model for binding. Current server: {self.server_key}, port: {self.port}")
+            self._release_server_instance() # Handles ref counting and actual shutdown if needed
+        else:
+            ASCIIColors.info("Unload_model called, but no server process was active for this binding instance.")
         self.current_model_path = None
-        self.port = None
-        ASCIIColors.info("Llama.cpp server and model unloaded.")
+        self.clip_model_path = None # Also clear the instance's clip path idea
+        # self.port and self.server_key are cleared by _release_server_instance
     def _get_request_url(self, endpoint: str) -> str:
         if not self.server_process or not self.server_process.is_healthy:
             raise ConnectionError("Llama.cpp server is not running or not healthy.")
         return f"{self.server_process.base_url}{endpoint}"
-    def _prepare_generation_payload(self,
-                                   prompt: str,
-                                   system_prompt: str = "",
-                                   n_predict: Optional[int] = None,
-                                   temperature: float = 0.7,
-                                   top_k: int = 40,
-                                   top_p: float = 0.9,
-                                   repeat_penalty: float = 1.1,
-                                   repeat_last_n: Optional[int] = 64, # Server calls this repeat_last_n or penalty_last_n
-                                   seed: Optional[int] = None,
-                                   stream: bool = False,
-                                   use_chat_format: bool = True, # True for /v1/chat/completions, False for /completion
-                                   images: Optional[List[str]] = None,
-                                   **extra_params # For things like grammar, mirostat, etc from server_args
-                                   ) -> Dict:
-        # Start with defaults from server_args, then override with call params
+    def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
+                                   temperature: float = 0.7, top_k: int = 40, top_p: float = 0.9,
+                                   repeat_penalty: float = 1.1, repeat_last_n: Optional[int] = 64,
+                                   seed: Optional[int] = None, stream: bool = False, use_chat_format: bool = True,
+                                   images: Optional[List[str]] = None, **extra_params) -> Dict:
         payload_params = {
-            "temperature": self.server_args.get("temperature", 0.7),
-            "top_k": self.server_args.get("top_k", 40),
-            "top_p": self.server_args.get("top_p", 0.9),
-            "repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
-            "repeat_last_n": self.server_args.get("repeat_last_n", 64),
-            "mirostat": self.server_args.get("mirostat_mode", 0), # llama.cpp server uses mirostat (0=disabled, 1=v1, 2=v2)
-            "mirostat_tau": self.server_args.get("mirostat_tau", 5.0),
-            "mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
-            # Add other mappable params from self.server_args like min_p, typical_p, grammar etc.
+            "temperature": self.server_args.get("temperature", 0.7), "top_k": self.server_args.get("top_k", 40),
+            "top_p": self.server_args.get("top_p", 0.9), "repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
+            "repeat_last_n": self.server_args.get("repeat_last_n", 64), "mirostat": self.server_args.get("mirostat_mode", 0),
+            "mirostat_tau": self.server_args.get("mirostat_tau", 5.0), "mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
         }
-        if "grammar_string" in self.server_args and self.server_args["grammar_string"]: # From config
+        if "grammar_string" in self.server_args and self.server_args["grammar_string"]:
              payload_params["grammar"] = self.server_args["grammar_string"]
-        # Override with specific call parameters
-        payload_params.update({
-            "temperature": temperature, "top_k": top_k, "top_p": top_p,
-            "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n,
-        })
-        if n_predict is not None: payload_params['n_predict'] = n_predict # Server uses n_predict
+        payload_params.update({"temperature": temperature, "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n})
+        if n_predict is not None: payload_params['n_predict'] = n_predict
         if seed is not None: payload_params['seed'] = seed
-        # Filter None values, as server might not like them
         payload_params = {k: v for k, v in payload_params.items() if v is not None}
-        payload_params.update(extra_params) # Add any other specific params for this call
+        payload_params.update(extra_params)
         if use_chat_format and self.default_completion_format == ELF_COMPLETION_FORMAT.Chat:
-            # Use /v1/chat/completions format
             messages = []
-            if system_prompt and system_prompt.strip():
-                messages.append({"role": "system", "content": system_prompt})
+            if system_prompt and system_prompt.strip(): messages.append({"role": "system", "content": system_prompt})
             user_content: Union[str, List[Dict[str, Any]]] = prompt
-            if images and self.clip_model_path: # Check if it's a LLaVA setup
+            if images and self.clip_model_path: # Use the binding's current clip_model_path
                 image_parts = []
                 for img_path in images:
                     try:
-                        with open(img_path, "rb") as image_file:
-                            encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
-                        image_type = Path(img_path).suffix[1:].lower() or "png"
-                        if image_type == "jpg": image_type = "jpeg"
-                        # Llama.cpp server expects image data directly for LLaVA with /completion
-                        # For /v1/chat/completions, it expects OpenAI's format for multimodal
-                        image_parts.append({
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}
-                        })
-                    except Exception as ex:
-                        trace_exception(ex)
+                        with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+                        image_type = Path(img_path).suffix[1:].lower() or "png"; image_type = "jpeg" if image_type == "jpg" else image_type
+                        image_parts.append({"type": "image_url", "image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}})
+                    except Exception as ex: trace_exception(ex)
                 user_content = [{"type": "text", "text": prompt}] + image_parts # type: ignore
             messages.append({"role": "user", "content": user_content})
             final_payload = {"messages": messages, "stream": stream, **payload_params}
-            # n_predict is max_tokens for OpenAI API
-            if 'n_predict' in final_payload:
-                final_payload['max_tokens'] = final_payload.pop('n_predict')
+            if 'n_predict' in final_payload: final_payload['max_tokens'] = final_payload.pop('n_predict')
             return final_payload
         else:
-            # Use /completion format (legacy or for raw text)
-            # For LLaVA with /completion, images are typically passed in a special way in the prompt
-            # or via an 'image_data' field if the server supports it.
-            # The example class uses tokenized prompt for /completion.
-            # For simplicity here, we'll send text prompt, server tokenizes.
-            # Llama.cpp server's /completion often expects 'prompt' as string or tokens.
-            # If images are involved with /completion, it needs specific handling.
-            # Example: 'prompt': "USER: <image>\nWhat is this?\nASSISTANT:", 'image_data': [{'data': base64_image, 'id': 10}]
-            full_prompt = prompt
-            if system_prompt and system_prompt.strip():
-                # Heuristic for instruct models, actual formatting depends on model/template
-                full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:"
+            full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:" if system_prompt and system_prompt.strip() else prompt
             final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
-            if images and self.server_args.get("clip_model_path"):
+            if images and self.clip_model_path: # Use binding's clip_model_path
                 image_data_list = []
                 for i, img_path in enumerate(images):
                     try:
-                        with open(img_path, "rb") as image_file:
-                            encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
-                        image_data_list.append({"data": encoded_string, "id": i + 10}) # ID needs to be > 9 for llama.cpp server
-                    except Exception as e_img:
-                        ASCIIColors.error(f"Could not encode image {img_path} for /completion: {e_img}")
-                if image_data_list:
-                    final_payload["image_data"] = image_data_list
-                    # The prompt needs to contain placeholder like USER: <image 1>\n<prompt>\nASSISTANT:
-                    # This part is tricky and model-dependent. For now, we assume user's prompt is already formatted.
-                    # Or, the server (if new enough) might handle it with chat_template even for /completion.
+                        with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+                        image_data_list.append({"data": encoded_string, "id": i + 10})
+                    except Exception as e_img: ASCIIColors.error(f"Could not encode image {img_path}: {e_img}")
+                if image_data_list: final_payload["image_data"] = image_data_list
             return final_payload
-    def generate_text(self,
-                     prompt: str,
-                     images: Optional[List[str]] = None,
-                     system_prompt: str = "",
-                     n_predict: Optional[int] = None,
-                     stream: bool = False,
-                     temperature: float = None, # Use binding's default if None
-                     top_k: int = None,
-                     top_p: float = None,
-                     repeat_penalty: float = None,
-                     repeat_last_n: Optional[int] = None,
-                     seed: Optional[int] = None,
+    def generate_text(self, prompt: str, images: Optional[List[str]] = None, system_prompt: str = "",
+                     n_predict: Optional[int] = None, stream: bool = False, temperature: float = None,
+                     top_k: int = None, top_p: float = None, repeat_penalty: float = None,
+                     repeat_last_n: Optional[int] = None, seed: Optional[int] = None,
                      streaming_callback: Optional[Callable[[str, int], bool]] = None,
-                     use_chat_format_override: Optional[bool] = None, # Allow overriding binding's default format
-                     **generation_kwargs
-                     ) -> Union[str, Dict[str, any]]:
+                     use_chat_format_override: Optional[bool] = None, **generation_kwargs) -> Union[str, Dict[str, any]]:
         if not self.server_process or not self.server_process.is_healthy:
              return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
-        _use_chat_format = use_chat_format_override if use_chat_format_override is not None \
-                           else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
+        _use_chat_format = use_chat_format_override if use_chat_format_override is not None else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
         payload = self._prepare_generation_payload(
             prompt=prompt, system_prompt=system_prompt, n_predict=n_predict,
             temperature=temperature if temperature is not None else self.server_args.get("temperature",0.7),
@@ -683,359 +538,331 @@ class LlamaCppServerBinding(LollmsLLMBinding):
             top_p=top_p if top_p is not None else self.server_args.get("top_p",0.9),
             repeat_penalty=repeat_penalty if repeat_penalty is not None else self.server_args.get("repeat_penalty",1.1),
             repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
-            seed=seed if seed is not None else self.server_args.get("seed", -1), # Use server's default seed if not provided
-            stream=stream, use_chat_format=_use_chat_format, images=images,
-            **generation_kwargs
+            seed=seed if seed is not None else self.server_args.get("seed", -1), stream=stream,
+            use_chat_format=_use_chat_format, images=images, **generation_kwargs
         )
         endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
         request_url = self._get_request_url(endpoint)
-        # For debugging, print payload (excluding potentially large image data)
-        debug_payload = {k:v for k,v in payload.items() if k not in ["image_data"]}
-        if "messages" in debug_payload:
-            debug_payload["messages"] = [{k:v for k,v in msg.items() if k !="content" or not isinstance(v,list) or not any("image_url" in part for part in v)} for msg in debug_payload["messages"]]
-        ASCIIColors.debug(f"Request to {request_url} with payload: {json.dumps(debug_payload, indent=2)[:500]}...")
+        # Debug payload (simplified)
+        # debug_payload = {k:v for k,v in payload.items() if k not in ["image_data","messages"] or (k=="messages" and not any("image_url" in part for item in v for part in (item.get("content") if isinstance(item.get("content"),list) else [])))} # Complex filter for brevity
+        # ASCIIColors.debug(f"Request to {request_url} with payload (simplified): {json.dumps(debug_payload, indent=2)[:500]}...")
         full_response_text = ""
         try:
             response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
             response.raise_for_status()
             if stream:
                 for line in response.iter_lines():
                     if not line: continue
                     line_str = line.decode('utf-8').strip()
                     if line_str.startswith('data: '): line_str = line_str[6:]
-                    if line_str == '[DONE]': break # OpenAI stream end
+                    if line_str == '[DONE]': break
                     try:
                         chunk_data = json.loads(line_str)
-                        chunk_content = ""
-                        if _use_chat_format: # OpenAI /v1/chat/completions format
-                            delta = chunk_data.get('choices', [{}])[0].get('delta', {})
-                            chunk_content = delta.get('content', '')
-                        else: # /completion format
-                            chunk_content = chunk_data.get('content', '')
+                        chunk_content = (chunk_data.get('choices', [{}])[0].get('delta', {}).get('content', '') if _use_chat_format
+                                         else chunk_data.get('content', ''))
                         if chunk_content:
                             full_response_text += chunk_content
                             if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
-                                # If callback returns False, we should try to stop generation.
-                                # Llama.cpp server's /completion doesn't have a direct way to stop mid-stream via API.
-                                # Closing the connection might be the only way if server supports it.
-                                ASCIIColors.info("Streaming callback requested stop.")
-                                response.close() # Attempt to signal server by closing connection
-                                break
-                        if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): # /completion specific stop flags
-                            break
-                    except json.JSONDecodeError:
-                        ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
-                        continue # Or handle error
+                                ASCIIColors.info("Streaming callback requested stop."); response.close(); break
+                        if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): break
+                    except json.JSONDecodeError: ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}"); continue
                 return full_response_text
-            else: # Not streaming
+            else:
                 response_data = response.json()
-                return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
+                return response_data.get('choices', [{}])[0].get('message', {}).get('content', '') if _use_chat_format \
+                       else response_data.get('content','') # /completion has 'content' at top level for non-stream
         except requests.exceptions.RequestException as e:
             error_message = f"Llama.cpp server request error: {e}"
             if e.response is not None:
-                try:
-                    error_details = e.response.json()
-                    error_message += f" - Details: {error_details.get('error', e.response.text)}"
-                except json.JSONDecodeError:
-                    error_message += f" - Response: {e.response.text[:200]}"
+                try: error_details = e.response.json(); error_message += f" - Details: {error_details.get('error', e.response.text)}"
+                except json.JSONDecodeError: error_message += f" - Response: {e.response.text[:200]}"
             ASCIIColors.error(error_message)
             return {"status": False, "error": error_message, "details": str(e.response.text if e.response else "No response text")}
         except Exception as ex:
-            error_message = f"Llama.cpp generation error: {str(ex)}"
-            trace_exception(ex)
+            error_message = f"Llama.cpp generation error: {str(ex)}"; trace_exception(ex)
             return {"status": False, "error": error_message}
     def tokenize(self, text: str) -> List[int]:
-        if not self.server_process or not self.server_process.is_healthy:
-            raise ConnectionError("Llama.cpp server is not running.")
+        if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
         try:
             response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
-            response.raise_for_status()
-            return response.json().get("tokens", [])
-        except Exception as e:
-            ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e)
-            return [] # Or raise
+            response.raise_for_status(); return response.json().get("tokens", [])
+        except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
     def detokenize(self, tokens: List[int]) -> str:
-        if not self.server_process or not self.server_process.is_healthy:
-            raise ConnectionError("Llama.cpp server is not running.")
+        if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
         try:
             response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
-            response.raise_for_status()
-            return response.json().get("content", "")
-        except Exception as e:
-            ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e)
-            return "" # Or raise
+            response.raise_for_status(); return response.json().get("content", "")
+        except Exception as e: ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e); return ""
-    def count_tokens(self, text: str) -> int:
-        return len(self.tokenize(text))
+    def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
     def embed(self, text: str, **kwargs) -> List[float]:
-        if not self.server_process or not self.server_process.is_healthy:
-             raise Exception("Llama.cpp server is not running.")
-        if not self.server_args.get("embedding"):
-            raise Exception("Embedding support was not enabled in server_args (set 'embedding: true').")
+        if not self.server_process or not self.server_process.is_healthy: raise Exception("Server not running.")
+        if not self.server_args.get("embedding"): raise Exception("Embedding not enabled in server_args.")
         try:
-            # llama.cpp server has /embedding endpoint (non-OpenAI) and /v1/embeddings (OpenAI-compatible)
-            # Let's try /v1/embeddings first for compatibility
-            payload = {"input": text}
-            if "model" in kwargs: payload["model"] = kwargs["model"] # Can specify model if server handles multiple embedding models (unlikely for llama.cpp server)
-            request_url = self._get_request_url("/v1/embeddings")
+            payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
             response = self.server_process.session.post(request_url, json=payload)
-            if response.status_code == 404: # Fallback to /embedding if /v1/embeddings not found
-                ASCIIColors.debug("Trying /embedding endpoint as /v1/embeddings was not found.")
+            if response.status_code == 404: # Fallback
                 request_url = self._get_request_url("/embedding")
-                response = self.server_process.session.post(request_url, json={"content": text}) # /embedding uses "content"
-            response.raise_for_status()
-            data = response.json()
-            if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: # /v1/embeddings format
-                return data["data"][0]["embedding"]
-            elif "embedding" in data and isinstance(data["embedding"], list): # /embedding format
-                return data["embedding"]
-            else:
-                raise ValueError(f"Unexpected embedding response format: {data}")
+                response = self.server_process.session.post(request_url, json={"content": text})
+            response.raise_for_status(); data = response.json()
+            if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: return data["data"][0]["embedding"]
+            elif "embedding" in data and isinstance(data["embedding"], list): return data["embedding"]
+            else: raise ValueError(f"Unexpected embedding response: {data}")
         except requests.exceptions.RequestException as e:
-            err_msg = f"Llama.cpp server embedding request error: {e}"
+            err_msg = f"Embedding request error: {e}";
             if e.response: err_msg += f" - {e.response.text[:200]}"
             raise Exception(err_msg) from e
-        except Exception as ex:
-            trace_exception(ex); raise Exception(f"Llama.cpp embedding failed: {str(ex)}") from ex
+        except Exception as ex: trace_exception(ex); raise Exception(f"Embedding failed: {str(ex)}") from ex
     def get_model_info(self) -> dict:
         info = {
             "name": self.binding_name,
-            "model_name": self.model_name, # User-provided name
+            "user_provided_model_name": self.user_provided_model_name,
             "model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
+            "clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
             "loaded": self.server_process is not None and self.server_process.is_healthy,
-            "server_args": self.server_args,
-            "port": self.port if self.port else "N/A"
+            "server_args": self.server_args, "port": self.port if self.port else "N/A",
+            "server_key": str(self.server_key) if self.server_key else "N/A",
         }
-        if info["loaded"]:
-            # Try to get more info from server's /props or /v1/models
+        if info["loaded"] and self.server_process:
             try:
-                props_url = self._get_request_url("/props") # llama.cpp specific
-                props_resp = self.server_process.session.get(props_url, timeout=5).json()
+                props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
                 info.update({
-                    "server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"), # Example path
+                    "server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
                     "server_chat_format": props_resp.get("chat_format"),
-                    "server_clip_model": props_resp.get("mmproj"),
+                    "server_clip_model_from_props": props_resp.get("mmproj"), # Server's view of clip model
                 })
-            except Exception: pass # Ignore if /props fails or data missing
-            is_llava = ("llava" in self.model_name.lower() or "bakllava" in self.model_name.lower()) or \
-                       (self.server_args.get("clip_model_path") is not None) or \
-                       (info.get("server_clip_model") is not None)
+            except Exception: pass
+            is_llava = self.clip_model_path is not None or \
+                       (info.get("server_clip_model_from_props") is not None) or \
+                       ("llava" in self.current_model_path.name.lower() if self.current_model_path else False)
             info["supports_vision"] = is_llava
             info["supports_structured_output"] = self.server_args.get("grammar_string") is not None
         return info
     def listModels(self) -> List[Dict[str, str]]:
-        # This binding manages one GGUF model at a time by starting a server for it.
-        # To "list models", we could scan the Lollms model directories for .gguf files.
         models_found = []
-        gguf_pattern = "*.gguf"
-        search_paths = []
-        binding_specific_folder_name = self.binding_name
-        search_paths.append(self.models_path)
         unique_models = set()
-        for spath in search_paths:
-            if spath.exists() and spath.is_dir():
-                for model_file in spath.rglob(gguf_pattern): # rglob for recursive
-                    if model_file.is_file() and model_file.name not in unique_models:
-                        models_found.append({
-                            'model_name': model_file.name,
-                            # Path relative to one of the main model roots for display/selection
-                            'path_hint': str(model_file.relative_to(spath.parent) if model_file.is_relative_to(spath.parent) else model_file),
-                            'size_gb': f"{model_file.stat().st_size / (1024**3):.2f} GB"
-                        })
-                        unique_models.add(model_file.name)
+        if self.models_path.exists() and self.models_path.is_dir():
+            for model_file in self.models_path.rglob("*.gguf"):
+                if model_file.is_file() and model_file.name not in unique_models:
+                    models_found.append({
+                        'model_name': model_file.name,
+                        'path_hint': str(model_file.relative_to(self.models_path.parent) if model_file.is_relative_to(self.models_path.parent) else model_file),
+                        'size_gb': f"{model_file.stat().st_size / (1024**3):.2f} GB"
+                    })
+                    unique_models.add(model_file.name)
         return models_found
     def __del__(self):
-        self.unload_model() # Ensure server is stopped when binding is deleted
+        self.unload_model()
 if __name__ == '__main__':
-    global full_streamed_text
+    global full_streamed_text # Define for the callback
+    full_streamed_text = ""
     ASCIIColors.yellow("Testing LlamaCppServerBinding...")
     # --- Configuration ---
-    # This should be the NAME of your GGUF model file. The binding will search for it.
-    # e.g., "Mistral-7B-Instruct-v0.2-Q4_K_M.gguf"
-    # Ensure this model is placed in one of the Lollms model directories.
-    # For testing, you can put a small GGUF model in the same directory as this script
-    # and set personal_models_path to "."
-    # Adjust current_directory if your models are elsewhere for testing
-    current_directory = Path(__file__).parent
-    models_path = "E:\lollms\models\gguf\Mistral-Nemo-Instruct-2407-GGUF" #replace with your own model path
-    model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
-    # Binding config (passed to server_args)
+    # This should be the NAME of your GGUF model file.
+    # Ensure this model is placed in your models_path directory.
+    # Example: models_path = "E:\\lollms\\models\\gguf" (Windows)
+    #          model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
+    # For CI/local testing without specific paths, you might download a tiny model
+    # or require user to set environment variables for these.
+    # For this example, replace with your actual paths/model.
+    try:
+        models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
+        model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf") # A small model
+        llava_model_name_str = os.environ.get("LOLLMS_TEST_LLAVA_MODEL_GGUF", "llava-v1.5-7b.Q2_K.gguf") # Placeholder
+        llava_clip_name_str = os.environ.get("LOLLMS_TEST_LLAVA_CLIP", "mmproj-model2-q4_0.gguf") # Placeholder
+        models_path = Path(models_path_str)
+        models_path.mkdir(parents=True, exist_ok=True) # Ensure test_models dir exists
+        # Verify model exists, or skip tests gracefully
+        test_model_path = models_path / model_name_str
+        if not test_model_path.exists():
+            ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set LOLLMS_TEST_MODEL_GGUF and LOLLMS_MODELS_PATH env vars.")
+            ASCIIColors.warning("Some tests will be skipped.")
+            # sys.exit(1) # Or allow to continue with skips
+            primary_model_available = False
+        else:
+            primary_model_available = True
+    except Exception as e:
+        ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
+        sys.exit(1)
     binding_config = {
-        "n_gpu_layers": 0,    # Set to -1 or a number for GPU offload
-        "n_ctx": 512,         # Short context for testing
-        "embedding": True,    # Enable for embedding tests
-        "verbose": False,     # llama.cpp server verbose logs
-        # "extra_cli_flags": ["--cont-batching"] # Example of extra flags
-        "server_startup_timeout": 180 # Give more time for server to start, esp. with large models
+        "n_gpu_layers": 0, "n_ctx": 512, "embedding": True,
+        "verbose": False, "server_startup_timeout": 180, "parallel_slots": 2,
     }
-    active_binding = None
+    active_binding1: Optional[LlamaCppServerBinding] = None
+    active_binding2: Optional[LlamaCppServerBinding] = None
+    active_binding_llava: Optional[LlamaCppServerBinding] = None
     try:
-        ASCIIColors.cyan("\n--- Initializing LlamaCppServerBinding ---")
-        active_binding = LlamaCppServerBinding(
-            model_name=model_name,
-            models_path=models_path,
-            config=binding_config
-        )
-        if not active_binding.server_process or not active_binding.server_process.is_healthy:
-            raise RuntimeError("Server process failed to start or become healthy.")
+        if primary_model_available:
+            ASCIIColors.cyan("\n--- Initializing First LlamaCppServerBinding Instance ---")
+            active_binding1 = LlamaCppServerBinding(
+                model_name=model_name_str, models_path=str(models_path), config=binding_config
+            )
+            if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
+                raise RuntimeError("Server for binding1 failed to start or become healthy.")
+            ASCIIColors.green(f"Binding1 initialized. Server for '{active_binding1.current_model_path.name}' running on port {active_binding1.port}.")
+            ASCIIColors.info(f"Binding1 Model Info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
+            ASCIIColors.cyan("\n--- Initializing Second LlamaCppServerBinding Instance (Same Model) ---")
+            active_binding2 = LlamaCppServerBinding(
+                model_name=model_name_str, models_path=str(models_path), config=binding_config # Same model and config
+            )
+            if not active_binding2.server_process or not active_binding2.server_process.is_healthy:
+                raise RuntimeError("Server for binding2 failed to start or become healthy (should reuse).")
+            ASCIIColors.green(f"Binding2 initialized. Server for '{active_binding2.current_model_path.name}' running on port {active_binding2.port}.")
+            ASCIIColors.info(f"Binding2 Model Info: {json.dumps(active_binding2.get_model_info(), indent=2)}")
-        ASCIIColors.green(f"Binding initialized. Server for '{active_binding.model_name}' running on port {active_binding.port}.")
-        ASCIIColors.info(f"Model Info: {json.dumps(active_binding.get_model_info(), indent=2)}")
+            if active_binding1.port != active_binding2.port:
+                ASCIIColors.error("ERROR: Bindings for the same model are using different ports! Server sharing failed.")
+            else:
+                ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing appears to work.")
+            # --- List Models (scans configured directories) ---
+            ASCIIColors.cyan("\n--- Listing Models (from search paths, using binding1) ---")
+            listed_models = active_binding1.listModels()
+            if listed_models: ASCIIColors.green(f"Found {len(listed_models)} GGUF files. First 5: {listed_models[:5]}")
+            else: ASCIIColors.warning("No GGUF models found in search paths.")
+            # --- Tokenize/Detokenize ---
+            ASCIIColors.cyan("\n--- Tokenize/Detokenize (using binding1) ---")
+            sample_text = "Hello, Llama.cpp server world!"
+            tokens = active_binding1.tokenize(sample_text)
+            ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
+            if tokens:
+                detokenized_text = active_binding1.detokenize(tokens)
+                ASCIIColors.green(f"Detokenized text: {detokenized_text}")
+            else: ASCIIColors.warning("Tokenization returned empty list.")
+            # --- Text Generation (Non-Streaming, Chat API, binding1) ---
+            ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API, binding1) ---")
+            prompt_text = "What is the capital of Germany?"
+            generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False, use_chat_format_override=True)
+            if isinstance(generated_text, str): ASCIIColors.green(f"Generated text (binding1): {generated_text}")
+            else: ASCIIColors.error(f"Generation failed (binding1): {generated_text}")
+            # --- Text Generation (Streaming, Completion API, binding2) ---
+            ASCIIColors.cyan("\n--- Text Generation (Streaming, Completion API, binding2) ---")
+            full_streamed_text = "" # Reset global
+            def stream_callback(chunk: str, msg_type: int): global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True); full_streamed_text += chunk; return True
+            result_b2 = active_binding2.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=30, stream=True, streaming_callback=stream_callback, use_chat_format_override=False)
+            print("\n--- End of Stream (binding2) ---")
+            if isinstance(result_b2, str): ASCIIColors.green(f"Full streamed text (binding2): {result_b2}")
+            else: ASCIIColors.error(f"Streaming generation failed (binding2): {result_b2}")
+            # --- Embeddings (binding1) ---
+            if binding_config.get("embedding"):
+                ASCIIColors.cyan("\n--- Embeddings (binding1) ---")
+                try:
+                    embedding_vector = active_binding1.embed("Test embedding.")
+                    ASCIIColors.green(f"Embedding (first 3 dims): {embedding_vector[:3]}... Dim: {len(embedding_vector)}")
+                except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
+            else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false) ---")
+        else: # primary_model_available is False
+            ASCIIColors.warning("Primary test model not available. Skipping most tests.")
-        # --- List Models (scans configured directories) ---
-        ASCIIColors.cyan("\n--- Listing Models (from search paths) ---")
-        listed_models = active_binding.listModels()
-        if listed_models:
-            ASCIIColors.green(f"Found {len(listed_models)} GGUF files. First 5:")
-            for m in listed_models[:5]: print(m)
-        else: ASCIIColors.warning("No GGUF models found in search paths.")
-        # --- Tokenize/Detokenize ---
-        ASCIIColors.cyan("\n--- Tokenize/Detokenize ---")
-        sample_text = "Hello, Llama.cpp server world!"
-        tokens = active_binding.tokenize(sample_text)
-        ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
-        token_count = active_binding.count_tokens(sample_text)
-        ASCIIColors.green(f"Token count: {token_count}")
-        if tokens: # Only detokenize if tokenization worked
-            detokenized_text = active_binding.detokenize(tokens)
-            ASCIIColors.green(f"Detokenized text: {detokenized_text}")
-            # Note: exact match might depend on BOS/EOS handling by server's tokenizer
-            # assert detokenized_text.strip() == sample_text.strip(), "Tokenization/Detokenization mismatch!"
-        else: ASCIIColors.warning("Tokenization returned empty list, skipping detokenization.")
-        # --- Text Generation (Non-Streaming, Chat Format using /v1/chat/completions) ---
-        ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API) ---")
-        prompt_text = "What is the capital of Germany?"
-        system_prompt_text = "You are a concise geography expert."
-        generated_text = active_binding.generate_text(
-            prompt_text, system_prompt=system_prompt_text, n_predict=20, stream=False,
-            use_chat_format_override=True # Force /v1/chat/completions
-        )
-        if isinstance(generated_text, str): ASCIIColors.green(f"Generated text: {generated_text}")
-        else: ASCIIColors.error(f"Generation failed: {generated_text}")
-        # --- Text Generation (Streaming, /completion API) ---
-        ASCIIColors.cyan("\n--- Text Generation (Streaming, Completion API) ---")
-        full_streamed_text = ""
-        def stream_callback(chunk: str, msg_type: int):
-            global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True)
-            full_streamed_text += chunk; return True
-        result = active_binding.generate_text(
-            prompt_text, system_prompt=system_prompt_text, n_predict=30, stream=True,
-            streaming_callback=stream_callback, use_chat_format_override=False # Force /completion
-        )
-        print("\n--- End of Stream ---")
-        if isinstance(result, str): ASCIIColors.green(f"Full streamed text: {result}")
-        else: ASCIIColors.error(f"Streaming generation failed: {result}")
-        # --- Embeddings ---
-        if binding_config.get("embedding"):
-            ASCIIColors.cyan("\n--- Embeddings ---")
-            embedding_text = "Test sentence for server-based embeddings."
-            try:
-                embedding_vector = active_binding.embed(embedding_text)
-                ASCIIColors.green(f"Embedding for '{embedding_text}' (first 3 dims): {embedding_vector[:3]}...")
-                ASCIIColors.info(f"Embedding vector dimension: {len(embedding_vector)}")
-            except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
-        else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false in config) ---")
         # --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
-        # To test LLaVA:
-        models_path = "E:\drumber" #replace with your own model path
-        model_name = "llava-v1.6-mistral-7b.Q3_K_XS.gguf"
-        model_path = Path(models_path)/model_name
-        ASCIIColors.cyan("\n--- LLaVA Vision Test ---")
-        dummy_image_path = Path("E:\\drumber\\drumber.png")
-        try:
-            from PIL import Image, ImageDraw
-            img = Image.new('RGB', (150, 70), color = ('magenta'))
-            d = ImageDraw.Draw(img); d.text((10,10), "Server LLaVA", fill=('white'))
-            img.save(dummy_image_path)
-            ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
-            llava_prompt = "Describe this image."
-            # For /v1/chat/completions with LLaVA, images are passed in messages.
-            # For /completion with LLaVA, prompt needs <image> placeholder and image_data field.
-            llava_response = active_binding.generate_text(
-                prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False,
-                use_chat_format_override=True # Use /v1/chat/completions for easier multimodal
-            )
-            if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
-            else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
-        except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
-        except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
-        finally:
-            if dummy_image_path.exists(): dummy_image_path.unlink()
-        # --- Test changing model ---
-        # This part is conceptual. You'd need another GGUF model file for a real test.
-        # For now, we'll just call load_model with the same model to test the logic.
-        ASCIIColors.cyan("\n--- Testing Model Change (reloading same model) ---")
-        reload_success = active_binding.load_model(str(model_path))
-        if reload_success and active_binding.server_process and active_binding.server_process.is_healthy:
-            ASCIIColors.green(f"Model reloaded/re-confirmed successfully. Server on port {active_binding.port}.")
-            # Quick generation test after reload
-            reloaded_gen = active_binding.generate_text("Ping", n_predict=5, stream=False)
-            if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping response: {reloaded_gen.strip()}")
-            else: ASCIIColors.error(f"Post-reload generation failed: {reloaded_gen}")
-        else:
-            ASCIIColors.error("Failed to reload model or server not healthy after reload attempt.")
+        ASCIIColors.cyan("\n--- LLaVA Vision Test (if model available) ---")
+        llava_model_path = models_path / llava_model_name_str
+        llava_clip_path_actual = models_path / llava_clip_name_str # Assuming clip is in models_path too
+        if llava_model_path.exists() and llava_clip_path_actual.exists():
+            dummy_image_path = models_path / "dummy_llava_image.png"
+            try:
+                from PIL import Image, ImageDraw
+                img = Image.new('RGB', (150, 70), color = ('magenta')); d = ImageDraw.Draw(img); d.text((10,10), "LLaVA Test", fill=('white')); img.save(dummy_image_path)
+                ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
-    except ImportError as e_imp:
-        ASCIIColors.error(f"Import error: {e_imp}. Ensure llama-cpp-binaries is installed.")
-    except FileNotFoundError as e_fnf:
-        ASCIIColors.error(f"File not found error: {e_fnf}. Check model or server binary paths.")
-    except ConnectionError as e_conn:
-        ASCIIColors.error(f"Connection error (server might have failed to start or is unresponsive): {e_conn}")
-    except RuntimeError as e_rt:
-        ASCIIColors.error(f"Runtime error (often server process issue): {e_rt}")
-        if active_binding and active_binding.server_process:
-             ASCIIColors.error("Last stderr lines from server:")
-             for line in active_binding.server_process._stderr_lines[-20:]: print(line) # Print last 20
-    except Exception as e_main:
-        ASCIIColors.error(f"An unexpected error occurred: {e_main}")
-        trace_exception(e_main)
-    finally:
-        if active_binding:
-            ASCIIColors.cyan("\n--- Unloading Model and Stopping Server ---")
-            active_binding.unload_model()
-            ASCIIColors.green("Server stopped and model unloaded.")
+                llava_binding_config = binding_config.copy()
+                # LLaVA might need specific chat template if server doesn't auto-detect well.
+                # llava_binding_config["chat_template"] = "llava-1.5"
+                active_binding_llava = LlamaCppServerBinding(
+                    model_name=str(llava_model_path), # Pass full path for clarity in test
+                    models_path=str(models_path),
+                    clip_model_name=str(llava_clip_path_actual), # Pass full path for clip
+                    config=llava_binding_config
+                )
+                if not active_binding_llava.server_process or not active_binding_llava.server_process.is_healthy:
+                     raise RuntimeError("LLaVA server failed to start or become healthy.")
+                ASCIIColors.green(f"LLaVA Binding initialized. Server for '{active_binding_llava.current_model_path.name}' running on port {active_binding_llava.port}.")
+                ASCIIColors.info(f"LLaVA Binding Model Info: {json.dumps(active_binding_llava.get_model_info(), indent=2)}")
+                llava_prompt = "Describe this image."
+                llava_response = active_binding_llava.generate_text(
+                    prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False, use_chat_format_override=True
+                )
+                if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
+                else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
+            except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
+            except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
+            finally:
+                if dummy_image_path.exists(): dummy_image_path.unlink()
+        else:
+            ASCIIColors.warning(f"LLaVA model '{llava_model_path.name}' or clip model '{llava_clip_path_actual.name}' not found in '{models_path}'. Skipping LLaVA test.")
+        if primary_model_available and active_binding1:
+            # --- Test changing model (using binding1 to load a different or same model) ---
+            ASCIIColors.cyan("\n--- Testing Model Change (binding1 reloads its model) ---")
+            # For a real change, use a different model name if available. Here, we reload the same.
+            reload_success = active_binding1.load_model(model_name_str) # Reload original model
+            if reload_success and active_binding1.server_process and active_binding1.server_process.is_healthy:
+                ASCIIColors.green(f"Model reloaded/re-confirmed successfully by binding1. Server on port {active_binding1.port}.")
+                reloaded_gen = active_binding1.generate_text("Ping", n_predict=5, stream=False)
+                if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping (binding1): {reloaded_gen.strip()}")
+                else: ASCIIColors.error(f"Post-reload generation failed (binding1): {reloaded_gen}")
+            else:
+                ASCIIColors.error("Failed to reload model or server not healthy after reload attempt by binding1.")
+    except ImportError as e_imp: ASCIIColors.error(f"Import error: {e_imp}.")
+    except FileNotFoundError as e_fnf: ASCIIColors.error(f"File not found error: {e_fnf}.")
+    except ConnectionError as e_conn: ASCIIColors.error(f"Connection error: {e_conn}")
+    except RuntimeError as e_rt:
+        ASCIIColors.error(f"Runtime error: {e_rt}")
+        if active_binding1 and active_binding1.server_process: ASCIIColors.error(f"Binding1 stderr:\n{active_binding1.server_process._stderr_lines[-20:]}")
+        if active_binding2 and active_binding2.server_process: ASCIIColors.error(f"Binding2 stderr:\n{active_binding2.server_process._stderr_lines[-20:]}")
+        if active_binding_llava and active_binding_llava.server_process: ASCIIColors.error(f"LLaVA Binding stderr:\n{active_binding_llava.server_process._stderr_lines[-20:]}")
+    except Exception as e_main: ASCIIColors.error(f"An unexpected error occurred: {e_main}"); trace_exception(e_main)
+    finally:
+        ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
+        if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
+        if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
+        if active_binding_llava: active_binding_llava.unload_model(); ASCIIColors.info("LLaVA Binding unloaded.")
+        # Check if any servers remain (should be none if all bindings unloaded)
+        with _server_registry_lock:
+            if _active_servers:
+                ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after all known bindings unloaded.")
+                for key, server_proc in list(_active_servers.items()): # list() for safe iteration if modifying
+                    ASCIIColors.info(f"Force shutting down stray server: {key}")
+                    try: server_proc.shutdown()
+                    except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
+                    _active_servers.pop(key,None)
+                    _server_ref_counts.pop(key,None)
+            else:
+                ASCIIColors.green("All servers shut down correctly.")
     ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")