lollms-client 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lollms-client might be problematic. Click here for more details.

@@ -119,8 +119,6 @@ _server_registry_lock = threading.Lock()
119
119
 
120
120
  BindingName = "LlamaCppServerBinding"
121
121
  DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
122
- # Port is now dynamic, this constant is less critical for direct use but good for reference.
123
- # DEFAULT_LLAMACPP_SERVER_PORT = 9641
124
122
 
125
123
  class LlamaCppServerProcess:
126
124
  def __init__(self, model_path: Union[str, Path], clip_model_path: Optional[Union[str, Path]] = None, server_binary_path: Optional[Union[str, Path]]=None, server_args: Dict[str, Any]={}):
@@ -266,54 +264,34 @@ class LlamaCppServerBinding(LollmsLLMBinding):
266
264
  "parallel_slots": 4, # Default parallel slots for server
267
265
  }
268
266
 
269
- def __init__(self, model_name: str, models_path: str, clip_model_name: Optional[str] = None,
267
+ def __init__(self, model_name: Optional[str], models_path: str, clip_model_name: Optional[str] = None,
270
268
  config: Optional[Dict[str, Any]] = None, default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat, **kwargs):
271
269
  super().__init__(binding_name=BindingName)
272
270
  if llama_cpp_binaries is None: raise ImportError("llama-cpp-binaries package is required but not found.")
273
271
 
274
272
  self.models_path = Path(models_path)
275
- self.user_provided_model_name = model_name # Store the name/path user gave
273
+ # Store initial preferences, but do not load/start server yet.
274
+ self.initial_model_name_preference: Optional[str] = model_name
275
+ self.user_provided_model_name: Optional[str] = model_name # Tracks the latest requested model
276
+ self.initial_clip_model_name_preference: Optional[str] = clip_model_name
277
+
276
278
  self._model_path_map: Dict[str, Path] = {} # Maps unique name to full Path
277
279
 
278
- # Initial scan for available models
280
+ # Initial scan for available models (to populate listModels)
279
281
  self._scan_models()
280
282
 
281
- # Determine the model to load
282
- effective_model_to_load = model_name
283
- if not effective_model_to_load and self._model_path_map:
284
- # If no model was specified and we have models, pick the first one
285
- # Sorting ensures a deterministic choice
286
- first_model_name = sorted(self._model_path_map.keys())[0]
287
- effective_model_to_load = first_model_name
288
- ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{effective_model_to_load}'")
289
- self.user_provided_model_name = effective_model_to_load # Update for get_model_info etc.
290
-
291
- # Initial hint for clip_model_path, resolved fully in load_model
292
- self.clip_model_path: Optional[Path] = None
293
- if clip_model_name:
294
- p_clip = Path(clip_model_name)
295
- if p_clip.is_absolute() and p_clip.exists():
296
- self.clip_model_path = p_clip
297
- elif (self.models_path / clip_model_name).exists(): # Relative to models_path
298
- self.clip_model_path = self.models_path / clip_model_name
299
- else:
300
- ASCIIColors.warning(f"Specified clip_model_name '{clip_model_name}' not found. Will rely on auto-detection if applicable.")
301
-
302
283
  self.default_completion_format = default_completion_format
303
284
  self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {}), **kwargs}
304
285
  self.server_binary_path = self._get_server_binary_path()
305
286
 
306
- self.current_model_path: Optional[Path] = None # Actual resolved path of loaded model
287
+ # Current state of the loaded model and server
288
+ self.current_model_path: Optional[Path] = None
289
+ self.clip_model_path: Optional[Path] = None # Actual resolved path of loaded clip model
307
290
  self.server_process: Optional[LlamaCppServerProcess] = None
308
291
  self.port: Optional[int] = None
309
292
  self.server_key: Optional[tuple] = None
310
293
 
311
- # Now, attempt to load the selected model
312
- if effective_model_to_load:
313
- if not self.load_model(effective_model_to_load):
314
- ASCIIColors.error(f"Initial model load for '{effective_model_to_load}' failed. Binding may not be functional.")
315
- else:
316
- ASCIIColors.warning("No models found in the models path. The binding will be idle until a model is loaded.")
294
+ ASCIIColors.info("LlamaCppServerBinding initialized. Server will start on-demand with first generation call.")
317
295
 
318
296
  def _get_server_binary_path(self) -> Path:
319
297
  custom_path_str = self.server_args.get("llama_server_binary_path")
@@ -384,7 +362,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
384
362
  if server_to_stop:
385
363
  try: server_to_stop.shutdown()
386
364
  except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
387
- # else: ASCIIColors.warning(f"Attempted to stop server {self.server_key} but it was not in _active_servers.") # Can be noisy
388
365
  else:
389
366
  ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
390
367
  _active_servers.pop(self.server_key, None) # Ensure removal
@@ -392,7 +369,8 @@ class LlamaCppServerBinding(LollmsLLMBinding):
392
369
  self.server_process = None
393
370
  self.port = None
394
371
  self.server_key = None
395
-
372
+ self.current_model_path = None # Also clear this binding's model association
373
+ self.clip_model_path = None # And clip model association
396
374
 
397
375
  def load_model(self, model_name_or_path: str) -> bool:
398
376
  self.user_provided_model_name = model_name_or_path # Keep track of the selected model name
@@ -401,15 +379,23 @@ class LlamaCppServerBinding(LollmsLLMBinding):
401
379
  except Exception as ex:
402
380
  trace_exception(ex)
403
381
  return False
404
- # Determine the clip_model_path for this server instance
405
- # Priority: 1. Explicit `clip_model_path` from init (if exists) 2. Auto-detection
382
+
383
+ # Determine the final clip_model_path for this server instance
384
+ # Priority: 1. Explicit `initial_clip_model_name_preference` from __init__ (if valid path)
385
+ # 2. Auto-detection based on the resolved main model.
406
386
  final_clip_model_path: Optional[Path] = None
407
- if self.clip_model_path and self.clip_model_path.exists(): # From __init__
408
- final_clip_model_path = self.clip_model_path
409
- ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
410
- elif not self.clip_model_path or (self.clip_model_path and not self.clip_model_path.exists()): # if init path was bad or not given
411
- if self.clip_model_path and not self.clip_model_path.exists():
412
- ASCIIColors.warning(f"Initial clip model path '{self.clip_model_path}' not found. Attempting auto-detection.")
387
+ if self.initial_clip_model_name_preference:
388
+ p_clip_pref = Path(self.initial_clip_model_name_preference)
389
+ if p_clip_pref.is_absolute() and p_clip_pref.exists():
390
+ final_clip_model_path = p_clip_pref
391
+ ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
392
+ elif (self.models_path / self.initial_clip_model_name_preference).exists():
393
+ final_clip_model_path = self.models_path / self.initial_clip_model_name_preference
394
+ ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path} (relative to models path)")
395
+ else:
396
+ ASCIIColors.warning(f"Specified initial clip_model_name '{self.initial_clip_model_name_preference}' not found. Attempting auto-detection.")
397
+
398
+ if not final_clip_model_path: # If no explicit path was provided or it was invalid, try auto-detection
413
399
  base_name = get_gguf_model_base_name(resolved_model_path.stem)
414
400
  potential_paths = [
415
401
  resolved_model_path.parent / f"{base_name}.mmproj",
@@ -427,9 +413,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
427
413
  final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
428
414
 
429
415
  # Server key based on model and essential server configurations (like clip model)
430
- # More server_args could be added to the key if they necessitate separate server instances
431
- # For example, different n_gpu_layers might require a server restart.
432
- # For now, model and clip model are the main differentiators for distinct servers.
433
416
  new_server_key = (str(resolved_model_path), final_clip_model_path_str)
434
417
 
435
418
  with _server_registry_lock:
@@ -503,20 +486,46 @@ class LlamaCppServerBinding(LollmsLLMBinding):
503
486
  self._release_server_instance() # Ensure cleanup if start failed
504
487
  return False
505
488
 
506
-
507
489
  def unload_model(self):
508
490
  if self.server_process:
509
491
  ASCIIColors.info(f"Unloading model for binding. Current server: {self.server_key}, port: {self.port}")
510
492
  self._release_server_instance() # Handles ref counting and actual shutdown if needed
511
493
  else:
512
494
  ASCIIColors.info("Unload_model called, but no server process was active for this binding instance.")
513
- self.current_model_path = None
514
- self.clip_model_path = None # Also clear the instance's clip path idea
515
- # self.port and self.server_key are cleared by _release_server_instance
495
+
496
+ def _ensure_server_is_running(self) -> bool:
497
+ """
498
+ Checks if the server is healthy. If not, it attempts to load the configured model.
499
+ Returns True if the server is healthy and ready, False otherwise.
500
+ """
501
+ if self.server_process and self.server_process.is_healthy:
502
+ return True
503
+
504
+ ASCIIColors.info("Server is not running. Attempting to start on-demand...")
505
+
506
+ # Determine which model to load
507
+ model_to_load = self.user_provided_model_name or self.initial_model_name_preference
508
+
509
+ if not model_to_load:
510
+ # No model specified, try to find one automatically
511
+ self._scan_models()
512
+ available_models = self.listModels()
513
+ if not available_models:
514
+ ASCIIColors.error("No model specified and no GGUF models found in models path.")
515
+ return False
516
+
517
+ model_to_load = available_models[0]['name'] # Pick the first one
518
+ ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{model_to_load}'")
519
+
520
+ # Now, attempt to load the selected model
521
+ if self.load_model(model_to_load):
522
+ return True
523
+ else:
524
+ ASCIIColors.error(f"Automatic model load for '{model_to_load}' failed.")
525
+ return False
516
526
 
517
527
  def _get_request_url(self, endpoint: str) -> str:
518
- if not self.server_process or not self.server_process.is_healthy:
519
- raise ConnectionError("Llama.cpp server is not running or not healthy.")
528
+ # This function now assumes _ensure_server_is_running has been called.
520
529
  return f"{self.server_process.base_url}{endpoint}"
521
530
 
522
531
  def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
@@ -584,48 +593,23 @@ class LlamaCppServerBinding(LollmsLLMBinding):
584
593
  system_prompt: str = "",
585
594
  n_predict: Optional[int] = None,
586
595
  stream: Optional[bool] = None,
587
- temperature: float = 0.7, # Ollama default is 0.8, common default 0.7
588
- top_k: int = 40, # Ollama default is 40
589
- top_p: float = 0.9, # Ollama default is 0.9
590
- repeat_penalty: float = 1.1, # Ollama default is 1.1
591
- repeat_last_n: int = 64, # Ollama default is 64
596
+ temperature: float = 0.7,
597
+ top_k: int = 40,
598
+ top_p: float = 0.9,
599
+ repeat_penalty: float = 1.1,
600
+ repeat_last_n: int = 64,
592
601
  seed: Optional[int] = None,
593
602
  n_threads: Optional[int] = None,
594
603
  ctx_size: int | None = None,
595
604
  streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
596
- split:Optional[bool]=False, # put to true if the prompt is a discussion
605
+ split:Optional[bool]=False,
597
606
  user_keyword:Optional[str]="!@>user:",
598
607
  ai_keyword:Optional[str]="!@>assistant:",
599
608
  **generation_kwargs
600
609
  ) -> Union[str, dict]:
601
- """
602
- Generate text using the active LLM binding, using instance defaults if parameters are not provided.
603
-
604
- Args:
605
- prompt (str): The input prompt for text generation.
606
- images (Optional[List[str]]): List of image file paths for multimodal generation.
607
- n_predict (Optional[int]): Maximum number of tokens to generate. Uses instance default if None.
608
- stream (Optional[bool]): Whether to stream the output. Uses instance default if None.
609
- temperature (Optional[float]): Sampling temperature. Uses instance default if None.
610
- top_k (Optional[int]): Top-k sampling parameter. Uses instance default if None.
611
- top_p (Optional[float]): Top-p sampling parameter. Uses instance default if None.
612
- repeat_penalty (Optional[float]): Penalty for repeated tokens. Uses instance default if None.
613
- repeat_last_n (Optional[int]): Number of previous tokens to consider for repeat penalty. Uses instance default if None.
614
- seed (Optional[int]): Random seed for generation. Uses instance default if None.
615
- n_threads (Optional[int]): Number of threads to use. Uses instance default if None.
616
- ctx_size (int | None): Context size override for this generation.
617
- streaming_callback (Optional[Callable[[str, str], None]]): Callback function for streaming output.
618
- - First parameter (str): The chunk of text received.
619
- - Second parameter (str): The message type (e.g., MSG_TYPE.MSG_TYPE_CHUNK).
620
- split:Optional[bool]: put to true if the prompt is a discussion
621
- user_keyword:Optional[str]: when splitting we use this to extract user prompt
622
- ai_keyword:Optional[str]": when splitting we use this to extract ai prompt
623
-
624
- Returns:
625
- Union[str, dict]: Generated text or error dictionary if failed.
626
- """
627
- if not self.server_process or not self.server_process.is_healthy:
628
- return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
610
+
611
+ if not self._ensure_server_is_running():
612
+ return {"status": False, "error": "Llama.cpp server could not be started. Please check model configuration and logs."}
629
613
 
630
614
  _use_chat_format = True
631
615
  payload = self._prepare_generation_payload(
@@ -642,11 +626,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
642
626
  endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
643
627
  request_url = self._get_request_url(endpoint)
644
628
 
645
- # Debug payload (simplified)
646
- # debug_payload = {k:v for k,v in payload.items() if k not in ["image_data","messages"] or (k=="messages" and not any("image_url" in part for item in v for part in (item.get("content") if isinstance(item.get("content"),list) else [])))} # Complex filter for brevity
647
- # ASCIIColors.debug(f"Request to {request_url} with payload (simplified): {json.dumps(debug_payload, indent=2)[:500]}...")
648
-
649
-
650
629
  full_response_text = ""
651
630
  try:
652
631
  response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
@@ -699,45 +678,16 @@ class LlamaCppServerBinding(LollmsLLMBinding):
699
678
  streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
700
679
  **generation_kwargs
701
680
  ) -> Union[str, dict]:
702
- """
703
- Conduct a chat session with the llama.cpp server using a LollmsDiscussion object.
704
-
705
- Args:
706
- discussion (LollmsDiscussion): The discussion object containing the conversation history.
707
- branch_tip_id (Optional[str]): The ID of the message to use as the tip of the conversation branch. Defaults to the active branch.
708
- n_predict (Optional[int]): Maximum number of tokens to generate.
709
- stream (Optional[bool]): Whether to stream the output.
710
- temperature (float): Sampling temperature.
711
- top_k (int): Top-k sampling parameter.
712
- top_p (float): Top-p sampling parameter.
713
- repeat_penalty (float): Penalty for repeated tokens.
714
- repeat_last_n (int): Number of previous tokens to consider for repeat penalty.
715
- seed (Optional[int]): Random seed for generation.
716
- streaming_callback (Optional[Callable[[str, MSG_TYPE], None]]): Callback for streaming output.
717
-
718
- Returns:
719
- Union[str, dict]: The generated text or an error dictionary.
720
- """
721
- if not self.server_process or not self.server_process.is_healthy:
722
- return {"status": "error", "message": "Llama.cpp server is not running or not healthy."}
723
681
 
724
- # 1. Export the discussion to the OpenAI chat format, which llama.cpp server understands.
725
- # This handles system prompts, user/assistant roles, and multi-modal content.
726
- messages = discussion.export("openai_chat", branch_tip_id)
682
+ if not self._ensure_server_is_running():
683
+ return {"status": "error", "message": "Llama.cpp server could not be started. Please check model configuration and logs."}
727
684
 
728
- # 2. Build the generation payload for the server
685
+ messages = discussion.export("openai_chat", branch_tip_id)
729
686
  payload = {
730
- "messages": messages,
731
- "max_tokens": n_predict,
732
- "temperature": temperature,
733
- "top_k": top_k,
734
- "top_p": top_p,
735
- "repeat_penalty": repeat_penalty,
736
- "seed": seed,
737
- "stream": stream,
738
- **generation_kwargs # Pass any extra parameters
687
+ "messages": messages, "max_tokens": n_predict, "temperature": temperature,
688
+ "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty,
689
+ "seed": seed, "stream": stream, **generation_kwargs
739
690
  }
740
- # Remove None values, as the API expects them to be absent
741
691
  payload = {k: v for k, v in payload.items() if v is not None}
742
692
 
743
693
  endpoint = "/v1/chat/completions"
@@ -745,7 +695,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
745
695
  full_response_text = ""
746
696
 
747
697
  try:
748
- # 3. Make the request to the server
749
698
  response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
750
699
  response.raise_for_status()
751
700
 
@@ -788,14 +737,14 @@ class LlamaCppServerBinding(LollmsLLMBinding):
788
737
  return {"status": "error", "message": error_message}
789
738
 
790
739
  def tokenize(self, text: str) -> List[int]:
791
- if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
740
+ if not self._ensure_server_is_running(): return []
792
741
  try:
793
742
  response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
794
743
  response.raise_for_status(); return response.json().get("tokens", [])
795
744
  except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
796
745
 
797
746
  def detokenize(self, tokens: List[int]) -> str:
798
- if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
747
+ if not self._ensure_server_is_running(): return ""
799
748
  try:
800
749
  response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
801
750
  response.raise_for_status(); return response.json().get("content", "")
@@ -804,8 +753,9 @@ class LlamaCppServerBinding(LollmsLLMBinding):
804
753
  def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
805
754
 
806
755
  def embed(self, text: str, **kwargs) -> List[float]:
807
- if not self.server_process or not self.server_process.is_healthy: raise Exception("Server not running.")
808
- if not self.server_args.get("embedding"): raise Exception("Embedding not enabled in server_args.")
756
+ if not self._ensure_server_is_running(): return []
757
+ if not self.server_args.get("embedding"):
758
+ ASCIIColors.warning("Embedding not enabled in server_args. Please set 'embedding' to True in config."); return []
809
759
  try:
810
760
  payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
811
761
  response = self.server_process.session.post(request_url, json=payload)
@@ -819,26 +769,31 @@ class LlamaCppServerBinding(LollmsLLMBinding):
819
769
  except requests.exceptions.RequestException as e:
820
770
  err_msg = f"Embedding request error: {e}";
821
771
  if e.response: err_msg += f" - {e.response.text[:200]}"
822
- raise Exception(err_msg) from e
823
- except Exception as ex: trace_exception(ex); raise Exception(f"Embedding failed: {str(ex)}") from ex
772
+ ASCIIColors.error(err_msg)
773
+ return []
774
+ except Exception as ex:
775
+ trace_exception(ex); ASCIIColors.error(f"Embedding failed: {str(ex)}")
776
+ return []
824
777
 
825
778
  def get_model_info(self) -> dict:
779
+ # This method reports the current state without triggering a server start
780
+ is_loaded = self.server_process is not None and self.server_process.is_healthy
826
781
  info = {
827
782
  "name": self.binding_name,
828
783
  "user_provided_model_name": self.user_provided_model_name,
829
784
  "model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
830
785
  "clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
831
- "loaded": self.server_process is not None and self.server_process.is_healthy,
786
+ "loaded": is_loaded,
832
787
  "server_args": self.server_args, "port": self.port if self.port else "N/A",
833
788
  "server_key": str(self.server_key) if self.server_key else "N/A",
834
789
  }
835
- if info["loaded"] and self.server_process:
790
+ if is_loaded:
836
791
  try:
837
792
  props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
838
793
  info.update({
839
794
  "server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
840
795
  "server_chat_format": props_resp.get("chat_format"),
841
- "server_clip_model_from_props": props_resp.get("mmproj"), # Server's view of clip model
796
+ "server_clip_model_from_props": props_resp.get("mmproj"),
842
797
  })
843
798
  except Exception: pass
844
799
 
@@ -850,10 +805,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
850
805
  return info
851
806
 
852
807
  def _scan_models(self):
853
- """
854
- Scans the models_path for GGUF files and populates the model map.
855
- Handles duplicate filenames by prefixing them with their parent directory path.
856
- """
857
808
  self._model_path_map = {}
858
809
  if not self.models_path.exists() or not self.models_path.is_dir():
859
810
  ASCIIColors.warning(f"Models path does not exist or is not a directory: {self.models_path}")
@@ -867,144 +818,79 @@ class LlamaCppServerBinding(LollmsLLMBinding):
867
818
 
868
819
  for model_file in all_paths:
869
820
  if model_file.is_file():
870
- # On Windows, path separators can be tricky. Convert to generic format.
871
821
  relative_path_str = str(model_file.relative_to(self.models_path).as_posix())
872
822
  if filenames_count[model_file.name] > 1:
873
- # Duplicate filename, use relative path as the unique name
874
823
  unique_name = relative_path_str
875
824
  else:
876
- # Unique filename, use the name itself
877
825
  unique_name = model_file.name
878
-
879
826
  self._model_path_map[unique_name] = model_file
880
827
 
881
828
  ASCIIColors.info(f"Scanned {len(self._model_path_map)} models from {self.models_path}.")
882
829
 
883
830
  def listModels(self) -> List[Dict[str, Any]]:
884
- """
885
- Lists all available GGUF models, rescanning the directory first.
886
- """
887
- self._scan_models() # Always rescan when asked for the list
888
-
831
+ self._scan_models()
889
832
  models_found = []
890
833
  for unique_name, model_path in self._model_path_map.items():
891
834
  models_found.append({
892
- 'name': unique_name, # The unique name for selection
893
- 'model_name': model_path.name, # The original filename for display
894
- 'path': str(model_path), # The full path
895
- 'size': model_path.stat().st_size
835
+ 'name': unique_name, 'model_name': model_path.name,
836
+ 'path': str(model_path), 'size': model_path.stat().st_size
896
837
  })
897
-
898
- # Sort the list alphabetically by the unique name for consistent ordering
899
838
  return sorted(models_found, key=lambda x: x['name'])
900
839
 
901
840
  def __del__(self):
902
841
  self.unload_model()
903
842
 
904
843
  def get_ctx_size(self, model_name: Optional[str] = None) -> Optional[int]:
905
- """
906
- Retrieves context size for a model from a hardcoded list.
907
-
908
- This method checks if the model name contains a known base model identifier
909
- (e.g., 'llama3.1', 'gemma2') to determine its context length. It's intended
910
- as a failsafe when the context size cannot be retrieved directly from the
911
- Ollama API.
912
- """
913
844
  if model_name is None:
914
- model_name = self.model_name
845
+ model_name = self.user_provided_model_name or self.initial_model_name_preference
846
+ if not model_name and self.current_model_path:
847
+ model_name = self.current_model_path.name
848
+
849
+ if model_name is None:
850
+ ASCIIColors.warning("Cannot determine context size without a model name.")
851
+ return None
915
852
 
916
- # Hardcoded context sizes for popular models. More specific names (e.g., 'llama3.1')
917
- # should appear, as they will be checked first due to the sorting logic below.
918
853
  known_contexts = {
919
- 'llama3.1': 131072, # Llama 3.1 extended context
920
- 'llama3.2': 131072, # Llama 3.2 extended context
921
- 'llama3.3': 131072, # Assuming similar to 3.1/3.2
922
- 'llama3': 8192, # Llama 3 default
923
- 'llama2': 4096, # Llama 2 default
924
- 'mixtral8x22b': 65536, # Mixtral 8x22B default
925
- 'mixtral': 32768, # Mixtral 8x7B default
926
- 'mistral': 32768, # Mistral 7B v0.2+ default
927
- 'gemma3': 131072, # Gemma 3 with 128K context
928
- 'gemma2': 8192, # Gemma 2 default
929
- 'gemma': 8192, # Gemma default
930
- 'phi3': 131072, # Phi-3 variants often use 128K (mini/medium extended)
931
- 'phi2': 2048, # Phi-2 default
932
- 'phi': 2048, # Phi default (older)
933
- 'qwen2.5': 131072, # Qwen2.5 with 128K
934
- 'qwen2': 32768, # Qwen2 default for 7B
935
- 'qwen': 8192, # Qwen default
936
- 'codellama': 16384, # CodeLlama extended
937
- 'codegemma': 8192, # CodeGemma default
938
- 'deepseek-coder-v2': 131072, # DeepSeek-Coder V2 with 128K
939
- 'deepseek-coder': 16384, # DeepSeek-Coder V1 default
940
- 'deepseek-v2': 131072, # DeepSeek-V2 with 128K
941
- 'deepseek-llm': 4096, # DeepSeek-LLM default
942
- 'yi1.5': 32768, # Yi-1.5 with 32K
943
- 'yi': 4096, # Yi base default
944
- 'command-r': 131072, # Command-R with 128K
945
- 'wizardlm2': 32768, # WizardLM2 (Mistral-based)
946
- 'wizardlm': 16384, # WizardLM default
947
- 'zephyr': 65536, # Zephyr beta (Mistral-based extended)
948
- 'vicuna': 2048, # Vicuna default (up to 16K in some variants)
949
- 'falcon': 2048, # Falcon default
950
- 'starcoder': 8192, # StarCoder default
951
- 'stablelm': 4096, # StableLM default
952
- 'orca2': 4096, # Orca 2 default
953
- 'orca': 4096, # Orca default
954
- 'dolphin': 32768, # Dolphin (often Mistral-based)
955
- 'openhermes': 8192, # OpenHermes default
854
+ 'llama3.1': 131072, 'llama3.2': 131072, 'llama3.3': 131072, 'llama3': 8192,
855
+ 'llama2': 4096, 'mixtral8x22b': 65536, 'mixtral': 32768, 'mistral': 32768,
856
+ 'gemma3': 131072, 'gemma2': 8192, 'gemma': 8192, 'phi3': 131072, 'phi2': 2048,
857
+ 'phi': 2048, 'qwen2.5': 131072, 'qwen2': 32768, 'qwen': 8192,
858
+ 'codellama': 16384, 'codegemma': 8192, 'deepseek-coder-v2': 131072,
859
+ 'deepseek-coder': 16384, 'deepseek-v2': 131072, 'deepseek-llm': 4096,
860
+ 'yi1.5': 32768, 'yi': 4096, 'command-r': 131072, 'wizardlm2': 32768,
861
+ 'wizardlm': 16384, 'zephyr': 65536, 'vicuna': 2048, 'falcon': 2048,
862
+ 'starcoder': 8192, 'stablelm': 4096, 'orca2': 4096, 'orca': 4096,
863
+ 'dolphin': 32768, 'openhermes': 8192,
956
864
  }
957
-
958
865
  normalized_model_name = model_name.lower().strip()
959
-
960
- # Sort keys by length in descending order. This ensures that a more specific
961
- # name like 'llama3.1' is checked before a less specific name like 'llama3'.
962
866
  sorted_base_models = sorted(known_contexts.keys(), key=len, reverse=True)
963
867
 
964
868
  for base_name in sorted_base_models:
965
869
  if base_name in normalized_model_name:
966
870
  context_size = known_contexts[base_name]
967
- ASCIIColors.warning(
968
- f"Using hardcoded context size for model '{model_name}' "
969
- f"based on base name '{base_name}': {context_size}"
970
- )
871
+ ASCIIColors.info(f"Using hardcoded context size for '{model_name}' based on '{base_name}': {context_size}")
971
872
  return context_size
972
873
 
973
874
  ASCIIColors.warning(f"Context size not found for model '{model_name}' in the hardcoded list.")
974
875
  return None
975
876
 
976
877
  if __name__ == '__main__':
977
- global full_streamed_text # Define for the callback
878
+ global full_streamed_text
978
879
  full_streamed_text = ""
979
880
  ASCIIColors.yellow("Testing LlamaCppServerBinding...")
980
881
 
981
- # --- Configuration ---
982
- # This should be the NAME of your GGUF model file.
983
- # Ensure this model is placed in your models_path directory.
984
- # Example: models_path = "E:\\lollms\\models\\gguf" (Windows)
985
- # model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
986
-
987
- # For CI/local testing without specific paths, you might download a tiny model
988
- # or require user to set environment variables for these.
989
- # For this example, replace with your actual paths/model.
990
882
  try:
991
883
  models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
992
- model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf") # A small model
993
- llava_model_name_str = os.environ.get("LOLLMS_TEST_LLAVA_MODEL_GGUF", "llava-v1.5-7b.Q2_K.gguf") # Placeholder
994
- llava_clip_name_str = os.environ.get("LOLLMS_TEST_LLAVA_CLIP", "mmproj-model2-q4_0.gguf") # Placeholder
995
-
996
- models_path = Path(models_path_str)
997
- models_path.mkdir(parents=True, exist_ok=True) # Ensure test_models dir exists
884
+ model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf")
998
885
 
999
- # Verify model exists, or skip tests gracefully
886
+ models_path = Path(models_path_str)
887
+ models_path.mkdir(parents=True, exist_ok=True)
1000
888
  test_model_path = models_path / model_name_str
1001
- if not test_model_path.exists():
1002
- ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set LOLLMS_TEST_MODEL_GGUF and LOLLMS_MODELS_PATH env vars.")
889
+
890
+ primary_model_available = test_model_path.exists()
891
+ if not primary_model_available:
892
+ ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set env vars.")
1003
893
  ASCIIColors.warning("Some tests will be skipped.")
1004
- # sys.exit(1) # Or allow to continue with skips
1005
- primary_model_available = False
1006
- else:
1007
- primary_model_available = True
1008
894
 
1009
895
  except Exception as e:
1010
896
  ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
@@ -1017,184 +903,106 @@ if __name__ == '__main__':
1017
903
 
1018
904
  active_binding1: Optional[LlamaCppServerBinding] = None
1019
905
  active_binding2: Optional[LlamaCppServerBinding] = None
1020
- active_binding_llava: Optional[LlamaCppServerBinding] = None
1021
-
906
+
1022
907
  try:
1023
908
  if primary_model_available:
1024
- ASCIIColors.cyan("\n--- Initializing First LlamaCppServerBinding Instance ---")
1025
- # Test default model selection by passing model_name=None
1026
- ASCIIColors.info("Testing default model selection (model_name=None)")
909
+ # --- Test 1: Auto-start server on first generation call ---
910
+ ASCIIColors.cyan("\n--- Test 1: Auto-start server with specified model name ---")
1027
911
  active_binding1 = LlamaCppServerBinding(
1028
- model_name=None, models_path=str(models_path), config=binding_config
912
+ model_name=model_name_str, models_path=str(models_path), config=binding_config
1029
913
  )
914
+ ASCIIColors.info("Binding1 initialized. No server should be running yet.")
915
+ ASCIIColors.info(f"Initial model info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
916
+
917
+ prompt_text = "What is the capital of France?"
918
+ generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
919
+
920
+ if isinstance(generated_text, str) and "Paris" in generated_text:
921
+ ASCIIColors.green(f"SUCCESS: Auto-start generation successful. Response: {generated_text}")
922
+ else:
923
+ ASCIIColors.error(f"FAILURE: Auto-start generation failed. Response: {generated_text}")
924
+
925
+ ASCIIColors.info(f"Model info after auto-start: {json.dumps(active_binding1.get_model_info(), indent=2)}")
1030
926
  if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
1031
- raise RuntimeError("Server for binding1 failed to start or become healthy.")
1032
- ASCIIColors.green(f"Binding1 initialized with default model. Server for '{active_binding1.current_model_path.name}' running on port {active_binding1.port}.")
1033
- ASCIIColors.info(f"Binding1 Model Info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
927
+ raise RuntimeError("Server for binding1 did not seem to start correctly.")
1034
928
 
1035
- ASCIIColors.cyan("\n--- Initializing Second LlamaCppServerBinding Instance (Same Model, explicit name) ---")
1036
- # Load the same model explicitly now
1037
- model_to_load_explicitly = active_binding1.user_provided_model_name
929
+ # --- Test 2: Server reuse with a second binding ---
930
+ ASCIIColors.cyan("\n--- Test 2: Server reuse with a second binding ---")
1038
931
  active_binding2 = LlamaCppServerBinding(
1039
- model_name=model_to_load_explicitly, models_path=str(models_path), config=binding_config
932
+ model_name=model_name_str, models_path=str(models_path), config=binding_config
1040
933
  )
1041
- if not active_binding2.server_process or not active_binding2.server_process.is_healthy:
1042
- raise RuntimeError("Server for binding2 failed to start or become healthy (should reuse).")
1043
- ASCIIColors.green(f"Binding2 initialized. Server for '{active_binding2.current_model_path.name}' running on port {active_binding2.port}.")
1044
- ASCIIColors.info(f"Binding2 Model Info: {json.dumps(active_binding2.get_model_info(), indent=2)}")
934
+ # This call should reuse the server from binding1
935
+ generated_text_b2 = active_binding2.generate_text("Ping", n_predict=5, stream=False)
936
+ if isinstance(generated_text_b2, str):
937
+ ASCIIColors.green(f"SUCCESS: Binding2 generation successful. Response: {generated_text_b2}")
938
+ else:
939
+ ASCIIColors.error(f"FAILURE: Binding2 generation failed. Response: {generated_text_b2}")
1045
940
 
1046
941
  if active_binding1.port != active_binding2.port:
1047
- ASCIIColors.error("ERROR: Bindings for the same model are using different ports! Server sharing failed.")
942
+ ASCIIColors.error("FAILURE: Bindings for the same model are using different ports! Server sharing failed.")
1048
943
  else:
1049
- ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing appears to work.")
1050
-
1051
- # --- List Models (scans configured directories) ---
1052
- ASCIIColors.cyan("\n--- Listing Models (from search paths, using binding1) ---")
1053
- # Create a dummy duplicate model to test unique naming
1054
- duplicate_folder = models_path / "subdir"
1055
- duplicate_folder.mkdir(exist_ok=True)
1056
- duplicate_model_path = duplicate_folder / test_model_path.name
1057
- import shutil
1058
- shutil.copy(test_model_path, duplicate_model_path)
1059
- ASCIIColors.info(f"Created a duplicate model for testing: {duplicate_model_path}")
1060
-
1061
- listed_models = active_binding1.listModels()
1062
- if listed_models:
1063
- ASCIIColors.green(f"Found {len(listed_models)} GGUF files.")
1064
- pprint.pprint(listed_models)
1065
- # Check if the duplicate was handled
1066
- names = [m['name'] for m in listed_models]
1067
- if test_model_path.name in names and f"subdir/{test_model_path.name}" in names:
1068
- ASCIIColors.green("SUCCESS: Duplicate model names were correctly handled.")
1069
- else:
1070
- ASCIIColors.error("FAILURE: Duplicate model names were not handled correctly.")
1071
- else: ASCIIColors.warning("No GGUF models found in search paths.")
1072
-
1073
- # Clean up dummy duplicate
1074
- duplicate_model_path.unlink()
1075
- duplicate_folder.rmdir()
1076
-
1077
-
1078
- # --- Tokenize/Detokenize ---
1079
- ASCIIColors.cyan("\n--- Tokenize/Detokenize (using binding1) ---")
1080
- sample_text = "Hello, Llama.cpp server world!"
1081
- tokens = active_binding1.tokenize(sample_text)
1082
- ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
1083
- if tokens:
1084
- detokenized_text = active_binding1.detokenize(tokens)
1085
- ASCIIColors.green(f"Detokenized text: {detokenized_text}")
1086
- else: ASCIIColors.warning("Tokenization returned empty list.")
1087
-
1088
- # --- Text Generation (Non-Streaming, Chat API, binding1) ---
1089
- ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API, binding1) ---")
1090
- prompt_text = "What is the capital of Germany?"
1091
- generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
1092
- if isinstance(generated_text, str): ASCIIColors.green(f"Generated text (binding1): {generated_text}")
1093
- else: ASCIIColors.error(f"Generation failed (binding1): {generated_text}")
944
+ ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing works.")
1094
945
 
1095
- # --- Text Generation (Streaming, Completion API, binding2) ---
1096
- ASCIIColors.cyan("\n--- Text Generation (Streaming, Chat API, binding2) ---")
1097
- full_streamed_text = "" # Reset global
1098
- def stream_callback(chunk: str, msg_type: int): global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True); full_streamed_text += chunk; return True
946
+ # --- Test 3: Unload and auto-reload ---
947
+ ASCIIColors.cyan("\n--- Test 3: Unload and auto-reload ---")
948
+ active_binding1.unload_model()
949
+ ASCIIColors.info("Binding1 unloaded. Ref count should be 1, server still up for binding2.")
1099
950
 
1100
- result_b2 = active_binding2.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=30, stream=True, streaming_callback=stream_callback)
1101
- print("\n--- End of Stream (binding2) ---")
1102
- if isinstance(result_b2, str): ASCIIColors.green(f"Full streamed text (binding2): {result_b2}")
1103
- else: ASCIIColors.error(f"Streaming generation failed (binding2): {result_b2}")
1104
-
1105
- # --- Embeddings (binding1) ---
1106
- if binding_config.get("embedding"):
1107
- ASCIIColors.cyan("\n--- Embeddings (binding1) ---")
1108
- try:
1109
- embedding_vector = active_binding1.embed("Test embedding.")
1110
- ASCIIColors.green(f"Embedding (first 3 dims): {embedding_vector[:3]}... Dim: {len(embedding_vector)}")
1111
- except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
1112
- else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false) ---")
1113
-
1114
- else: # primary_model_available is False
1115
- ASCIIColors.warning("Primary test model not available. Skipping most tests.")
1116
-
1117
-
1118
- # --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
1119
- ASCIIColors.cyan("\n--- LLaVA Vision Test (if model available) ---")
1120
- llava_model_path = models_path / llava_model_name_str
1121
- llava_clip_path_actual = models_path / llava_clip_name_str # Assuming clip is in models_path too
1122
-
1123
- if llava_model_path.exists() and llava_clip_path_actual.exists():
1124
- dummy_image_path = models_path / "dummy_llava_image.png"
1125
- try:
1126
- from PIL import Image, ImageDraw
1127
- img = Image.new('RGB', (150, 70), color = ('magenta')); d = ImageDraw.Draw(img); d.text((10,10), "LLaVA Test", fill=('white')); img.save(dummy_image_path)
1128
- ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
1129
-
1130
- llava_binding_config = binding_config.copy()
1131
- # LLaVA might need specific chat template if server doesn't auto-detect well.
1132
- # llava_binding_config["chat_template"] = "llava-1.5"
1133
-
1134
- active_binding_llava = LlamaCppServerBinding(
1135
- model_name=str(llava_model_path.name), # Pass filename, let it resolve
1136
- models_path=str(models_path),
1137
- clip_model_name=str(llava_clip_path_actual.name), # Pass filename for clip
1138
- config=llava_binding_config
1139
- )
1140
- if not active_binding_llava.server_process or not active_binding_llava.server_process.is_healthy:
1141
- raise RuntimeError("LLaVA server failed to start or become healthy.")
1142
- ASCIIColors.green(f"LLaVA Binding initialized. Server for '{active_binding_llava.current_model_path.name}' running on port {active_binding_llava.port}.")
1143
- ASCIIColors.info(f"LLaVA Binding Model Info: {json.dumps(active_binding_llava.get_model_info(), indent=2)}")
951
+ # The server should still be up because binding2 holds a reference
952
+ with _server_registry_lock:
953
+ if not _active_servers:
954
+ ASCIIColors.error("FAILURE: Server shut down prematurely while still referenced by binding2.")
955
+ else:
956
+ ASCIIColors.green("SUCCESS: Server correctly remained active for binding2.")
1144
957
 
958
+ # This call should re-acquire a reference to the same server for binding1
959
+ generated_text_reloaded = active_binding1.generate_text("Test reload", n_predict=5, stream=False)
960
+ if isinstance(generated_text_reloaded, str):
961
+ ASCIIColors.green(f"SUCCESS: Generation after reload successful. Response: {generated_text_reloaded}")
962
+ else:
963
+ ASCIIColors.error(f"FAILURE: Generation after reload failed. Response: {generated_text_reloaded}")
1145
964
 
1146
- llava_prompt = "Describe this image."
1147
- llava_response = active_binding_llava.generate_text(
1148
- prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False
1149
- )
1150
- if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
1151
- else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
965
+ if active_binding1.port != active_binding2.port:
966
+ ASCIIColors.error("FAILURE: Port mismatch after reload.")
967
+ else:
968
+ ASCIIColors.green("SUCCESS: Correctly re-used same server after reload.")
1152
969
 
1153
- except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
1154
- except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
1155
- finally:
1156
- if dummy_image_path.exists(): dummy_image_path.unlink()
1157
970
  else:
1158
- ASCIIColors.warning(f"LLaVA model '{llava_model_path.name}' or clip model '{llava_clip_path_actual.name}' not found in '{models_path}'. Skipping LLaVA test.")
1159
-
1160
- if primary_model_available and active_binding1:
1161
- # --- Test changing model (using binding1 to load a different or same model) ---
1162
- ASCIIColors.cyan("\n--- Testing Model Change (binding1 reloads its model) ---")
1163
- # For a real change, use a different model name if available. Here, we reload the same.
1164
- reload_success = active_binding1.load_model(active_binding1.user_provided_model_name) # Reload original model
1165
- if reload_success and active_binding1.server_process and active_binding1.server_process.is_healthy:
1166
- ASCIIColors.green(f"Model reloaded/re-confirmed successfully by binding1. Server on port {active_binding1.port}.")
1167
- reloaded_gen = active_binding1.generate_text("Ping", n_predict=5, stream=False)
1168
- if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping (binding1): {reloaded_gen.strip()}")
1169
- else: ASCIIColors.error(f"Post-reload generation failed (binding1): {reloaded_gen}")
971
+ ASCIIColors.warning("\n--- Primary model not available, skipping most tests ---")
972
+
973
+ # --- Test 4: Initialize with model_name=None and auto-find ---
974
+ ASCIIColors.cyan("\n--- Test 4: Initialize with model_name=None and auto-find ---")
975
+ unspecified_binding = LlamaCppServerBinding(model_name=None, models_path=str(models_path), config=binding_config)
976
+ gen_unspec = unspecified_binding.generate_text("Ping", n_predict=5, stream=False)
977
+ if primary_model_available:
978
+ if isinstance(gen_unspec, str):
979
+ ASCIIColors.green(f"SUCCESS: Auto-find generation successful. Response: {gen_unspec}")
980
+ ASCIIColors.info(f"Model auto-selected: {unspecified_binding.user_provided_model_name}")
1170
981
  else:
1171
- ASCIIColors.error("Failed to reload model or server not healthy after reload attempt by binding1.")
1172
-
1173
- except ImportError as e_imp: ASCIIColors.error(f"Import error: {e_imp}.")
1174
- except FileNotFoundError as e_fnf: ASCIIColors.error(f"File not found error: {e_fnf}.")
1175
- except ConnectionError as e_conn: ASCIIColors.error(f"Connection error: {e_conn}")
1176
- except RuntimeError as e_rt:
1177
- ASCIIColors.error(f"Runtime error: {e_rt}")
1178
- if active_binding1 and active_binding1.server_process: ASCIIColors.error(f"Binding1 stderr:\n{active_binding1.server_process._stderr_lines[-20:]}")
1179
- if active_binding2 and active_binding2.server_process: ASCIIColors.error(f"Binding2 stderr:\n{active_binding2.server_process._stderr_lines[-20:]}")
1180
- if active_binding_llava and active_binding_llava.server_process: ASCIIColors.error(f"LLaVA Binding stderr:\n{active_binding_llava.server_process._stderr_lines[-20:]}")
1181
- except Exception as e_main: ASCIIColors.error(f"An unexpected error occurred: {e_main}"); trace_exception(e_main)
982
+ ASCIIColors.error(f"FAILURE: Auto-find generation failed. Response: {gen_unspec}")
983
+ else: # If no models, this should fail gracefully
984
+ if isinstance(gen_unspec, dict) and 'error' in gen_unspec:
985
+ ASCIIColors.green("SUCCESS: Correctly failed to generate when no models are available.")
986
+ else:
987
+ ASCIIColors.error(f"FAILURE: Incorrect behavior when no models are available. Response: {gen_unspec}")
988
+
989
+ except Exception as e_main:
990
+ ASCIIColors.error(f"An unexpected error occurred during testing: {e_main}")
991
+ trace_exception(e_main)
1182
992
  finally:
1183
993
  ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
1184
994
  if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
1185
995
  if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
1186
- if active_binding_llava: active_binding_llava.unload_model(); ASCIIColors.info("LLaVA Binding unloaded.")
1187
996
 
1188
- # Check if any servers remain (should be none if all bindings unloaded)
1189
997
  with _server_registry_lock:
1190
998
  if _active_servers:
1191
- ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after all known bindings unloaded.")
1192
- for key, server_proc in list(_active_servers.items()): # list() for safe iteration if modifying
999
+ ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after tests.")
1000
+ for key, server_proc in list(_active_servers.items()):
1193
1001
  ASCIIColors.info(f"Force shutting down stray server: {key}")
1194
1002
  try: server_proc.shutdown()
1195
1003
  except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
1196
- _active_servers.pop(key,None)
1197
- _server_ref_counts.pop(key,None)
1004
+ _active_servers.pop(key, None)
1005
+ _server_ref_counts.pop(key, None)
1198
1006
  else:
1199
1007
  ASCIIColors.green("All servers shut down correctly.")
1200
1008