lollms-client 0.32.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lollms-client might be problematic. Click here for more details.

Files changed (73) hide show
  1. lollms_client/__init__.py +1 -1
  2. lollms_client/llm_bindings/azure_openai/__init__.py +6 -10
  3. lollms_client/llm_bindings/claude/__init__.py +4 -7
  4. lollms_client/llm_bindings/gemini/__init__.py +3 -7
  5. lollms_client/llm_bindings/grok/__init__.py +3 -7
  6. lollms_client/llm_bindings/groq/__init__.py +4 -7
  7. lollms_client/llm_bindings/hugging_face_inference_api/__init__.py +4 -6
  8. lollms_client/llm_bindings/litellm/__init__.py +15 -6
  9. lollms_client/llm_bindings/llamacpp/__init__.py +214 -388
  10. lollms_client/llm_bindings/lollms/__init__.py +24 -14
  11. lollms_client/llm_bindings/lollms_webui/__init__.py +6 -12
  12. lollms_client/llm_bindings/mistral/__init__.py +58 -29
  13. lollms_client/llm_bindings/ollama/__init__.py +6 -11
  14. lollms_client/llm_bindings/open_router/__init__.py +45 -14
  15. lollms_client/llm_bindings/openai/__init__.py +7 -14
  16. lollms_client/llm_bindings/openllm/__init__.py +12 -12
  17. lollms_client/llm_bindings/pythonllamacpp/__init__.py +1 -1
  18. lollms_client/llm_bindings/tensor_rt/__init__.py +8 -13
  19. lollms_client/llm_bindings/transformers/__init__.py +14 -6
  20. lollms_client/llm_bindings/vllm/__init__.py +16 -12
  21. lollms_client/lollms_core.py +296 -487
  22. lollms_client/lollms_discussion.py +436 -78
  23. lollms_client/lollms_llm_binding.py +223 -11
  24. lollms_client/lollms_mcp_binding.py +33 -2
  25. lollms_client/mcp_bindings/local_mcp/__init__.py +3 -2
  26. lollms_client/mcp_bindings/remote_mcp/__init__.py +6 -5
  27. lollms_client/mcp_bindings/standard_mcp/__init__.py +3 -5
  28. lollms_client/stt_bindings/lollms/__init__.py +6 -8
  29. lollms_client/stt_bindings/whisper/__init__.py +2 -4
  30. lollms_client/stt_bindings/whispercpp/__init__.py +15 -16
  31. lollms_client/tti_bindings/dalle/__init__.py +29 -28
  32. lollms_client/tti_bindings/diffusers/__init__.py +25 -21
  33. lollms_client/tti_bindings/gemini/__init__.py +215 -0
  34. lollms_client/tti_bindings/lollms/__init__.py +8 -9
  35. lollms_client-1.0.0.dist-info/METADATA +1214 -0
  36. lollms_client-1.0.0.dist-info/RECORD +69 -0
  37. {lollms_client-0.32.1.dist-info → lollms_client-1.0.0.dist-info}/top_level.txt +0 -2
  38. examples/article_summary/article_summary.py +0 -58
  39. examples/console_discussion/console_app.py +0 -266
  40. examples/console_discussion.py +0 -448
  41. examples/deep_analyze/deep_analyse.py +0 -30
  42. examples/deep_analyze/deep_analyze_multiple_files.py +0 -32
  43. examples/function_calling_with_local_custom_mcp.py +0 -250
  44. examples/generate_a_benchmark_for_safe_store.py +0 -89
  45. examples/generate_and_speak/generate_and_speak.py +0 -251
  46. examples/generate_game_sfx/generate_game_fx.py +0 -240
  47. examples/generate_text_with_multihop_rag_example.py +0 -210
  48. examples/gradio_chat_app.py +0 -228
  49. examples/gradio_lollms_chat.py +0 -259
  50. examples/internet_search_with_rag.py +0 -226
  51. examples/lollms_chat/calculator.py +0 -59
  52. examples/lollms_chat/derivative.py +0 -48
  53. examples/lollms_chat/test_openai_compatible_with_lollms_chat.py +0 -12
  54. examples/lollms_discussions_test.py +0 -155
  55. examples/mcp_examples/external_mcp.py +0 -267
  56. examples/mcp_examples/local_mcp.py +0 -171
  57. examples/mcp_examples/openai_mcp.py +0 -203
  58. examples/mcp_examples/run_remote_mcp_example_v2.py +0 -290
  59. examples/mcp_examples/run_standard_mcp_example.py +0 -204
  60. examples/simple_text_gen_test.py +0 -173
  61. examples/simple_text_gen_with_image_test.py +0 -178
  62. examples/test_local_models/local_chat.py +0 -9
  63. examples/text_2_audio.py +0 -77
  64. examples/text_2_image.py +0 -144
  65. examples/text_2_image_diffusers.py +0 -274
  66. examples/text_and_image_2_audio.py +0 -59
  67. examples/text_gen.py +0 -30
  68. examples/text_gen_system_prompt.py +0 -29
  69. lollms_client-0.32.1.dist-info/METADATA +0 -854
  70. lollms_client-0.32.1.dist-info/RECORD +0 -101
  71. test/test_lollms_discussion.py +0 -368
  72. {lollms_client-0.32.1.dist-info → lollms_client-1.0.0.dist-info}/WHEEL +0 -0
  73. {lollms_client-0.32.1.dist-info → lollms_client-1.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -119,11 +119,16 @@ _server_registry_lock = threading.Lock()
119
119
 
120
120
  BindingName = "LlamaCppServerBinding"
121
121
  DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
122
- # Port is now dynamic, this constant is less critical for direct use but good for reference.
123
- # DEFAULT_LLAMACPP_SERVER_PORT = 9641
124
122
 
125
123
  class LlamaCppServerProcess:
126
- def __init__(self, model_path: Union[str, Path], clip_model_path: Optional[Union[str, Path]] = None, server_binary_path: Optional[Union[str, Path]]=None, server_args: Dict[str, Any]={}):
124
+ def __init__(self,
125
+ model_path: Union[str, Path],
126
+ clip_model_path: Optional[Union[str, Path]] = None,
127
+ server_binary_path: Optional[Union[str, Path]]=None,
128
+ server_args: Dict[str, Any]={}
129
+ ):
130
+ """Initialize the Llama.cpp server process.
131
+ """
127
132
  self.model_path = Path(model_path)
128
133
  self.clip_model_path = Path(clip_model_path) if clip_model_path else None
129
134
 
@@ -266,54 +271,45 @@ class LlamaCppServerBinding(LollmsLLMBinding):
266
271
  "parallel_slots": 4, # Default parallel slots for server
267
272
  }
268
273
 
269
- def __init__(self, model_name: str, models_path: str, clip_model_name: Optional[str] = None,
270
- config: Optional[Dict[str, Any]] = None, default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat, **kwargs):
271
- super().__init__(binding_name=BindingName)
274
+ def __init__(self,
275
+ **kwargs
276
+ ):
277
+ """Initialize the Llama.cpp server binding.
278
+ Args:
279
+ model_name (str): Name of the model to load. If None, will use initial_model_name_preference.
280
+ models_path (str): Path to the directory containing model files.
281
+ clip_model_name (str): Optional name of the clip model to use. If None, will try to auto-detect based on the main model.
282
+ config (dict): Additional configuration options for the server.
283
+ default_completion_format (ELF_COMPLETION_FORMAT): Default format for completions.
284
+
285
+ """
286
+ super().__init__(BindingName, **kwargs)
272
287
  if llama_cpp_binaries is None: raise ImportError("llama-cpp-binaries package is required but not found.")
273
288
 
289
+ models_path = kwargs.get("models_path", Path(__file__).parent/"models")
274
290
  self.models_path = Path(models_path)
275
- self.user_provided_model_name = model_name # Store the name/path user gave
291
+ # Store initial preferences, but do not load/start server yet.
292
+ self.initial_model_name_preference: Optional[str] = kwargs.get("model_name")
293
+ self.user_provided_model_name: Optional[str] = kwargs.get("model_name") # Tracks the latest requested model
294
+ self.initial_clip_model_name_preference: Optional[str] = kwargs.get("clip_model_name")
295
+
276
296
  self._model_path_map: Dict[str, Path] = {} # Maps unique name to full Path
277
297
 
278
- # Initial scan for available models
298
+ # Initial scan for available models (to populate listModels)
279
299
  self._scan_models()
280
300
 
281
- # Determine the model to load
282
- effective_model_to_load = model_name
283
- if not effective_model_to_load and self._model_path_map:
284
- # If no model was specified and we have models, pick the first one
285
- # Sorting ensures a deterministic choice
286
- first_model_name = sorted(self._model_path_map.keys())[0]
287
- effective_model_to_load = first_model_name
288
- ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{effective_model_to_load}'")
289
- self.user_provided_model_name = effective_model_to_load # Update for get_model_info etc.
290
-
291
- # Initial hint for clip_model_path, resolved fully in load_model
292
- self.clip_model_path: Optional[Path] = None
293
- if clip_model_name:
294
- p_clip = Path(clip_model_name)
295
- if p_clip.is_absolute() and p_clip.exists():
296
- self.clip_model_path = p_clip
297
- elif (self.models_path / clip_model_name).exists(): # Relative to models_path
298
- self.clip_model_path = self.models_path / clip_model_name
299
- else:
300
- ASCIIColors.warning(f"Specified clip_model_name '{clip_model_name}' not found. Will rely on auto-detection if applicable.")
301
-
302
- self.default_completion_format = default_completion_format
303
- self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {}), **kwargs}
301
+ self.default_completion_format = kwargs.get("default_completion_format", ELF_COMPLETION_FORMAT.Chat)
302
+ self.server_args = {**self.DEFAULT_SERVER_ARGS, **(kwargs.get("config") or {}), **kwargs}
304
303
  self.server_binary_path = self._get_server_binary_path()
305
304
 
306
- self.current_model_path: Optional[Path] = None # Actual resolved path of loaded model
305
+ # Current state of the loaded model and server
306
+ self.current_model_path: Optional[Path] = None
307
+ self.clip_model_path: Optional[Path] = None # Actual resolved path of loaded clip model
307
308
  self.server_process: Optional[LlamaCppServerProcess] = None
308
309
  self.port: Optional[int] = None
309
310
  self.server_key: Optional[tuple] = None
310
311
 
311
- # Now, attempt to load the selected model
312
- if effective_model_to_load:
313
- if not self.load_model(effective_model_to_load):
314
- ASCIIColors.error(f"Initial model load for '{effective_model_to_load}' failed. Binding may not be functional.")
315
- else:
316
- ASCIIColors.warning("No models found in the models path. The binding will be idle until a model is loaded.")
312
+ ASCIIColors.info("LlamaCppServerBinding initialized. Server will start on-demand with first generation call.")
317
313
 
318
314
  def _get_server_binary_path(self) -> Path:
319
315
  custom_path_str = self.server_args.get("llama_server_binary_path")
@@ -384,7 +380,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
384
380
  if server_to_stop:
385
381
  try: server_to_stop.shutdown()
386
382
  except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
387
- # else: ASCIIColors.warning(f"Attempted to stop server {self.server_key} but it was not in _active_servers.") # Can be noisy
388
383
  else:
389
384
  ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
390
385
  _active_servers.pop(self.server_key, None) # Ensure removal
@@ -392,7 +387,8 @@ class LlamaCppServerBinding(LollmsLLMBinding):
392
387
  self.server_process = None
393
388
  self.port = None
394
389
  self.server_key = None
395
-
390
+ self.current_model_path = None # Also clear this binding's model association
391
+ self.clip_model_path = None # And clip model association
396
392
 
397
393
  def load_model(self, model_name_or_path: str) -> bool:
398
394
  self.user_provided_model_name = model_name_or_path # Keep track of the selected model name
@@ -401,15 +397,23 @@ class LlamaCppServerBinding(LollmsLLMBinding):
401
397
  except Exception as ex:
402
398
  trace_exception(ex)
403
399
  return False
404
- # Determine the clip_model_path for this server instance
405
- # Priority: 1. Explicit `clip_model_path` from init (if exists) 2. Auto-detection
400
+
401
+ # Determine the final clip_model_path for this server instance
402
+ # Priority: 1. Explicit `initial_clip_model_name_preference` from __init__ (if valid path)
403
+ # 2. Auto-detection based on the resolved main model.
406
404
  final_clip_model_path: Optional[Path] = None
407
- if self.clip_model_path and self.clip_model_path.exists(): # From __init__
408
- final_clip_model_path = self.clip_model_path
409
- ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
410
- elif not self.clip_model_path or (self.clip_model_path and not self.clip_model_path.exists()): # if init path was bad or not given
411
- if self.clip_model_path and not self.clip_model_path.exists():
412
- ASCIIColors.warning(f"Initial clip model path '{self.clip_model_path}' not found. Attempting auto-detection.")
405
+ if self.initial_clip_model_name_preference:
406
+ p_clip_pref = Path(self.initial_clip_model_name_preference)
407
+ if p_clip_pref.is_absolute() and p_clip_pref.exists():
408
+ final_clip_model_path = p_clip_pref
409
+ ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
410
+ elif (self.models_path / self.initial_clip_model_name_preference).exists():
411
+ final_clip_model_path = self.models_path / self.initial_clip_model_name_preference
412
+ ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path} (relative to models path)")
413
+ else:
414
+ ASCIIColors.warning(f"Specified initial clip_model_name '{self.initial_clip_model_name_preference}' not found. Attempting auto-detection.")
415
+
416
+ if not final_clip_model_path: # If no explicit path was provided or it was invalid, try auto-detection
413
417
  base_name = get_gguf_model_base_name(resolved_model_path.stem)
414
418
  potential_paths = [
415
419
  resolved_model_path.parent / f"{base_name}.mmproj",
@@ -427,9 +431,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
427
431
  final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
428
432
 
429
433
  # Server key based on model and essential server configurations (like clip model)
430
- # More server_args could be added to the key if they necessitate separate server instances
431
- # For example, different n_gpu_layers might require a server restart.
432
- # For now, model and clip model are the main differentiators for distinct servers.
433
434
  new_server_key = (str(resolved_model_path), final_clip_model_path_str)
434
435
 
435
436
  with _server_registry_lock:
@@ -503,20 +504,46 @@ class LlamaCppServerBinding(LollmsLLMBinding):
503
504
  self._release_server_instance() # Ensure cleanup if start failed
504
505
  return False
505
506
 
506
-
507
507
  def unload_model(self):
508
508
  if self.server_process:
509
509
  ASCIIColors.info(f"Unloading model for binding. Current server: {self.server_key}, port: {self.port}")
510
510
  self._release_server_instance() # Handles ref counting and actual shutdown if needed
511
511
  else:
512
512
  ASCIIColors.info("Unload_model called, but no server process was active for this binding instance.")
513
- self.current_model_path = None
514
- self.clip_model_path = None # Also clear the instance's clip path idea
515
- # self.port and self.server_key are cleared by _release_server_instance
513
+
514
+ def _ensure_server_is_running(self) -> bool:
515
+ """
516
+ Checks if the server is healthy. If not, it attempts to load the configured model.
517
+ Returns True if the server is healthy and ready, False otherwise.
518
+ """
519
+ if self.server_process and self.server_process.is_healthy:
520
+ return True
521
+
522
+ ASCIIColors.info("Server is not running. Attempting to start on-demand...")
523
+
524
+ # Determine which model to load
525
+ model_to_load = self.user_provided_model_name or self.initial_model_name_preference
526
+
527
+ if not model_to_load:
528
+ # No model specified, try to find one automatically
529
+ self._scan_models()
530
+ available_models = self.listModels()
531
+ if not available_models:
532
+ ASCIIColors.error("No model specified and no GGUF models found in models path.")
533
+ return False
534
+
535
+ model_to_load = available_models[0]['name'] # Pick the first one
536
+ ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{model_to_load}'")
537
+
538
+ # Now, attempt to load the selected model
539
+ if self.load_model(model_to_load):
540
+ return True
541
+ else:
542
+ ASCIIColors.error(f"Automatic model load for '{model_to_load}' failed.")
543
+ return False
516
544
 
517
545
  def _get_request_url(self, endpoint: str) -> str:
518
- if not self.server_process or not self.server_process.is_healthy:
519
- raise ConnectionError("Llama.cpp server is not running or not healthy.")
546
+ # This function now assumes _ensure_server_is_running has been called.
520
547
  return f"{self.server_process.base_url}{endpoint}"
521
548
 
522
549
  def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
@@ -584,48 +611,23 @@ class LlamaCppServerBinding(LollmsLLMBinding):
584
611
  system_prompt: str = "",
585
612
  n_predict: Optional[int] = None,
586
613
  stream: Optional[bool] = None,
587
- temperature: float = 0.7, # Ollama default is 0.8, common default 0.7
588
- top_k: int = 40, # Ollama default is 40
589
- top_p: float = 0.9, # Ollama default is 0.9
590
- repeat_penalty: float = 1.1, # Ollama default is 1.1
591
- repeat_last_n: int = 64, # Ollama default is 64
614
+ temperature: float = 0.7,
615
+ top_k: int = 40,
616
+ top_p: float = 0.9,
617
+ repeat_penalty: float = 1.1,
618
+ repeat_last_n: int = 64,
592
619
  seed: Optional[int] = None,
593
620
  n_threads: Optional[int] = None,
594
621
  ctx_size: int | None = None,
595
622
  streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
596
- split:Optional[bool]=False, # put to true if the prompt is a discussion
623
+ split:Optional[bool]=False,
597
624
  user_keyword:Optional[str]="!@>user:",
598
625
  ai_keyword:Optional[str]="!@>assistant:",
599
626
  **generation_kwargs
600
627
  ) -> Union[str, dict]:
601
- """
602
- Generate text using the active LLM binding, using instance defaults if parameters are not provided.
603
-
604
- Args:
605
- prompt (str): The input prompt for text generation.
606
- images (Optional[List[str]]): List of image file paths for multimodal generation.
607
- n_predict (Optional[int]): Maximum number of tokens to generate. Uses instance default if None.
608
- stream (Optional[bool]): Whether to stream the output. Uses instance default if None.
609
- temperature (Optional[float]): Sampling temperature. Uses instance default if None.
610
- top_k (Optional[int]): Top-k sampling parameter. Uses instance default if None.
611
- top_p (Optional[float]): Top-p sampling parameter. Uses instance default if None.
612
- repeat_penalty (Optional[float]): Penalty for repeated tokens. Uses instance default if None.
613
- repeat_last_n (Optional[int]): Number of previous tokens to consider for repeat penalty. Uses instance default if None.
614
- seed (Optional[int]): Random seed for generation. Uses instance default if None.
615
- n_threads (Optional[int]): Number of threads to use. Uses instance default if None.
616
- ctx_size (int | None): Context size override for this generation.
617
- streaming_callback (Optional[Callable[[str, str], None]]): Callback function for streaming output.
618
- - First parameter (str): The chunk of text received.
619
- - Second parameter (str): The message type (e.g., MSG_TYPE.MSG_TYPE_CHUNK).
620
- split:Optional[bool]: put to true if the prompt is a discussion
621
- user_keyword:Optional[str]: when splitting we use this to extract user prompt
622
- ai_keyword:Optional[str]": when splitting we use this to extract ai prompt
623
-
624
- Returns:
625
- Union[str, dict]: Generated text or error dictionary if failed.
626
- """
627
- if not self.server_process or not self.server_process.is_healthy:
628
- return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
628
+
629
+ if not self._ensure_server_is_running():
630
+ return {"status": False, "error": "Llama.cpp server could not be started. Please check model configuration and logs."}
629
631
 
630
632
  _use_chat_format = True
631
633
  payload = self._prepare_generation_payload(
@@ -642,11 +644,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
642
644
  endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
643
645
  request_url = self._get_request_url(endpoint)
644
646
 
645
- # Debug payload (simplified)
646
- # debug_payload = {k:v for k,v in payload.items() if k not in ["image_data","messages"] or (k=="messages" and not any("image_url" in part for item in v for part in (item.get("content") if isinstance(item.get("content"),list) else [])))} # Complex filter for brevity
647
- # ASCIIColors.debug(f"Request to {request_url} with payload (simplified): {json.dumps(debug_payload, indent=2)[:500]}...")
648
-
649
-
650
647
  full_response_text = ""
651
648
  try:
652
649
  response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
@@ -699,45 +696,16 @@ class LlamaCppServerBinding(LollmsLLMBinding):
699
696
  streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
700
697
  **generation_kwargs
701
698
  ) -> Union[str, dict]:
702
- """
703
- Conduct a chat session with the llama.cpp server using a LollmsDiscussion object.
704
699
 
705
- Args:
706
- discussion (LollmsDiscussion): The discussion object containing the conversation history.
707
- branch_tip_id (Optional[str]): The ID of the message to use as the tip of the conversation branch. Defaults to the active branch.
708
- n_predict (Optional[int]): Maximum number of tokens to generate.
709
- stream (Optional[bool]): Whether to stream the output.
710
- temperature (float): Sampling temperature.
711
- top_k (int): Top-k sampling parameter.
712
- top_p (float): Top-p sampling parameter.
713
- repeat_penalty (float): Penalty for repeated tokens.
714
- repeat_last_n (int): Number of previous tokens to consider for repeat penalty.
715
- seed (Optional[int]): Random seed for generation.
716
- streaming_callback (Optional[Callable[[str, MSG_TYPE], None]]): Callback for streaming output.
717
-
718
- Returns:
719
- Union[str, dict]: The generated text or an error dictionary.
720
- """
721
- if not self.server_process or not self.server_process.is_healthy:
722
- return {"status": "error", "message": "Llama.cpp server is not running or not healthy."}
700
+ if not self._ensure_server_is_running():
701
+ return {"status": "error", "message": "Llama.cpp server could not be started. Please check model configuration and logs."}
723
702
 
724
- # 1. Export the discussion to the OpenAI chat format, which llama.cpp server understands.
725
- # This handles system prompts, user/assistant roles, and multi-modal content.
726
703
  messages = discussion.export("openai_chat", branch_tip_id)
727
-
728
- # 2. Build the generation payload for the server
729
704
  payload = {
730
- "messages": messages,
731
- "max_tokens": n_predict,
732
- "temperature": temperature,
733
- "top_k": top_k,
734
- "top_p": top_p,
735
- "repeat_penalty": repeat_penalty,
736
- "seed": seed,
737
- "stream": stream,
738
- **generation_kwargs # Pass any extra parameters
705
+ "messages": messages, "max_tokens": n_predict, "temperature": temperature,
706
+ "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty,
707
+ "seed": seed, "stream": stream, **generation_kwargs
739
708
  }
740
- # Remove None values, as the API expects them to be absent
741
709
  payload = {k: v for k, v in payload.items() if v is not None}
742
710
 
743
711
  endpoint = "/v1/chat/completions"
@@ -745,7 +713,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
745
713
  full_response_text = ""
746
714
 
747
715
  try:
748
- # 3. Make the request to the server
749
716
  response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
750
717
  response.raise_for_status()
751
718
 
@@ -788,14 +755,14 @@ class LlamaCppServerBinding(LollmsLLMBinding):
788
755
  return {"status": "error", "message": error_message}
789
756
 
790
757
  def tokenize(self, text: str) -> List[int]:
791
- if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
758
+ if not self._ensure_server_is_running(): return []
792
759
  try:
793
760
  response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
794
761
  response.raise_for_status(); return response.json().get("tokens", [])
795
762
  except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
796
763
 
797
764
  def detokenize(self, tokens: List[int]) -> str:
798
- if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
765
+ if not self._ensure_server_is_running(): return ""
799
766
  try:
800
767
  response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
801
768
  response.raise_for_status(); return response.json().get("content", "")
@@ -804,8 +771,9 @@ class LlamaCppServerBinding(LollmsLLMBinding):
804
771
  def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
805
772
 
806
773
  def embed(self, text: str, **kwargs) -> List[float]:
807
- if not self.server_process or not self.server_process.is_healthy: raise Exception("Server not running.")
808
- if not self.server_args.get("embedding"): raise Exception("Embedding not enabled in server_args.")
774
+ if not self._ensure_server_is_running(): return []
775
+ if not self.server_args.get("embedding"):
776
+ ASCIIColors.warning("Embedding not enabled in server_args. Please set 'embedding' to True in config."); return []
809
777
  try:
810
778
  payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
811
779
  response = self.server_process.session.post(request_url, json=payload)
@@ -819,26 +787,31 @@ class LlamaCppServerBinding(LollmsLLMBinding):
819
787
  except requests.exceptions.RequestException as e:
820
788
  err_msg = f"Embedding request error: {e}";
821
789
  if e.response: err_msg += f" - {e.response.text[:200]}"
822
- raise Exception(err_msg) from e
823
- except Exception as ex: trace_exception(ex); raise Exception(f"Embedding failed: {str(ex)}") from ex
790
+ ASCIIColors.error(err_msg)
791
+ return []
792
+ except Exception as ex:
793
+ trace_exception(ex); ASCIIColors.error(f"Embedding failed: {str(ex)}")
794
+ return []
824
795
 
825
796
  def get_model_info(self) -> dict:
797
+ # This method reports the current state without triggering a server start
798
+ is_loaded = self.server_process is not None and self.server_process.is_healthy
826
799
  info = {
827
800
  "name": self.binding_name,
828
801
  "user_provided_model_name": self.user_provided_model_name,
829
802
  "model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
830
803
  "clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
831
- "loaded": self.server_process is not None and self.server_process.is_healthy,
804
+ "loaded": is_loaded,
832
805
  "server_args": self.server_args, "port": self.port if self.port else "N/A",
833
806
  "server_key": str(self.server_key) if self.server_key else "N/A",
834
807
  }
835
- if info["loaded"] and self.server_process:
808
+ if is_loaded:
836
809
  try:
837
810
  props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
838
811
  info.update({
839
812
  "server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
840
813
  "server_chat_format": props_resp.get("chat_format"),
841
- "server_clip_model_from_props": props_resp.get("mmproj"), # Server's view of clip model
814
+ "server_clip_model_from_props": props_resp.get("mmproj"),
842
815
  })
843
816
  except Exception: pass
844
817
 
@@ -850,10 +823,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
850
823
  return info
851
824
 
852
825
  def _scan_models(self):
853
- """
854
- Scans the models_path for GGUF files and populates the model map.
855
- Handles duplicate filenames by prefixing them with their parent directory path.
856
- """
857
826
  self._model_path_map = {}
858
827
  if not self.models_path.exists() or not self.models_path.is_dir():
859
828
  ASCIIColors.warning(f"Models path does not exist or is not a directory: {self.models_path}")
@@ -867,144 +836,79 @@ class LlamaCppServerBinding(LollmsLLMBinding):
867
836
 
868
837
  for model_file in all_paths:
869
838
  if model_file.is_file():
870
- # On Windows, path separators can be tricky. Convert to generic format.
871
839
  relative_path_str = str(model_file.relative_to(self.models_path).as_posix())
872
840
  if filenames_count[model_file.name] > 1:
873
- # Duplicate filename, use relative path as the unique name
874
841
  unique_name = relative_path_str
875
842
  else:
876
- # Unique filename, use the name itself
877
843
  unique_name = model_file.name
878
-
879
844
  self._model_path_map[unique_name] = model_file
880
845
 
881
846
  ASCIIColors.info(f"Scanned {len(self._model_path_map)} models from {self.models_path}.")
882
847
 
883
848
  def listModels(self) -> List[Dict[str, Any]]:
884
- """
885
- Lists all available GGUF models, rescanning the directory first.
886
- """
887
- self._scan_models() # Always rescan when asked for the list
888
-
849
+ self._scan_models()
889
850
  models_found = []
890
851
  for unique_name, model_path in self._model_path_map.items():
891
852
  models_found.append({
892
- 'name': unique_name, # The unique name for selection
893
- 'model_name': model_path.name, # The original filename for display
894
- 'path': str(model_path), # The full path
895
- 'size': model_path.stat().st_size
853
+ 'name': unique_name, 'model_name': model_path.name,
854
+ 'path': str(model_path), 'size': model_path.stat().st_size
896
855
  })
897
-
898
- # Sort the list alphabetically by the unique name for consistent ordering
899
856
  return sorted(models_found, key=lambda x: x['name'])
900
857
 
901
858
  def __del__(self):
902
859
  self.unload_model()
903
860
 
904
861
  def get_ctx_size(self, model_name: Optional[str] = None) -> Optional[int]:
905
- """
906
- Retrieves context size for a model from a hardcoded list.
907
-
908
- This method checks if the model name contains a known base model identifier
909
- (e.g., 'llama3.1', 'gemma2') to determine its context length. It's intended
910
- as a failsafe when the context size cannot be retrieved directly from the
911
- Ollama API.
912
- """
913
862
  if model_name is None:
914
- model_name = self.model_name
863
+ model_name = self.user_provided_model_name or self.initial_model_name_preference
864
+ if not model_name and self.current_model_path:
865
+ model_name = self.current_model_path.name
866
+
867
+ if model_name is None:
868
+ ASCIIColors.warning("Cannot determine context size without a model name.")
869
+ return None
915
870
 
916
- # Hardcoded context sizes for popular models. More specific names (e.g., 'llama3.1')
917
- # should appear, as they will be checked first due to the sorting logic below.
918
871
  known_contexts = {
919
- 'llama3.1': 131072, # Llama 3.1 extended context
920
- 'llama3.2': 131072, # Llama 3.2 extended context
921
- 'llama3.3': 131072, # Assuming similar to 3.1/3.2
922
- 'llama3': 8192, # Llama 3 default
923
- 'llama2': 4096, # Llama 2 default
924
- 'mixtral8x22b': 65536, # Mixtral 8x22B default
925
- 'mixtral': 32768, # Mixtral 8x7B default
926
- 'mistral': 32768, # Mistral 7B v0.2+ default
927
- 'gemma3': 131072, # Gemma 3 with 128K context
928
- 'gemma2': 8192, # Gemma 2 default
929
- 'gemma': 8192, # Gemma default
930
- 'phi3': 131072, # Phi-3 variants often use 128K (mini/medium extended)
931
- 'phi2': 2048, # Phi-2 default
932
- 'phi': 2048, # Phi default (older)
933
- 'qwen2.5': 131072, # Qwen2.5 with 128K
934
- 'qwen2': 32768, # Qwen2 default for 7B
935
- 'qwen': 8192, # Qwen default
936
- 'codellama': 16384, # CodeLlama extended
937
- 'codegemma': 8192, # CodeGemma default
938
- 'deepseek-coder-v2': 131072, # DeepSeek-Coder V2 with 128K
939
- 'deepseek-coder': 16384, # DeepSeek-Coder V1 default
940
- 'deepseek-v2': 131072, # DeepSeek-V2 with 128K
941
- 'deepseek-llm': 4096, # DeepSeek-LLM default
942
- 'yi1.5': 32768, # Yi-1.5 with 32K
943
- 'yi': 4096, # Yi base default
944
- 'command-r': 131072, # Command-R with 128K
945
- 'wizardlm2': 32768, # WizardLM2 (Mistral-based)
946
- 'wizardlm': 16384, # WizardLM default
947
- 'zephyr': 65536, # Zephyr beta (Mistral-based extended)
948
- 'vicuna': 2048, # Vicuna default (up to 16K in some variants)
949
- 'falcon': 2048, # Falcon default
950
- 'starcoder': 8192, # StarCoder default
951
- 'stablelm': 4096, # StableLM default
952
- 'orca2': 4096, # Orca 2 default
953
- 'orca': 4096, # Orca default
954
- 'dolphin': 32768, # Dolphin (often Mistral-based)
955
- 'openhermes': 8192, # OpenHermes default
872
+ 'llama3.1': 131072, 'llama3.2': 131072, 'llama3.3': 131072, 'llama3': 8192,
873
+ 'llama2': 4096, 'mixtral8x22b': 65536, 'mixtral': 32768, 'mistral': 32768,
874
+ 'gemma3': 131072, 'gemma2': 8192, 'gemma': 8192, 'phi3': 131072, 'phi2': 2048,
875
+ 'phi': 2048, 'qwen2.5': 131072, 'qwen2': 32768, 'qwen': 8192,
876
+ 'codellama': 16384, 'codegemma': 8192, 'deepseek-coder-v2': 131072,
877
+ 'deepseek-coder': 16384, 'deepseek-v2': 131072, 'deepseek-llm': 4096,
878
+ 'yi1.5': 32768, 'yi': 4096, 'command-r': 131072, 'wizardlm2': 32768,
879
+ 'wizardlm': 16384, 'zephyr': 65536, 'vicuna': 2048, 'falcon': 2048,
880
+ 'starcoder': 8192, 'stablelm': 4096, 'orca2': 4096, 'orca': 4096,
881
+ 'dolphin': 32768, 'openhermes': 8192,
956
882
  }
957
-
958
883
  normalized_model_name = model_name.lower().strip()
959
-
960
- # Sort keys by length in descending order. This ensures that a more specific
961
- # name like 'llama3.1' is checked before a less specific name like 'llama3'.
962
884
  sorted_base_models = sorted(known_contexts.keys(), key=len, reverse=True)
963
885
 
964
886
  for base_name in sorted_base_models:
965
887
  if base_name in normalized_model_name:
966
888
  context_size = known_contexts[base_name]
967
- ASCIIColors.warning(
968
- f"Using hardcoded context size for model '{model_name}' "
969
- f"based on base name '{base_name}': {context_size}"
970
- )
889
+ ASCIIColors.info(f"Using hardcoded context size for '{model_name}' based on '{base_name}': {context_size}")
971
890
  return context_size
972
891
 
973
892
  ASCIIColors.warning(f"Context size not found for model '{model_name}' in the hardcoded list.")
974
893
  return None
975
894
 
976
895
  if __name__ == '__main__':
977
- global full_streamed_text # Define for the callback
896
+ global full_streamed_text
978
897
  full_streamed_text = ""
979
898
  ASCIIColors.yellow("Testing LlamaCppServerBinding...")
980
899
 
981
- # --- Configuration ---
982
- # This should be the NAME of your GGUF model file.
983
- # Ensure this model is placed in your models_path directory.
984
- # Example: models_path = "E:\\lollms\\models\\gguf" (Windows)
985
- # model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
986
-
987
- # For CI/local testing without specific paths, you might download a tiny model
988
- # or require user to set environment variables for these.
989
- # For this example, replace with your actual paths/model.
990
900
  try:
991
901
  models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
992
- model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf") # A small model
993
- llava_model_name_str = os.environ.get("LOLLMS_TEST_LLAVA_MODEL_GGUF", "llava-v1.5-7b.Q2_K.gguf") # Placeholder
994
- llava_clip_name_str = os.environ.get("LOLLMS_TEST_LLAVA_CLIP", "mmproj-model2-q4_0.gguf") # Placeholder
995
-
996
- models_path = Path(models_path_str)
997
- models_path.mkdir(parents=True, exist_ok=True) # Ensure test_models dir exists
902
+ model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf")
998
903
 
999
- # Verify model exists, or skip tests gracefully
904
+ models_path = Path(models_path_str)
905
+ models_path.mkdir(parents=True, exist_ok=True)
1000
906
  test_model_path = models_path / model_name_str
1001
- if not test_model_path.exists():
1002
- ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set LOLLMS_TEST_MODEL_GGUF and LOLLMS_MODELS_PATH env vars.")
907
+
908
+ primary_model_available = test_model_path.exists()
909
+ if not primary_model_available:
910
+ ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set env vars.")
1003
911
  ASCIIColors.warning("Some tests will be skipped.")
1004
- # sys.exit(1) # Or allow to continue with skips
1005
- primary_model_available = False
1006
- else:
1007
- primary_model_available = True
1008
912
 
1009
913
  except Exception as e:
1010
914
  ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
@@ -1017,184 +921,106 @@ if __name__ == '__main__':
1017
921
 
1018
922
  active_binding1: Optional[LlamaCppServerBinding] = None
1019
923
  active_binding2: Optional[LlamaCppServerBinding] = None
1020
- active_binding_llava: Optional[LlamaCppServerBinding] = None
1021
-
924
+
1022
925
  try:
1023
926
  if primary_model_available:
1024
- ASCIIColors.cyan("\n--- Initializing First LlamaCppServerBinding Instance ---")
1025
- # Test default model selection by passing model_name=None
1026
- ASCIIColors.info("Testing default model selection (model_name=None)")
927
+ # --- Test 1: Auto-start server on first generation call ---
928
+ ASCIIColors.cyan("\n--- Test 1: Auto-start server with specified model name ---")
1027
929
  active_binding1 = LlamaCppServerBinding(
1028
- model_name=None, models_path=str(models_path), config=binding_config
930
+ model_name=model_name_str, models_path=str(models_path), config=binding_config
1029
931
  )
932
+ ASCIIColors.info("Binding1 initialized. No server should be running yet.")
933
+ ASCIIColors.info(f"Initial model info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
934
+
935
+ prompt_text = "What is the capital of France?"
936
+ generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
937
+
938
+ if isinstance(generated_text, str) and "Paris" in generated_text:
939
+ ASCIIColors.green(f"SUCCESS: Auto-start generation successful. Response: {generated_text}")
940
+ else:
941
+ ASCIIColors.error(f"FAILURE: Auto-start generation failed. Response: {generated_text}")
942
+
943
+ ASCIIColors.info(f"Model info after auto-start: {json.dumps(active_binding1.get_model_info(), indent=2)}")
1030
944
  if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
1031
- raise RuntimeError("Server for binding1 failed to start or become healthy.")
1032
- ASCIIColors.green(f"Binding1 initialized with default model. Server for '{active_binding1.current_model_path.name}' running on port {active_binding1.port}.")
1033
- ASCIIColors.info(f"Binding1 Model Info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
945
+ raise RuntimeError("Server for binding1 did not seem to start correctly.")
1034
946
 
1035
- ASCIIColors.cyan("\n--- Initializing Second LlamaCppServerBinding Instance (Same Model, explicit name) ---")
1036
- # Load the same model explicitly now
1037
- model_to_load_explicitly = active_binding1.user_provided_model_name
947
+ # --- Test 2: Server reuse with a second binding ---
948
+ ASCIIColors.cyan("\n--- Test 2: Server reuse with a second binding ---")
1038
949
  active_binding2 = LlamaCppServerBinding(
1039
- model_name=model_to_load_explicitly, models_path=str(models_path), config=binding_config
950
+ model_name=model_name_str, models_path=str(models_path), config=binding_config
1040
951
  )
1041
- if not active_binding2.server_process or not active_binding2.server_process.is_healthy:
1042
- raise RuntimeError("Server for binding2 failed to start or become healthy (should reuse).")
1043
- ASCIIColors.green(f"Binding2 initialized. Server for '{active_binding2.current_model_path.name}' running on port {active_binding2.port}.")
1044
- ASCIIColors.info(f"Binding2 Model Info: {json.dumps(active_binding2.get_model_info(), indent=2)}")
952
+ # This call should reuse the server from binding1
953
+ generated_text_b2 = active_binding2.generate_text("Ping", n_predict=5, stream=False)
954
+ if isinstance(generated_text_b2, str):
955
+ ASCIIColors.green(f"SUCCESS: Binding2 generation successful. Response: {generated_text_b2}")
956
+ else:
957
+ ASCIIColors.error(f"FAILURE: Binding2 generation failed. Response: {generated_text_b2}")
1045
958
 
1046
959
  if active_binding1.port != active_binding2.port:
1047
- ASCIIColors.error("ERROR: Bindings for the same model are using different ports! Server sharing failed.")
960
+ ASCIIColors.error("FAILURE: Bindings for the same model are using different ports! Server sharing failed.")
1048
961
  else:
1049
- ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing appears to work.")
1050
-
1051
- # --- List Models (scans configured directories) ---
1052
- ASCIIColors.cyan("\n--- Listing Models (from search paths, using binding1) ---")
1053
- # Create a dummy duplicate model to test unique naming
1054
- duplicate_folder = models_path / "subdir"
1055
- duplicate_folder.mkdir(exist_ok=True)
1056
- duplicate_model_path = duplicate_folder / test_model_path.name
1057
- import shutil
1058
- shutil.copy(test_model_path, duplicate_model_path)
1059
- ASCIIColors.info(f"Created a duplicate model for testing: {duplicate_model_path}")
1060
-
1061
- listed_models = active_binding1.listModels()
1062
- if listed_models:
1063
- ASCIIColors.green(f"Found {len(listed_models)} GGUF files.")
1064
- pprint.pprint(listed_models)
1065
- # Check if the duplicate was handled
1066
- names = [m['name'] for m in listed_models]
1067
- if test_model_path.name in names and f"subdir/{test_model_path.name}" in names:
1068
- ASCIIColors.green("SUCCESS: Duplicate model names were correctly handled.")
1069
- else:
1070
- ASCIIColors.error("FAILURE: Duplicate model names were not handled correctly.")
1071
- else: ASCIIColors.warning("No GGUF models found in search paths.")
1072
-
1073
- # Clean up dummy duplicate
1074
- duplicate_model_path.unlink()
1075
- duplicate_folder.rmdir()
1076
-
1077
-
1078
- # --- Tokenize/Detokenize ---
1079
- ASCIIColors.cyan("\n--- Tokenize/Detokenize (using binding1) ---")
1080
- sample_text = "Hello, Llama.cpp server world!"
1081
- tokens = active_binding1.tokenize(sample_text)
1082
- ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
1083
- if tokens:
1084
- detokenized_text = active_binding1.detokenize(tokens)
1085
- ASCIIColors.green(f"Detokenized text: {detokenized_text}")
1086
- else: ASCIIColors.warning("Tokenization returned empty list.")
1087
-
1088
- # --- Text Generation (Non-Streaming, Chat API, binding1) ---
1089
- ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API, binding1) ---")
1090
- prompt_text = "What is the capital of Germany?"
1091
- generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
1092
- if isinstance(generated_text, str): ASCIIColors.green(f"Generated text (binding1): {generated_text}")
1093
- else: ASCIIColors.error(f"Generation failed (binding1): {generated_text}")
962
+ ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing works.")
1094
963
 
1095
- # --- Text Generation (Streaming, Completion API, binding2) ---
1096
- ASCIIColors.cyan("\n--- Text Generation (Streaming, Chat API, binding2) ---")
1097
- full_streamed_text = "" # Reset global
1098
- def stream_callback(chunk: str, msg_type: int): global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True); full_streamed_text += chunk; return True
964
+ # --- Test 3: Unload and auto-reload ---
965
+ ASCIIColors.cyan("\n--- Test 3: Unload and auto-reload ---")
966
+ active_binding1.unload_model()
967
+ ASCIIColors.info("Binding1 unloaded. Ref count should be 1, server still up for binding2.")
1099
968
 
1100
- result_b2 = active_binding2.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=30, stream=True, streaming_callback=stream_callback)
1101
- print("\n--- End of Stream (binding2) ---")
1102
- if isinstance(result_b2, str): ASCIIColors.green(f"Full streamed text (binding2): {result_b2}")
1103
- else: ASCIIColors.error(f"Streaming generation failed (binding2): {result_b2}")
1104
-
1105
- # --- Embeddings (binding1) ---
1106
- if binding_config.get("embedding"):
1107
- ASCIIColors.cyan("\n--- Embeddings (binding1) ---")
1108
- try:
1109
- embedding_vector = active_binding1.embed("Test embedding.")
1110
- ASCIIColors.green(f"Embedding (first 3 dims): {embedding_vector[:3]}... Dim: {len(embedding_vector)}")
1111
- except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
1112
- else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false) ---")
1113
-
1114
- else: # primary_model_available is False
1115
- ASCIIColors.warning("Primary test model not available. Skipping most tests.")
1116
-
1117
-
1118
- # --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
1119
- ASCIIColors.cyan("\n--- LLaVA Vision Test (if model available) ---")
1120
- llava_model_path = models_path / llava_model_name_str
1121
- llava_clip_path_actual = models_path / llava_clip_name_str # Assuming clip is in models_path too
1122
-
1123
- if llava_model_path.exists() and llava_clip_path_actual.exists():
1124
- dummy_image_path = models_path / "dummy_llava_image.png"
1125
- try:
1126
- from PIL import Image, ImageDraw
1127
- img = Image.new('RGB', (150, 70), color = ('magenta')); d = ImageDraw.Draw(img); d.text((10,10), "LLaVA Test", fill=('white')); img.save(dummy_image_path)
1128
- ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
1129
-
1130
- llava_binding_config = binding_config.copy()
1131
- # LLaVA might need specific chat template if server doesn't auto-detect well.
1132
- # llava_binding_config["chat_template"] = "llava-1.5"
1133
-
1134
- active_binding_llava = LlamaCppServerBinding(
1135
- model_name=str(llava_model_path.name), # Pass filename, let it resolve
1136
- models_path=str(models_path),
1137
- clip_model_name=str(llava_clip_path_actual.name), # Pass filename for clip
1138
- config=llava_binding_config
1139
- )
1140
- if not active_binding_llava.server_process or not active_binding_llava.server_process.is_healthy:
1141
- raise RuntimeError("LLaVA server failed to start or become healthy.")
1142
- ASCIIColors.green(f"LLaVA Binding initialized. Server for '{active_binding_llava.current_model_path.name}' running on port {active_binding_llava.port}.")
1143
- ASCIIColors.info(f"LLaVA Binding Model Info: {json.dumps(active_binding_llava.get_model_info(), indent=2)}")
969
+ # The server should still be up because binding2 holds a reference
970
+ with _server_registry_lock:
971
+ if not _active_servers:
972
+ ASCIIColors.error("FAILURE: Server shut down prematurely while still referenced by binding2.")
973
+ else:
974
+ ASCIIColors.green("SUCCESS: Server correctly remained active for binding2.")
1144
975
 
976
+ # This call should re-acquire a reference to the same server for binding1
977
+ generated_text_reloaded = active_binding1.generate_text("Test reload", n_predict=5, stream=False)
978
+ if isinstance(generated_text_reloaded, str):
979
+ ASCIIColors.green(f"SUCCESS: Generation after reload successful. Response: {generated_text_reloaded}")
980
+ else:
981
+ ASCIIColors.error(f"FAILURE: Generation after reload failed. Response: {generated_text_reloaded}")
1145
982
 
1146
- llava_prompt = "Describe this image."
1147
- llava_response = active_binding_llava.generate_text(
1148
- prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False
1149
- )
1150
- if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
1151
- else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
983
+ if active_binding1.port != active_binding2.port:
984
+ ASCIIColors.error("FAILURE: Port mismatch after reload.")
985
+ else:
986
+ ASCIIColors.green("SUCCESS: Correctly re-used same server after reload.")
1152
987
 
1153
- except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
1154
- except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
1155
- finally:
1156
- if dummy_image_path.exists(): dummy_image_path.unlink()
1157
988
  else:
1158
- ASCIIColors.warning(f"LLaVA model '{llava_model_path.name}' or clip model '{llava_clip_path_actual.name}' not found in '{models_path}'. Skipping LLaVA test.")
1159
-
1160
- if primary_model_available and active_binding1:
1161
- # --- Test changing model (using binding1 to load a different or same model) ---
1162
- ASCIIColors.cyan("\n--- Testing Model Change (binding1 reloads its model) ---")
1163
- # For a real change, use a different model name if available. Here, we reload the same.
1164
- reload_success = active_binding1.load_model(active_binding1.user_provided_model_name) # Reload original model
1165
- if reload_success and active_binding1.server_process and active_binding1.server_process.is_healthy:
1166
- ASCIIColors.green(f"Model reloaded/re-confirmed successfully by binding1. Server on port {active_binding1.port}.")
1167
- reloaded_gen = active_binding1.generate_text("Ping", n_predict=5, stream=False)
1168
- if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping (binding1): {reloaded_gen.strip()}")
1169
- else: ASCIIColors.error(f"Post-reload generation failed (binding1): {reloaded_gen}")
989
+ ASCIIColors.warning("\n--- Primary model not available, skipping most tests ---")
990
+
991
+ # --- Test 4: Initialize with model_name=None and auto-find ---
992
+ ASCIIColors.cyan("\n--- Test 4: Initialize with model_name=None and auto-find ---")
993
+ unspecified_binding = LlamaCppServerBinding(model_name=None, models_path=str(models_path), config=binding_config)
994
+ gen_unspec = unspecified_binding.generate_text("Ping", n_predict=5, stream=False)
995
+ if primary_model_available:
996
+ if isinstance(gen_unspec, str):
997
+ ASCIIColors.green(f"SUCCESS: Auto-find generation successful. Response: {gen_unspec}")
998
+ ASCIIColors.info(f"Model auto-selected: {unspecified_binding.user_provided_model_name}")
1170
999
  else:
1171
- ASCIIColors.error("Failed to reload model or server not healthy after reload attempt by binding1.")
1172
-
1173
- except ImportError as e_imp: ASCIIColors.error(f"Import error: {e_imp}.")
1174
- except FileNotFoundError as e_fnf: ASCIIColors.error(f"File not found error: {e_fnf}.")
1175
- except ConnectionError as e_conn: ASCIIColors.error(f"Connection error: {e_conn}")
1176
- except RuntimeError as e_rt:
1177
- ASCIIColors.error(f"Runtime error: {e_rt}")
1178
- if active_binding1 and active_binding1.server_process: ASCIIColors.error(f"Binding1 stderr:\n{active_binding1.server_process._stderr_lines[-20:]}")
1179
- if active_binding2 and active_binding2.server_process: ASCIIColors.error(f"Binding2 stderr:\n{active_binding2.server_process._stderr_lines[-20:]}")
1180
- if active_binding_llava and active_binding_llava.server_process: ASCIIColors.error(f"LLaVA Binding stderr:\n{active_binding_llava.server_process._stderr_lines[-20:]}")
1181
- except Exception as e_main: ASCIIColors.error(f"An unexpected error occurred: {e_main}"); trace_exception(e_main)
1000
+ ASCIIColors.error(f"FAILURE: Auto-find generation failed. Response: {gen_unspec}")
1001
+ else: # If no models, this should fail gracefully
1002
+ if isinstance(gen_unspec, dict) and 'error' in gen_unspec:
1003
+ ASCIIColors.green("SUCCESS: Correctly failed to generate when no models are available.")
1004
+ else:
1005
+ ASCIIColors.error(f"FAILURE: Incorrect behavior when no models are available. Response: {gen_unspec}")
1006
+
1007
+ except Exception as e_main:
1008
+ ASCIIColors.error(f"An unexpected error occurred during testing: {e_main}")
1009
+ trace_exception(e_main)
1182
1010
  finally:
1183
1011
  ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
1184
1012
  if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
1185
1013
  if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
1186
- if active_binding_llava: active_binding_llava.unload_model(); ASCIIColors.info("LLaVA Binding unloaded.")
1187
1014
 
1188
- # Check if any servers remain (should be none if all bindings unloaded)
1189
1015
  with _server_registry_lock:
1190
1016
  if _active_servers:
1191
- ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after all known bindings unloaded.")
1192
- for key, server_proc in list(_active_servers.items()): # list() for safe iteration if modifying
1017
+ ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after tests.")
1018
+ for key, server_proc in list(_active_servers.items()):
1193
1019
  ASCIIColors.info(f"Force shutting down stray server: {key}")
1194
1020
  try: server_proc.shutdown()
1195
1021
  except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
1196
- _active_servers.pop(key,None)
1197
- _server_ref_counts.pop(key,None)
1022
+ _active_servers.pop(key, None)
1023
+ _server_ref_counts.pop(key, None)
1198
1024
  else:
1199
1025
  ASCIIColors.green("All servers shut down correctly.")
1200
1026