lollms-client 0.32.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lollms-client might be problematic. Click here for more details.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/groq/__init__.py +2 -3
- lollms_client/llm_bindings/llamacpp/__init__.py +207 -328
- lollms_client/llm_bindings/mistral/__init__.py +57 -26
- lollms_client/llm_bindings/ollama/__init__.py +88 -0
- lollms_client/llm_bindings/open_router/__init__.py +43 -10
- lollms_client/lollms_discussion.py +6 -1
- lollms_client/lollms_llm_binding.py +403 -2
- {lollms_client-0.32.0.dist-info → lollms_client-0.33.0.dist-info}/METADATA +1 -1
- {lollms_client-0.32.0.dist-info → lollms_client-0.33.0.dist-info}/RECORD +13 -13
- {lollms_client-0.32.0.dist-info → lollms_client-0.33.0.dist-info}/WHEEL +0 -0
- {lollms_client-0.32.0.dist-info → lollms_client-0.33.0.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-0.32.0.dist-info → lollms_client-0.33.0.dist-info}/top_level.txt +0 -0
|
@@ -119,8 +119,6 @@ _server_registry_lock = threading.Lock()
|
|
|
119
119
|
|
|
120
120
|
BindingName = "LlamaCppServerBinding"
|
|
121
121
|
DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
|
|
122
|
-
# Port is now dynamic, this constant is less critical for direct use but good for reference.
|
|
123
|
-
# DEFAULT_LLAMACPP_SERVER_PORT = 9641
|
|
124
122
|
|
|
125
123
|
class LlamaCppServerProcess:
|
|
126
124
|
def __init__(self, model_path: Union[str, Path], clip_model_path: Optional[Union[str, Path]] = None, server_binary_path: Optional[Union[str, Path]]=None, server_args: Dict[str, Any]={}):
|
|
@@ -266,54 +264,34 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
266
264
|
"parallel_slots": 4, # Default parallel slots for server
|
|
267
265
|
}
|
|
268
266
|
|
|
269
|
-
def __init__(self, model_name: str, models_path: str, clip_model_name: Optional[str] = None,
|
|
267
|
+
def __init__(self, model_name: Optional[str], models_path: str, clip_model_name: Optional[str] = None,
|
|
270
268
|
config: Optional[Dict[str, Any]] = None, default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat, **kwargs):
|
|
271
269
|
super().__init__(binding_name=BindingName)
|
|
272
270
|
if llama_cpp_binaries is None: raise ImportError("llama-cpp-binaries package is required but not found.")
|
|
273
271
|
|
|
274
272
|
self.models_path = Path(models_path)
|
|
275
|
-
|
|
273
|
+
# Store initial preferences, but do not load/start server yet.
|
|
274
|
+
self.initial_model_name_preference: Optional[str] = model_name
|
|
275
|
+
self.user_provided_model_name: Optional[str] = model_name # Tracks the latest requested model
|
|
276
|
+
self.initial_clip_model_name_preference: Optional[str] = clip_model_name
|
|
277
|
+
|
|
276
278
|
self._model_path_map: Dict[str, Path] = {} # Maps unique name to full Path
|
|
277
279
|
|
|
278
|
-
# Initial scan for available models
|
|
280
|
+
# Initial scan for available models (to populate listModels)
|
|
279
281
|
self._scan_models()
|
|
280
282
|
|
|
281
|
-
# Determine the model to load
|
|
282
|
-
effective_model_to_load = model_name
|
|
283
|
-
if not effective_model_to_load and self._model_path_map:
|
|
284
|
-
# If no model was specified and we have models, pick the first one
|
|
285
|
-
# Sorting ensures a deterministic choice
|
|
286
|
-
first_model_name = sorted(self._model_path_map.keys())[0]
|
|
287
|
-
effective_model_to_load = first_model_name
|
|
288
|
-
ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{effective_model_to_load}'")
|
|
289
|
-
self.user_provided_model_name = effective_model_to_load # Update for get_model_info etc.
|
|
290
|
-
|
|
291
|
-
# Initial hint for clip_model_path, resolved fully in load_model
|
|
292
|
-
self.clip_model_path: Optional[Path] = None
|
|
293
|
-
if clip_model_name:
|
|
294
|
-
p_clip = Path(clip_model_name)
|
|
295
|
-
if p_clip.is_absolute() and p_clip.exists():
|
|
296
|
-
self.clip_model_path = p_clip
|
|
297
|
-
elif (self.models_path / clip_model_name).exists(): # Relative to models_path
|
|
298
|
-
self.clip_model_path = self.models_path / clip_model_name
|
|
299
|
-
else:
|
|
300
|
-
ASCIIColors.warning(f"Specified clip_model_name '{clip_model_name}' not found. Will rely on auto-detection if applicable.")
|
|
301
|
-
|
|
302
283
|
self.default_completion_format = default_completion_format
|
|
303
284
|
self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {}), **kwargs}
|
|
304
285
|
self.server_binary_path = self._get_server_binary_path()
|
|
305
286
|
|
|
306
|
-
|
|
287
|
+
# Current state of the loaded model and server
|
|
288
|
+
self.current_model_path: Optional[Path] = None
|
|
289
|
+
self.clip_model_path: Optional[Path] = None # Actual resolved path of loaded clip model
|
|
307
290
|
self.server_process: Optional[LlamaCppServerProcess] = None
|
|
308
291
|
self.port: Optional[int] = None
|
|
309
292
|
self.server_key: Optional[tuple] = None
|
|
310
293
|
|
|
311
|
-
|
|
312
|
-
if effective_model_to_load:
|
|
313
|
-
if not self.load_model(effective_model_to_load):
|
|
314
|
-
ASCIIColors.error(f"Initial model load for '{effective_model_to_load}' failed. Binding may not be functional.")
|
|
315
|
-
else:
|
|
316
|
-
ASCIIColors.warning("No models found in the models path. The binding will be idle until a model is loaded.")
|
|
294
|
+
ASCIIColors.info("LlamaCppServerBinding initialized. Server will start on-demand with first generation call.")
|
|
317
295
|
|
|
318
296
|
def _get_server_binary_path(self) -> Path:
|
|
319
297
|
custom_path_str = self.server_args.get("llama_server_binary_path")
|
|
@@ -384,7 +362,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
384
362
|
if server_to_stop:
|
|
385
363
|
try: server_to_stop.shutdown()
|
|
386
364
|
except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
|
|
387
|
-
# else: ASCIIColors.warning(f"Attempted to stop server {self.server_key} but it was not in _active_servers.") # Can be noisy
|
|
388
365
|
else:
|
|
389
366
|
ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
|
|
390
367
|
_active_servers.pop(self.server_key, None) # Ensure removal
|
|
@@ -392,7 +369,8 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
392
369
|
self.server_process = None
|
|
393
370
|
self.port = None
|
|
394
371
|
self.server_key = None
|
|
395
|
-
|
|
372
|
+
self.current_model_path = None # Also clear this binding's model association
|
|
373
|
+
self.clip_model_path = None # And clip model association
|
|
396
374
|
|
|
397
375
|
def load_model(self, model_name_or_path: str) -> bool:
|
|
398
376
|
self.user_provided_model_name = model_name_or_path # Keep track of the selected model name
|
|
@@ -401,15 +379,23 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
401
379
|
except Exception as ex:
|
|
402
380
|
trace_exception(ex)
|
|
403
381
|
return False
|
|
404
|
-
|
|
405
|
-
#
|
|
382
|
+
|
|
383
|
+
# Determine the final clip_model_path for this server instance
|
|
384
|
+
# Priority: 1. Explicit `initial_clip_model_name_preference` from __init__ (if valid path)
|
|
385
|
+
# 2. Auto-detection based on the resolved main model.
|
|
406
386
|
final_clip_model_path: Optional[Path] = None
|
|
407
|
-
if self.
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
387
|
+
if self.initial_clip_model_name_preference:
|
|
388
|
+
p_clip_pref = Path(self.initial_clip_model_name_preference)
|
|
389
|
+
if p_clip_pref.is_absolute() and p_clip_pref.exists():
|
|
390
|
+
final_clip_model_path = p_clip_pref
|
|
391
|
+
ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
|
|
392
|
+
elif (self.models_path / self.initial_clip_model_name_preference).exists():
|
|
393
|
+
final_clip_model_path = self.models_path / self.initial_clip_model_name_preference
|
|
394
|
+
ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path} (relative to models path)")
|
|
395
|
+
else:
|
|
396
|
+
ASCIIColors.warning(f"Specified initial clip_model_name '{self.initial_clip_model_name_preference}' not found. Attempting auto-detection.")
|
|
397
|
+
|
|
398
|
+
if not final_clip_model_path: # If no explicit path was provided or it was invalid, try auto-detection
|
|
413
399
|
base_name = get_gguf_model_base_name(resolved_model_path.stem)
|
|
414
400
|
potential_paths = [
|
|
415
401
|
resolved_model_path.parent / f"{base_name}.mmproj",
|
|
@@ -427,9 +413,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
427
413
|
final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
|
|
428
414
|
|
|
429
415
|
# Server key based on model and essential server configurations (like clip model)
|
|
430
|
-
# More server_args could be added to the key if they necessitate separate server instances
|
|
431
|
-
# For example, different n_gpu_layers might require a server restart.
|
|
432
|
-
# For now, model and clip model are the main differentiators for distinct servers.
|
|
433
416
|
new_server_key = (str(resolved_model_path), final_clip_model_path_str)
|
|
434
417
|
|
|
435
418
|
with _server_registry_lock:
|
|
@@ -503,20 +486,46 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
503
486
|
self._release_server_instance() # Ensure cleanup if start failed
|
|
504
487
|
return False
|
|
505
488
|
|
|
506
|
-
|
|
507
489
|
def unload_model(self):
|
|
508
490
|
if self.server_process:
|
|
509
491
|
ASCIIColors.info(f"Unloading model for binding. Current server: {self.server_key}, port: {self.port}")
|
|
510
492
|
self._release_server_instance() # Handles ref counting and actual shutdown if needed
|
|
511
493
|
else:
|
|
512
494
|
ASCIIColors.info("Unload_model called, but no server process was active for this binding instance.")
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
495
|
+
|
|
496
|
+
def _ensure_server_is_running(self) -> bool:
|
|
497
|
+
"""
|
|
498
|
+
Checks if the server is healthy. If not, it attempts to load the configured model.
|
|
499
|
+
Returns True if the server is healthy and ready, False otherwise.
|
|
500
|
+
"""
|
|
501
|
+
if self.server_process and self.server_process.is_healthy:
|
|
502
|
+
return True
|
|
503
|
+
|
|
504
|
+
ASCIIColors.info("Server is not running. Attempting to start on-demand...")
|
|
505
|
+
|
|
506
|
+
# Determine which model to load
|
|
507
|
+
model_to_load = self.user_provided_model_name or self.initial_model_name_preference
|
|
508
|
+
|
|
509
|
+
if not model_to_load:
|
|
510
|
+
# No model specified, try to find one automatically
|
|
511
|
+
self._scan_models()
|
|
512
|
+
available_models = self.listModels()
|
|
513
|
+
if not available_models:
|
|
514
|
+
ASCIIColors.error("No model specified and no GGUF models found in models path.")
|
|
515
|
+
return False
|
|
516
|
+
|
|
517
|
+
model_to_load = available_models[0]['name'] # Pick the first one
|
|
518
|
+
ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{model_to_load}'")
|
|
519
|
+
|
|
520
|
+
# Now, attempt to load the selected model
|
|
521
|
+
if self.load_model(model_to_load):
|
|
522
|
+
return True
|
|
523
|
+
else:
|
|
524
|
+
ASCIIColors.error(f"Automatic model load for '{model_to_load}' failed.")
|
|
525
|
+
return False
|
|
516
526
|
|
|
517
527
|
def _get_request_url(self, endpoint: str) -> str:
|
|
518
|
-
|
|
519
|
-
raise ConnectionError("Llama.cpp server is not running or not healthy.")
|
|
528
|
+
# This function now assumes _ensure_server_is_running has been called.
|
|
520
529
|
return f"{self.server_process.base_url}{endpoint}"
|
|
521
530
|
|
|
522
531
|
def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
|
|
@@ -584,48 +593,23 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
584
593
|
system_prompt: str = "",
|
|
585
594
|
n_predict: Optional[int] = None,
|
|
586
595
|
stream: Optional[bool] = None,
|
|
587
|
-
temperature: float = 0.7,
|
|
588
|
-
top_k: int = 40,
|
|
589
|
-
top_p: float = 0.9,
|
|
590
|
-
repeat_penalty: float = 1.1,
|
|
591
|
-
repeat_last_n: int = 64,
|
|
596
|
+
temperature: float = 0.7,
|
|
597
|
+
top_k: int = 40,
|
|
598
|
+
top_p: float = 0.9,
|
|
599
|
+
repeat_penalty: float = 1.1,
|
|
600
|
+
repeat_last_n: int = 64,
|
|
592
601
|
seed: Optional[int] = None,
|
|
593
602
|
n_threads: Optional[int] = None,
|
|
594
603
|
ctx_size: int | None = None,
|
|
595
604
|
streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
|
|
596
|
-
split:Optional[bool]=False,
|
|
605
|
+
split:Optional[bool]=False,
|
|
597
606
|
user_keyword:Optional[str]="!@>user:",
|
|
598
607
|
ai_keyword:Optional[str]="!@>assistant:",
|
|
599
608
|
**generation_kwargs
|
|
600
609
|
) -> Union[str, dict]:
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
Args:
|
|
605
|
-
prompt (str): The input prompt for text generation.
|
|
606
|
-
images (Optional[List[str]]): List of image file paths for multimodal generation.
|
|
607
|
-
n_predict (Optional[int]): Maximum number of tokens to generate. Uses instance default if None.
|
|
608
|
-
stream (Optional[bool]): Whether to stream the output. Uses instance default if None.
|
|
609
|
-
temperature (Optional[float]): Sampling temperature. Uses instance default if None.
|
|
610
|
-
top_k (Optional[int]): Top-k sampling parameter. Uses instance default if None.
|
|
611
|
-
top_p (Optional[float]): Top-p sampling parameter. Uses instance default if None.
|
|
612
|
-
repeat_penalty (Optional[float]): Penalty for repeated tokens. Uses instance default if None.
|
|
613
|
-
repeat_last_n (Optional[int]): Number of previous tokens to consider for repeat penalty. Uses instance default if None.
|
|
614
|
-
seed (Optional[int]): Random seed for generation. Uses instance default if None.
|
|
615
|
-
n_threads (Optional[int]): Number of threads to use. Uses instance default if None.
|
|
616
|
-
ctx_size (int | None): Context size override for this generation.
|
|
617
|
-
streaming_callback (Optional[Callable[[str, str], None]]): Callback function for streaming output.
|
|
618
|
-
- First parameter (str): The chunk of text received.
|
|
619
|
-
- Second parameter (str): The message type (e.g., MSG_TYPE.MSG_TYPE_CHUNK).
|
|
620
|
-
split:Optional[bool]: put to true if the prompt is a discussion
|
|
621
|
-
user_keyword:Optional[str]: when splitting we use this to extract user prompt
|
|
622
|
-
ai_keyword:Optional[str]": when splitting we use this to extract ai prompt
|
|
623
|
-
|
|
624
|
-
Returns:
|
|
625
|
-
Union[str, dict]: Generated text or error dictionary if failed.
|
|
626
|
-
"""
|
|
627
|
-
if not self.server_process or not self.server_process.is_healthy:
|
|
628
|
-
return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
|
|
610
|
+
|
|
611
|
+
if not self._ensure_server_is_running():
|
|
612
|
+
return {"status": False, "error": "Llama.cpp server could not be started. Please check model configuration and logs."}
|
|
629
613
|
|
|
630
614
|
_use_chat_format = True
|
|
631
615
|
payload = self._prepare_generation_payload(
|
|
@@ -642,11 +626,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
642
626
|
endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
|
|
643
627
|
request_url = self._get_request_url(endpoint)
|
|
644
628
|
|
|
645
|
-
# Debug payload (simplified)
|
|
646
|
-
# debug_payload = {k:v for k,v in payload.items() if k not in ["image_data","messages"] or (k=="messages" and not any("image_url" in part for item in v for part in (item.get("content") if isinstance(item.get("content"),list) else [])))} # Complex filter for brevity
|
|
647
|
-
# ASCIIColors.debug(f"Request to {request_url} with payload (simplified): {json.dumps(debug_payload, indent=2)[:500]}...")
|
|
648
|
-
|
|
649
|
-
|
|
650
629
|
full_response_text = ""
|
|
651
630
|
try:
|
|
652
631
|
response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
|
|
@@ -699,45 +678,16 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
699
678
|
streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
|
|
700
679
|
**generation_kwargs
|
|
701
680
|
) -> Union[str, dict]:
|
|
702
|
-
"""
|
|
703
|
-
Conduct a chat session with the llama.cpp server using a LollmsDiscussion object.
|
|
704
|
-
|
|
705
|
-
Args:
|
|
706
|
-
discussion (LollmsDiscussion): The discussion object containing the conversation history.
|
|
707
|
-
branch_tip_id (Optional[str]): The ID of the message to use as the tip of the conversation branch. Defaults to the active branch.
|
|
708
|
-
n_predict (Optional[int]): Maximum number of tokens to generate.
|
|
709
|
-
stream (Optional[bool]): Whether to stream the output.
|
|
710
|
-
temperature (float): Sampling temperature.
|
|
711
|
-
top_k (int): Top-k sampling parameter.
|
|
712
|
-
top_p (float): Top-p sampling parameter.
|
|
713
|
-
repeat_penalty (float): Penalty for repeated tokens.
|
|
714
|
-
repeat_last_n (int): Number of previous tokens to consider for repeat penalty.
|
|
715
|
-
seed (Optional[int]): Random seed for generation.
|
|
716
|
-
streaming_callback (Optional[Callable[[str, MSG_TYPE], None]]): Callback for streaming output.
|
|
717
|
-
|
|
718
|
-
Returns:
|
|
719
|
-
Union[str, dict]: The generated text or an error dictionary.
|
|
720
|
-
"""
|
|
721
|
-
if not self.server_process or not self.server_process.is_healthy:
|
|
722
|
-
return {"status": "error", "message": "Llama.cpp server is not running or not healthy."}
|
|
723
681
|
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
messages = discussion.export("openai_chat", branch_tip_id)
|
|
682
|
+
if not self._ensure_server_is_running():
|
|
683
|
+
return {"status": "error", "message": "Llama.cpp server could not be started. Please check model configuration and logs."}
|
|
727
684
|
|
|
728
|
-
|
|
685
|
+
messages = discussion.export("openai_chat", branch_tip_id)
|
|
729
686
|
payload = {
|
|
730
|
-
"messages": messages,
|
|
731
|
-
"
|
|
732
|
-
"
|
|
733
|
-
"top_k": top_k,
|
|
734
|
-
"top_p": top_p,
|
|
735
|
-
"repeat_penalty": repeat_penalty,
|
|
736
|
-
"seed": seed,
|
|
737
|
-
"stream": stream,
|
|
738
|
-
**generation_kwargs # Pass any extra parameters
|
|
687
|
+
"messages": messages, "max_tokens": n_predict, "temperature": temperature,
|
|
688
|
+
"top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty,
|
|
689
|
+
"seed": seed, "stream": stream, **generation_kwargs
|
|
739
690
|
}
|
|
740
|
-
# Remove None values, as the API expects them to be absent
|
|
741
691
|
payload = {k: v for k, v in payload.items() if v is not None}
|
|
742
692
|
|
|
743
693
|
endpoint = "/v1/chat/completions"
|
|
@@ -745,7 +695,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
745
695
|
full_response_text = ""
|
|
746
696
|
|
|
747
697
|
try:
|
|
748
|
-
# 3. Make the request to the server
|
|
749
698
|
response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
|
|
750
699
|
response.raise_for_status()
|
|
751
700
|
|
|
@@ -788,14 +737,14 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
788
737
|
return {"status": "error", "message": error_message}
|
|
789
738
|
|
|
790
739
|
def tokenize(self, text: str) -> List[int]:
|
|
791
|
-
if not self.
|
|
740
|
+
if not self._ensure_server_is_running(): return []
|
|
792
741
|
try:
|
|
793
742
|
response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
|
|
794
743
|
response.raise_for_status(); return response.json().get("tokens", [])
|
|
795
744
|
except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
|
|
796
745
|
|
|
797
746
|
def detokenize(self, tokens: List[int]) -> str:
|
|
798
|
-
if not self.
|
|
747
|
+
if not self._ensure_server_is_running(): return ""
|
|
799
748
|
try:
|
|
800
749
|
response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
|
|
801
750
|
response.raise_for_status(); return response.json().get("content", "")
|
|
@@ -804,8 +753,9 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
804
753
|
def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
|
|
805
754
|
|
|
806
755
|
def embed(self, text: str, **kwargs) -> List[float]:
|
|
807
|
-
if not self.
|
|
808
|
-
if not self.server_args.get("embedding"):
|
|
756
|
+
if not self._ensure_server_is_running(): return []
|
|
757
|
+
if not self.server_args.get("embedding"):
|
|
758
|
+
ASCIIColors.warning("Embedding not enabled in server_args. Please set 'embedding' to True in config."); return []
|
|
809
759
|
try:
|
|
810
760
|
payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
|
|
811
761
|
response = self.server_process.session.post(request_url, json=payload)
|
|
@@ -819,26 +769,31 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
819
769
|
except requests.exceptions.RequestException as e:
|
|
820
770
|
err_msg = f"Embedding request error: {e}";
|
|
821
771
|
if e.response: err_msg += f" - {e.response.text[:200]}"
|
|
822
|
-
|
|
823
|
-
|
|
772
|
+
ASCIIColors.error(err_msg)
|
|
773
|
+
return []
|
|
774
|
+
except Exception as ex:
|
|
775
|
+
trace_exception(ex); ASCIIColors.error(f"Embedding failed: {str(ex)}")
|
|
776
|
+
return []
|
|
824
777
|
|
|
825
778
|
def get_model_info(self) -> dict:
|
|
779
|
+
# This method reports the current state without triggering a server start
|
|
780
|
+
is_loaded = self.server_process is not None and self.server_process.is_healthy
|
|
826
781
|
info = {
|
|
827
782
|
"name": self.binding_name,
|
|
828
783
|
"user_provided_model_name": self.user_provided_model_name,
|
|
829
784
|
"model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
|
|
830
785
|
"clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
|
|
831
|
-
"loaded":
|
|
786
|
+
"loaded": is_loaded,
|
|
832
787
|
"server_args": self.server_args, "port": self.port if self.port else "N/A",
|
|
833
788
|
"server_key": str(self.server_key) if self.server_key else "N/A",
|
|
834
789
|
}
|
|
835
|
-
if
|
|
790
|
+
if is_loaded:
|
|
836
791
|
try:
|
|
837
792
|
props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
|
|
838
793
|
info.update({
|
|
839
794
|
"server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
|
|
840
795
|
"server_chat_format": props_resp.get("chat_format"),
|
|
841
|
-
"server_clip_model_from_props": props_resp.get("mmproj"),
|
|
796
|
+
"server_clip_model_from_props": props_resp.get("mmproj"),
|
|
842
797
|
})
|
|
843
798
|
except Exception: pass
|
|
844
799
|
|
|
@@ -850,10 +805,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
850
805
|
return info
|
|
851
806
|
|
|
852
807
|
def _scan_models(self):
|
|
853
|
-
"""
|
|
854
|
-
Scans the models_path for GGUF files and populates the model map.
|
|
855
|
-
Handles duplicate filenames by prefixing them with their parent directory path.
|
|
856
|
-
"""
|
|
857
808
|
self._model_path_map = {}
|
|
858
809
|
if not self.models_path.exists() or not self.models_path.is_dir():
|
|
859
810
|
ASCIIColors.warning(f"Models path does not exist or is not a directory: {self.models_path}")
|
|
@@ -867,73 +818,79 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
867
818
|
|
|
868
819
|
for model_file in all_paths:
|
|
869
820
|
if model_file.is_file():
|
|
870
|
-
# On Windows, path separators can be tricky. Convert to generic format.
|
|
871
821
|
relative_path_str = str(model_file.relative_to(self.models_path).as_posix())
|
|
872
822
|
if filenames_count[model_file.name] > 1:
|
|
873
|
-
# Duplicate filename, use relative path as the unique name
|
|
874
823
|
unique_name = relative_path_str
|
|
875
824
|
else:
|
|
876
|
-
# Unique filename, use the name itself
|
|
877
825
|
unique_name = model_file.name
|
|
878
|
-
|
|
879
826
|
self._model_path_map[unique_name] = model_file
|
|
880
827
|
|
|
881
828
|
ASCIIColors.info(f"Scanned {len(self._model_path_map)} models from {self.models_path}.")
|
|
882
829
|
|
|
883
830
|
def listModels(self) -> List[Dict[str, Any]]:
|
|
884
|
-
|
|
885
|
-
Lists all available GGUF models, rescanning the directory first.
|
|
886
|
-
"""
|
|
887
|
-
self._scan_models() # Always rescan when asked for the list
|
|
888
|
-
|
|
831
|
+
self._scan_models()
|
|
889
832
|
models_found = []
|
|
890
833
|
for unique_name, model_path in self._model_path_map.items():
|
|
891
834
|
models_found.append({
|
|
892
|
-
'name': unique_name,
|
|
893
|
-
'
|
|
894
|
-
'path': str(model_path), # The full path
|
|
895
|
-
'size': model_path.stat().st_size
|
|
835
|
+
'name': unique_name, 'model_name': model_path.name,
|
|
836
|
+
'path': str(model_path), 'size': model_path.stat().st_size
|
|
896
837
|
})
|
|
897
|
-
|
|
898
|
-
# Sort the list alphabetically by the unique name for consistent ordering
|
|
899
838
|
return sorted(models_found, key=lambda x: x['name'])
|
|
900
839
|
|
|
901
840
|
def __del__(self):
|
|
902
841
|
self.unload_model()
|
|
903
842
|
|
|
843
|
+
def get_ctx_size(self, model_name: Optional[str] = None) -> Optional[int]:
|
|
844
|
+
if model_name is None:
|
|
845
|
+
model_name = self.user_provided_model_name or self.initial_model_name_preference
|
|
846
|
+
if not model_name and self.current_model_path:
|
|
847
|
+
model_name = self.current_model_path.name
|
|
848
|
+
|
|
849
|
+
if model_name is None:
|
|
850
|
+
ASCIIColors.warning("Cannot determine context size without a model name.")
|
|
851
|
+
return None
|
|
852
|
+
|
|
853
|
+
known_contexts = {
|
|
854
|
+
'llama3.1': 131072, 'llama3.2': 131072, 'llama3.3': 131072, 'llama3': 8192,
|
|
855
|
+
'llama2': 4096, 'mixtral8x22b': 65536, 'mixtral': 32768, 'mistral': 32768,
|
|
856
|
+
'gemma3': 131072, 'gemma2': 8192, 'gemma': 8192, 'phi3': 131072, 'phi2': 2048,
|
|
857
|
+
'phi': 2048, 'qwen2.5': 131072, 'qwen2': 32768, 'qwen': 8192,
|
|
858
|
+
'codellama': 16384, 'codegemma': 8192, 'deepseek-coder-v2': 131072,
|
|
859
|
+
'deepseek-coder': 16384, 'deepseek-v2': 131072, 'deepseek-llm': 4096,
|
|
860
|
+
'yi1.5': 32768, 'yi': 4096, 'command-r': 131072, 'wizardlm2': 32768,
|
|
861
|
+
'wizardlm': 16384, 'zephyr': 65536, 'vicuna': 2048, 'falcon': 2048,
|
|
862
|
+
'starcoder': 8192, 'stablelm': 4096, 'orca2': 4096, 'orca': 4096,
|
|
863
|
+
'dolphin': 32768, 'openhermes': 8192,
|
|
864
|
+
}
|
|
865
|
+
normalized_model_name = model_name.lower().strip()
|
|
866
|
+
sorted_base_models = sorted(known_contexts.keys(), key=len, reverse=True)
|
|
867
|
+
|
|
868
|
+
for base_name in sorted_base_models:
|
|
869
|
+
if base_name in normalized_model_name:
|
|
870
|
+
context_size = known_contexts[base_name]
|
|
871
|
+
ASCIIColors.info(f"Using hardcoded context size for '{model_name}' based on '{base_name}': {context_size}")
|
|
872
|
+
return context_size
|
|
873
|
+
|
|
874
|
+
ASCIIColors.warning(f"Context size not found for model '{model_name}' in the hardcoded list.")
|
|
875
|
+
return None
|
|
904
876
|
|
|
905
877
|
if __name__ == '__main__':
|
|
906
|
-
global full_streamed_text
|
|
878
|
+
global full_streamed_text
|
|
907
879
|
full_streamed_text = ""
|
|
908
880
|
ASCIIColors.yellow("Testing LlamaCppServerBinding...")
|
|
909
881
|
|
|
910
|
-
# --- Configuration ---
|
|
911
|
-
# This should be the NAME of your GGUF model file.
|
|
912
|
-
# Ensure this model is placed in your models_path directory.
|
|
913
|
-
# Example: models_path = "E:\\lollms\\models\\gguf" (Windows)
|
|
914
|
-
# model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
|
|
915
|
-
|
|
916
|
-
# For CI/local testing without specific paths, you might download a tiny model
|
|
917
|
-
# or require user to set environment variables for these.
|
|
918
|
-
# For this example, replace with your actual paths/model.
|
|
919
882
|
try:
|
|
920
883
|
models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
|
|
921
|
-
model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf")
|
|
922
|
-
llava_model_name_str = os.environ.get("LOLLMS_TEST_LLAVA_MODEL_GGUF", "llava-v1.5-7b.Q2_K.gguf") # Placeholder
|
|
923
|
-
llava_clip_name_str = os.environ.get("LOLLMS_TEST_LLAVA_CLIP", "mmproj-model2-q4_0.gguf") # Placeholder
|
|
924
|
-
|
|
925
|
-
models_path = Path(models_path_str)
|
|
926
|
-
models_path.mkdir(parents=True, exist_ok=True) # Ensure test_models dir exists
|
|
884
|
+
model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf")
|
|
927
885
|
|
|
928
|
-
|
|
886
|
+
models_path = Path(models_path_str)
|
|
887
|
+
models_path.mkdir(parents=True, exist_ok=True)
|
|
929
888
|
test_model_path = models_path / model_name_str
|
|
930
|
-
|
|
931
|
-
|
|
889
|
+
|
|
890
|
+
primary_model_available = test_model_path.exists()
|
|
891
|
+
if not primary_model_available:
|
|
892
|
+
ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set env vars.")
|
|
932
893
|
ASCIIColors.warning("Some tests will be skipped.")
|
|
933
|
-
# sys.exit(1) # Or allow to continue with skips
|
|
934
|
-
primary_model_available = False
|
|
935
|
-
else:
|
|
936
|
-
primary_model_available = True
|
|
937
894
|
|
|
938
895
|
except Exception as e:
|
|
939
896
|
ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
|
|
@@ -946,184 +903,106 @@ if __name__ == '__main__':
|
|
|
946
903
|
|
|
947
904
|
active_binding1: Optional[LlamaCppServerBinding] = None
|
|
948
905
|
active_binding2: Optional[LlamaCppServerBinding] = None
|
|
949
|
-
|
|
950
|
-
|
|
906
|
+
|
|
951
907
|
try:
|
|
952
908
|
if primary_model_available:
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
ASCIIColors.info("Testing default model selection (model_name=None)")
|
|
909
|
+
# --- Test 1: Auto-start server on first generation call ---
|
|
910
|
+
ASCIIColors.cyan("\n--- Test 1: Auto-start server with specified model name ---")
|
|
956
911
|
active_binding1 = LlamaCppServerBinding(
|
|
957
|
-
model_name=
|
|
912
|
+
model_name=model_name_str, models_path=str(models_path), config=binding_config
|
|
958
913
|
)
|
|
914
|
+
ASCIIColors.info("Binding1 initialized. No server should be running yet.")
|
|
915
|
+
ASCIIColors.info(f"Initial model info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
916
|
+
|
|
917
|
+
prompt_text = "What is the capital of France?"
|
|
918
|
+
generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
|
|
919
|
+
|
|
920
|
+
if isinstance(generated_text, str) and "Paris" in generated_text:
|
|
921
|
+
ASCIIColors.green(f"SUCCESS: Auto-start generation successful. Response: {generated_text}")
|
|
922
|
+
else:
|
|
923
|
+
ASCIIColors.error(f"FAILURE: Auto-start generation failed. Response: {generated_text}")
|
|
924
|
+
|
|
925
|
+
ASCIIColors.info(f"Model info after auto-start: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
959
926
|
if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
|
|
960
|
-
|
|
961
|
-
ASCIIColors.green(f"Binding1 initialized with default model. Server for '{active_binding1.current_model_path.name}' running on port {active_binding1.port}.")
|
|
962
|
-
ASCIIColors.info(f"Binding1 Model Info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
927
|
+
raise RuntimeError("Server for binding1 did not seem to start correctly.")
|
|
963
928
|
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
model_to_load_explicitly = active_binding1.user_provided_model_name
|
|
929
|
+
# --- Test 2: Server reuse with a second binding ---
|
|
930
|
+
ASCIIColors.cyan("\n--- Test 2: Server reuse with a second binding ---")
|
|
967
931
|
active_binding2 = LlamaCppServerBinding(
|
|
968
|
-
model_name=
|
|
932
|
+
model_name=model_name_str, models_path=str(models_path), config=binding_config
|
|
969
933
|
)
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
934
|
+
# This call should reuse the server from binding1
|
|
935
|
+
generated_text_b2 = active_binding2.generate_text("Ping", n_predict=5, stream=False)
|
|
936
|
+
if isinstance(generated_text_b2, str):
|
|
937
|
+
ASCIIColors.green(f"SUCCESS: Binding2 generation successful. Response: {generated_text_b2}")
|
|
938
|
+
else:
|
|
939
|
+
ASCIIColors.error(f"FAILURE: Binding2 generation failed. Response: {generated_text_b2}")
|
|
974
940
|
|
|
975
941
|
if active_binding1.port != active_binding2.port:
|
|
976
|
-
ASCIIColors.error("
|
|
942
|
+
ASCIIColors.error("FAILURE: Bindings for the same model are using different ports! Server sharing failed.")
|
|
977
943
|
else:
|
|
978
|
-
ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing
|
|
979
|
-
|
|
980
|
-
# --- List Models (scans configured directories) ---
|
|
981
|
-
ASCIIColors.cyan("\n--- Listing Models (from search paths, using binding1) ---")
|
|
982
|
-
# Create a dummy duplicate model to test unique naming
|
|
983
|
-
duplicate_folder = models_path / "subdir"
|
|
984
|
-
duplicate_folder.mkdir(exist_ok=True)
|
|
985
|
-
duplicate_model_path = duplicate_folder / test_model_path.name
|
|
986
|
-
import shutil
|
|
987
|
-
shutil.copy(test_model_path, duplicate_model_path)
|
|
988
|
-
ASCIIColors.info(f"Created a duplicate model for testing: {duplicate_model_path}")
|
|
989
|
-
|
|
990
|
-
listed_models = active_binding1.listModels()
|
|
991
|
-
if listed_models:
|
|
992
|
-
ASCIIColors.green(f"Found {len(listed_models)} GGUF files.")
|
|
993
|
-
pprint.pprint(listed_models)
|
|
994
|
-
# Check if the duplicate was handled
|
|
995
|
-
names = [m['name'] for m in listed_models]
|
|
996
|
-
if test_model_path.name in names and f"subdir/{test_model_path.name}" in names:
|
|
997
|
-
ASCIIColors.green("SUCCESS: Duplicate model names were correctly handled.")
|
|
998
|
-
else:
|
|
999
|
-
ASCIIColors.error("FAILURE: Duplicate model names were not handled correctly.")
|
|
1000
|
-
else: ASCIIColors.warning("No GGUF models found in search paths.")
|
|
1001
|
-
|
|
1002
|
-
# Clean up dummy duplicate
|
|
1003
|
-
duplicate_model_path.unlink()
|
|
1004
|
-
duplicate_folder.rmdir()
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
# --- Tokenize/Detokenize ---
|
|
1008
|
-
ASCIIColors.cyan("\n--- Tokenize/Detokenize (using binding1) ---")
|
|
1009
|
-
sample_text = "Hello, Llama.cpp server world!"
|
|
1010
|
-
tokens = active_binding1.tokenize(sample_text)
|
|
1011
|
-
ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
|
|
1012
|
-
if tokens:
|
|
1013
|
-
detokenized_text = active_binding1.detokenize(tokens)
|
|
1014
|
-
ASCIIColors.green(f"Detokenized text: {detokenized_text}")
|
|
1015
|
-
else: ASCIIColors.warning("Tokenization returned empty list.")
|
|
1016
|
-
|
|
1017
|
-
# --- Text Generation (Non-Streaming, Chat API, binding1) ---
|
|
1018
|
-
ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API, binding1) ---")
|
|
1019
|
-
prompt_text = "What is the capital of Germany?"
|
|
1020
|
-
generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
|
|
1021
|
-
if isinstance(generated_text, str): ASCIIColors.green(f"Generated text (binding1): {generated_text}")
|
|
1022
|
-
else: ASCIIColors.error(f"Generation failed (binding1): {generated_text}")
|
|
944
|
+
ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing works.")
|
|
1023
945
|
|
|
1024
|
-
# ---
|
|
1025
|
-
ASCIIColors.cyan("\n---
|
|
1026
|
-
|
|
1027
|
-
|
|
946
|
+
# --- Test 3: Unload and auto-reload ---
|
|
947
|
+
ASCIIColors.cyan("\n--- Test 3: Unload and auto-reload ---")
|
|
948
|
+
active_binding1.unload_model()
|
|
949
|
+
ASCIIColors.info("Binding1 unloaded. Ref count should be 1, server still up for binding2.")
|
|
1028
950
|
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
if binding_config.get("embedding"):
|
|
1036
|
-
ASCIIColors.cyan("\n--- Embeddings (binding1) ---")
|
|
1037
|
-
try:
|
|
1038
|
-
embedding_vector = active_binding1.embed("Test embedding.")
|
|
1039
|
-
ASCIIColors.green(f"Embedding (first 3 dims): {embedding_vector[:3]}... Dim: {len(embedding_vector)}")
|
|
1040
|
-
except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
|
|
1041
|
-
else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false) ---")
|
|
1042
|
-
|
|
1043
|
-
else: # primary_model_available is False
|
|
1044
|
-
ASCIIColors.warning("Primary test model not available. Skipping most tests.")
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
# --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
|
|
1048
|
-
ASCIIColors.cyan("\n--- LLaVA Vision Test (if model available) ---")
|
|
1049
|
-
llava_model_path = models_path / llava_model_name_str
|
|
1050
|
-
llava_clip_path_actual = models_path / llava_clip_name_str # Assuming clip is in models_path too
|
|
1051
|
-
|
|
1052
|
-
if llava_model_path.exists() and llava_clip_path_actual.exists():
|
|
1053
|
-
dummy_image_path = models_path / "dummy_llava_image.png"
|
|
1054
|
-
try:
|
|
1055
|
-
from PIL import Image, ImageDraw
|
|
1056
|
-
img = Image.new('RGB', (150, 70), color = ('magenta')); d = ImageDraw.Draw(img); d.text((10,10), "LLaVA Test", fill=('white')); img.save(dummy_image_path)
|
|
1057
|
-
ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
|
|
1058
|
-
|
|
1059
|
-
llava_binding_config = binding_config.copy()
|
|
1060
|
-
# LLaVA might need specific chat template if server doesn't auto-detect well.
|
|
1061
|
-
# llava_binding_config["chat_template"] = "llava-1.5"
|
|
1062
|
-
|
|
1063
|
-
active_binding_llava = LlamaCppServerBinding(
|
|
1064
|
-
model_name=str(llava_model_path.name), # Pass filename, let it resolve
|
|
1065
|
-
models_path=str(models_path),
|
|
1066
|
-
clip_model_name=str(llava_clip_path_actual.name), # Pass filename for clip
|
|
1067
|
-
config=llava_binding_config
|
|
1068
|
-
)
|
|
1069
|
-
if not active_binding_llava.server_process or not active_binding_llava.server_process.is_healthy:
|
|
1070
|
-
raise RuntimeError("LLaVA server failed to start or become healthy.")
|
|
1071
|
-
ASCIIColors.green(f"LLaVA Binding initialized. Server for '{active_binding_llava.current_model_path.name}' running on port {active_binding_llava.port}.")
|
|
1072
|
-
ASCIIColors.info(f"LLaVA Binding Model Info: {json.dumps(active_binding_llava.get_model_info(), indent=2)}")
|
|
951
|
+
# The server should still be up because binding2 holds a reference
|
|
952
|
+
with _server_registry_lock:
|
|
953
|
+
if not _active_servers:
|
|
954
|
+
ASCIIColors.error("FAILURE: Server shut down prematurely while still referenced by binding2.")
|
|
955
|
+
else:
|
|
956
|
+
ASCIIColors.green("SUCCESS: Server correctly remained active for binding2.")
|
|
1073
957
|
|
|
958
|
+
# This call should re-acquire a reference to the same server for binding1
|
|
959
|
+
generated_text_reloaded = active_binding1.generate_text("Test reload", n_predict=5, stream=False)
|
|
960
|
+
if isinstance(generated_text_reloaded, str):
|
|
961
|
+
ASCIIColors.green(f"SUCCESS: Generation after reload successful. Response: {generated_text_reloaded}")
|
|
962
|
+
else:
|
|
963
|
+
ASCIIColors.error(f"FAILURE: Generation after reload failed. Response: {generated_text_reloaded}")
|
|
1074
964
|
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
|
|
1080
|
-
else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
|
|
965
|
+
if active_binding1.port != active_binding2.port:
|
|
966
|
+
ASCIIColors.error("FAILURE: Port mismatch after reload.")
|
|
967
|
+
else:
|
|
968
|
+
ASCIIColors.green("SUCCESS: Correctly re-used same server after reload.")
|
|
1081
969
|
|
|
1082
|
-
except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
|
|
1083
|
-
except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
|
|
1084
|
-
finally:
|
|
1085
|
-
if dummy_image_path.exists(): dummy_image_path.unlink()
|
|
1086
970
|
else:
|
|
1087
|
-
ASCIIColors.warning(
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
if
|
|
1095
|
-
ASCIIColors.green(f"
|
|
1096
|
-
|
|
1097
|
-
if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping (binding1): {reloaded_gen.strip()}")
|
|
1098
|
-
else: ASCIIColors.error(f"Post-reload generation failed (binding1): {reloaded_gen}")
|
|
971
|
+
ASCIIColors.warning("\n--- Primary model not available, skipping most tests ---")
|
|
972
|
+
|
|
973
|
+
# --- Test 4: Initialize with model_name=None and auto-find ---
|
|
974
|
+
ASCIIColors.cyan("\n--- Test 4: Initialize with model_name=None and auto-find ---")
|
|
975
|
+
unspecified_binding = LlamaCppServerBinding(model_name=None, models_path=str(models_path), config=binding_config)
|
|
976
|
+
gen_unspec = unspecified_binding.generate_text("Ping", n_predict=5, stream=False)
|
|
977
|
+
if primary_model_available:
|
|
978
|
+
if isinstance(gen_unspec, str):
|
|
979
|
+
ASCIIColors.green(f"SUCCESS: Auto-find generation successful. Response: {gen_unspec}")
|
|
980
|
+
ASCIIColors.info(f"Model auto-selected: {unspecified_binding.user_provided_model_name}")
|
|
1099
981
|
else:
|
|
1100
|
-
ASCIIColors.error("
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
except Exception as e_main: ASCIIColors.error(f"An unexpected error occurred: {e_main}"); trace_exception(e_main)
|
|
982
|
+
ASCIIColors.error(f"FAILURE: Auto-find generation failed. Response: {gen_unspec}")
|
|
983
|
+
else: # If no models, this should fail gracefully
|
|
984
|
+
if isinstance(gen_unspec, dict) and 'error' in gen_unspec:
|
|
985
|
+
ASCIIColors.green("SUCCESS: Correctly failed to generate when no models are available.")
|
|
986
|
+
else:
|
|
987
|
+
ASCIIColors.error(f"FAILURE: Incorrect behavior when no models are available. Response: {gen_unspec}")
|
|
988
|
+
|
|
989
|
+
except Exception as e_main:
|
|
990
|
+
ASCIIColors.error(f"An unexpected error occurred during testing: {e_main}")
|
|
991
|
+
trace_exception(e_main)
|
|
1111
992
|
finally:
|
|
1112
993
|
ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
|
|
1113
994
|
if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
|
|
1114
995
|
if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
|
|
1115
|
-
if active_binding_llava: active_binding_llava.unload_model(); ASCIIColors.info("LLaVA Binding unloaded.")
|
|
1116
996
|
|
|
1117
|
-
# Check if any servers remain (should be none if all bindings unloaded)
|
|
1118
997
|
with _server_registry_lock:
|
|
1119
998
|
if _active_servers:
|
|
1120
|
-
ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after
|
|
1121
|
-
for key, server_proc in list(_active_servers.items()):
|
|
999
|
+
ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after tests.")
|
|
1000
|
+
for key, server_proc in list(_active_servers.items()):
|
|
1122
1001
|
ASCIIColors.info(f"Force shutting down stray server: {key}")
|
|
1123
1002
|
try: server_proc.shutdown()
|
|
1124
1003
|
except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
|
|
1125
|
-
_active_servers.pop(key,None)
|
|
1126
|
-
_server_ref_counts.pop(key,None)
|
|
1004
|
+
_active_servers.pop(key, None)
|
|
1005
|
+
_server_ref_counts.pop(key, None)
|
|
1127
1006
|
else:
|
|
1128
1007
|
ASCIIColors.green("All servers shut down correctly.")
|
|
1129
1008
|
|