lollms-client 0.32.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lollms-client might be problematic. Click here for more details.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/azure_openai/__init__.py +6 -10
- lollms_client/llm_bindings/claude/__init__.py +4 -7
- lollms_client/llm_bindings/gemini/__init__.py +3 -7
- lollms_client/llm_bindings/grok/__init__.py +3 -7
- lollms_client/llm_bindings/groq/__init__.py +4 -7
- lollms_client/llm_bindings/hugging_face_inference_api/__init__.py +4 -6
- lollms_client/llm_bindings/litellm/__init__.py +15 -6
- lollms_client/llm_bindings/llamacpp/__init__.py +214 -388
- lollms_client/llm_bindings/lollms/__init__.py +24 -14
- lollms_client/llm_bindings/lollms_webui/__init__.py +6 -12
- lollms_client/llm_bindings/mistral/__init__.py +58 -29
- lollms_client/llm_bindings/ollama/__init__.py +6 -11
- lollms_client/llm_bindings/open_router/__init__.py +45 -14
- lollms_client/llm_bindings/openai/__init__.py +7 -14
- lollms_client/llm_bindings/openllm/__init__.py +12 -12
- lollms_client/llm_bindings/pythonllamacpp/__init__.py +1 -1
- lollms_client/llm_bindings/tensor_rt/__init__.py +8 -13
- lollms_client/llm_bindings/transformers/__init__.py +14 -6
- lollms_client/llm_bindings/vllm/__init__.py +16 -12
- lollms_client/lollms_core.py +296 -487
- lollms_client/lollms_discussion.py +436 -78
- lollms_client/lollms_llm_binding.py +223 -11
- lollms_client/lollms_mcp_binding.py +33 -2
- lollms_client/mcp_bindings/local_mcp/__init__.py +3 -2
- lollms_client/mcp_bindings/remote_mcp/__init__.py +6 -5
- lollms_client/mcp_bindings/standard_mcp/__init__.py +3 -5
- lollms_client/stt_bindings/lollms/__init__.py +6 -8
- lollms_client/stt_bindings/whisper/__init__.py +2 -4
- lollms_client/stt_bindings/whispercpp/__init__.py +15 -16
- lollms_client/tti_bindings/dalle/__init__.py +29 -28
- lollms_client/tti_bindings/diffusers/__init__.py +25 -21
- lollms_client/tti_bindings/gemini/__init__.py +215 -0
- lollms_client/tti_bindings/lollms/__init__.py +8 -9
- lollms_client-1.0.0.dist-info/METADATA +1214 -0
- lollms_client-1.0.0.dist-info/RECORD +69 -0
- {lollms_client-0.32.1.dist-info → lollms_client-1.0.0.dist-info}/top_level.txt +0 -2
- examples/article_summary/article_summary.py +0 -58
- examples/console_discussion/console_app.py +0 -266
- examples/console_discussion.py +0 -448
- examples/deep_analyze/deep_analyse.py +0 -30
- examples/deep_analyze/deep_analyze_multiple_files.py +0 -32
- examples/function_calling_with_local_custom_mcp.py +0 -250
- examples/generate_a_benchmark_for_safe_store.py +0 -89
- examples/generate_and_speak/generate_and_speak.py +0 -251
- examples/generate_game_sfx/generate_game_fx.py +0 -240
- examples/generate_text_with_multihop_rag_example.py +0 -210
- examples/gradio_chat_app.py +0 -228
- examples/gradio_lollms_chat.py +0 -259
- examples/internet_search_with_rag.py +0 -226
- examples/lollms_chat/calculator.py +0 -59
- examples/lollms_chat/derivative.py +0 -48
- examples/lollms_chat/test_openai_compatible_with_lollms_chat.py +0 -12
- examples/lollms_discussions_test.py +0 -155
- examples/mcp_examples/external_mcp.py +0 -267
- examples/mcp_examples/local_mcp.py +0 -171
- examples/mcp_examples/openai_mcp.py +0 -203
- examples/mcp_examples/run_remote_mcp_example_v2.py +0 -290
- examples/mcp_examples/run_standard_mcp_example.py +0 -204
- examples/simple_text_gen_test.py +0 -173
- examples/simple_text_gen_with_image_test.py +0 -178
- examples/test_local_models/local_chat.py +0 -9
- examples/text_2_audio.py +0 -77
- examples/text_2_image.py +0 -144
- examples/text_2_image_diffusers.py +0 -274
- examples/text_and_image_2_audio.py +0 -59
- examples/text_gen.py +0 -30
- examples/text_gen_system_prompt.py +0 -29
- lollms_client-0.32.1.dist-info/METADATA +0 -854
- lollms_client-0.32.1.dist-info/RECORD +0 -101
- test/test_lollms_discussion.py +0 -368
- {lollms_client-0.32.1.dist-info → lollms_client-1.0.0.dist-info}/WHEEL +0 -0
- {lollms_client-0.32.1.dist-info → lollms_client-1.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -119,11 +119,16 @@ _server_registry_lock = threading.Lock()
|
|
|
119
119
|
|
|
120
120
|
BindingName = "LlamaCppServerBinding"
|
|
121
121
|
DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
|
|
122
|
-
# Port is now dynamic, this constant is less critical for direct use but good for reference.
|
|
123
|
-
# DEFAULT_LLAMACPP_SERVER_PORT = 9641
|
|
124
122
|
|
|
125
123
|
class LlamaCppServerProcess:
|
|
126
|
-
def __init__(self,
|
|
124
|
+
def __init__(self,
|
|
125
|
+
model_path: Union[str, Path],
|
|
126
|
+
clip_model_path: Optional[Union[str, Path]] = None,
|
|
127
|
+
server_binary_path: Optional[Union[str, Path]]=None,
|
|
128
|
+
server_args: Dict[str, Any]={}
|
|
129
|
+
):
|
|
130
|
+
"""Initialize the Llama.cpp server process.
|
|
131
|
+
"""
|
|
127
132
|
self.model_path = Path(model_path)
|
|
128
133
|
self.clip_model_path = Path(clip_model_path) if clip_model_path else None
|
|
129
134
|
|
|
@@ -266,54 +271,45 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
266
271
|
"parallel_slots": 4, # Default parallel slots for server
|
|
267
272
|
}
|
|
268
273
|
|
|
269
|
-
def __init__(self,
|
|
270
|
-
|
|
271
|
-
|
|
274
|
+
def __init__(self,
|
|
275
|
+
**kwargs
|
|
276
|
+
):
|
|
277
|
+
"""Initialize the Llama.cpp server binding.
|
|
278
|
+
Args:
|
|
279
|
+
model_name (str): Name of the model to load. If None, will use initial_model_name_preference.
|
|
280
|
+
models_path (str): Path to the directory containing model files.
|
|
281
|
+
clip_model_name (str): Optional name of the clip model to use. If None, will try to auto-detect based on the main model.
|
|
282
|
+
config (dict): Additional configuration options for the server.
|
|
283
|
+
default_completion_format (ELF_COMPLETION_FORMAT): Default format for completions.
|
|
284
|
+
|
|
285
|
+
"""
|
|
286
|
+
super().__init__(BindingName, **kwargs)
|
|
272
287
|
if llama_cpp_binaries is None: raise ImportError("llama-cpp-binaries package is required but not found.")
|
|
273
288
|
|
|
289
|
+
models_path = kwargs.get("models_path", Path(__file__).parent/"models")
|
|
274
290
|
self.models_path = Path(models_path)
|
|
275
|
-
|
|
291
|
+
# Store initial preferences, but do not load/start server yet.
|
|
292
|
+
self.initial_model_name_preference: Optional[str] = kwargs.get("model_name")
|
|
293
|
+
self.user_provided_model_name: Optional[str] = kwargs.get("model_name") # Tracks the latest requested model
|
|
294
|
+
self.initial_clip_model_name_preference: Optional[str] = kwargs.get("clip_model_name")
|
|
295
|
+
|
|
276
296
|
self._model_path_map: Dict[str, Path] = {} # Maps unique name to full Path
|
|
277
297
|
|
|
278
|
-
# Initial scan for available models
|
|
298
|
+
# Initial scan for available models (to populate listModels)
|
|
279
299
|
self._scan_models()
|
|
280
300
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
if not effective_model_to_load and self._model_path_map:
|
|
284
|
-
# If no model was specified and we have models, pick the first one
|
|
285
|
-
# Sorting ensures a deterministic choice
|
|
286
|
-
first_model_name = sorted(self._model_path_map.keys())[0]
|
|
287
|
-
effective_model_to_load = first_model_name
|
|
288
|
-
ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{effective_model_to_load}'")
|
|
289
|
-
self.user_provided_model_name = effective_model_to_load # Update for get_model_info etc.
|
|
290
|
-
|
|
291
|
-
# Initial hint for clip_model_path, resolved fully in load_model
|
|
292
|
-
self.clip_model_path: Optional[Path] = None
|
|
293
|
-
if clip_model_name:
|
|
294
|
-
p_clip = Path(clip_model_name)
|
|
295
|
-
if p_clip.is_absolute() and p_clip.exists():
|
|
296
|
-
self.clip_model_path = p_clip
|
|
297
|
-
elif (self.models_path / clip_model_name).exists(): # Relative to models_path
|
|
298
|
-
self.clip_model_path = self.models_path / clip_model_name
|
|
299
|
-
else:
|
|
300
|
-
ASCIIColors.warning(f"Specified clip_model_name '{clip_model_name}' not found. Will rely on auto-detection if applicable.")
|
|
301
|
-
|
|
302
|
-
self.default_completion_format = default_completion_format
|
|
303
|
-
self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {}), **kwargs}
|
|
301
|
+
self.default_completion_format = kwargs.get("default_completion_format", ELF_COMPLETION_FORMAT.Chat)
|
|
302
|
+
self.server_args = {**self.DEFAULT_SERVER_ARGS, **(kwargs.get("config") or {}), **kwargs}
|
|
304
303
|
self.server_binary_path = self._get_server_binary_path()
|
|
305
304
|
|
|
306
|
-
|
|
305
|
+
# Current state of the loaded model and server
|
|
306
|
+
self.current_model_path: Optional[Path] = None
|
|
307
|
+
self.clip_model_path: Optional[Path] = None # Actual resolved path of loaded clip model
|
|
307
308
|
self.server_process: Optional[LlamaCppServerProcess] = None
|
|
308
309
|
self.port: Optional[int] = None
|
|
309
310
|
self.server_key: Optional[tuple] = None
|
|
310
311
|
|
|
311
|
-
|
|
312
|
-
if effective_model_to_load:
|
|
313
|
-
if not self.load_model(effective_model_to_load):
|
|
314
|
-
ASCIIColors.error(f"Initial model load for '{effective_model_to_load}' failed. Binding may not be functional.")
|
|
315
|
-
else:
|
|
316
|
-
ASCIIColors.warning("No models found in the models path. The binding will be idle until a model is loaded.")
|
|
312
|
+
ASCIIColors.info("LlamaCppServerBinding initialized. Server will start on-demand with first generation call.")
|
|
317
313
|
|
|
318
314
|
def _get_server_binary_path(self) -> Path:
|
|
319
315
|
custom_path_str = self.server_args.get("llama_server_binary_path")
|
|
@@ -384,7 +380,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
384
380
|
if server_to_stop:
|
|
385
381
|
try: server_to_stop.shutdown()
|
|
386
382
|
except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
|
|
387
|
-
# else: ASCIIColors.warning(f"Attempted to stop server {self.server_key} but it was not in _active_servers.") # Can be noisy
|
|
388
383
|
else:
|
|
389
384
|
ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
|
|
390
385
|
_active_servers.pop(self.server_key, None) # Ensure removal
|
|
@@ -392,7 +387,8 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
392
387
|
self.server_process = None
|
|
393
388
|
self.port = None
|
|
394
389
|
self.server_key = None
|
|
395
|
-
|
|
390
|
+
self.current_model_path = None # Also clear this binding's model association
|
|
391
|
+
self.clip_model_path = None # And clip model association
|
|
396
392
|
|
|
397
393
|
def load_model(self, model_name_or_path: str) -> bool:
|
|
398
394
|
self.user_provided_model_name = model_name_or_path # Keep track of the selected model name
|
|
@@ -401,15 +397,23 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
401
397
|
except Exception as ex:
|
|
402
398
|
trace_exception(ex)
|
|
403
399
|
return False
|
|
404
|
-
|
|
405
|
-
#
|
|
400
|
+
|
|
401
|
+
# Determine the final clip_model_path for this server instance
|
|
402
|
+
# Priority: 1. Explicit `initial_clip_model_name_preference` from __init__ (if valid path)
|
|
403
|
+
# 2. Auto-detection based on the resolved main model.
|
|
406
404
|
final_clip_model_path: Optional[Path] = None
|
|
407
|
-
if self.
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
405
|
+
if self.initial_clip_model_name_preference:
|
|
406
|
+
p_clip_pref = Path(self.initial_clip_model_name_preference)
|
|
407
|
+
if p_clip_pref.is_absolute() and p_clip_pref.exists():
|
|
408
|
+
final_clip_model_path = p_clip_pref
|
|
409
|
+
ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
|
|
410
|
+
elif (self.models_path / self.initial_clip_model_name_preference).exists():
|
|
411
|
+
final_clip_model_path = self.models_path / self.initial_clip_model_name_preference
|
|
412
|
+
ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path} (relative to models path)")
|
|
413
|
+
else:
|
|
414
|
+
ASCIIColors.warning(f"Specified initial clip_model_name '{self.initial_clip_model_name_preference}' not found. Attempting auto-detection.")
|
|
415
|
+
|
|
416
|
+
if not final_clip_model_path: # If no explicit path was provided or it was invalid, try auto-detection
|
|
413
417
|
base_name = get_gguf_model_base_name(resolved_model_path.stem)
|
|
414
418
|
potential_paths = [
|
|
415
419
|
resolved_model_path.parent / f"{base_name}.mmproj",
|
|
@@ -427,9 +431,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
427
431
|
final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
|
|
428
432
|
|
|
429
433
|
# Server key based on model and essential server configurations (like clip model)
|
|
430
|
-
# More server_args could be added to the key if they necessitate separate server instances
|
|
431
|
-
# For example, different n_gpu_layers might require a server restart.
|
|
432
|
-
# For now, model and clip model are the main differentiators for distinct servers.
|
|
433
434
|
new_server_key = (str(resolved_model_path), final_clip_model_path_str)
|
|
434
435
|
|
|
435
436
|
with _server_registry_lock:
|
|
@@ -503,20 +504,46 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
503
504
|
self._release_server_instance() # Ensure cleanup if start failed
|
|
504
505
|
return False
|
|
505
506
|
|
|
506
|
-
|
|
507
507
|
def unload_model(self):
|
|
508
508
|
if self.server_process:
|
|
509
509
|
ASCIIColors.info(f"Unloading model for binding. Current server: {self.server_key}, port: {self.port}")
|
|
510
510
|
self._release_server_instance() # Handles ref counting and actual shutdown if needed
|
|
511
511
|
else:
|
|
512
512
|
ASCIIColors.info("Unload_model called, but no server process was active for this binding instance.")
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
513
|
+
|
|
514
|
+
def _ensure_server_is_running(self) -> bool:
|
|
515
|
+
"""
|
|
516
|
+
Checks if the server is healthy. If not, it attempts to load the configured model.
|
|
517
|
+
Returns True if the server is healthy and ready, False otherwise.
|
|
518
|
+
"""
|
|
519
|
+
if self.server_process and self.server_process.is_healthy:
|
|
520
|
+
return True
|
|
521
|
+
|
|
522
|
+
ASCIIColors.info("Server is not running. Attempting to start on-demand...")
|
|
523
|
+
|
|
524
|
+
# Determine which model to load
|
|
525
|
+
model_to_load = self.user_provided_model_name or self.initial_model_name_preference
|
|
526
|
+
|
|
527
|
+
if not model_to_load:
|
|
528
|
+
# No model specified, try to find one automatically
|
|
529
|
+
self._scan_models()
|
|
530
|
+
available_models = self.listModels()
|
|
531
|
+
if not available_models:
|
|
532
|
+
ASCIIColors.error("No model specified and no GGUF models found in models path.")
|
|
533
|
+
return False
|
|
534
|
+
|
|
535
|
+
model_to_load = available_models[0]['name'] # Pick the first one
|
|
536
|
+
ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{model_to_load}'")
|
|
537
|
+
|
|
538
|
+
# Now, attempt to load the selected model
|
|
539
|
+
if self.load_model(model_to_load):
|
|
540
|
+
return True
|
|
541
|
+
else:
|
|
542
|
+
ASCIIColors.error(f"Automatic model load for '{model_to_load}' failed.")
|
|
543
|
+
return False
|
|
516
544
|
|
|
517
545
|
def _get_request_url(self, endpoint: str) -> str:
|
|
518
|
-
|
|
519
|
-
raise ConnectionError("Llama.cpp server is not running or not healthy.")
|
|
546
|
+
# This function now assumes _ensure_server_is_running has been called.
|
|
520
547
|
return f"{self.server_process.base_url}{endpoint}"
|
|
521
548
|
|
|
522
549
|
def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
|
|
@@ -584,48 +611,23 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
584
611
|
system_prompt: str = "",
|
|
585
612
|
n_predict: Optional[int] = None,
|
|
586
613
|
stream: Optional[bool] = None,
|
|
587
|
-
temperature: float = 0.7,
|
|
588
|
-
top_k: int = 40,
|
|
589
|
-
top_p: float = 0.9,
|
|
590
|
-
repeat_penalty: float = 1.1,
|
|
591
|
-
repeat_last_n: int = 64,
|
|
614
|
+
temperature: float = 0.7,
|
|
615
|
+
top_k: int = 40,
|
|
616
|
+
top_p: float = 0.9,
|
|
617
|
+
repeat_penalty: float = 1.1,
|
|
618
|
+
repeat_last_n: int = 64,
|
|
592
619
|
seed: Optional[int] = None,
|
|
593
620
|
n_threads: Optional[int] = None,
|
|
594
621
|
ctx_size: int | None = None,
|
|
595
622
|
streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
|
|
596
|
-
split:Optional[bool]=False,
|
|
623
|
+
split:Optional[bool]=False,
|
|
597
624
|
user_keyword:Optional[str]="!@>user:",
|
|
598
625
|
ai_keyword:Optional[str]="!@>assistant:",
|
|
599
626
|
**generation_kwargs
|
|
600
627
|
) -> Union[str, dict]:
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
Args:
|
|
605
|
-
prompt (str): The input prompt for text generation.
|
|
606
|
-
images (Optional[List[str]]): List of image file paths for multimodal generation.
|
|
607
|
-
n_predict (Optional[int]): Maximum number of tokens to generate. Uses instance default if None.
|
|
608
|
-
stream (Optional[bool]): Whether to stream the output. Uses instance default if None.
|
|
609
|
-
temperature (Optional[float]): Sampling temperature. Uses instance default if None.
|
|
610
|
-
top_k (Optional[int]): Top-k sampling parameter. Uses instance default if None.
|
|
611
|
-
top_p (Optional[float]): Top-p sampling parameter. Uses instance default if None.
|
|
612
|
-
repeat_penalty (Optional[float]): Penalty for repeated tokens. Uses instance default if None.
|
|
613
|
-
repeat_last_n (Optional[int]): Number of previous tokens to consider for repeat penalty. Uses instance default if None.
|
|
614
|
-
seed (Optional[int]): Random seed for generation. Uses instance default if None.
|
|
615
|
-
n_threads (Optional[int]): Number of threads to use. Uses instance default if None.
|
|
616
|
-
ctx_size (int | None): Context size override for this generation.
|
|
617
|
-
streaming_callback (Optional[Callable[[str, str], None]]): Callback function for streaming output.
|
|
618
|
-
- First parameter (str): The chunk of text received.
|
|
619
|
-
- Second parameter (str): The message type (e.g., MSG_TYPE.MSG_TYPE_CHUNK).
|
|
620
|
-
split:Optional[bool]: put to true if the prompt is a discussion
|
|
621
|
-
user_keyword:Optional[str]: when splitting we use this to extract user prompt
|
|
622
|
-
ai_keyword:Optional[str]": when splitting we use this to extract ai prompt
|
|
623
|
-
|
|
624
|
-
Returns:
|
|
625
|
-
Union[str, dict]: Generated text or error dictionary if failed.
|
|
626
|
-
"""
|
|
627
|
-
if not self.server_process or not self.server_process.is_healthy:
|
|
628
|
-
return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
|
|
628
|
+
|
|
629
|
+
if not self._ensure_server_is_running():
|
|
630
|
+
return {"status": False, "error": "Llama.cpp server could not be started. Please check model configuration and logs."}
|
|
629
631
|
|
|
630
632
|
_use_chat_format = True
|
|
631
633
|
payload = self._prepare_generation_payload(
|
|
@@ -642,11 +644,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
642
644
|
endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
|
|
643
645
|
request_url = self._get_request_url(endpoint)
|
|
644
646
|
|
|
645
|
-
# Debug payload (simplified)
|
|
646
|
-
# debug_payload = {k:v for k,v in payload.items() if k not in ["image_data","messages"] or (k=="messages" and not any("image_url" in part for item in v for part in (item.get("content") if isinstance(item.get("content"),list) else [])))} # Complex filter for brevity
|
|
647
|
-
# ASCIIColors.debug(f"Request to {request_url} with payload (simplified): {json.dumps(debug_payload, indent=2)[:500]}...")
|
|
648
|
-
|
|
649
|
-
|
|
650
647
|
full_response_text = ""
|
|
651
648
|
try:
|
|
652
649
|
response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
|
|
@@ -699,45 +696,16 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
699
696
|
streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
|
|
700
697
|
**generation_kwargs
|
|
701
698
|
) -> Union[str, dict]:
|
|
702
|
-
"""
|
|
703
|
-
Conduct a chat session with the llama.cpp server using a LollmsDiscussion object.
|
|
704
699
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
branch_tip_id (Optional[str]): The ID of the message to use as the tip of the conversation branch. Defaults to the active branch.
|
|
708
|
-
n_predict (Optional[int]): Maximum number of tokens to generate.
|
|
709
|
-
stream (Optional[bool]): Whether to stream the output.
|
|
710
|
-
temperature (float): Sampling temperature.
|
|
711
|
-
top_k (int): Top-k sampling parameter.
|
|
712
|
-
top_p (float): Top-p sampling parameter.
|
|
713
|
-
repeat_penalty (float): Penalty for repeated tokens.
|
|
714
|
-
repeat_last_n (int): Number of previous tokens to consider for repeat penalty.
|
|
715
|
-
seed (Optional[int]): Random seed for generation.
|
|
716
|
-
streaming_callback (Optional[Callable[[str, MSG_TYPE], None]]): Callback for streaming output.
|
|
717
|
-
|
|
718
|
-
Returns:
|
|
719
|
-
Union[str, dict]: The generated text or an error dictionary.
|
|
720
|
-
"""
|
|
721
|
-
if not self.server_process or not self.server_process.is_healthy:
|
|
722
|
-
return {"status": "error", "message": "Llama.cpp server is not running or not healthy."}
|
|
700
|
+
if not self._ensure_server_is_running():
|
|
701
|
+
return {"status": "error", "message": "Llama.cpp server could not be started. Please check model configuration and logs."}
|
|
723
702
|
|
|
724
|
-
# 1. Export the discussion to the OpenAI chat format, which llama.cpp server understands.
|
|
725
|
-
# This handles system prompts, user/assistant roles, and multi-modal content.
|
|
726
703
|
messages = discussion.export("openai_chat", branch_tip_id)
|
|
727
|
-
|
|
728
|
-
# 2. Build the generation payload for the server
|
|
729
704
|
payload = {
|
|
730
|
-
"messages": messages,
|
|
731
|
-
"
|
|
732
|
-
"
|
|
733
|
-
"top_k": top_k,
|
|
734
|
-
"top_p": top_p,
|
|
735
|
-
"repeat_penalty": repeat_penalty,
|
|
736
|
-
"seed": seed,
|
|
737
|
-
"stream": stream,
|
|
738
|
-
**generation_kwargs # Pass any extra parameters
|
|
705
|
+
"messages": messages, "max_tokens": n_predict, "temperature": temperature,
|
|
706
|
+
"top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty,
|
|
707
|
+
"seed": seed, "stream": stream, **generation_kwargs
|
|
739
708
|
}
|
|
740
|
-
# Remove None values, as the API expects them to be absent
|
|
741
709
|
payload = {k: v for k, v in payload.items() if v is not None}
|
|
742
710
|
|
|
743
711
|
endpoint = "/v1/chat/completions"
|
|
@@ -745,7 +713,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
745
713
|
full_response_text = ""
|
|
746
714
|
|
|
747
715
|
try:
|
|
748
|
-
# 3. Make the request to the server
|
|
749
716
|
response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
|
|
750
717
|
response.raise_for_status()
|
|
751
718
|
|
|
@@ -788,14 +755,14 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
788
755
|
return {"status": "error", "message": error_message}
|
|
789
756
|
|
|
790
757
|
def tokenize(self, text: str) -> List[int]:
|
|
791
|
-
if not self.
|
|
758
|
+
if not self._ensure_server_is_running(): return []
|
|
792
759
|
try:
|
|
793
760
|
response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
|
|
794
761
|
response.raise_for_status(); return response.json().get("tokens", [])
|
|
795
762
|
except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
|
|
796
763
|
|
|
797
764
|
def detokenize(self, tokens: List[int]) -> str:
|
|
798
|
-
if not self.
|
|
765
|
+
if not self._ensure_server_is_running(): return ""
|
|
799
766
|
try:
|
|
800
767
|
response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
|
|
801
768
|
response.raise_for_status(); return response.json().get("content", "")
|
|
@@ -804,8 +771,9 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
804
771
|
def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
|
|
805
772
|
|
|
806
773
|
def embed(self, text: str, **kwargs) -> List[float]:
|
|
807
|
-
if not self.
|
|
808
|
-
if not self.server_args.get("embedding"):
|
|
774
|
+
if not self._ensure_server_is_running(): return []
|
|
775
|
+
if not self.server_args.get("embedding"):
|
|
776
|
+
ASCIIColors.warning("Embedding not enabled in server_args. Please set 'embedding' to True in config."); return []
|
|
809
777
|
try:
|
|
810
778
|
payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
|
|
811
779
|
response = self.server_process.session.post(request_url, json=payload)
|
|
@@ -819,26 +787,31 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
819
787
|
except requests.exceptions.RequestException as e:
|
|
820
788
|
err_msg = f"Embedding request error: {e}";
|
|
821
789
|
if e.response: err_msg += f" - {e.response.text[:200]}"
|
|
822
|
-
|
|
823
|
-
|
|
790
|
+
ASCIIColors.error(err_msg)
|
|
791
|
+
return []
|
|
792
|
+
except Exception as ex:
|
|
793
|
+
trace_exception(ex); ASCIIColors.error(f"Embedding failed: {str(ex)}")
|
|
794
|
+
return []
|
|
824
795
|
|
|
825
796
|
def get_model_info(self) -> dict:
|
|
797
|
+
# This method reports the current state without triggering a server start
|
|
798
|
+
is_loaded = self.server_process is not None and self.server_process.is_healthy
|
|
826
799
|
info = {
|
|
827
800
|
"name": self.binding_name,
|
|
828
801
|
"user_provided_model_name": self.user_provided_model_name,
|
|
829
802
|
"model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
|
|
830
803
|
"clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
|
|
831
|
-
"loaded":
|
|
804
|
+
"loaded": is_loaded,
|
|
832
805
|
"server_args": self.server_args, "port": self.port if self.port else "N/A",
|
|
833
806
|
"server_key": str(self.server_key) if self.server_key else "N/A",
|
|
834
807
|
}
|
|
835
|
-
if
|
|
808
|
+
if is_loaded:
|
|
836
809
|
try:
|
|
837
810
|
props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
|
|
838
811
|
info.update({
|
|
839
812
|
"server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
|
|
840
813
|
"server_chat_format": props_resp.get("chat_format"),
|
|
841
|
-
"server_clip_model_from_props": props_resp.get("mmproj"),
|
|
814
|
+
"server_clip_model_from_props": props_resp.get("mmproj"),
|
|
842
815
|
})
|
|
843
816
|
except Exception: pass
|
|
844
817
|
|
|
@@ -850,10 +823,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
850
823
|
return info
|
|
851
824
|
|
|
852
825
|
def _scan_models(self):
|
|
853
|
-
"""
|
|
854
|
-
Scans the models_path for GGUF files and populates the model map.
|
|
855
|
-
Handles duplicate filenames by prefixing them with their parent directory path.
|
|
856
|
-
"""
|
|
857
826
|
self._model_path_map = {}
|
|
858
827
|
if not self.models_path.exists() or not self.models_path.is_dir():
|
|
859
828
|
ASCIIColors.warning(f"Models path does not exist or is not a directory: {self.models_path}")
|
|
@@ -867,144 +836,79 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
867
836
|
|
|
868
837
|
for model_file in all_paths:
|
|
869
838
|
if model_file.is_file():
|
|
870
|
-
# On Windows, path separators can be tricky. Convert to generic format.
|
|
871
839
|
relative_path_str = str(model_file.relative_to(self.models_path).as_posix())
|
|
872
840
|
if filenames_count[model_file.name] > 1:
|
|
873
|
-
# Duplicate filename, use relative path as the unique name
|
|
874
841
|
unique_name = relative_path_str
|
|
875
842
|
else:
|
|
876
|
-
# Unique filename, use the name itself
|
|
877
843
|
unique_name = model_file.name
|
|
878
|
-
|
|
879
844
|
self._model_path_map[unique_name] = model_file
|
|
880
845
|
|
|
881
846
|
ASCIIColors.info(f"Scanned {len(self._model_path_map)} models from {self.models_path}.")
|
|
882
847
|
|
|
883
848
|
def listModels(self) -> List[Dict[str, Any]]:
|
|
884
|
-
|
|
885
|
-
Lists all available GGUF models, rescanning the directory first.
|
|
886
|
-
"""
|
|
887
|
-
self._scan_models() # Always rescan when asked for the list
|
|
888
|
-
|
|
849
|
+
self._scan_models()
|
|
889
850
|
models_found = []
|
|
890
851
|
for unique_name, model_path in self._model_path_map.items():
|
|
891
852
|
models_found.append({
|
|
892
|
-
'name': unique_name,
|
|
893
|
-
'
|
|
894
|
-
'path': str(model_path), # The full path
|
|
895
|
-
'size': model_path.stat().st_size
|
|
853
|
+
'name': unique_name, 'model_name': model_path.name,
|
|
854
|
+
'path': str(model_path), 'size': model_path.stat().st_size
|
|
896
855
|
})
|
|
897
|
-
|
|
898
|
-
# Sort the list alphabetically by the unique name for consistent ordering
|
|
899
856
|
return sorted(models_found, key=lambda x: x['name'])
|
|
900
857
|
|
|
901
858
|
def __del__(self):
|
|
902
859
|
self.unload_model()
|
|
903
860
|
|
|
904
861
|
def get_ctx_size(self, model_name: Optional[str] = None) -> Optional[int]:
|
|
905
|
-
"""
|
|
906
|
-
Retrieves context size for a model from a hardcoded list.
|
|
907
|
-
|
|
908
|
-
This method checks if the model name contains a known base model identifier
|
|
909
|
-
(e.g., 'llama3.1', 'gemma2') to determine its context length. It's intended
|
|
910
|
-
as a failsafe when the context size cannot be retrieved directly from the
|
|
911
|
-
Ollama API.
|
|
912
|
-
"""
|
|
913
862
|
if model_name is None:
|
|
914
|
-
model_name = self.
|
|
863
|
+
model_name = self.user_provided_model_name or self.initial_model_name_preference
|
|
864
|
+
if not model_name and self.current_model_path:
|
|
865
|
+
model_name = self.current_model_path.name
|
|
866
|
+
|
|
867
|
+
if model_name is None:
|
|
868
|
+
ASCIIColors.warning("Cannot determine context size without a model name.")
|
|
869
|
+
return None
|
|
915
870
|
|
|
916
|
-
# Hardcoded context sizes for popular models. More specific names (e.g., 'llama3.1')
|
|
917
|
-
# should appear, as they will be checked first due to the sorting logic below.
|
|
918
871
|
known_contexts = {
|
|
919
|
-
'llama3.1': 131072,
|
|
920
|
-
'
|
|
921
|
-
'
|
|
922
|
-
'
|
|
923
|
-
'
|
|
924
|
-
'
|
|
925
|
-
'
|
|
926
|
-
'
|
|
927
|
-
'
|
|
928
|
-
'
|
|
929
|
-
'gemma': 8192, # Gemma default
|
|
930
|
-
'phi3': 131072, # Phi-3 variants often use 128K (mini/medium extended)
|
|
931
|
-
'phi2': 2048, # Phi-2 default
|
|
932
|
-
'phi': 2048, # Phi default (older)
|
|
933
|
-
'qwen2.5': 131072, # Qwen2.5 with 128K
|
|
934
|
-
'qwen2': 32768, # Qwen2 default for 7B
|
|
935
|
-
'qwen': 8192, # Qwen default
|
|
936
|
-
'codellama': 16384, # CodeLlama extended
|
|
937
|
-
'codegemma': 8192, # CodeGemma default
|
|
938
|
-
'deepseek-coder-v2': 131072, # DeepSeek-Coder V2 with 128K
|
|
939
|
-
'deepseek-coder': 16384, # DeepSeek-Coder V1 default
|
|
940
|
-
'deepseek-v2': 131072, # DeepSeek-V2 with 128K
|
|
941
|
-
'deepseek-llm': 4096, # DeepSeek-LLM default
|
|
942
|
-
'yi1.5': 32768, # Yi-1.5 with 32K
|
|
943
|
-
'yi': 4096, # Yi base default
|
|
944
|
-
'command-r': 131072, # Command-R with 128K
|
|
945
|
-
'wizardlm2': 32768, # WizardLM2 (Mistral-based)
|
|
946
|
-
'wizardlm': 16384, # WizardLM default
|
|
947
|
-
'zephyr': 65536, # Zephyr beta (Mistral-based extended)
|
|
948
|
-
'vicuna': 2048, # Vicuna default (up to 16K in some variants)
|
|
949
|
-
'falcon': 2048, # Falcon default
|
|
950
|
-
'starcoder': 8192, # StarCoder default
|
|
951
|
-
'stablelm': 4096, # StableLM default
|
|
952
|
-
'orca2': 4096, # Orca 2 default
|
|
953
|
-
'orca': 4096, # Orca default
|
|
954
|
-
'dolphin': 32768, # Dolphin (often Mistral-based)
|
|
955
|
-
'openhermes': 8192, # OpenHermes default
|
|
872
|
+
'llama3.1': 131072, 'llama3.2': 131072, 'llama3.3': 131072, 'llama3': 8192,
|
|
873
|
+
'llama2': 4096, 'mixtral8x22b': 65536, 'mixtral': 32768, 'mistral': 32768,
|
|
874
|
+
'gemma3': 131072, 'gemma2': 8192, 'gemma': 8192, 'phi3': 131072, 'phi2': 2048,
|
|
875
|
+
'phi': 2048, 'qwen2.5': 131072, 'qwen2': 32768, 'qwen': 8192,
|
|
876
|
+
'codellama': 16384, 'codegemma': 8192, 'deepseek-coder-v2': 131072,
|
|
877
|
+
'deepseek-coder': 16384, 'deepseek-v2': 131072, 'deepseek-llm': 4096,
|
|
878
|
+
'yi1.5': 32768, 'yi': 4096, 'command-r': 131072, 'wizardlm2': 32768,
|
|
879
|
+
'wizardlm': 16384, 'zephyr': 65536, 'vicuna': 2048, 'falcon': 2048,
|
|
880
|
+
'starcoder': 8192, 'stablelm': 4096, 'orca2': 4096, 'orca': 4096,
|
|
881
|
+
'dolphin': 32768, 'openhermes': 8192,
|
|
956
882
|
}
|
|
957
|
-
|
|
958
883
|
normalized_model_name = model_name.lower().strip()
|
|
959
|
-
|
|
960
|
-
# Sort keys by length in descending order. This ensures that a more specific
|
|
961
|
-
# name like 'llama3.1' is checked before a less specific name like 'llama3'.
|
|
962
884
|
sorted_base_models = sorted(known_contexts.keys(), key=len, reverse=True)
|
|
963
885
|
|
|
964
886
|
for base_name in sorted_base_models:
|
|
965
887
|
if base_name in normalized_model_name:
|
|
966
888
|
context_size = known_contexts[base_name]
|
|
967
|
-
ASCIIColors.
|
|
968
|
-
f"Using hardcoded context size for model '{model_name}' "
|
|
969
|
-
f"based on base name '{base_name}': {context_size}"
|
|
970
|
-
)
|
|
889
|
+
ASCIIColors.info(f"Using hardcoded context size for '{model_name}' based on '{base_name}': {context_size}")
|
|
971
890
|
return context_size
|
|
972
891
|
|
|
973
892
|
ASCIIColors.warning(f"Context size not found for model '{model_name}' in the hardcoded list.")
|
|
974
893
|
return None
|
|
975
894
|
|
|
976
895
|
if __name__ == '__main__':
|
|
977
|
-
global full_streamed_text
|
|
896
|
+
global full_streamed_text
|
|
978
897
|
full_streamed_text = ""
|
|
979
898
|
ASCIIColors.yellow("Testing LlamaCppServerBinding...")
|
|
980
899
|
|
|
981
|
-
# --- Configuration ---
|
|
982
|
-
# This should be the NAME of your GGUF model file.
|
|
983
|
-
# Ensure this model is placed in your models_path directory.
|
|
984
|
-
# Example: models_path = "E:\\lollms\\models\\gguf" (Windows)
|
|
985
|
-
# model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
|
|
986
|
-
|
|
987
|
-
# For CI/local testing without specific paths, you might download a tiny model
|
|
988
|
-
# or require user to set environment variables for these.
|
|
989
|
-
# For this example, replace with your actual paths/model.
|
|
990
900
|
try:
|
|
991
901
|
models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
|
|
992
|
-
model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf")
|
|
993
|
-
llava_model_name_str = os.environ.get("LOLLMS_TEST_LLAVA_MODEL_GGUF", "llava-v1.5-7b.Q2_K.gguf") # Placeholder
|
|
994
|
-
llava_clip_name_str = os.environ.get("LOLLMS_TEST_LLAVA_CLIP", "mmproj-model2-q4_0.gguf") # Placeholder
|
|
995
|
-
|
|
996
|
-
models_path = Path(models_path_str)
|
|
997
|
-
models_path.mkdir(parents=True, exist_ok=True) # Ensure test_models dir exists
|
|
902
|
+
model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf")
|
|
998
903
|
|
|
999
|
-
|
|
904
|
+
models_path = Path(models_path_str)
|
|
905
|
+
models_path.mkdir(parents=True, exist_ok=True)
|
|
1000
906
|
test_model_path = models_path / model_name_str
|
|
1001
|
-
|
|
1002
|
-
|
|
907
|
+
|
|
908
|
+
primary_model_available = test_model_path.exists()
|
|
909
|
+
if not primary_model_available:
|
|
910
|
+
ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set env vars.")
|
|
1003
911
|
ASCIIColors.warning("Some tests will be skipped.")
|
|
1004
|
-
# sys.exit(1) # Or allow to continue with skips
|
|
1005
|
-
primary_model_available = False
|
|
1006
|
-
else:
|
|
1007
|
-
primary_model_available = True
|
|
1008
912
|
|
|
1009
913
|
except Exception as e:
|
|
1010
914
|
ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
|
|
@@ -1017,184 +921,106 @@ if __name__ == '__main__':
|
|
|
1017
921
|
|
|
1018
922
|
active_binding1: Optional[LlamaCppServerBinding] = None
|
|
1019
923
|
active_binding2: Optional[LlamaCppServerBinding] = None
|
|
1020
|
-
|
|
1021
|
-
|
|
924
|
+
|
|
1022
925
|
try:
|
|
1023
926
|
if primary_model_available:
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
ASCIIColors.info("Testing default model selection (model_name=None)")
|
|
927
|
+
# --- Test 1: Auto-start server on first generation call ---
|
|
928
|
+
ASCIIColors.cyan("\n--- Test 1: Auto-start server with specified model name ---")
|
|
1027
929
|
active_binding1 = LlamaCppServerBinding(
|
|
1028
|
-
model_name=
|
|
930
|
+
model_name=model_name_str, models_path=str(models_path), config=binding_config
|
|
1029
931
|
)
|
|
932
|
+
ASCIIColors.info("Binding1 initialized. No server should be running yet.")
|
|
933
|
+
ASCIIColors.info(f"Initial model info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
934
|
+
|
|
935
|
+
prompt_text = "What is the capital of France?"
|
|
936
|
+
generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
|
|
937
|
+
|
|
938
|
+
if isinstance(generated_text, str) and "Paris" in generated_text:
|
|
939
|
+
ASCIIColors.green(f"SUCCESS: Auto-start generation successful. Response: {generated_text}")
|
|
940
|
+
else:
|
|
941
|
+
ASCIIColors.error(f"FAILURE: Auto-start generation failed. Response: {generated_text}")
|
|
942
|
+
|
|
943
|
+
ASCIIColors.info(f"Model info after auto-start: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
1030
944
|
if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
|
|
1031
|
-
|
|
1032
|
-
ASCIIColors.green(f"Binding1 initialized with default model. Server for '{active_binding1.current_model_path.name}' running on port {active_binding1.port}.")
|
|
1033
|
-
ASCIIColors.info(f"Binding1 Model Info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
945
|
+
raise RuntimeError("Server for binding1 did not seem to start correctly.")
|
|
1034
946
|
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
model_to_load_explicitly = active_binding1.user_provided_model_name
|
|
947
|
+
# --- Test 2: Server reuse with a second binding ---
|
|
948
|
+
ASCIIColors.cyan("\n--- Test 2: Server reuse with a second binding ---")
|
|
1038
949
|
active_binding2 = LlamaCppServerBinding(
|
|
1039
|
-
model_name=
|
|
950
|
+
model_name=model_name_str, models_path=str(models_path), config=binding_config
|
|
1040
951
|
)
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
952
|
+
# This call should reuse the server from binding1
|
|
953
|
+
generated_text_b2 = active_binding2.generate_text("Ping", n_predict=5, stream=False)
|
|
954
|
+
if isinstance(generated_text_b2, str):
|
|
955
|
+
ASCIIColors.green(f"SUCCESS: Binding2 generation successful. Response: {generated_text_b2}")
|
|
956
|
+
else:
|
|
957
|
+
ASCIIColors.error(f"FAILURE: Binding2 generation failed. Response: {generated_text_b2}")
|
|
1045
958
|
|
|
1046
959
|
if active_binding1.port != active_binding2.port:
|
|
1047
|
-
ASCIIColors.error("
|
|
960
|
+
ASCIIColors.error("FAILURE: Bindings for the same model are using different ports! Server sharing failed.")
|
|
1048
961
|
else:
|
|
1049
|
-
ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing
|
|
1050
|
-
|
|
1051
|
-
# --- List Models (scans configured directories) ---
|
|
1052
|
-
ASCIIColors.cyan("\n--- Listing Models (from search paths, using binding1) ---")
|
|
1053
|
-
# Create a dummy duplicate model to test unique naming
|
|
1054
|
-
duplicate_folder = models_path / "subdir"
|
|
1055
|
-
duplicate_folder.mkdir(exist_ok=True)
|
|
1056
|
-
duplicate_model_path = duplicate_folder / test_model_path.name
|
|
1057
|
-
import shutil
|
|
1058
|
-
shutil.copy(test_model_path, duplicate_model_path)
|
|
1059
|
-
ASCIIColors.info(f"Created a duplicate model for testing: {duplicate_model_path}")
|
|
1060
|
-
|
|
1061
|
-
listed_models = active_binding1.listModels()
|
|
1062
|
-
if listed_models:
|
|
1063
|
-
ASCIIColors.green(f"Found {len(listed_models)} GGUF files.")
|
|
1064
|
-
pprint.pprint(listed_models)
|
|
1065
|
-
# Check if the duplicate was handled
|
|
1066
|
-
names = [m['name'] for m in listed_models]
|
|
1067
|
-
if test_model_path.name in names and f"subdir/{test_model_path.name}" in names:
|
|
1068
|
-
ASCIIColors.green("SUCCESS: Duplicate model names were correctly handled.")
|
|
1069
|
-
else:
|
|
1070
|
-
ASCIIColors.error("FAILURE: Duplicate model names were not handled correctly.")
|
|
1071
|
-
else: ASCIIColors.warning("No GGUF models found in search paths.")
|
|
1072
|
-
|
|
1073
|
-
# Clean up dummy duplicate
|
|
1074
|
-
duplicate_model_path.unlink()
|
|
1075
|
-
duplicate_folder.rmdir()
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
# --- Tokenize/Detokenize ---
|
|
1079
|
-
ASCIIColors.cyan("\n--- Tokenize/Detokenize (using binding1) ---")
|
|
1080
|
-
sample_text = "Hello, Llama.cpp server world!"
|
|
1081
|
-
tokens = active_binding1.tokenize(sample_text)
|
|
1082
|
-
ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
|
|
1083
|
-
if tokens:
|
|
1084
|
-
detokenized_text = active_binding1.detokenize(tokens)
|
|
1085
|
-
ASCIIColors.green(f"Detokenized text: {detokenized_text}")
|
|
1086
|
-
else: ASCIIColors.warning("Tokenization returned empty list.")
|
|
1087
|
-
|
|
1088
|
-
# --- Text Generation (Non-Streaming, Chat API, binding1) ---
|
|
1089
|
-
ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API, binding1) ---")
|
|
1090
|
-
prompt_text = "What is the capital of Germany?"
|
|
1091
|
-
generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False)
|
|
1092
|
-
if isinstance(generated_text, str): ASCIIColors.green(f"Generated text (binding1): {generated_text}")
|
|
1093
|
-
else: ASCIIColors.error(f"Generation failed (binding1): {generated_text}")
|
|
962
|
+
ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing works.")
|
|
1094
963
|
|
|
1095
|
-
# ---
|
|
1096
|
-
ASCIIColors.cyan("\n---
|
|
1097
|
-
|
|
1098
|
-
|
|
964
|
+
# --- Test 3: Unload and auto-reload ---
|
|
965
|
+
ASCIIColors.cyan("\n--- Test 3: Unload and auto-reload ---")
|
|
966
|
+
active_binding1.unload_model()
|
|
967
|
+
ASCIIColors.info("Binding1 unloaded. Ref count should be 1, server still up for binding2.")
|
|
1099
968
|
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
if binding_config.get("embedding"):
|
|
1107
|
-
ASCIIColors.cyan("\n--- Embeddings (binding1) ---")
|
|
1108
|
-
try:
|
|
1109
|
-
embedding_vector = active_binding1.embed("Test embedding.")
|
|
1110
|
-
ASCIIColors.green(f"Embedding (first 3 dims): {embedding_vector[:3]}... Dim: {len(embedding_vector)}")
|
|
1111
|
-
except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
|
|
1112
|
-
else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false) ---")
|
|
1113
|
-
|
|
1114
|
-
else: # primary_model_available is False
|
|
1115
|
-
ASCIIColors.warning("Primary test model not available. Skipping most tests.")
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
# --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
|
|
1119
|
-
ASCIIColors.cyan("\n--- LLaVA Vision Test (if model available) ---")
|
|
1120
|
-
llava_model_path = models_path / llava_model_name_str
|
|
1121
|
-
llava_clip_path_actual = models_path / llava_clip_name_str # Assuming clip is in models_path too
|
|
1122
|
-
|
|
1123
|
-
if llava_model_path.exists() and llava_clip_path_actual.exists():
|
|
1124
|
-
dummy_image_path = models_path / "dummy_llava_image.png"
|
|
1125
|
-
try:
|
|
1126
|
-
from PIL import Image, ImageDraw
|
|
1127
|
-
img = Image.new('RGB', (150, 70), color = ('magenta')); d = ImageDraw.Draw(img); d.text((10,10), "LLaVA Test", fill=('white')); img.save(dummy_image_path)
|
|
1128
|
-
ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
|
|
1129
|
-
|
|
1130
|
-
llava_binding_config = binding_config.copy()
|
|
1131
|
-
# LLaVA might need specific chat template if server doesn't auto-detect well.
|
|
1132
|
-
# llava_binding_config["chat_template"] = "llava-1.5"
|
|
1133
|
-
|
|
1134
|
-
active_binding_llava = LlamaCppServerBinding(
|
|
1135
|
-
model_name=str(llava_model_path.name), # Pass filename, let it resolve
|
|
1136
|
-
models_path=str(models_path),
|
|
1137
|
-
clip_model_name=str(llava_clip_path_actual.name), # Pass filename for clip
|
|
1138
|
-
config=llava_binding_config
|
|
1139
|
-
)
|
|
1140
|
-
if not active_binding_llava.server_process or not active_binding_llava.server_process.is_healthy:
|
|
1141
|
-
raise RuntimeError("LLaVA server failed to start or become healthy.")
|
|
1142
|
-
ASCIIColors.green(f"LLaVA Binding initialized. Server for '{active_binding_llava.current_model_path.name}' running on port {active_binding_llava.port}.")
|
|
1143
|
-
ASCIIColors.info(f"LLaVA Binding Model Info: {json.dumps(active_binding_llava.get_model_info(), indent=2)}")
|
|
969
|
+
# The server should still be up because binding2 holds a reference
|
|
970
|
+
with _server_registry_lock:
|
|
971
|
+
if not _active_servers:
|
|
972
|
+
ASCIIColors.error("FAILURE: Server shut down prematurely while still referenced by binding2.")
|
|
973
|
+
else:
|
|
974
|
+
ASCIIColors.green("SUCCESS: Server correctly remained active for binding2.")
|
|
1144
975
|
|
|
976
|
+
# This call should re-acquire a reference to the same server for binding1
|
|
977
|
+
generated_text_reloaded = active_binding1.generate_text("Test reload", n_predict=5, stream=False)
|
|
978
|
+
if isinstance(generated_text_reloaded, str):
|
|
979
|
+
ASCIIColors.green(f"SUCCESS: Generation after reload successful. Response: {generated_text_reloaded}")
|
|
980
|
+
else:
|
|
981
|
+
ASCIIColors.error(f"FAILURE: Generation after reload failed. Response: {generated_text_reloaded}")
|
|
1145
982
|
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
|
|
1151
|
-
else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
|
|
983
|
+
if active_binding1.port != active_binding2.port:
|
|
984
|
+
ASCIIColors.error("FAILURE: Port mismatch after reload.")
|
|
985
|
+
else:
|
|
986
|
+
ASCIIColors.green("SUCCESS: Correctly re-used same server after reload.")
|
|
1152
987
|
|
|
1153
|
-
except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
|
|
1154
|
-
except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
|
|
1155
|
-
finally:
|
|
1156
|
-
if dummy_image_path.exists(): dummy_image_path.unlink()
|
|
1157
988
|
else:
|
|
1158
|
-
ASCIIColors.warning(
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
if
|
|
1166
|
-
ASCIIColors.green(f"
|
|
1167
|
-
|
|
1168
|
-
if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping (binding1): {reloaded_gen.strip()}")
|
|
1169
|
-
else: ASCIIColors.error(f"Post-reload generation failed (binding1): {reloaded_gen}")
|
|
989
|
+
ASCIIColors.warning("\n--- Primary model not available, skipping most tests ---")
|
|
990
|
+
|
|
991
|
+
# --- Test 4: Initialize with model_name=None and auto-find ---
|
|
992
|
+
ASCIIColors.cyan("\n--- Test 4: Initialize with model_name=None and auto-find ---")
|
|
993
|
+
unspecified_binding = LlamaCppServerBinding(model_name=None, models_path=str(models_path), config=binding_config)
|
|
994
|
+
gen_unspec = unspecified_binding.generate_text("Ping", n_predict=5, stream=False)
|
|
995
|
+
if primary_model_available:
|
|
996
|
+
if isinstance(gen_unspec, str):
|
|
997
|
+
ASCIIColors.green(f"SUCCESS: Auto-find generation successful. Response: {gen_unspec}")
|
|
998
|
+
ASCIIColors.info(f"Model auto-selected: {unspecified_binding.user_provided_model_name}")
|
|
1170
999
|
else:
|
|
1171
|
-
ASCIIColors.error("
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
except Exception as e_main: ASCIIColors.error(f"An unexpected error occurred: {e_main}"); trace_exception(e_main)
|
|
1000
|
+
ASCIIColors.error(f"FAILURE: Auto-find generation failed. Response: {gen_unspec}")
|
|
1001
|
+
else: # If no models, this should fail gracefully
|
|
1002
|
+
if isinstance(gen_unspec, dict) and 'error' in gen_unspec:
|
|
1003
|
+
ASCIIColors.green("SUCCESS: Correctly failed to generate when no models are available.")
|
|
1004
|
+
else:
|
|
1005
|
+
ASCIIColors.error(f"FAILURE: Incorrect behavior when no models are available. Response: {gen_unspec}")
|
|
1006
|
+
|
|
1007
|
+
except Exception as e_main:
|
|
1008
|
+
ASCIIColors.error(f"An unexpected error occurred during testing: {e_main}")
|
|
1009
|
+
trace_exception(e_main)
|
|
1182
1010
|
finally:
|
|
1183
1011
|
ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
|
|
1184
1012
|
if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
|
|
1185
1013
|
if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
|
|
1186
|
-
if active_binding_llava: active_binding_llava.unload_model(); ASCIIColors.info("LLaVA Binding unloaded.")
|
|
1187
1014
|
|
|
1188
|
-
# Check if any servers remain (should be none if all bindings unloaded)
|
|
1189
1015
|
with _server_registry_lock:
|
|
1190
1016
|
if _active_servers:
|
|
1191
|
-
ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after
|
|
1192
|
-
for key, server_proc in list(_active_servers.items()):
|
|
1017
|
+
ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after tests.")
|
|
1018
|
+
for key, server_proc in list(_active_servers.items()):
|
|
1193
1019
|
ASCIIColors.info(f"Force shutting down stray server: {key}")
|
|
1194
1020
|
try: server_proc.shutdown()
|
|
1195
1021
|
except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
|
|
1196
|
-
_active_servers.pop(key,None)
|
|
1197
|
-
_server_ref_counts.pop(key,None)
|
|
1022
|
+
_active_servers.pop(key, None)
|
|
1023
|
+
_server_ref_counts.pop(key, None)
|
|
1198
1024
|
else:
|
|
1199
1025
|
ASCIIColors.green("All servers shut down correctly.")
|
|
1200
1026
|
|