lollms-client 0.14.1__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lollms-client might be problematic. Click here for more details.

@@ -0,0 +1,1041 @@
1
+ # bindings/llamacpp_server/binding.py
2
+ import json
3
+ import os
4
+ import pprint
5
+ import re
6
+ import socket
7
+ import subprocess
8
+ import sys
9
+ import threading
10
+ import time
11
+ from pathlib import Path
12
+ from typing import Optional, Callable, List, Union, Dict, Any, Set
13
+ import base64
14
+ import requests # For HTTP client
15
+ from lollms_client.lollms_llm_binding import LollmsLLMBinding
16
+ from lollms_client.lollms_types import MSG_TYPE, ELF_COMPLETION_FORMAT
17
+
18
+ from ascii_colors import ASCIIColors, trace_exception
19
+ import pipmaster as pm
20
+ import platform
21
+
22
+ # Ensure llama-cpp-binaries and requests are installed
23
+ pm.ensure_packages(["requests", "pillow"]) # pillow for dummy image in test
24
+ if not pm.is_installed("llama-cpp-binaries"):
25
+ def install_llama_cpp():
26
+ system = platform.system()
27
+
28
+ if system == "Windows":
29
+ url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl"
30
+ elif system == "Linux":
31
+ url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl"
32
+ else:
33
+ print(f"Unsupported OS: {system}")
34
+ return
35
+ pm.install(url)
36
+ install_llama_cpp()
37
+
38
+ try:
39
+ import llama_cpp_binaries
40
+ except ImportError:
41
+ ASCIIColors.error("llama-cpp-binaries package not found. Please install it.")
42
+ ASCIIColors.error("You can try: pip install llama-cpp-binaries")
43
+ ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases")
44
+ llama_cpp_binaries = None
45
+
46
+
47
+ # --- Predefined patterns ---
48
+
49
+ # Quantization type strings (derived from ggml.h, llama.cpp, and common usage)
50
+ # These are the "core component" strings, without separators like '.', '-', or '_'
51
+ _QUANT_COMPONENTS_SET: Set[str] = {
52
+ # K-quants (most common, often with S/M/L suffix, and now XS/XXS)
53
+ "Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K",
54
+ "Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S", # No Q6_K_S usually
55
+ "Q3_K_M", "Q4_K_M", "Q5_K_M", # No Q2/Q6_K_M usually
56
+ "Q3_K_L", # Only Q3_K_L is common
57
+ # Adding XS and XXS variants for K-quants by analogy with IQ types
58
+ "Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
59
+ "Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS",
60
+
61
+ # Non-K-quant legacy types
62
+ "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
63
+
64
+ # Floating point types
65
+ "F16", "FP16", "F32", "FP32", "BF16",
66
+
67
+ # IQ (Innovative Quantization) types
68
+ "IQ1_S", "IQ1_M",
69
+ "IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
70
+ "IQ3_XXS", "IQ3_S", "IQ3_M",
71
+ "IQ4_NL", "IQ4_XS",
72
+
73
+ # Newer IQ K-Quant variants (IQ types using K-quant style super-blocks)
74
+ "IQ3_M_K", "IQ3_S_K", # Adding IQ3_S_K as it's plausible
75
+ "IQ4_XS_K", "IQ4_NL_K", # Adding IQ4_NL_K as it's plausible
76
+
77
+ # Basic integer types (less common in user-facing LLM filenames as primary quantizer)
78
+ "I8", "I16", "I32",
79
+
80
+ # Special GGUF type names that might appear (from ggml.c `ggml_type_name`)
81
+ "ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
82
+ "MOSTLY_Q8_0",
83
+ "MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
84
+ "MOSTLY_Q4_K_S", "MOSTLY_Q4_K_M", "MOSTLY_Q5_K_S", "MOSTLY_Q5_K_M", "MOSTLY_Q6_K",
85
+ "MOSTLY_IQ1_S", "MOSTLY_IQ1_M", # Adding these
86
+ "MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
87
+ "MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", # Adding IQ3_M, IQ3_S
88
+ "MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
89
+ }
90
+
91
+ # Common descriptive suffixes for model names
92
+ _MODEL_NAME_SUFFIX_COMPONENTS_SET: Set[str] = {
93
+ "instruct", "chat", "GGUF", "HF", "ggml", "pytorch", "AWQ", "GPTQ", "EXL2",
94
+ "base", "cont", "continue", "ft", # Fine-tuning related
95
+ "v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0", # Common version tags if they are truly suffixes
96
+ # Be cautious with general version numbers (e.g., "v1", "v2") or model sizes (e.g., "7b")
97
+ # as they are often integral parts of the base name. Only add if they are
98
+ # *always* extraneous suffixes in your context.
99
+ # The ones above are more specific and often appear as full suffix components.
100
+ }
101
+
102
+ # Combine, ensure uniqueness by using sets, then sort by length descending.
103
+ # Sorting ensures longer patterns (e.g., "Q4_K_M") are checked before
104
+ # shorter sub-patterns (e.g., "Q4_K" or "K_M").
105
+ _ALL_REMOVABLE_COMPONENTS: List[str] = sorted(
106
+ list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)),
107
+ key=len,
108
+ reverse=True
109
+ )
110
+
111
+ def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
112
+ """
113
+ Extracts a base model name from a GGUF filename or path by removing
114
+ the .gguf extension and then iteratively stripping known quantization
115
+ patterns and common descriptive suffixes from the end of the name.
116
+
117
+ The stripping is case-insensitive and checks for patterns preceded
118
+ by '.', '-', or '_'.
119
+
120
+ Args:
121
+ file_path_or_name: The file path (as a string or Path object)
122
+ or just the filename string.
123
+
124
+ Returns:
125
+ The derived base model name string.
126
+ """
127
+ if isinstance(file_path_or_name, str):
128
+ p = Path(file_path_or_name)
129
+ elif isinstance(file_path_or_name, Path):
130
+ p = file_path_or_name
131
+ else:
132
+ raise TypeError(
133
+ "Input must be a string or Path object. "
134
+ f"Got: {type(file_path_or_name)}"
135
+ )
136
+
137
+ name_part = p.name # Full filename, e.g., "MyModel-7B-chat.Q4_K_M.gguf"
138
+
139
+ # 1. Remove .gguf extension (case-insensitive)
140
+ if name_part.lower().endswith(".gguf"):
141
+ name_part = name_part[:-5] # Remove last 5 chars: ".gguf"
142
+
143
+ # 2. Iteratively strip known components (quantization, common suffixes)
144
+ # These components are usually preceded by '.', '-', or '_'
145
+ while True:
146
+ original_name_part_len = len(name_part)
147
+ stripped_in_this_iteration = False
148
+
149
+ for component in _ALL_REMOVABLE_COMPONENTS:
150
+ component_lower = component.lower()
151
+ # Check for patterns like ".component", "-component", or "_component"
152
+ for separator in [".", "-", "_"]:
153
+ pattern_to_check = f"{separator}{component_lower}"
154
+ if name_part.lower().endswith(pattern_to_check):
155
+ # Remove from the original-case name_part
156
+ name_part = name_part[:-(len(pattern_to_check))]
157
+ stripped_in_this_iteration = True
158
+ break # Break from separator loop
159
+ if stripped_in_this_iteration:
160
+ break # Break from component loop (found a match, restart while loop with shorter name_part)
161
+
162
+ # If no component was stripped in a full pass through _ALL_REMOVABLE_COMPONENTS,
163
+ # or if name_part became empty, we're done.
164
+ if not stripped_in_this_iteration or not name_part:
165
+ break
166
+
167
+ # 3. Final cleanup: remove trailing separators if any are left after stripping
168
+ while name_part and name_part[-1] in ['.', '-', '_']:
169
+ name_part = name_part[:-1]
170
+
171
+ return name_part
172
+
173
+
174
+ BindingName = "LlamaCppServerBinding"
175
+ DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
176
+ DEFAULT_LLAMACPP_SERVER_PORT = 9641
177
+ # Based on the LlamaServer class provided in the prompt
178
+ class LlamaCppServerProcess:
179
+ def __init__(self, model_path: str|Path, clip_model_path: str = None, server_binary_path: str=None, port: int=None, server_args: Dict[str, Any]={}):
180
+ self.model_path = Path(model_path)
181
+ self.clip_model_path = clip_model_path
182
+ self.server_binary_path = Path(server_binary_path)
183
+ if self.server_binary_path is None:
184
+ self.server_binary_path = llama_cpp_binaries.get_binary_path()
185
+ self.port = port if port else DEFAULT_LLAMACPP_SERVER_PORT
186
+ self.server_args = server_args
187
+ self.process: Optional[subprocess.Popen] = None
188
+ self.session = requests.Session()
189
+ self.host = DEFAULT_LLAMACPP_SERVER_HOST
190
+ self.base_url = f"http://{self.host}:{self.port}"
191
+ self.is_healthy = False
192
+ self._stderr_lines = [] # Store last few stderr lines for debugging
193
+ self._stderr_thread = None
194
+
195
+ if not self.model_path.exists():
196
+ raise FileNotFoundError(f"Model file not found: {self.model_path}")
197
+ if not self.server_binary_path.exists():
198
+ raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
199
+
200
+ self._start_server()
201
+
202
+ def _filter_stderr(self, stderr_pipe):
203
+ try:
204
+ for line in iter(stderr_pipe.readline, ''):
205
+ if line:
206
+ self._stderr_lines.append(line.strip())
207
+ if len(self._stderr_lines) > 50: # Keep last 50 lines
208
+ self._stderr_lines.pop(0)
209
+ # Simple progress or key info logging
210
+ if "llama_model_loaded" in line or "error" in line.lower() or "failed" in line.lower():
211
+ ASCIIColors.debug(f"[LLAMA_SERVER_STDERR] {line.strip()}")
212
+ elif "running" in line and "port" in line: # Server startup message
213
+ ASCIIColors.info(f"[LLAMA_SERVER_STDERR] {line.strip()}")
214
+
215
+ except ValueError: # Pipe closed
216
+ pass
217
+ except Exception as e:
218
+ ASCIIColors.warning(f"Exception in stderr filter thread: {e}")
219
+
220
+
221
+ def _start_server(self, is_embedding=False):
222
+ cmd = [
223
+ str(self.server_binary_path),
224
+ "--model", str(self.model_path),
225
+ "--host", self.host,
226
+ "--port", str(self.port),
227
+ # Add other common defaults or arguments from self.server_args
228
+ ]
229
+
230
+ # Common arguments mapping from LlamaCppBinding to server CLI args
231
+ # (This needs to be kept in sync with llama.cpp server's CLI)
232
+ arg_map = {
233
+ "n_ctx": "--ctx-size", "n_gpu_layers": "--gpu-layers", "main_gpu": "--main-gpu",
234
+ "tensor_split": "--tensor-split", "use_mmap": (lambda v: ["--no-mmap"] if not v else []),
235
+ "use_mlock": (lambda v: ["--mlock"] if v else []), "seed": "--seed",
236
+ "n_batch": "--batch-size", "n_threads": "--threads", "n_threads_batch": "--threads-batch",
237
+ "rope_scaling_type": "--rope-scaling", "rope_freq_base": "--rope-freq-base",
238
+ "rope_freq_scale": "--rope-freq-scale",
239
+ "embedding": (lambda v: ["--embedding"] if is_embedding else []), # Server needs to be started with embedding support
240
+ "verbose": (lambda v: ["--verbose"] if v else []),
241
+ "chat_template": "--chat-template", # For newer servers if they support jinja chat templates
242
+ # Old llama.cpp server used --chatml or specific format flags
243
+ }
244
+
245
+ # For LLaVA, specific args are needed
246
+ if self.clip_model_path:
247
+ cmd.extend(["--mmproj", str(self.clip_model_path)])
248
+ # The server might automatically detect LLaVA chat format or need a specific flag
249
+ # e.g., --chat-template llava-1.5 (if server supports templates)
250
+ # For older servers, a specific chat format flag like --chatml with LLaVA prompt structure was used.
251
+ # The server from llama-cpp-binaries is usually quite up-to-date.
252
+
253
+ for key, cli_arg in arg_map.items():
254
+ val = self.server_args.get(key)
255
+ if val is not None:
256
+ if callable(cli_arg): # For args like --no-mmap
257
+ cmd.extend(cli_arg(val))
258
+ else:
259
+ cmd.extend([cli_arg, str(val)])
260
+
261
+ # Add any extra CLI flags directly
262
+ extra_cli_flags = self.server_args.get("extra_cli_flags", [])
263
+ if isinstance(extra_cli_flags, str): # If it's a string, split it
264
+ extra_cli_flags = extra_cli_flags.split()
265
+ cmd.extend(extra_cli_flags)
266
+
267
+
268
+ ASCIIColors.info(f"Starting Llama.cpp server with command: {' '.join(cmd)}")
269
+
270
+ # Prevent paths with spaces from breaking the command on some OS, though Popen usually handles this.
271
+ # For safety, ensure paths are quoted if necessary, or rely on Popen's list-based command.
272
+
273
+ env = os.environ.copy()
274
+ # On Linux, it might be necessary to set LD_LIBRARY_PATH if server binary has shared lib dependencies in its folder
275
+ if os.name == 'posix' and self.server_binary_path.parent != Path('.'):
276
+ lib_path_str = str(self.server_binary_path.parent.resolve())
277
+ current_ld_path = env.get('LD_LIBRARY_PATH', '')
278
+ if current_ld_path:
279
+ env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}"
280
+ else:
281
+ env['LD_LIBRARY_PATH'] = lib_path_str
282
+
283
+ try:
284
+ ASCIIColors.green(f"running server: {' '.join(cmd)}")
285
+ self.process = subprocess.Popen(
286
+ cmd,
287
+ stderr=subprocess.PIPE,
288
+ stdout=subprocess.PIPE, # Capture stdout as well for debugging
289
+ text=True,
290
+ bufsize=1, # Line buffered
291
+ env=env
292
+ )
293
+ except Exception as e:
294
+ ASCIIColors.error(f"Failed to start llama.cpp server process: {e}")
295
+ trace_exception(e)
296
+ raise
297
+
298
+ # Start stderr/stdout reading threads
299
+ self._stderr_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stderr,), daemon=True)
300
+ self._stderr_thread.start()
301
+ # self._stdout_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stdout,), daemon=True) # can use same filter
302
+ # self._stdout_thread.start()
303
+
304
+
305
+ # Wait for server to be healthy
306
+ health_url = f"{self.base_url}/health"
307
+ max_wait_time = self.server_args.get("server_startup_timeout", 60) # seconds
308
+ start_time = time.time()
309
+
310
+ while time.time() - start_time < max_wait_time:
311
+ if self.process.poll() is not None:
312
+ exit_code = self.process.poll()
313
+ stderr_output = "\n".join(self._stderr_lines[-10:]) # Last 10 lines
314
+ raise RuntimeError(f"Llama.cpp server process terminated unexpectedly with exit code {exit_code} during startup. Stderr:\n{stderr_output}")
315
+ try:
316
+ response = self.session.get(health_url, timeout=2)
317
+ if response.status_code == 200 and response.json().get("status") == "ok":
318
+ self.is_healthy = True
319
+ ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port}.")
320
+ return
321
+ except requests.exceptions.ConnectionError:
322
+ time.sleep(1) # Wait and retry
323
+ except Exception as e:
324
+ ASCIIColors.warning(f"Health check failed: {e}")
325
+ time.sleep(1)
326
+
327
+ self.is_healthy = False
328
+ self.stop() # Ensure process is killed if health check failed
329
+ stderr_output = "\n".join(self._stderr_lines[-10:])
330
+ raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
331
+
332
+ def stop(self):
333
+ self.is_healthy = False
334
+ if self.process:
335
+ ASCIIColors.info(f"Stopping Llama.cpp server (PID: {self.process.pid})...")
336
+ try:
337
+ # Try graceful termination first
338
+ if os.name == 'nt': # Windows
339
+ # Sending CTRL_C_EVENT to the process group might be more effective for console apps
340
+ # self.process.send_signal(signal.CTRL_C_EVENT) # Requires creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
341
+ self.process.terminate() # For Windows, terminate is often like kill
342
+ else: # POSIX
343
+ self.process.terminate() # Sends SIGTERM
344
+
345
+ self.process.wait(timeout=10) # Wait for graceful shutdown
346
+ except subprocess.TimeoutExpired:
347
+ ASCIIColors.warning("Llama.cpp server did not terminate gracefully, killing...")
348
+ self.process.kill() # Force kill
349
+ try:
350
+ self.process.wait(timeout=5)
351
+ except subprocess.TimeoutExpired:
352
+ ASCIIColors.error("Failed to kill llama.cpp server process.")
353
+ except Exception as e:
354
+ ASCIIColors.error(f"Error during server stop: {e}")
355
+ finally:
356
+ self.process = None
357
+ if self._stderr_thread and self._stderr_thread.is_alive():
358
+ self._stderr_thread.join(timeout=1) # Wait for thread to finish
359
+ ASCIIColors.info("Llama.cpp server stopped.")
360
+
361
+
362
+ class LlamaCppServerBinding(LollmsLLMBinding):
363
+ """
364
+ Binding for llama.cpp server using pre-compiled binaries.
365
+ Manages a local llama.cpp server subprocess and communicates via HTTP.
366
+ """
367
+ # Default parameters for the llama.cpp server
368
+ DEFAULT_SERVER_ARGS = {
369
+ "n_gpu_layers": 0,
370
+ "n_ctx": 128000,
371
+ "n_batch": 512,
372
+ "embedding": False, # Enable if embeddings are needed via /embedding or /v1/embeddings
373
+ "verbose": False,
374
+ "server_startup_timeout": 120, # seconds
375
+ # "chat_format": "chatml", # Deprecated in favor of --chat-template, but some old servers might need it
376
+ # For LLaVA
377
+ # "clip_model_path": None,
378
+ # "chat_template": "llava-1.5" # if server supports it. Or specific prompt structure.
379
+ }
380
+
381
+ def __init__(self,
382
+ model_name: str, # Name of the GGUF file (e.g., "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
383
+ models_path: str,
384
+ clip_model_name: str = None,
385
+ config: Optional[Dict[str, Any]] = None, # Binding specific config from global_config.yaml
386
+ default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
387
+ **kwargs # Overrides for server_args
388
+ ):
389
+
390
+ super().__init__(binding_name=BindingName)
391
+
392
+ if llama_cpp_binaries is None:
393
+ raise ImportError("llama-cpp-binaries package is required but not found.")
394
+
395
+ self.models_path = Path(models_path)
396
+ self.model_name = model_name
397
+ self.model_path = self.models_path/self.model_name
398
+ self.clip_model_path = self.models_path/clip_model_name if clip_model_name else None
399
+ self.default_completion_format = default_completion_format
400
+
401
+ self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {})}
402
+ self.server_args.update(kwargs) # Apply direct kwargs overrides
403
+
404
+ self.server_binary_path = self._get_server_binary_path()
405
+ self.current_model_path: Optional[Path] = None
406
+ self.server_process: Optional[LlamaCppServerProcess] = None
407
+ self.port: Optional[int] = None
408
+
409
+ # Attempt to load the model (which starts the server)
410
+ self.load_model(str(self.model_path))
411
+
412
+ def _get_server_binary_path(self) -> Path:
413
+ try:
414
+ # Check if a custom path is provided in config
415
+ custom_path_str = self.server_args.get("llama_server_binary_path")
416
+ if custom_path_str:
417
+ custom_path = Path(custom_path_str)
418
+ if custom_path.exists() and custom_path.is_file():
419
+ ASCIIColors.info(f"Using custom llama.cpp server binary path: {custom_path}")
420
+ return custom_path
421
+ else:
422
+ ASCIIColors.warning(f"Custom llama.cpp server binary path '{custom_path_str}' not found or not a file. Falling back.")
423
+
424
+ # Default to using llama_cpp_binaries
425
+ bin_path_str = llama_cpp_binaries.get_binary_path() # specify "server"
426
+ if bin_path_str:
427
+ bin_path = Path(bin_path_str)
428
+ if bin_path.exists() and bin_path.is_file():
429
+ ASCIIColors.info(f"Using llama.cpp server binary from llama-cpp-binaries: {bin_path}")
430
+ return bin_path
431
+
432
+ raise FileNotFoundError("Could not locate llama.cpp server binary via llama-cpp-binaries or custom path.")
433
+
434
+ except Exception as e:
435
+ ASCIIColors.error(f"Error getting llama.cpp server binary path: {e}")
436
+ trace_exception(e)
437
+ # As a last resort, try a common name in system PATH or a known location if Lollms ships one
438
+ # For now, rely on llama-cpp-binaries or explicit config.
439
+ raise FileNotFoundError(
440
+ "Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' is installed "
441
+ "or provide 'llama_server_binary_path' in the binding's configuration."
442
+ ) from e
443
+
444
+ def _resolve_model_path(self, model_path: str) -> Path:
445
+ # Search order:
446
+ # 1. Absolute path
447
+ # 2. Relative to binding-specific models path (e.g., personal_models_path/LlamaCppServerBinding/)
448
+ # 3. Relative to personal_models_path
449
+ # 4. Relative to models_zoo_path
450
+
451
+ model_p = Path(model_path)
452
+ if model_p.is_absolute() and model_p.exists():
453
+ return model_p
454
+
455
+ paths_to_check = []
456
+ binding_specific_folder_name = self.binding_name # "LlamaCppServerBinding"
457
+ paths_to_check.append(self.models_path)
458
+
459
+ for p in paths_to_check:
460
+ if p.exists() and p.is_file():
461
+ ASCIIColors.info(f"Found model at: {p}")
462
+ return p
463
+
464
+ raise FileNotFoundError(f"Model '{model_name}' not found in standard Lollms model paths or as an absolute path.")
465
+
466
+ def _find_available_port(self) -> int:
467
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
468
+ s.bind(('', 0)) # Bind to port 0 to get an OS-assigned available port
469
+ return s.getsockname()[1]
470
+
471
+ def load_model(self, model_name: str) -> bool:
472
+ resolved_path = self._resolve_model_path(model_name)
473
+
474
+ if self.server_process and self.server_process.is_healthy and self.current_model_path == resolved_path:
475
+ ASCIIColors.info(f"Model '{model_name}' is already loaded and server is running.")
476
+ return True
477
+
478
+ if self.server_process:
479
+ self.unload_model() # Stop existing server
480
+
481
+ self.model_name = model_name # Store the name provided by user
482
+ self.current_model_path = resolved_path
483
+ self.port = self._find_available_port()
484
+
485
+ ASCIIColors.info(f"Attempting to start Llama.cpp server for model: {self.current_model_path} on port {self.port}")
486
+
487
+ # Prepare server_args specifically for this model load
488
+ current_server_args = self.server_args.copy()
489
+
490
+ if not self.clip_model_path:
491
+ # Try to find a corresponding .mmproj file or allow user to specify in config
492
+ # e.g. if model is llava-v1.5-7b.Q4_K_M.gguf, look for llava-v1.5-7b.mmproj or mmproj-modelname.gguf
493
+ base_name = get_gguf_model_base_name(self.current_model_path.stem) # etc.
494
+
495
+ potential_clip_paths = [
496
+ self.current_model_path.parent / f"{base_name}.mmproj",
497
+ self.current_model_path.parent / f"mmproj-{base_name}.gguf", # Common pattern
498
+ self.current_model_path.with_suffix(".mmproj"),
499
+ ]
500
+ found_clip_path = None
501
+ for p_clip in potential_clip_paths:
502
+ if p_clip.exists():
503
+ found_clip_path = str(p_clip)
504
+ ASCIIColors.info(f"Auto-detected LLaVA clip model: {found_clip_path}")
505
+ break
506
+ if found_clip_path:
507
+ self.clip_model_path = found_clip_path
508
+ # Set a default LLaVA chat template if server supports it, or rely on server auto-detection
509
+ #if not current_server_args.get("chat_template") and not current_server_args.get("chat_format"):
510
+ # current_server_args["chat_template"] = "llava-1.5" # Common default
511
+ else:
512
+ ASCIIColors.warning("Vision capabilities will likely not work. Please ensure the .mmproj file is "
513
+ "next to the model or specify 'clip_model_path' in binding config.")
514
+
515
+
516
+ try:
517
+ self.server_process = LlamaCppServerProcess(
518
+ model_path=str(self.current_model_path),
519
+ clip_model_path = str(self.clip_model_path),
520
+ server_binary_path=str(self.server_binary_path),
521
+ port=self.port,
522
+ server_args=current_server_args,
523
+ )
524
+ return self.server_process.is_healthy
525
+ except Exception as e:
526
+ ASCIIColors.error(f"Failed to load model '{model_name}' and start server: {e}")
527
+ trace_exception(e)
528
+ self.server_process = None
529
+ self.current_model_path = None
530
+ return False
531
+
532
+ def unload_model(self):
533
+ if self.server_process:
534
+ self.server_process.stop()
535
+ self.server_process = None
536
+ self.current_model_path = None
537
+ self.port = None
538
+ ASCIIColors.info("Llama.cpp server and model unloaded.")
539
+
540
+ def _get_request_url(self, endpoint: str) -> str:
541
+ if not self.server_process or not self.server_process.is_healthy:
542
+ raise ConnectionError("Llama.cpp server is not running or not healthy.")
543
+ return f"{self.server_process.base_url}{endpoint}"
544
+
545
+ def _prepare_generation_payload(self,
546
+ prompt: str,
547
+ system_prompt: str = "",
548
+ n_predict: Optional[int] = None,
549
+ temperature: float = 0.7,
550
+ top_k: int = 40,
551
+ top_p: float = 0.9,
552
+ repeat_penalty: float = 1.1,
553
+ repeat_last_n: Optional[int] = 64, # Server calls this repeat_last_n or penalty_last_n
554
+ seed: Optional[int] = None,
555
+ stream: bool = False,
556
+ use_chat_format: bool = True, # True for /v1/chat/completions, False for /completion
557
+ images: Optional[List[str]] = None,
558
+ **extra_params # For things like grammar, mirostat, etc from server_args
559
+ ) -> Dict:
560
+
561
+ # Start with defaults from server_args, then override with call params
562
+ payload_params = {
563
+ "temperature": self.server_args.get("temperature", 0.7),
564
+ "top_k": self.server_args.get("top_k", 40),
565
+ "top_p": self.server_args.get("top_p", 0.9),
566
+ "repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
567
+ "repeat_last_n": self.server_args.get("repeat_last_n", 64),
568
+ "mirostat": self.server_args.get("mirostat_mode", 0), # llama.cpp server uses mirostat (0=disabled, 1=v1, 2=v2)
569
+ "mirostat_tau": self.server_args.get("mirostat_tau", 5.0),
570
+ "mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
571
+ # Add other mappable params from self.server_args like min_p, typical_p, grammar etc.
572
+ }
573
+ if "grammar_string" in self.server_args and self.server_args["grammar_string"]: # From config
574
+ payload_params["grammar"] = self.server_args["grammar_string"]
575
+
576
+ # Override with specific call parameters
577
+ payload_params.update({
578
+ "temperature": temperature, "top_k": top_k, "top_p": top_p,
579
+ "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n,
580
+ })
581
+ if n_predict is not None: payload_params['n_predict'] = n_predict # Server uses n_predict
582
+ if seed is not None: payload_params['seed'] = seed
583
+
584
+ # Filter None values, as server might not like them
585
+ payload_params = {k: v for k, v in payload_params.items() if v is not None}
586
+ payload_params.update(extra_params) # Add any other specific params for this call
587
+
588
+ if use_chat_format and self.default_completion_format == ELF_COMPLETION_FORMAT.Chat:
589
+ # Use /v1/chat/completions format
590
+ messages = []
591
+ if system_prompt and system_prompt.strip():
592
+ messages.append({"role": "system", "content": system_prompt})
593
+
594
+ user_content: Union[str, List[Dict[str, Any]]] = prompt
595
+ if images and self.clip_model_path: # Check if it's a LLaVA setup
596
+ image_parts = []
597
+ for img_path in images:
598
+ try:
599
+ with open(img_path, "rb") as image_file:
600
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
601
+ image_type = Path(img_path).suffix[1:].lower() or "png"
602
+ if image_type == "jpg": image_type = "jpeg"
603
+ # Llama.cpp server expects image data directly for LLaVA with /completion
604
+ # For /v1/chat/completions, it expects OpenAI's format for multimodal
605
+ image_parts.append({
606
+ "type": "image_url",
607
+ "image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}
608
+ })
609
+ except Exception as ex:
610
+ trace_exception(ex)
611
+ user_content = [{"type": "text", "text": prompt}] + image_parts # type: ignore
612
+
613
+ messages.append({"role": "user", "content": user_content})
614
+
615
+ final_payload = {"messages": messages, "stream": stream, **payload_params}
616
+ # n_predict is max_tokens for OpenAI API
617
+ if 'n_predict' in final_payload:
618
+ final_payload['max_tokens'] = final_payload.pop('n_predict')
619
+
620
+ return final_payload
621
+ else:
622
+ # Use /completion format (legacy or for raw text)
623
+ # For LLaVA with /completion, images are typically passed in a special way in the prompt
624
+ # or via an 'image_data' field if the server supports it.
625
+ # The example class uses tokenized prompt for /completion.
626
+ # For simplicity here, we'll send text prompt, server tokenizes.
627
+ # Llama.cpp server's /completion often expects 'prompt' as string or tokens.
628
+ # If images are involved with /completion, it needs specific handling.
629
+ # Example: 'prompt': "USER: <image>\nWhat is this?\nASSISTANT:", 'image_data': [{'data': base64_image, 'id': 10}]
630
+
631
+ full_prompt = prompt
632
+ if system_prompt and system_prompt.strip():
633
+ # Heuristic for instruct models, actual formatting depends on model/template
634
+ full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:"
635
+
636
+ final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
637
+
638
+ if images and self.server_args.get("clip_model_path"):
639
+ image_data_list = []
640
+ for i, img_path in enumerate(images):
641
+ try:
642
+ with open(img_path, "rb") as image_file:
643
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
644
+ image_data_list.append({"data": encoded_string, "id": i + 10}) # ID needs to be > 9 for llama.cpp server
645
+ except Exception as e_img:
646
+ ASCIIColors.error(f"Could not encode image {img_path} for /completion: {e_img}")
647
+ if image_data_list:
648
+ final_payload["image_data"] = image_data_list
649
+ # The prompt needs to contain placeholder like USER: <image 1>\n<prompt>\nASSISTANT:
650
+ # This part is tricky and model-dependent. For now, we assume user's prompt is already formatted.
651
+ # Or, the server (if new enough) might handle it with chat_template even for /completion.
652
+
653
+ return final_payload
654
+
655
+
656
+ def generate_text(self,
657
+ prompt: str,
658
+ images: Optional[List[str]] = None,
659
+ system_prompt: str = "",
660
+ n_predict: Optional[int] = None,
661
+ stream: bool = False,
662
+ temperature: float = None, # Use binding's default if None
663
+ top_k: int = None,
664
+ top_p: float = None,
665
+ repeat_penalty: float = None,
666
+ repeat_last_n: Optional[int] = None,
667
+ seed: Optional[int] = None,
668
+ streaming_callback: Optional[Callable[[str, int], bool]] = None,
669
+ use_chat_format_override: Optional[bool] = None, # Allow overriding binding's default format
670
+ **generation_kwargs
671
+ ) -> Union[str, Dict[str, any]]:
672
+
673
+ if not self.server_process or not self.server_process.is_healthy:
674
+ return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
675
+
676
+ _use_chat_format = use_chat_format_override if use_chat_format_override is not None \
677
+ else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
678
+
679
+ payload = self._prepare_generation_payload(
680
+ prompt=prompt, system_prompt=system_prompt, n_predict=n_predict,
681
+ temperature=temperature if temperature is not None else self.server_args.get("temperature",0.7),
682
+ top_k=top_k if top_k is not None else self.server_args.get("top_k",40),
683
+ top_p=top_p if top_p is not None else self.server_args.get("top_p",0.9),
684
+ repeat_penalty=repeat_penalty if repeat_penalty is not None else self.server_args.get("repeat_penalty",1.1),
685
+ repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
686
+ seed=seed if seed is not None else self.server_args.get("seed", -1), # Use server's default seed if not provided
687
+ stream=stream, use_chat_format=_use_chat_format, images=images,
688
+ **generation_kwargs
689
+ )
690
+
691
+ endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
692
+ request_url = self._get_request_url(endpoint)
693
+
694
+ # For debugging, print payload (excluding potentially large image data)
695
+ debug_payload = {k:v for k,v in payload.items() if k not in ["image_data"]}
696
+ if "messages" in debug_payload:
697
+ debug_payload["messages"] = [{k:v for k,v in msg.items() if k !="content" or not isinstance(v,list) or not any("image_url" in part for part in v)} for msg in debug_payload["messages"]]
698
+ ASCIIColors.debug(f"Request to {request_url} with payload: {json.dumps(debug_payload, indent=2)[:500]}...")
699
+
700
+ full_response_text = ""
701
+ try:
702
+ response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
703
+ response.raise_for_status()
704
+
705
+ if stream:
706
+ for line in response.iter_lines():
707
+ if not line: continue
708
+ line_str = line.decode('utf-8').strip()
709
+ if line_str.startswith('data: '): line_str = line_str[6:]
710
+ if line_str == '[DONE]': break # OpenAI stream end
711
+
712
+ try:
713
+ chunk_data = json.loads(line_str)
714
+ chunk_content = ""
715
+ if _use_chat_format: # OpenAI /v1/chat/completions format
716
+ delta = chunk_data.get('choices', [{}])[0].get('delta', {})
717
+ chunk_content = delta.get('content', '')
718
+ else: # /completion format
719
+ chunk_content = chunk_data.get('content', '')
720
+
721
+ if chunk_content:
722
+ full_response_text += chunk_content
723
+ if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
724
+ # If callback returns False, we should try to stop generation.
725
+ # Llama.cpp server's /completion doesn't have a direct way to stop mid-stream via API.
726
+ # Closing the connection might be the only way if server supports it.
727
+ ASCIIColors.info("Streaming callback requested stop.")
728
+ response.close() # Attempt to signal server by closing connection
729
+ break
730
+ if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): # /completion specific stop flags
731
+ break
732
+ except json.JSONDecodeError:
733
+ ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
734
+ continue # Or handle error
735
+ return full_response_text
736
+ else: # Not streaming
737
+ response_data = response.json()
738
+ return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
739
+
740
+ except requests.exceptions.RequestException as e:
741
+ error_message = f"Llama.cpp server request error: {e}"
742
+ if e.response is not None:
743
+ try:
744
+ error_details = e.response.json()
745
+ error_message += f" - Details: {error_details.get('error', e.response.text)}"
746
+ except json.JSONDecodeError:
747
+ error_message += f" - Response: {e.response.text[:200]}"
748
+ ASCIIColors.error(error_message)
749
+ return {"status": False, "error": error_message, "details": str(e.response.text if e.response else "No response text")}
750
+ except Exception as ex:
751
+ error_message = f"Llama.cpp generation error: {str(ex)}"
752
+ trace_exception(ex)
753
+ return {"status": False, "error": error_message}
754
+
755
+ def tokenize(self, text: str) -> List[int]:
756
+ if not self.server_process or not self.server_process.is_healthy:
757
+ raise ConnectionError("Llama.cpp server is not running.")
758
+ try:
759
+ response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
760
+ response.raise_for_status()
761
+ return response.json().get("tokens", [])
762
+ except Exception as e:
763
+ ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e)
764
+ return [] # Or raise
765
+
766
+ def detokenize(self, tokens: List[int]) -> str:
767
+ if not self.server_process or not self.server_process.is_healthy:
768
+ raise ConnectionError("Llama.cpp server is not running.")
769
+ try:
770
+ response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
771
+ response.raise_for_status()
772
+ return response.json().get("content", "")
773
+ except Exception as e:
774
+ ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e)
775
+ return "" # Or raise
776
+
777
+ def count_tokens(self, text: str) -> int:
778
+ return len(self.tokenize(text))
779
+
780
+ def embed(self, text: str, **kwargs) -> List[float]:
781
+ if not self.server_process or not self.server_process.is_healthy:
782
+ raise Exception("Llama.cpp server is not running.")
783
+ if not self.server_args.get("embedding"):
784
+ raise Exception("Embedding support was not enabled in server_args (set 'embedding: true').")
785
+
786
+ try:
787
+ # llama.cpp server has /embedding endpoint (non-OpenAI) and /v1/embeddings (OpenAI-compatible)
788
+ # Let's try /v1/embeddings first for compatibility
789
+ payload = {"input": text}
790
+ if "model" in kwargs: payload["model"] = kwargs["model"] # Can specify model if server handles multiple embedding models (unlikely for llama.cpp server)
791
+
792
+ request_url = self._get_request_url("/v1/embeddings")
793
+ response = self.server_process.session.post(request_url, json=payload)
794
+
795
+ if response.status_code == 404: # Fallback to /embedding if /v1/embeddings not found
796
+ ASCIIColors.debug("Trying /embedding endpoint as /v1/embeddings was not found.")
797
+ request_url = self._get_request_url("/embedding")
798
+ response = self.server_process.session.post(request_url, json={"content": text}) # /embedding uses "content"
799
+
800
+ response.raise_for_status()
801
+ data = response.json()
802
+
803
+ if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: # /v1/embeddings format
804
+ return data["data"][0]["embedding"]
805
+ elif "embedding" in data and isinstance(data["embedding"], list): # /embedding format
806
+ return data["embedding"]
807
+ else:
808
+ raise ValueError(f"Unexpected embedding response format: {data}")
809
+
810
+ except requests.exceptions.RequestException as e:
811
+ err_msg = f"Llama.cpp server embedding request error: {e}"
812
+ if e.response: err_msg += f" - {e.response.text[:200]}"
813
+ raise Exception(err_msg) from e
814
+ except Exception as ex:
815
+ trace_exception(ex); raise Exception(f"Llama.cpp embedding failed: {str(ex)}") from ex
816
+
817
+ def get_model_info(self) -> dict:
818
+ info = {
819
+ "name": self.binding_name,
820
+ "model_name": self.model_name, # User-provided name
821
+ "model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
822
+ "loaded": self.server_process is not None and self.server_process.is_healthy,
823
+ "server_args": self.server_args,
824
+ "port": self.port if self.port else "N/A"
825
+ }
826
+ if info["loaded"]:
827
+ # Try to get more info from server's /props or /v1/models
828
+ try:
829
+ props_url = self._get_request_url("/props") # llama.cpp specific
830
+ props_resp = self.server_process.session.get(props_url, timeout=5).json()
831
+ info.update({
832
+ "server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"), # Example path
833
+ "server_chat_format": props_resp.get("chat_format"),
834
+ "server_clip_model": props_resp.get("mmproj"),
835
+ })
836
+ except Exception: pass # Ignore if /props fails or data missing
837
+
838
+ is_llava = ("llava" in self.model_name.lower() or "bakllava" in self.model_name.lower()) or \
839
+ (self.server_args.get("clip_model_path") is not None) or \
840
+ (info.get("server_clip_model") is not None)
841
+
842
+ info["supports_vision"] = is_llava
843
+ info["supports_structured_output"] = self.server_args.get("grammar_string") is not None
844
+ return info
845
+
846
+ def listModels(self) -> List[Dict[str, str]]:
847
+ # This binding manages one GGUF model at a time by starting a server for it.
848
+ # To "list models", we could scan the Lollms model directories for .gguf files.
849
+ models_found = []
850
+ gguf_pattern = "*.gguf"
851
+
852
+ search_paths = []
853
+ binding_specific_folder_name = self.binding_name
854
+
855
+ search_paths.append(self.models_path)
856
+
857
+ unique_models = set()
858
+ for spath in search_paths:
859
+ if spath.exists() and spath.is_dir():
860
+ for model_file in spath.rglob(gguf_pattern): # rglob for recursive
861
+ if model_file.is_file() and model_file.name not in unique_models:
862
+ models_found.append({
863
+ 'model_name': model_file.name,
864
+ # Path relative to one of the main model roots for display/selection
865
+ 'path_hint': str(model_file.relative_to(spath.parent) if model_file.is_relative_to(spath.parent) else model_file),
866
+ 'size_gb': f"{model_file.stat().st_size / (1024**3):.2f} GB"
867
+ })
868
+ unique_models.add(model_file.name)
869
+ return models_found
870
+
871
+ def __del__(self):
872
+ self.unload_model() # Ensure server is stopped when binding is deleted
873
+
874
+
875
+ if __name__ == '__main__':
876
+ global full_streamed_text
877
+ ASCIIColors.yellow("Testing LlamaCppServerBinding...")
878
+
879
+ # --- Configuration ---
880
+ # This should be the NAME of your GGUF model file. The binding will search for it.
881
+ # e.g., "Mistral-7B-Instruct-v0.2-Q4_K_M.gguf"
882
+ # Ensure this model is placed in one of the Lollms model directories.
883
+ # For testing, you can put a small GGUF model in the same directory as this script
884
+ # and set personal_models_path to "."
885
+
886
+ # Adjust current_directory if your models are elsewhere for testing
887
+ current_directory = Path(__file__).parent
888
+ models_path = "E:\lollms\models\gguf\Mistral-Nemo-Instruct-2407-GGUF" #replace with your own model path
889
+ model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
890
+
891
+ # Binding config (passed to server_args)
892
+ binding_config = {
893
+ "n_gpu_layers": 0, # Set to -1 or a number for GPU offload
894
+ "n_ctx": 512, # Short context for testing
895
+ "embedding": True, # Enable for embedding tests
896
+ "verbose": False, # llama.cpp server verbose logs
897
+ # "extra_cli_flags": ["--cont-batching"] # Example of extra flags
898
+ "server_startup_timeout": 180 # Give more time for server to start, esp. with large models
899
+ }
900
+
901
+ active_binding = None
902
+ try:
903
+ ASCIIColors.cyan("\n--- Initializing LlamaCppServerBinding ---")
904
+ active_binding = LlamaCppServerBinding(
905
+ model_name=model_name,
906
+ models_path=models_path,
907
+ config=binding_config
908
+ )
909
+ if not active_binding.server_process or not active_binding.server_process.is_healthy:
910
+ raise RuntimeError("Server process failed to start or become healthy.")
911
+
912
+ ASCIIColors.green(f"Binding initialized. Server for '{active_binding.model_name}' running on port {active_binding.port}.")
913
+ ASCIIColors.info(f"Model Info: {json.dumps(active_binding.get_model_info(), indent=2)}")
914
+
915
+
916
+ # --- List Models (scans configured directories) ---
917
+ ASCIIColors.cyan("\n--- Listing Models (from search paths) ---")
918
+ listed_models = active_binding.listModels()
919
+ if listed_models:
920
+ ASCIIColors.green(f"Found {len(listed_models)} GGUF files. First 5:")
921
+ for m in listed_models[:5]: print(m)
922
+ else: ASCIIColors.warning("No GGUF models found in search paths.")
923
+
924
+ # --- Tokenize/Detokenize ---
925
+ ASCIIColors.cyan("\n--- Tokenize/Detokenize ---")
926
+ sample_text = "Hello, Llama.cpp server world!"
927
+ tokens = active_binding.tokenize(sample_text)
928
+ ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
929
+ token_count = active_binding.count_tokens(sample_text)
930
+ ASCIIColors.green(f"Token count: {token_count}")
931
+ if tokens: # Only detokenize if tokenization worked
932
+ detokenized_text = active_binding.detokenize(tokens)
933
+ ASCIIColors.green(f"Detokenized text: {detokenized_text}")
934
+ # Note: exact match might depend on BOS/EOS handling by server's tokenizer
935
+ # assert detokenized_text.strip() == sample_text.strip(), "Tokenization/Detokenization mismatch!"
936
+ else: ASCIIColors.warning("Tokenization returned empty list, skipping detokenization.")
937
+
938
+ # --- Text Generation (Non-Streaming, Chat Format using /v1/chat/completions) ---
939
+ ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API) ---")
940
+ prompt_text = "What is the capital of Germany?"
941
+ system_prompt_text = "You are a concise geography expert."
942
+ generated_text = active_binding.generate_text(
943
+ prompt_text, system_prompt=system_prompt_text, n_predict=20, stream=False,
944
+ use_chat_format_override=True # Force /v1/chat/completions
945
+ )
946
+ if isinstance(generated_text, str): ASCIIColors.green(f"Generated text: {generated_text}")
947
+ else: ASCIIColors.error(f"Generation failed: {generated_text}")
948
+
949
+ # --- Text Generation (Streaming, /completion API) ---
950
+ ASCIIColors.cyan("\n--- Text Generation (Streaming, Completion API) ---")
951
+ full_streamed_text = ""
952
+ def stream_callback(chunk: str, msg_type: int):
953
+ global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True)
954
+ full_streamed_text += chunk; return True
955
+
956
+ result = active_binding.generate_text(
957
+ prompt_text, system_prompt=system_prompt_text, n_predict=30, stream=True,
958
+ streaming_callback=stream_callback, use_chat_format_override=False # Force /completion
959
+ )
960
+ print("\n--- End of Stream ---")
961
+ if isinstance(result, str): ASCIIColors.green(f"Full streamed text: {result}")
962
+ else: ASCIIColors.error(f"Streaming generation failed: {result}")
963
+
964
+ # --- Embeddings ---
965
+ if binding_config.get("embedding"):
966
+ ASCIIColors.cyan("\n--- Embeddings ---")
967
+ embedding_text = "Test sentence for server-based embeddings."
968
+ try:
969
+ embedding_vector = active_binding.embed(embedding_text)
970
+ ASCIIColors.green(f"Embedding for '{embedding_text}' (first 3 dims): {embedding_vector[:3]}...")
971
+ ASCIIColors.info(f"Embedding vector dimension: {len(embedding_vector)}")
972
+ except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
973
+ else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false in config) ---")
974
+
975
+ # --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
976
+ # To test LLaVA:
977
+ models_path = "E:\drumber" #replace with your own model path
978
+ model_name = "llava-v1.6-mistral-7b.Q3_K_XS.gguf"
979
+ model_path = Path(models_path)/model_name
980
+ ASCIIColors.cyan("\n--- LLaVA Vision Test ---")
981
+ dummy_image_path = Path("E:\\drumber\\drumber.png")
982
+ try:
983
+ from PIL import Image, ImageDraw
984
+ img = Image.new('RGB', (150, 70), color = ('magenta'))
985
+ d = ImageDraw.Draw(img); d.text((10,10), "Server LLaVA", fill=('white'))
986
+ img.save(dummy_image_path)
987
+ ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
988
+
989
+ llava_prompt = "Describe this image."
990
+ # For /v1/chat/completions with LLaVA, images are passed in messages.
991
+ # For /completion with LLaVA, prompt needs <image> placeholder and image_data field.
992
+ llava_response = active_binding.generate_text(
993
+ prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False,
994
+ use_chat_format_override=True # Use /v1/chat/completions for easier multimodal
995
+ )
996
+ if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
997
+ else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
998
+ except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
999
+ except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
1000
+ finally:
1001
+ if dummy_image_path.exists(): dummy_image_path.unlink()
1002
+
1003
+ # --- Test changing model ---
1004
+ # This part is conceptual. You'd need another GGUF model file for a real test.
1005
+ # For now, we'll just call load_model with the same model to test the logic.
1006
+
1007
+ ASCIIColors.cyan("\n--- Testing Model Change (reloading same model) ---")
1008
+ reload_success = active_binding.load_model(str(model_path))
1009
+ if reload_success and active_binding.server_process and active_binding.server_process.is_healthy:
1010
+ ASCIIColors.green(f"Model reloaded/re-confirmed successfully. Server on port {active_binding.port}.")
1011
+ # Quick generation test after reload
1012
+ reloaded_gen = active_binding.generate_text("Ping", n_predict=5, stream=False)
1013
+ if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping response: {reloaded_gen.strip()}")
1014
+ else: ASCIIColors.error(f"Post-reload generation failed: {reloaded_gen}")
1015
+ else:
1016
+ ASCIIColors.error("Failed to reload model or server not healthy after reload attempt.")
1017
+
1018
+
1019
+ except ImportError as e_imp:
1020
+ ASCIIColors.error(f"Import error: {e_imp}. Ensure llama-cpp-binaries is installed.")
1021
+ except FileNotFoundError as e_fnf:
1022
+ ASCIIColors.error(f"File not found error: {e_fnf}. Check model or server binary paths.")
1023
+ except ConnectionError as e_conn:
1024
+ ASCIIColors.error(f"Connection error (server might have failed to start or is unresponsive): {e_conn}")
1025
+ except RuntimeError as e_rt:
1026
+ ASCIIColors.error(f"Runtime error (often server process issue): {e_rt}")
1027
+ if active_binding and active_binding.server_process:
1028
+ ASCIIColors.error("Last stderr lines from server:")
1029
+ for line in active_binding.server_process._stderr_lines[-20:]: print(line) # Print last 20
1030
+ except Exception as e_main:
1031
+ ASCIIColors.error(f"An unexpected error occurred: {e_main}")
1032
+ trace_exception(e_main)
1033
+ finally:
1034
+ if active_binding:
1035
+ ASCIIColors.cyan("\n--- Unloading Model and Stopping Server ---")
1036
+ active_binding.unload_model()
1037
+ ASCIIColors.green("Server stopped and model unloaded.")
1038
+
1039
+
1040
+
1041
+ ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")