lollms-client 1.7.10__py3-none-any.whl → 1.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1155 +0,0 @@
1
- # bindings/llamacpp_server/binding.py
2
- import json
3
- import os
4
- import pprint
5
- import re
6
- import socket
7
- import subprocess
8
- import sys
9
- import threading
10
- import time
11
- import tempfile
12
- from pathlib import Path
13
- from typing import Optional, Callable, List, Union, Dict, Any, Set
14
- import base64
15
- from lollms_client.lollms_discussion import LollmsDiscussion
16
- import requests # For HTTP client
17
- from lollms_client.lollms_llm_binding import LollmsLLMBinding
18
- from lollms_client.lollms_types import MSG_TYPE, ELF_COMPLETION_FORMAT
19
-
20
- from ascii_colors import ASCIIColors, trace_exception
21
- import pipmaster as pm
22
- import platform
23
-
24
- # --- Multi-process locking for registry ---
25
- # On Windows, we need msvcrt, on POSIX, fcntl
26
- try:
27
- if platform.system() == "Windows":
28
- import msvcrt
29
- else:
30
- import fcntl
31
- except ImportError:
32
- # This might happen in some restricted environments.
33
- # The binding will fall back to thread-safety only.
34
- msvcrt = fcntl = None
35
-
36
-
37
- class FileLock:
38
- def __init__(self, lock_file_path):
39
- self.lock_file_path = lock_file_path
40
- self.lock_file = None
41
- self._is_windows = platform.system() == "Windows"
42
-
43
- def __enter__(self):
44
- self.lock_file = open(self.lock_file_path, 'w')
45
- if self._is_windows and msvcrt:
46
- msvcrt.locking(self.lock_file.fileno(), msvcrt.LK_LOCK, 1)
47
- elif not self._is_windows and fcntl:
48
- fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_EX)
49
- return self
50
-
51
- def __exit__(self, exc_type, exc_val, exc_tb):
52
- if self.lock_file:
53
- if self._is_windows and msvcrt:
54
- self.lock_file.seek(0)
55
- msvcrt.locking(self.lock_file.fileno(), msvcrt.LK_UNLCK, 1)
56
- elif not self._is_windows and fcntl:
57
- fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_UN)
58
- self.lock_file.close()
59
- self.lock_file = None
60
-
61
- # --- End multi-process locking ---
62
-
63
-
64
- # Ensure llama-cpp-binaries, requests, pillow, and psutil are installed
65
- pm.ensure_packages(["requests", "pillow", "psutil"]) # pillow for dummy image in test, psutil for multi-process management
66
- if not pm.is_installed("llama-cpp-binaries"):
67
- def install_llama_cpp():
68
- system = platform.system()
69
- python_version_simple = f"py{sys.version_info.major}{sys.version_info.minor}" # e.g. py310 for 3.10
70
-
71
- version_tag = "v0.56.0"
72
- cuda_suffix = "+cu124"
73
-
74
- if system == "Windows":
75
- # Try version-specific URL first
76
- url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
77
- # Fallback to generic py3 if version-specific doesn't exist
78
- fallback_url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-py3-none-win_amd64.whl"
79
- elif system == "Linux":
80
- # Try version-specific URL first
81
- url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
82
- # Fallback to generic py3 if version-specific doesn't exist
83
- fallback_url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-py3-none-linux_x86_64.whl"
84
- else:
85
- ASCIIColors.error(f"Unsupported OS for precompiled llama-cpp-binaries: {system}. "
86
- "You might need to set 'llama_server_binary_path' in the binding config "
87
- "to point to a manually compiled llama.cpp server binary.")
88
- return False
89
-
90
-
91
- ASCIIColors.info(f"Attempting to install llama-cpp-binaries from: {url}")
92
- try:
93
- pm.install(url)
94
- except Exception as e:
95
- ASCIIColors.warning(f"Failed to install specific version from {url}: {e}")
96
- ASCIIColors.info(f"Attempting fallback URL: {fallback_url}")
97
- try:
98
- pm.install(fallback_url)
99
- except Exception as e_fallback:
100
- ASCIIColors.error(f"Failed to install from fallback URL {fallback_url}: {e_fallback}")
101
- ASCIIColors.error("Please try installing llama-cpp-binaries manually, e.g., 'pip install llama-cpp-python[server]' or from a wheel.")
102
-
103
- install_llama_cpp()
104
-
105
- try:
106
- import llama_cpp_binaries
107
- import psutil
108
- except ImportError:
109
- ASCIIColors.error("llama-cpp-binaries or psutil package not found. Please ensure they are installed.")
110
- ASCIIColors.error("You can try: pip install llama-cpp-python[server] psutil")
111
- llama_cpp_binaries = None
112
- psutil = None
113
-
114
-
115
- # --- Predefined patterns ---
116
- _QUANT_COMPONENTS_SET: Set[str] = {
117
- "Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S",
118
- "Q3_K_M", "Q4_K_M", "Q5_K_M", "Q3_K_L", "Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
119
- "Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS", "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
120
- "F16", "FP16", "F32", "FP32", "BF16", "IQ1_S", "IQ1_M", "IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
121
- "IQ3_XXS", "IQ3_S", "IQ3_M", "IQ4_NL", "IQ4_XS", "IQ3_M_K", "IQ3_S_K", "IQ4_XS_K", "IQ4_NL_K",
122
- "I8", "I16", "I32", "ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
123
- "MOSTLY_Q8_0", "MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
124
- "MOSTLY_Q4_K_S", "MOSTLY_Q4_K_M", "MOSTLY_Q5_K_S", "MOSTLY_Q5_K_M", "MOSTLY_Q6_K",
125
- "MOSTLY_IQ1_S", "MOSTLY_IQ1_M", "MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
126
- "MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", "MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
127
- }
128
- _MODEL_NAME_SUFFIX_COMPONENTS_SET: Set[str] = {
129
- "instruct", "chat", "GGUF", "HF", "ggml", "pytorch", "AWQ", "GPTQ", "EXL2",
130
- "base", "cont", "continue", "ft", "v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0"
131
- }
132
- _ALL_REMOVABLE_COMPONENTS: List[str] = sorted(
133
- list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)), key=len, reverse=True
134
- )
135
-
136
- def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
137
- if isinstance(file_path_or_name, str): p = Path(file_path_or_name)
138
- elif isinstance(file_path_or_name, Path): p = file_path_or_name
139
- else: raise TypeError(f"Input must be a string or Path object. Got: {type(file_path_or_name)}")
140
- name_part = p.stem if p.suffix.lower() == ".gguf" else p.name
141
- if name_part.lower().endswith(".gguf"): name_part = name_part[:-5]
142
- while True:
143
- original_name_part_len = len(name_part)
144
- stripped_in_this_iteration = False
145
- for component in _ALL_REMOVABLE_COMPONENTS:
146
- component_lower = component.lower()
147
- for separator in [".", "-", "_"]:
148
- pattern_to_check = f"{separator}{component_lower}"
149
- if name_part.lower().endswith(pattern_to_check):
150
- name_part = name_part[:-(len(pattern_to_check))]
151
- stripped_in_this_iteration = True; break
152
- if stripped_in_this_iteration: break
153
- if not stripped_in_this_iteration or not name_part: break
154
- while name_part and name_part[-1] in ['.', '-', '_']: name_part = name_part[:-1]
155
- return name_part
156
-
157
- # --- Global Server Registry (File-based for multi-process support) ---
158
-
159
- class ServerRegistry:
160
- def __init__(self):
161
- self.registry_dir = Path(tempfile.gettempdir()) / "lollms_llamacpp_servers"
162
- self.registry_dir.mkdir(parents=True, exist_ok=True)
163
- self.registry_file = self.registry_dir / "registry.json"
164
- self.lock_file = self.registry_dir / "registry.lock"
165
- self.my_pid = os.getpid()
166
-
167
- def _is_pid_running(self, pid: int) -> bool:
168
- if psutil is None: return True # Conservative default if psutil is missing
169
- return psutil.pid_exists(pid)
170
-
171
- def _read_registry(self) -> Dict[str, Any]:
172
- if not self.registry_file.exists():
173
- return {}
174
- try:
175
- with open(self.registry_file, 'r') as f:
176
- return json.load(f)
177
- except (json.JSONDecodeError, FileNotFoundError):
178
- return {}
179
-
180
- def _write_registry(self, data: Dict[str, Any]):
181
- with open(self.registry_file, 'w') as f:
182
- json.dump(data, f, indent=2)
183
-
184
- def _clean_stale_entries(self, registry_data: Dict[str, Any]) -> bool:
185
- """Cleans stale servers and clients. Returns True if changes were made."""
186
- changed = False
187
- # Clean dead servers
188
- dead_servers = [k for k, v in registry_data.items() if not self._is_pid_running(v['pid'])]
189
- for key in dead_servers:
190
- ASCIIColors.warning(f"Registry Cleaner: Found dead server process (PID: {registry_data[key]['pid']}). Removing entry {key}.")
191
- del registry_data[key]
192
- changed = True
193
-
194
- # Clean dead clients from living servers
195
- for key, server_info in list(registry_data.items()):
196
- dead_clients = [pid for pid in server_info.get('client_pids', []) if not self._is_pid_running(pid)]
197
- if dead_clients:
198
- ASCIIColors.warning(f"Registry Cleaner: Found dead client PIDs {dead_clients} for server {key}. Cleaning up.")
199
- server_info['client_pids'] = [pid for pid in server_info['client_pids'] if pid not in dead_clients]
200
- server_info['ref_count'] = len(server_info['client_pids'])
201
- changed = True
202
-
203
- # If a server has no clients left after cleanup, it's an orphan. Remove it.
204
- if server_info['ref_count'] <= 0:
205
- ASCIIColors.warning(f"Registry Cleaner: Server {key} (PID: {server_info['pid']}) has no clients left. Shutting it down.")
206
- try:
207
- p = psutil.Process(server_info['pid'])
208
- p.terminate()
209
- p.wait(timeout=5)
210
- except psutil.NoSuchProcess: pass
211
- except Exception as e: ASCIIColors.error(f"Error terminating orphaned server PID {server_info['pid']}: {e}")
212
- del registry_data[key]
213
- changed = True
214
-
215
- return changed
216
-
217
- def get_server(self, server_key: str) -> Optional[Dict[str, Any]]:
218
- with FileLock(self.lock_file):
219
- registry = self._read_registry()
220
- self._clean_stale_entries(registry) # Always clean before read
221
- server_info = registry.get(server_key)
222
- if server_info:
223
- self._write_registry(registry) # Write back changes from cleaning
224
- return server_info
225
-
226
- def register_new_server(self, server_key: str, pid: int, port: int):
227
- with FileLock(self.lock_file):
228
- registry = self._read_registry()
229
- # Clean just in case something happened between server start and registration
230
- self._clean_stale_entries(registry)
231
-
232
- registry[server_key] = {
233
- "pid": pid, "port": port,
234
- "ref_count": 1, "client_pids": [self.my_pid]
235
- }
236
- self._write_registry(registry)
237
- ASCIIColors.info(f"Process {self.my_pid} registered new server {server_key} (PID: {pid}, Port: {port})")
238
-
239
- def increment_ref_count(self, server_key: str):
240
- with FileLock(self.lock_file):
241
- registry = self._read_registry()
242
- self._clean_stale_entries(registry)
243
-
244
- server_info = registry.get(server_key)
245
- if server_info:
246
- if self.my_pid not in server_info['client_pids']:
247
- server_info['client_pids'].append(self.my_pid)
248
- server_info['ref_count'] = len(server_info['client_pids'])
249
- self._write_registry(registry)
250
- ASCIIColors.info(f"Process {self.my_pid} attached to server {server_key}. New ref_count: {server_info['ref_count']}")
251
- else:
252
- ASCIIColors.warning(f"Process {self.my_pid} tried to attach to non-existent server {server_key}.")
253
-
254
- def decrement_ref_count(self, server_key: str):
255
- with FileLock(self.lock_file):
256
- registry = self._read_registry()
257
- made_changes = self._clean_stale_entries(registry)
258
-
259
- server_info = registry.get(server_key)
260
- if server_info:
261
- if self.my_pid in server_info['client_pids']:
262
- server_info['client_pids'].remove(self.my_pid)
263
- server_info['ref_count'] = len(server_info['client_pids'])
264
- made_changes = True
265
- ASCIIColors.info(f"Process {self.my_pid} detached from server {server_key}. New ref_count: {server_info['ref_count']}")
266
-
267
- if server_info['ref_count'] <= 0:
268
- ASCIIColors.info(f"Last client (PID: {self.my_pid}) detached. Shutting down server {server_key} (PID: {server_info['pid']}).")
269
- try:
270
- p = psutil.Process(server_info['pid'])
271
- p.terminate()
272
- p.wait(timeout=10)
273
- except psutil.NoSuchProcess:
274
- ASCIIColors.warning(f"Server process {server_info['pid']} was already gone.")
275
- except Exception as e:
276
- ASCIIColors.error(f"Error terminating server process {server_info['pid']}: {e}")
277
- del registry[server_key]
278
-
279
- if made_changes:
280
- self._write_registry(registry)
281
-
282
- BindingName = "LlamaCppServerBinding"
283
- DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
284
-
285
- class LlamaCppServerProcess:
286
- def __init__(self,
287
- model_path: Union[str, Path],
288
- clip_model_path: Optional[Union[str, Path]] = None,
289
- server_binary_path: Optional[Union[str, Path]]=None,
290
- server_args: Dict[str, Any]={},
291
- process_pid: Optional[int]=None, # PID if we are attaching to existing process
292
- port: Optional[int]=None,
293
- ):
294
- """Initialize the Llama.cpp server process wrapper.
295
- Can either start a new process or wrap an existing one.
296
- """
297
- self.model_path = Path(model_path)
298
- self.clip_model_path = Path(clip_model_path) if clip_model_path else None
299
-
300
- if server_binary_path:
301
- self.server_binary_path = Path(server_binary_path)
302
- elif llama_cpp_binaries:
303
- self.server_binary_path = Path(llama_cpp_binaries.get_binary_path())
304
- else:
305
- raise FileNotFoundError("llama_cpp_binaries not found and no server_binary_path provided.")
306
-
307
- self.port: Optional[int] = port
308
- self.pid: Optional[int] = process_pid
309
- self.server_args = server_args
310
- # The actual subprocess.Popen object. Will be None if this instance is just a client to a server started by another process.
311
- self.process: Optional[subprocess.Popen] = None
312
- self.session = requests.Session()
313
- self.host = self.server_args.get("host",DEFAULT_LLAMACPP_SERVER_HOST)
314
- self.base_url: Optional[str] = f"http://{self.host}:{self.port}" if self.port else None
315
- self.is_healthy = False
316
- self._stderr_lines: List[str] = []
317
- self._stderr_thread: Optional[threading.Thread] = None
318
-
319
- if not self.model_path.exists():
320
- raise FileNotFoundError(f"Model file not found: {self.model_path}")
321
- if self.clip_model_path and not self.clip_model_path.exists():
322
- ASCIIColors.warning(f"Clip model file '{self.clip_model_path}' not found. Vision features may not work or may use a different auto-detected clip model.")
323
- if not self.server_binary_path.exists():
324
- raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
325
-
326
- def attach(self):
327
- """Attaches to an already running process by checking its health."""
328
- if not self.pid or not self.port:
329
- raise ValueError("Cannot attach without PID and port.")
330
- self.base_url = f"http://{self.host}:{self.port}"
331
- health_url = f"{self.base_url}/health"
332
- try:
333
- response = self.session.get(health_url, timeout=5)
334
- if response.status_code == 200 and response.json().get("status") == "ok":
335
- self.is_healthy = True
336
- ASCIIColors.green(f"Successfully attached to Llama.cpp server on port {self.port} (PID: {self.pid}).")
337
- return
338
- except requests.exceptions.RequestException as e:
339
- ASCIIColors.warning(f"Failed to attach to server on port {self.port}: {e}")
340
- self.is_healthy = False
341
- raise ConnectionError(f"Could not connect to existing server at {health_url}")
342
-
343
- def _filter_stderr(self, stderr_pipe):
344
- try:
345
- for line in iter(stderr_pipe.readline, ''):
346
- if line:
347
- self._stderr_lines.append(line.strip())
348
- if len(self._stderr_lines) > 50: self._stderr_lines.pop(0)
349
- if "llama_model_loaded" in line or "error" in line.lower() or "failed" in line.lower():
350
- ASCIIColors.debug(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
351
- elif "running on port" in line: # Server startup message
352
- ASCIIColors.info(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
353
- except ValueError: pass
354
- except Exception as e: ASCIIColors.warning(f"Exception in stderr filter thread for port {self.port}: {e}")
355
-
356
- def start(self, port_to_use: int):
357
- self.port = port_to_use
358
- self.base_url = f"http://{self.host}:{self.port}"
359
-
360
- cmd = [
361
- str(self.server_binary_path),
362
- "--model", str(self.model_path),
363
- "--host", self.host,
364
- "--port", str(self.port),
365
- ]
366
-
367
- arg_map = {
368
- "n_ctx": "--ctx-size", "n_gpu_layers": "--gpu-layers", "main_gpu": "--main-gpu",
369
- "tensor_split": "--tensor-split", "use_mmap": (lambda v: ["--no-mmap"] if not v else []),
370
- "use_mlock": (lambda v: ["--mlock"] if v else []), "seed": "--seed",
371
- "n_batch": "--batch-size", "n_threads": "--threads", "n_threads_batch": "--threads-batch",
372
- "rope_scaling_type": "--rope-scaling", "rope_freq_base": "--rope-freq-base",
373
- "rope_freq_scale": "--rope-freq-scale",
374
- "embedding": (lambda v: ["--embedding"] if v else []),
375
- "verbose": (lambda v: ["--verbose"] if v else []),
376
- "chat_template": "--chat-template",
377
- "parallel_slots": "--parallel", # Number of parallel processing slots
378
- }
379
-
380
- if self.clip_model_path: # This should be the actual path resolved by the binding
381
- cmd.extend(["--mmproj", str(self.clip_model_path)])
382
-
383
- for key, cli_arg in arg_map.items():
384
- val = self.server_args.get(key)
385
- if val is not None:
386
- if callable(cli_arg): cmd.extend(cli_arg(val))
387
- else: cmd.extend([cli_arg, str(val)])
388
-
389
- extra_cli_flags = self.server_args.get("extra_cli_flags", [])
390
- if isinstance(extra_cli_flags, str): extra_cli_flags = extra_cli_flags.split()
391
- cmd.extend(extra_cli_flags)
392
-
393
- ASCIIColors.info(f"Starting Llama.cpp server ({' '.join(cmd)})")
394
-
395
- env = os.environ.copy()
396
- if os.name == 'posix' and self.server_binary_path.parent != Path('.'):
397
- lib_path_str = str(self.server_binary_path.parent.resolve())
398
- current_ld_path = env.get('LD_LIBRARY_PATH', '')
399
- env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}" if current_ld_path else lib_path_str
400
-
401
- try:
402
- self.process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, bufsize=1, env=env)
403
- self.pid = self.process.pid
404
- except Exception as e:
405
- ASCIIColors.error(f"Failed to start llama.cpp server process on port {self.port}: {e}"); trace_exception(e); raise
406
-
407
- self._stderr_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stderr,), daemon=True)
408
- self._stderr_thread.start()
409
-
410
- health_url = f"{self.base_url}/health"
411
- max_wait_time = self.server_args.get("server_startup_timeout", 60)
412
- start_time = time.time()
413
-
414
- while time.time() - start_time < max_wait_time:
415
- if self.process.poll() is not None:
416
- stderr_output = "\n".join(self._stderr_lines[-10:])
417
- raise RuntimeError(f"Llama.cpp server (port {self.port}) terminated unexpectedly (exit code {self.process.poll()}) during startup. Stderr:\n{stderr_output}")
418
- try:
419
- response = self.session.get(health_url, timeout=2)
420
- if response.status_code == 200 and response.json().get("status") == "ok":
421
- self.is_healthy = True
422
- ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port} (PID: {self.pid}).")
423
- return
424
- except requests.exceptions.ConnectionError: time.sleep(1)
425
- except Exception as e: ASCIIColors.warning(f"Health check for port {self.port} failed: {e}"); time.sleep(1)
426
-
427
- self.is_healthy = False
428
- self.shutdown()
429
- stderr_output = "\n".join(self._stderr_lines[-10:])
430
- raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
431
-
432
- def shutdown(self):
433
- """ This method only shuts down a server if this instance owns the Popen object.
434
- The actual termination for multi-process is handled by the ServerRegistry. """
435
- self.is_healthy = False
436
- if self.process:
437
- ASCIIColors.info(f"Shutting down owned Llama.cpp server process (PID: {self.process.pid} on port {self.port})...")
438
- try:
439
- self.process.terminate()
440
- self.process.wait(timeout=10)
441
- except subprocess.TimeoutExpired:
442
- ASCIIColors.warning(f"Llama.cpp server (port {self.port}) did not terminate gracefully, killing...")
443
- self.process.kill()
444
- try: self.process.wait(timeout=5)
445
- except subprocess.TimeoutExpired: ASCIIColors.error(f"Failed to kill llama.cpp server process (port {self.port}).")
446
- except Exception as e: ASCIIColors.error(f"Error during server shutdown (port {self.port}): {e}")
447
- finally:
448
- self.process = None
449
- if self._stderr_thread and self._stderr_thread.is_alive(): self._stderr_thread.join(timeout=1)
450
- ASCIIColors.info(f"Llama.cpp server on port {self.port} shut down.")
451
-
452
-
453
- class LlamaCppServerBinding(LollmsLLMBinding):
454
- DEFAULT_SERVER_ARGS = {
455
- "n_gpu_layers": 0, "n_ctx": 128000, "n_batch": 512,
456
- "embedding": False, "verbose": False, "server_startup_timeout": 120,
457
- "parallel_slots": 4, # Default parallel slots for server
458
- "stop_sequences": ["<|im_start|>"], # Default stop sequences
459
- }
460
-
461
- def __init__(self, **kwargs):
462
- super().__init__(BindingName, **kwargs)
463
- if llama_cpp_binaries is None or psutil is None:
464
- raise ImportError("llama-cpp-binaries and psutil packages are required.")
465
-
466
- self.registry = ServerRegistry()
467
- models_path = kwargs.get("models_path", Path(__file__).parent/"models")
468
- self.models_path = Path(models_path)
469
- self.initial_model_name_preference: Optional[str] = kwargs.get("model_name")
470
- self.user_provided_model_name: Optional[str] = kwargs.get("model_name")
471
- self.initial_clip_model_name_preference: Optional[str] = kwargs.get("clip_model_name")
472
- self._model_path_map: Dict[str, Path] = {}
473
- self._scan_models()
474
- self.default_completion_format = kwargs.get("default_completion_format", ELF_COMPLETION_FORMAT.Chat)
475
- self.server_args = {**self.DEFAULT_SERVER_ARGS, **(kwargs.get("config") or {}), **kwargs}
476
- self.server_binary_path = self._get_server_binary_path()
477
-
478
- self.current_model_path: Optional[Path] = None
479
- self.clip_model_path: Optional[Path] = None
480
- self.server_process: Optional[LlamaCppServerProcess] = None
481
- self.port: Optional[int] = None
482
- self.server_key: Optional[str] = None
483
-
484
- ASCIIColors.info("LlamaCppServerBinding initialized. Server will start on-demand with first generation call.")
485
-
486
- def _get_server_binary_path(self) -> Path:
487
- custom_path_str = self.server_args.get("llama_server_binary_path")
488
- if custom_path_str:
489
- custom_path = Path(custom_path_str)
490
- if custom_path.exists() and custom_path.is_file():
491
- ASCIIColors.info(f"Using custom llama.cpp server binary: {custom_path}"); return custom_path
492
- else: ASCIIColors.warning(f"Custom binary '{custom_path_str}' not found. Falling back.")
493
- if llama_cpp_binaries:
494
- bin_path_str = llama_cpp_binaries.get_binary_path()
495
- if bin_path_str:
496
- bin_path = Path(bin_path_str)
497
- if bin_path.exists() and bin_path.is_file():
498
- ASCIIColors.info(f"Using binary from llama-cpp-binaries: {bin_path}"); return bin_path
499
- raise FileNotFoundError("Llama.cpp server binary not found.")
500
-
501
- def _resolve_model_path(self, model_name_or_path: str) -> Path:
502
- """
503
- Resolves a model name or path to a full Path object.
504
- It prioritizes the internal map, then checks for absolute/relative paths,
505
- and rescans the models directory as a fallback.
506
- """
507
- if model_name_or_path in self._model_path_map:
508
- return self._model_path_map[model_name_or_path]
509
- model_p = Path(model_name_or_path)
510
- if model_p.is_absolute() and model_p.exists(): return model_p
511
- path_in_models_dir = self.models_path / model_name_or_path
512
- if path_in_models_dir.exists(): return path_in_models_dir
513
- self._scan_models()
514
- if model_name_or_path in self._model_path_map:
515
- return self._model_path_map[model_name_or_path]
516
- raise FileNotFoundError(f"Model '{model_name_or_path}' not found.")
517
-
518
- def _find_available_port(self) -> int:
519
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
520
- s.bind(('', 0)); return s.getsockname()[1]
521
-
522
- def _release_server_instance(self):
523
- if self.server_process and self.server_key:
524
- self.registry.decrement_ref_count(self.server_key)
525
- self.server_process = None
526
- self.port = None
527
- self.server_key = None
528
- self.current_model_path = None
529
- self.clip_model_path = None
530
-
531
- def load_model(self, model_name_or_path: str) -> bool:
532
- self.user_provided_model_name = model_name_or_path
533
- try:
534
- resolved_model_path = self._resolve_model_path(model_name_or_path)
535
- except Exception as ex:
536
- trace_exception(ex); return False
537
-
538
- final_clip_model_path: Optional[Path] = None
539
- if self.initial_clip_model_name_preference:
540
- p_clip_pref = Path(self.initial_clip_model_name_preference)
541
- if p_clip_pref.is_absolute() and p_clip_pref.exists(): final_clip_model_path = p_clip_pref
542
- elif (self.models_path / p_clip_pref).exists(): final_clip_model_path = self.models_path / p_clip_pref
543
- else: ASCIIColors.warning(f"Specified clip model '{self.initial_clip_model_name_preference}' not found.")
544
-
545
- if not final_clip_model_path:
546
- base_name = get_gguf_model_base_name(resolved_model_path.stem)
547
- potential_paths = [
548
- resolved_model_path.parent / f"{base_name}.mmproj",
549
- resolved_model_path.parent / f"mmproj-{base_name}.gguf",
550
- self.models_path / f"{base_name}.mmproj",
551
- self.models_path / f"mmproj-{base_name}.gguf",
552
- ]
553
- for p_clip in potential_paths:
554
- if p_clip.exists(): final_clip_model_path = p_clip; break
555
-
556
- final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else "None"
557
- new_server_key = f"{resolved_model_path}|{final_clip_model_path_str}"
558
-
559
- if self.server_process and self.server_key == new_server_key and self.server_process.is_healthy:
560
- ASCIIColors.info(f"Model '{model_name_or_path}' is already loaded. No change.")
561
- return True
562
-
563
- if self.server_process and self.server_key != new_server_key:
564
- self._release_server_instance()
565
-
566
- # Check registry for an existing server
567
- existing_server_info = self.registry.get_server(new_server_key)
568
- if existing_server_info:
569
- ASCIIColors.info(f"Found existing server for {new_server_key} in registry (PID: {existing_server_info['pid']}, Port: {existing_server_info['port']}). Attaching...")
570
- try:
571
- self.server_process = LlamaCppServerProcess(
572
- model_path=resolved_model_path, clip_model_path=final_clip_model_path,
573
- process_pid=existing_server_info['pid'], port=existing_server_info['port'],
574
- server_args=self.server_args
575
- )
576
- self.server_process.attach() # This verifies health
577
- self.port = self.server_process.port
578
- self.current_model_path = resolved_model_path
579
- self.clip_model_path = final_clip_model_path
580
- self.server_key = new_server_key
581
- self.registry.increment_ref_count(new_server_key)
582
- return True
583
- except Exception as e:
584
- ASCIIColors.error(f"Failed to attach to existing server: {e}. It might be stale. Will attempt to start a new one.")
585
- self.registry.decrement_ref_count(new_server_key) # Clean up failed attach
586
-
587
- # Start a new server
588
- ASCIIColors.info(f"No existing server found for {new_server_key}. Starting a new one.")
589
- self.current_model_path = resolved_model_path
590
- self.clip_model_path = final_clip_model_path
591
- self.server_key = new_server_key
592
-
593
- try:
594
- new_port = self._find_available_port()
595
- current_server_args = self.server_args.copy()
596
- if "parallel_slots" not in current_server_args or current_server_args["parallel_slots"] <=0:
597
- current_server_args["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
598
-
599
- new_server = LlamaCppServerProcess(
600
- model_path=self.current_model_path, clip_model_path=self.clip_model_path,
601
- server_binary_path=self.server_binary_path, server_args=current_server_args
602
- )
603
- new_server.start(port_to_use=new_port)
604
-
605
- if new_server.is_healthy:
606
- self.server_process = new_server
607
- self.port = new_port
608
- self.registry.register_new_server(self.server_key, new_server.pid, new_port)
609
- ASCIIColors.green(f"New server {self.server_key} started and registered.")
610
- return True
611
- else:
612
- return False
613
- except Exception as e:
614
- ASCIIColors.error(f"Failed to start new server for '{model_name_or_path}': {e}"); trace_exception(e)
615
- self._release_server_instance()
616
- return False
617
-
618
- def unload_model(self):
619
- if self.server_process:
620
- self._release_server_instance()
621
- else:
622
- ASCIIColors.info("Unload called, but no server was active for this binding instance.")
623
-
624
- def _ensure_server_is_running(self) -> bool:
625
- """
626
- Checks if the server is healthy. If not, it attempts to load the configured model.
627
- Returns True if the server is healthy and ready, False otherwise.
628
- """
629
- if self.server_process and self.server_process.is_healthy:
630
- return True
631
-
632
- ASCIIColors.info("Server is not running. Attempting to start on-demand...")
633
-
634
- model_to_load = self.user_provided_model_name or self.initial_model_name_preference
635
-
636
- if not model_to_load:
637
- self._scan_models()
638
- available_models = self.list_models()
639
- if not available_models:
640
- ASCIIColors.error("No model specified and no GGUF models found in models path.")
641
- return False
642
-
643
- model_to_load = available_models[0]['name']
644
- ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{model_to_load}'")
645
-
646
- if self.load_model(model_to_load):
647
- return True
648
- else:
649
- ASCIIColors.error(f"Automatic model load for '{model_to_load}' failed.")
650
- return False
651
-
652
- def _get_request_url(self, endpoint: str) -> str:
653
- return f"{self.server_process.base_url}{endpoint}"
654
-
655
- def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
656
- temperature: float = 0.7, top_k: int = 40, top_p: float = 0.9,
657
- repeat_penalty: float = 1.1, repeat_last_n: Optional[int] = 64,
658
- seed: Optional[int] = None, stream: bool = False, use_chat_format: bool = True,
659
- images: Optional[List[str]] = None,
660
- stop_sequences: Optional[List[str]] = None,
661
- split:Optional[bool]=False,
662
- user_keyword:Optional[str]="!@>user:",
663
- ai_keyword:Optional[str]="!@>assistant:",
664
- **extra_params) -> Dict:
665
- payload_params = {
666
- "temperature": self.server_args.get("temperature", 0.7), "top_k": self.server_args.get("top_k", 40),
667
- "top_p": self.server_args.get("top_p", 0.9), "repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
668
- "repeat_last_n": self.server_args.get("repeat_last_n", 64), "mirostat": self.server_args.get("mirostat_mode", 0),
669
- "mirostat_tau": self.server_args.get("mirostat_tau", 5.0), "mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
670
- }
671
- if "grammar_string" in self.server_args and self.server_args["grammar_string"]:
672
- payload_params["grammar"] = self.server_args["grammar_string"]
673
-
674
- payload_params.update({"temperature": temperature, "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n})
675
- if n_predict is not None: payload_params['n_predict'] = n_predict
676
- if seed is not None: payload_params['seed'] = seed
677
-
678
- # --- Handle stop sequences ---
679
- all_stop_sequences = set(self.server_args.get("stop_sequences", []))
680
- if stop_sequences:
681
- all_stop_sequences.update(stop_sequences)
682
- if all_stop_sequences:
683
- payload_params['stop'] = list(all_stop_sequences)
684
- # --- End stop sequences ---
685
-
686
- payload_params = {k: v for k, v in payload_params.items() if v is not None}
687
- payload_params.update(extra_params)
688
-
689
- if use_chat_format and self.default_completion_format == ELF_COMPLETION_FORMAT.Chat:
690
- messages = []
691
- if system_prompt and system_prompt.strip(): messages.append({"role": "system", "content": system_prompt})
692
- user_content: Union[str, List[Dict[str, Any]]] = prompt
693
- if split:
694
- messages += self.split_discussion(user_content,user_keyword=user_keyword, ai_keyword=ai_keyword)
695
- else:
696
- messages.append({"role": "user", "content": user_content})
697
- if images and self.clip_model_path:
698
- image_parts = []
699
- for img_path in images:
700
- try:
701
- with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
702
- image_type = Path(img_path).suffix[1:].lower() or "png"; image_type = "jpeg" if image_type == "jpg" else image_type
703
- image_parts.append({"type": "image_url", "image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}})
704
- except Exception as ex: trace_exception(ex)
705
- messages[-1]["content"] =[{"type": "text", "text": messages[-1]["content"]}] + image_parts # type: ignore
706
- final_payload = {"messages": messages, "stream": stream, **payload_params}
707
- if 'n_predict' in final_payload: final_payload['max_tokens'] = final_payload.pop('n_predict')
708
- return final_payload
709
- else:
710
- full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:" if system_prompt and system_prompt.strip() else prompt
711
- final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
712
- if images and self.clip_model_path:
713
- image_data_list = []
714
- for i, img_path in enumerate(images):
715
- try:
716
- with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
717
- image_data_list.append({"data": encoded_string, "id": i + 10})
718
- except Exception as e_img: ASCIIColors.error(f"Could not encode image {img_path}: {e_img}")
719
- if image_data_list: final_payload["image_data"] = image_data_list
720
- return final_payload
721
-
722
-
723
- def generate_text(self,
724
- prompt: str,
725
- images: Optional[List[str]] = None,
726
- system_prompt: str = "",
727
- n_predict: Optional[int] = None,
728
- stream: Optional[bool] = None,
729
- temperature: float = 0.7,
730
- top_k: int = 40,
731
- top_p: float = 0.9,
732
- repeat_penalty: float = 1.1,
733
- repeat_last_n: int = 64,
734
- seed: Optional[int] = None,
735
- n_threads: Optional[int] = None,
736
- ctx_size: int | None = None,
737
- streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
738
- stop_sequences: Optional[List[str]] = None,
739
- split:Optional[bool]=False,
740
- user_keyword:Optional[str]="!@>user:",
741
- ai_keyword:Optional[str]="!@>assistant:",
742
- **generation_kwargs
743
- ) -> Union[str, dict]:
744
-
745
- if not self._ensure_server_is_running():
746
- return {"status": False, "error": "Llama.cpp server could not be started. Please check model configuration and logs."}
747
-
748
- _use_chat_format = True
749
- payload = self._prepare_generation_payload(
750
- prompt=prompt, system_prompt=system_prompt, n_predict=n_predict,
751
- temperature=temperature if temperature is not None else self.server_args.get("temperature",0.7),
752
- top_k=top_k if top_k is not None else self.server_args.get("top_k",40),
753
- top_p=top_p if top_p is not None else self.server_args.get("top_p",0.9),
754
- repeat_penalty=repeat_penalty if repeat_penalty is not None else self.server_args.get("repeat_penalty",1.1),
755
- repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
756
- seed=seed if seed is not None else self.server_args.get("seed", -1), stream=stream,
757
- use_chat_format=_use_chat_format, images=images,
758
- stop_sequences=stop_sequences,
759
- split= split, user_keyword=user_keyword, ai_keyword=ai_keyword, **generation_kwargs
760
- )
761
- endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
762
- request_url = self._get_request_url(endpoint)
763
-
764
- full_response_text = ""
765
- try:
766
- response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
767
- response.raise_for_status()
768
- if stream:
769
- for line in response.iter_lines():
770
- if not line: continue
771
- line_str = line.decode('utf-8').strip()
772
- if line_str.startswith('data: '): line_str = line_str[6:]
773
- if line_str == '[DONE]': break
774
- try:
775
- chunk_data = json.loads(line_str)
776
- chunk_content = (chunk_data.get('choices', [{}])[0].get('delta', {}).get('content', '') if _use_chat_format
777
- else chunk_data.get('content', ''))
778
- if chunk_content:
779
- full_response_text += chunk_content
780
- if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
781
- ASCIIColors.info("Streaming callback requested stop."); response.close(); break
782
- if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): break
783
- except json.JSONDecodeError: ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}"); continue
784
- return full_response_text
785
- else:
786
- response_data = response.json()
787
- return response_data.get('choices', [{}])[0].get('message', {}).get('content', '') if _use_chat_format \
788
- else response_data.get('content','')
789
- except requests.exceptions.RequestException as e:
790
- error_message = f"Llama.cpp server request error: {e}"
791
- if e.response is not None:
792
- try: error_details = e.response.json(); error_message += f" - Details: {error_details.get('error', e.response.text)}"
793
- except json.JSONDecodeError: error_message += f" - Response: {e.response.text[:200]}"
794
- ASCIIColors.error(error_message)
795
- return {"status": False, "error": error_message, "details": str(e.response.text if e.response else "No response text")}
796
- except Exception as ex:
797
- error_message = f"Llama.cpp generation error: {str(ex)}"; trace_exception(ex)
798
- return {"status": False, "error": error_message}
799
-
800
- def chat(self,
801
- discussion: LollmsDiscussion,
802
- branch_tip_id: Optional[str] = None,
803
- n_predict: Optional[int] = None,
804
- stream: Optional[bool] = None,
805
- temperature: float = 0.7,
806
- top_k: int = 40,
807
- top_p: float = 0.9,
808
- repeat_penalty: float = 1.1,
809
- repeat_last_n: int = 64,
810
- seed: Optional[int] = None,
811
- n_threads: Optional[int] = None,
812
- ctx_size: Optional[int] = None,
813
- streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
814
- stop_sequences: Optional[List[str]] = None,
815
- **generation_kwargs
816
- ) -> Union[str, dict]:
817
-
818
- if not self._ensure_server_is_running():
819
- return {"status": "error", "message": "Llama.cpp server could not be started. Please check model configuration and logs."}
820
-
821
- messages = discussion.export("openai_chat", branch_tip_id)
822
- payload = {
823
- "messages": messages, "max_tokens": n_predict, "temperature": temperature,
824
- "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty,
825
- "seed": seed, "stream": stream, **generation_kwargs
826
- }
827
-
828
- all_stop_sequences = set(self.server_args.get("stop_sequences", []))
829
- if stop_sequences:
830
- all_stop_sequences.update(stop_sequences)
831
- if all_stop_sequences:
832
- payload['stop'] = list(all_stop_sequences)
833
-
834
- payload = {k: v for k, v in payload.items() if v is not None}
835
-
836
- endpoint = "/v1/chat/completions"
837
- request_url = self._get_request_url(endpoint)
838
- full_response_text = ""
839
-
840
- try:
841
- response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
842
- response.raise_for_status()
843
-
844
- if stream:
845
- for line in response.iter_lines():
846
- if not line: continue
847
- line_str = line.decode('utf-8').strip()
848
- if line_str.startswith('data: '): line_str = line_str[6:]
849
- if line_str == '[DONE]': break
850
- try:
851
- chunk_data = json.loads(line_str)
852
- choices = chunk_data.get('choices', [{}])
853
- if choices and len(choices)>0:
854
- chunk_content = choices[0].get('delta', {}).get('content', '')
855
- if chunk_content:
856
- full_response_text += chunk_content
857
- if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
858
- ASCIIColors.info("Streaming callback requested stop.")
859
- response.close()
860
- break
861
- except json.JSONDecodeError:
862
- ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
863
- continue
864
- return full_response_text
865
- else:
866
- response_data = response.json()
867
- return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
868
-
869
- except requests.exceptions.RequestException as e:
870
- error_message = f"Llama.cpp server request error: {e}"
871
- if e.response is not None:
872
- try:
873
- error_details = e.response.json()
874
- error_message += f" - Details: {error_details.get('error', e.response.text)}"
875
- except json.JSONDecodeError:
876
- error_message += f" - Response: {e.response.text[:200]}"
877
- ASCIIColors.error(error_message)
878
- return {"status": "error", "message": error_message}
879
- except Exception as ex:
880
- error_message = f"Llama.cpp generation error: {str(ex)}"
881
- trace_exception(ex)
882
- return {"status": "error", "message": error_message}
883
-
884
- def tokenize(self, text: str) -> List[int]:
885
- if not self._ensure_server_is_running(): return []
886
- try:
887
- response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
888
- response.raise_for_status(); return response.json().get("tokens", [])
889
- except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
890
-
891
- def detokenize(self, tokens: List[int]) -> str:
892
- if not self._ensure_server_is_running(): return ""
893
- try:
894
- response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
895
- response.raise_for_status(); return response.json().get("content", "")
896
- except Exception as e: ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e); return ""
897
-
898
- def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
899
-
900
- def embed(self, text: str, **kwargs) -> List[float]:
901
- if not self._ensure_server_is_running(): return []
902
- if not self.server_args.get("embedding"):
903
- ASCIIColors.warning("Embedding not enabled in server_args. Please set 'embedding' to True in config."); return []
904
- try:
905
- payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
906
- response = self.server_process.session.post(request_url, json=payload)
907
- if response.status_code == 404: # Fallback
908
- request_url = self._get_request_url("/embedding")
909
- response = self.server_process.session.post(request_url, json={"content": text})
910
- response.raise_for_status(); data = response.json()
911
- if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: return data["data"][0]["embedding"]
912
- elif "embedding" in data and isinstance(data["embedding"], list): return data["embedding"]
913
- else: raise ValueError(f"Unexpected embedding response: {data}")
914
- except requests.exceptions.RequestException as e:
915
- err_msg = f"Embedding request error: {e}";
916
- if e.response: err_msg += f" - {e.response.text[:200]}"
917
- ASCIIColors.error(err_msg)
918
- return []
919
- except Exception as ex:
920
- trace_exception(ex); ASCIIColors.error(f"Embedding failed: {str(ex)}")
921
- return []
922
-
923
- def get_model_info(self) -> dict:
924
- is_loaded = self.server_process is not None and self.server_process.is_healthy
925
- info = {
926
- "name": self.binding_name,
927
- "user_provided_model_name": self.user_provided_model_name,
928
- "model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
929
- "clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
930
- "loaded": is_loaded,
931
- "server_args": self.server_args, "port": self.port if self.port else "N/A",
932
- "server_key": str(self.server_key) if self.server_key else "N/A",
933
- }
934
- if is_loaded:
935
- try:
936
- props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
937
- info.update({
938
- "server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
939
- "server_chat_format": props_resp.get("chat_format"),
940
- "server_clip_model_from_props": props_resp.get("mmproj"),
941
- })
942
- except Exception: pass
943
-
944
- is_llava = self.clip_model_path is not None or \
945
- (info.get("server_clip_model_from_props") is not None) or \
946
- ("llava" in self.current_model_path.name.lower() if self.current_model_path else False)
947
- info["supports_vision"] = is_llava
948
- info["supports_structured_output"] = self.server_args.get("grammar_string") is not None
949
- return info
950
-
951
- def _scan_models(self):
952
- self._model_path_map = {}
953
- if not self.models_path.exists() or not self.models_path.is_dir():
954
- ASCIIColors.warning(f"Models path does not exist or is not a directory: {self.models_path}")
955
- return
956
-
957
- all_paths = list(self.models_path.rglob("*.gguf"))
958
- filenames_count = {}
959
- for path in all_paths:
960
- if path.is_file():
961
- filenames_count[path.name] = filenames_count.get(path.name, 0) + 1
962
-
963
- for model_file in all_paths:
964
- if model_file.is_file():
965
- relative_path_str = str(model_file.relative_to(self.models_path).as_posix())
966
- if filenames_count[model_file.name] > 1:
967
- unique_name = relative_path_str
968
- else:
969
- unique_name = model_file.name
970
- self._model_path_map[unique_name] = model_file
971
-
972
- ASCIIColors.info(f"Scanned {len(self._model_path_map)} models from {self.models_path}.")
973
-
974
- def list_models(self) -> List[Dict[str, Any]]:
975
- self._scan_models()
976
- models_found = []
977
- for unique_name, model_path in self._model_path_map.items():
978
- models_found.append({
979
- 'name': unique_name, 'model_name': model_path.name,
980
- 'path': str(model_path), 'size': model_path.stat().st_size
981
- })
982
- return sorted(models_found, key=lambda x: x['name'])
983
-
984
- def __del__(self):
985
- self.unload_model()
986
-
987
- def get_ctx_size(self, model_name: Optional[str] = None) -> Optional[int]:
988
- if model_name is None:
989
- model_name = self.user_provided_model_name or self.initial_model_name_preference
990
- if not model_name and self.current_model_path:
991
- model_name = self.current_model_path.name
992
-
993
- if model_name is None:
994
- ASCIIColors.warning("Cannot determine context size without a model name.")
995
- return None
996
-
997
- known_contexts = {
998
- 'llama3.1': 131072, 'llama3.2': 131072, 'llama3.3': 131072, 'llama3': 8192,
999
- 'llama2': 4096, 'mixtral8x22b': 65536, 'mixtral': 32768, 'mistral': 32768,
1000
- 'gemma3': 131072, 'gemma2': 8192, 'gemma': 8192, 'phi3': 131072, 'phi2': 2048,
1001
- 'phi': 2048, 'qwen2.5': 131072, 'qwen2': 32768, 'qwen': 8192,
1002
- 'codellama': 16384, 'codegemma': 8192, 'deepseek-coder-v2': 131072,
1003
- 'deepseek-coder': 16384, 'deepseek-v2': 131072, 'deepseek-llm': 4096,
1004
- 'yi1.5': 32768, 'yi': 4096, 'command-r': 131072, 'wizardlm2': 32768,
1005
- 'wizardlm': 16384, 'zephyr': 65536, 'vicuna': 2048, 'falcon': 2048,
1006
- 'starcoder': 8192, 'stablelm': 4096, 'orca2': 4096, 'orca': 4096,
1007
- 'dolphin': 32768, 'openhermes': 8192,
1008
- }
1009
- normalized_model_name = model_name.lower().strip()
1010
- sorted_base_models = sorted(known_contexts.keys(), key=len, reverse=True)
1011
-
1012
- for base_name in sorted_base_models:
1013
- if base_name in normalized_model_name:
1014
- context_size = known_contexts[base_name]
1015
- ASCIIColors.info(f"Using hardcoded context size for '{model_name}' based on '{base_name}': {context_size}")
1016
- return context_size
1017
-
1018
- ASCIIColors.warning(f"Context size not found for model '{model_name}' in the hardcoded list.")
1019
- return None
1020
-
1021
- if __name__ == '__main__':
1022
- # NOTE: This test block is designed for a single-process scenario to verify basic functionality.
1023
- # Testing the multi-process capabilities requires a separate script that launches multiple
1024
- # instances of a test program using this binding. The logic here, however, will now use the
1025
- # new file-based registry system.
1026
- full_streamed_text = ""
1027
- ASCIIColors.yellow("Testing LlamaCppServerBinding...")
1028
-
1029
- try:
1030
- models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
1031
- model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf")
1032
-
1033
- models_path = Path(models_path_str)
1034
- models_path.mkdir(parents=True, exist_ok=True)
1035
- test_model_path = models_path / model_name_str
1036
-
1037
- primary_model_available = test_model_path.exists()
1038
- if not primary_model_available:
1039
- ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set env vars.")
1040
- ASCIIColors.warning("Some tests will be skipped.")
1041
-
1042
- except Exception as e:
1043
- ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
1044
- sys.exit(1)
1045
-
1046
- binding_config = {
1047
- "n_gpu_layers": 0, "n_ctx": 512, "embedding": True,
1048
- "verbose": False, "server_startup_timeout": 180, "parallel_slots": 2,
1049
- "stop_sequences": ["<|user|>", "\nUSER:"], # Example default stop sequences
1050
- }
1051
-
1052
- active_binding1: Optional[LlamaCppServerBinding] = None
1053
- active_binding2: Optional[LlamaCppServerBinding] = None
1054
-
1055
- try:
1056
- if primary_model_available:
1057
- # --- Test 1: Auto-start server on first generation call ---
1058
- ASCIIColors.cyan("\n--- Test 1: Auto-start server with specified model name ---")
1059
- active_binding1 = LlamaCppServerBinding(
1060
- model_name=model_name_str, models_path=str(models_path), config=binding_config
1061
- )
1062
- ASCIIColors.info("Binding1 initialized. No server should be running yet.")
1063
- ASCIIColors.info(f"Initial model info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
1064
-
1065
- prompt_text = "What is the capital of France?"
1066
- generated_text = active_binding1.generate_text(
1067
- prompt_text,
1068
- system_prompt="Concise expert.",
1069
- n_predict=20,
1070
- stream=False,
1071
- stop_sequences=["Paris"] # Test per-call stop sequence
1072
- )
1073
-
1074
- if isinstance(generated_text, str) and "Paris" not in generated_text: # Should stop *before* generating Paris
1075
- ASCIIColors.green(f"SUCCESS: Auto-start generation with stop sequence successful. Response: '{generated_text}'")
1076
- else:
1077
- ASCIIColors.error(f"FAILURE: Auto-start generation failed or stop sequence ignored. Response: {generated_text}")
1078
-
1079
- ASCIIColors.info(f"Model info after auto-start: {json.dumps(active_binding1.get_model_info(), indent=2)}")
1080
- if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
1081
- raise RuntimeError("Server for binding1 did not seem to start correctly.")
1082
-
1083
- # --- Test 2: Server reuse with a second binding ---
1084
- ASCIIColors.cyan("\n--- Test 2: Server reuse with a second binding ---")
1085
- active_binding2 = LlamaCppServerBinding(
1086
- model_name=model_name_str, models_path=str(models_path), config=binding_config
1087
- )
1088
- generated_text_b2 = active_binding2.generate_text("Ping", n_predict=5, stream=False)
1089
- if isinstance(generated_text_b2, str):
1090
- ASCIIColors.green(f"SUCCESS: Binding2 generation successful. Response: {generated_text_b2}")
1091
- else:
1092
- ASCIIColors.error(f"FAILURE: Binding2 generation failed. Response: {generated_text_b2}")
1093
-
1094
- if active_binding1.port != active_binding2.port:
1095
- ASCIIColors.error("FAILURE: Bindings for the same model are using different ports! Server sharing failed.")
1096
- else:
1097
- ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing works.")
1098
-
1099
- # --- Test 3: Unload and auto-reload ---
1100
- ASCIIColors.cyan("\n--- Test 3: Unload and auto-reload ---")
1101
- active_binding1.unload_model()
1102
- ASCIIColors.info("Binding1 unloaded. Ref count should be 1, server still up for binding2.")
1103
-
1104
- generated_text_reloaded = active_binding1.generate_text("Test reload", n_predict=5, stream=False)
1105
- if isinstance(generated_text_reloaded, str):
1106
- ASCIIColors.green(f"SUCCESS: Generation after reload successful. Response: {generated_text_reloaded}")
1107
- else:
1108
- ASCIIColors.error(f"FAILURE: Generation after reload failed. Response: {generated_text_reloaded}")
1109
-
1110
- if active_binding1.port != active_binding2.port:
1111
- ASCIIColors.error("FAILURE: Port mismatch after reload.")
1112
- else:
1113
- ASCIIColors.green("SUCCESS: Correctly re-used same server after reload.")
1114
-
1115
- else:
1116
- ASCIIColors.warning("\n--- Primary model not available, skipping most tests ---")
1117
-
1118
- # --- Test 4: Initialize with model_name=None and auto-find ---
1119
- ASCIIColors.cyan("\n--- Test 4: Initialize with model_name=None and auto-find ---")
1120
- unspecified_binding = LlamaCppServerBinding(model_name=None, models_path=str(models_path), config=binding_config)
1121
- gen_unspec = unspecified_binding.generate_text("Ping", n_predict=5, stream=False)
1122
- if primary_model_available:
1123
- if isinstance(gen_unspec, str):
1124
- ASCIIColors.green(f"SUCCESS: Auto-find generation successful. Response: {gen_unspec}")
1125
- ASCIIColors.info(f"Model auto-selected: {unspecified_binding.user_provided_model_name}")
1126
- else:
1127
- ASCIIColors.error(f"FAILURE: Auto-find generation failed. Response: {gen_unspec}")
1128
- else: # If no models, this should fail gracefully
1129
- if isinstance(gen_unspec, dict) and 'error' in gen_unspec:
1130
- ASCIIColors.green("SUCCESS: Correctly failed to generate when no models are available.")
1131
- else:
1132
- ASCIIColors.error(f"FAILURE: Incorrect behavior when no models are available. Response: {gen_unspec}")
1133
-
1134
- except Exception as e_main:
1135
- ASCIIColors.error(f"An unexpected error occurred during testing: {e_main}")
1136
- trace_exception(e_main)
1137
- finally:
1138
- ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
1139
- if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
1140
- if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
1141
- # Any other bindings will be cleaned up by __del__ on exit
1142
-
1143
- registry = ServerRegistry()
1144
- with FileLock(registry.lock_file):
1145
- final_state = registry._read_registry()
1146
- if not final_state or not any(c for s in final_state.values() for c in s.get('client_pids',[])):
1147
- ASCIIColors.green("All servers shut down correctly and registry is empty or has no clients.")
1148
- if final_state: registry._write_registry({}) # Clean up for next run
1149
- else:
1150
- ASCIIColors.warning(f"Warning: Registry is not empty after tests: {final_state}")
1151
- registry._clean_stale_entries(final_state)
1152
- registry._write_registry(final_state)
1153
- ASCIIColors.info("Forced a final registry cleanup.")
1154
-
1155
- ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")