lollms-client 1.3.4__py3-none-any.whl → 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lollms-client might be problematic. Click here for more details.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/llamacpp/__init__.py +354 -233
- lollms_client/llm_bindings/lollms/__init__.py +152 -153
- lollms_client/lollms_core.py +162 -76
- lollms_client/lollms_discussion.py +2 -2
- lollms_client/lollms_llm_binding.py +3 -3
- lollms_client/lollms_tts_binding.py +80 -67
- lollms_client/tts_bindings/bark/__init__.py +110 -329
- lollms_client/tts_bindings/bark/server/install_bark.py +64 -0
- lollms_client/tts_bindings/bark/server/main.py +311 -0
- lollms_client/tts_bindings/piper_tts/__init__.py +115 -335
- lollms_client/tts_bindings/piper_tts/server/install_piper.py +92 -0
- lollms_client/tts_bindings/piper_tts/server/main.py +425 -0
- lollms_client/tts_bindings/piper_tts/server/setup_voices.py +67 -0
- lollms_client/tts_bindings/xtts/__init__.py +99 -305
- lollms_client/tts_bindings/xtts/server/main.py +314 -0
- lollms_client/tts_bindings/xtts/server/setup_voices.py +67 -0
- {lollms_client-1.3.4.dist-info → lollms_client-1.3.7.dist-info}/METADATA +1 -1
- {lollms_client-1.3.4.dist-info → lollms_client-1.3.7.dist-info}/RECORD +22 -15
- {lollms_client-1.3.4.dist-info → lollms_client-1.3.7.dist-info}/WHEEL +0 -0
- {lollms_client-1.3.4.dist-info → lollms_client-1.3.7.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-1.3.4.dist-info → lollms_client-1.3.7.dist-info}/top_level.txt +0 -0
|
@@ -8,6 +8,7 @@ import subprocess
|
|
|
8
8
|
import sys
|
|
9
9
|
import threading
|
|
10
10
|
import time
|
|
11
|
+
import tempfile
|
|
11
12
|
from pathlib import Path
|
|
12
13
|
from typing import Optional, Callable, List, Union, Dict, Any, Set
|
|
13
14
|
import base64
|
|
@@ -20,29 +21,62 @@ from ascii_colors import ASCIIColors, trace_exception
|
|
|
20
21
|
import pipmaster as pm
|
|
21
22
|
import platform
|
|
22
23
|
|
|
23
|
-
#
|
|
24
|
-
|
|
24
|
+
# --- Multi-process locking for registry ---
|
|
25
|
+
# On Windows, we need msvcrt, on POSIX, fcntl
|
|
26
|
+
try:
|
|
27
|
+
if platform.system() == "Windows":
|
|
28
|
+
import msvcrt
|
|
29
|
+
else:
|
|
30
|
+
import fcntl
|
|
31
|
+
except ImportError:
|
|
32
|
+
# This might happen in some restricted environments.
|
|
33
|
+
# The binding will fall back to thread-safety only.
|
|
34
|
+
msvcrt = fcntl = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class FileLock:
|
|
38
|
+
def __init__(self, lock_file_path):
|
|
39
|
+
self.lock_file_path = lock_file_path
|
|
40
|
+
self.lock_file = None
|
|
41
|
+
self._is_windows = platform.system() == "Windows"
|
|
42
|
+
|
|
43
|
+
def __enter__(self):
|
|
44
|
+
self.lock_file = open(self.lock_file_path, 'w')
|
|
45
|
+
if self._is_windows and msvcrt:
|
|
46
|
+
msvcrt.locking(self.lock_file.fileno(), msvcrt.LK_LOCK, 1)
|
|
47
|
+
elif not self._is_windows and fcntl:
|
|
48
|
+
fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_EX)
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
52
|
+
if self.lock_file:
|
|
53
|
+
if self._is_windows and msvcrt:
|
|
54
|
+
self.lock_file.seek(0)
|
|
55
|
+
msvcrt.locking(self.lock_file.fileno(), msvcrt.LK_UNLCK, 1)
|
|
56
|
+
elif not self._is_windows and fcntl:
|
|
57
|
+
fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_UN)
|
|
58
|
+
self.lock_file.close()
|
|
59
|
+
self.lock_file = None
|
|
60
|
+
|
|
61
|
+
# --- End multi-process locking ---
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Ensure llama-cpp-binaries, requests, pillow, and psutil are installed
|
|
65
|
+
pm.ensure_packages(["requests", "pillow", "psutil"]) # pillow for dummy image in test, psutil for multi-process management
|
|
25
66
|
if not pm.is_installed("llama-cpp-binaries"):
|
|
26
67
|
def install_llama_cpp():
|
|
27
68
|
system = platform.system()
|
|
28
69
|
python_version_simple = f"py{sys.version_info.major}" # e.g. py310 for 3.10
|
|
29
70
|
|
|
30
|
-
# Determine CUDA suffix based on common recent versions. Adjust if needed.
|
|
31
|
-
# For simplicity, we'll target a common recent CUDA version.
|
|
32
|
-
# Users with specific needs might need to install manually.
|
|
33
|
-
# As of late 2023/early 2024, cu121 or cu118 are common.
|
|
34
|
-
# The oobabooga binaries often use +cu124 for recent builds. Let's try that.
|
|
35
71
|
cuda_suffix = "+cu124"
|
|
36
72
|
|
|
37
73
|
|
|
38
74
|
if system == "Windows":
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl" # Generic py3
|
|
75
|
+
url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
|
|
76
|
+
fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl" # Generic py3
|
|
42
77
|
elif system == "Linux":
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl" # Generic py3
|
|
78
|
+
url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
|
|
79
|
+
fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl" # Generic py3
|
|
46
80
|
else:
|
|
47
81
|
ASCIIColors.warning(f"Unsupported OS for prebuilt llama-cpp-binaries: {system}. Please install manually.")
|
|
48
82
|
return
|
|
@@ -63,11 +97,12 @@ if not pm.is_installed("llama-cpp-binaries"):
|
|
|
63
97
|
|
|
64
98
|
try:
|
|
65
99
|
import llama_cpp_binaries
|
|
100
|
+
import psutil
|
|
66
101
|
except ImportError:
|
|
67
|
-
ASCIIColors.error("llama-cpp-binaries package not found. Please
|
|
68
|
-
ASCIIColors.error("You can try: pip install llama-cpp-python[server]
|
|
69
|
-
ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases or https://pypi.org/project/llama-cpp-python/#files")
|
|
102
|
+
ASCIIColors.error("llama-cpp-binaries or psutil package not found. Please ensure they are installed.")
|
|
103
|
+
ASCIIColors.error("You can try: pip install llama-cpp-python[server] psutil")
|
|
70
104
|
llama_cpp_binaries = None
|
|
105
|
+
psutil = None
|
|
71
106
|
|
|
72
107
|
|
|
73
108
|
# --- Predefined patterns ---
|
|
@@ -112,10 +147,130 @@ def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
|
|
|
112
147
|
while name_part and name_part[-1] in ['.', '-', '_']: name_part = name_part[:-1]
|
|
113
148
|
return name_part
|
|
114
149
|
|
|
115
|
-
# --- Global Server Registry ---
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
150
|
+
# --- Global Server Registry (File-based for multi-process support) ---
|
|
151
|
+
|
|
152
|
+
class ServerRegistry:
|
|
153
|
+
def __init__(self):
|
|
154
|
+
self.registry_dir = Path(tempfile.gettempdir()) / "lollms_llamacpp_servers"
|
|
155
|
+
self.registry_dir.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
self.registry_file = self.registry_dir / "registry.json"
|
|
157
|
+
self.lock_file = self.registry_dir / "registry.lock"
|
|
158
|
+
self.my_pid = os.getpid()
|
|
159
|
+
|
|
160
|
+
def _is_pid_running(self, pid: int) -> bool:
|
|
161
|
+
if psutil is None: return True # Conservative default if psutil is missing
|
|
162
|
+
return psutil.pid_exists(pid)
|
|
163
|
+
|
|
164
|
+
def _read_registry(self) -> Dict[str, Any]:
|
|
165
|
+
if not self.registry_file.exists():
|
|
166
|
+
return {}
|
|
167
|
+
try:
|
|
168
|
+
with open(self.registry_file, 'r') as f:
|
|
169
|
+
return json.load(f)
|
|
170
|
+
except (json.JSONDecodeError, FileNotFoundError):
|
|
171
|
+
return {}
|
|
172
|
+
|
|
173
|
+
def _write_registry(self, data: Dict[str, Any]):
|
|
174
|
+
with open(self.registry_file, 'w') as f:
|
|
175
|
+
json.dump(data, f, indent=2)
|
|
176
|
+
|
|
177
|
+
def _clean_stale_entries(self, registry_data: Dict[str, Any]) -> bool:
|
|
178
|
+
"""Cleans stale servers and clients. Returns True if changes were made."""
|
|
179
|
+
changed = False
|
|
180
|
+
# Clean dead servers
|
|
181
|
+
dead_servers = [k for k, v in registry_data.items() if not self._is_pid_running(v['pid'])]
|
|
182
|
+
for key in dead_servers:
|
|
183
|
+
ASCIIColors.warning(f"Registry Cleaner: Found dead server process (PID: {registry_data[key]['pid']}). Removing entry {key}.")
|
|
184
|
+
del registry_data[key]
|
|
185
|
+
changed = True
|
|
186
|
+
|
|
187
|
+
# Clean dead clients from living servers
|
|
188
|
+
for key, server_info in list(registry_data.items()):
|
|
189
|
+
dead_clients = [pid for pid in server_info.get('client_pids', []) if not self._is_pid_running(pid)]
|
|
190
|
+
if dead_clients:
|
|
191
|
+
ASCIIColors.warning(f"Registry Cleaner: Found dead client PIDs {dead_clients} for server {key}. Cleaning up.")
|
|
192
|
+
server_info['client_pids'] = [pid for pid in server_info['client_pids'] if pid not in dead_clients]
|
|
193
|
+
server_info['ref_count'] = len(server_info['client_pids'])
|
|
194
|
+
changed = True
|
|
195
|
+
|
|
196
|
+
# If a server has no clients left after cleanup, it's an orphan. Remove it.
|
|
197
|
+
if server_info['ref_count'] <= 0:
|
|
198
|
+
ASCIIColors.warning(f"Registry Cleaner: Server {key} (PID: {server_info['pid']}) has no clients left. Shutting it down.")
|
|
199
|
+
try:
|
|
200
|
+
p = psutil.Process(server_info['pid'])
|
|
201
|
+
p.terminate()
|
|
202
|
+
p.wait(timeout=5)
|
|
203
|
+
except psutil.NoSuchProcess: pass
|
|
204
|
+
except Exception as e: ASCIIColors.error(f"Error terminating orphaned server PID {server_info['pid']}: {e}")
|
|
205
|
+
del registry_data[key]
|
|
206
|
+
changed = True
|
|
207
|
+
|
|
208
|
+
return changed
|
|
209
|
+
|
|
210
|
+
def get_server(self, server_key: str) -> Optional[Dict[str, Any]]:
|
|
211
|
+
with FileLock(self.lock_file):
|
|
212
|
+
registry = self._read_registry()
|
|
213
|
+
self._clean_stale_entries(registry) # Always clean before read
|
|
214
|
+
server_info = registry.get(server_key)
|
|
215
|
+
if server_info:
|
|
216
|
+
self._write_registry(registry) # Write back changes from cleaning
|
|
217
|
+
return server_info
|
|
218
|
+
|
|
219
|
+
def register_new_server(self, server_key: str, pid: int, port: int):
|
|
220
|
+
with FileLock(self.lock_file):
|
|
221
|
+
registry = self._read_registry()
|
|
222
|
+
# Clean just in case something happened between server start and registration
|
|
223
|
+
self._clean_stale_entries(registry)
|
|
224
|
+
|
|
225
|
+
registry[server_key] = {
|
|
226
|
+
"pid": pid, "port": port,
|
|
227
|
+
"ref_count": 1, "client_pids": [self.my_pid]
|
|
228
|
+
}
|
|
229
|
+
self._write_registry(registry)
|
|
230
|
+
ASCIIColors.info(f"Process {self.my_pid} registered new server {server_key} (PID: {pid}, Port: {port})")
|
|
231
|
+
|
|
232
|
+
def increment_ref_count(self, server_key: str):
|
|
233
|
+
with FileLock(self.lock_file):
|
|
234
|
+
registry = self._read_registry()
|
|
235
|
+
self._clean_stale_entries(registry)
|
|
236
|
+
|
|
237
|
+
server_info = registry.get(server_key)
|
|
238
|
+
if server_info:
|
|
239
|
+
if self.my_pid not in server_info['client_pids']:
|
|
240
|
+
server_info['client_pids'].append(self.my_pid)
|
|
241
|
+
server_info['ref_count'] = len(server_info['client_pids'])
|
|
242
|
+
self._write_registry(registry)
|
|
243
|
+
ASCIIColors.info(f"Process {self.my_pid} attached to server {server_key}. New ref_count: {server_info['ref_count']}")
|
|
244
|
+
else:
|
|
245
|
+
ASCIIColors.warning(f"Process {self.my_pid} tried to attach to non-existent server {server_key}.")
|
|
246
|
+
|
|
247
|
+
def decrement_ref_count(self, server_key: str):
|
|
248
|
+
with FileLock(self.lock_file):
|
|
249
|
+
registry = self._read_registry()
|
|
250
|
+
made_changes = self._clean_stale_entries(registry)
|
|
251
|
+
|
|
252
|
+
server_info = registry.get(server_key)
|
|
253
|
+
if server_info:
|
|
254
|
+
if self.my_pid in server_info['client_pids']:
|
|
255
|
+
server_info['client_pids'].remove(self.my_pid)
|
|
256
|
+
server_info['ref_count'] = len(server_info['client_pids'])
|
|
257
|
+
made_changes = True
|
|
258
|
+
ASCIIColors.info(f"Process {self.my_pid} detached from server {server_key}. New ref_count: {server_info['ref_count']}")
|
|
259
|
+
|
|
260
|
+
if server_info['ref_count'] <= 0:
|
|
261
|
+
ASCIIColors.info(f"Last client (PID: {self.my_pid}) detached. Shutting down server {server_key} (PID: {server_info['pid']}).")
|
|
262
|
+
try:
|
|
263
|
+
p = psutil.Process(server_info['pid'])
|
|
264
|
+
p.terminate()
|
|
265
|
+
p.wait(timeout=10)
|
|
266
|
+
except psutil.NoSuchProcess:
|
|
267
|
+
ASCIIColors.warning(f"Server process {server_info['pid']} was already gone.")
|
|
268
|
+
except Exception as e:
|
|
269
|
+
ASCIIColors.error(f"Error terminating server process {server_info['pid']}: {e}")
|
|
270
|
+
del registry[server_key]
|
|
271
|
+
|
|
272
|
+
if made_changes:
|
|
273
|
+
self._write_registry(registry)
|
|
119
274
|
|
|
120
275
|
BindingName = "LlamaCppServerBinding"
|
|
121
276
|
DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
|
|
@@ -125,9 +280,12 @@ class LlamaCppServerProcess:
|
|
|
125
280
|
model_path: Union[str, Path],
|
|
126
281
|
clip_model_path: Optional[Union[str, Path]] = None,
|
|
127
282
|
server_binary_path: Optional[Union[str, Path]]=None,
|
|
128
|
-
server_args: Dict[str, Any]={}
|
|
283
|
+
server_args: Dict[str, Any]={},
|
|
284
|
+
process_pid: Optional[int]=None, # PID if we are attaching to existing process
|
|
285
|
+
port: Optional[int]=None,
|
|
129
286
|
):
|
|
130
|
-
"""Initialize the Llama.cpp server process.
|
|
287
|
+
"""Initialize the Llama.cpp server process wrapper.
|
|
288
|
+
Can either start a new process or wrap an existing one.
|
|
131
289
|
"""
|
|
132
290
|
self.model_path = Path(model_path)
|
|
133
291
|
self.clip_model_path = Path(clip_model_path) if clip_model_path else None
|
|
@@ -139,12 +297,14 @@ class LlamaCppServerProcess:
|
|
|
139
297
|
else:
|
|
140
298
|
raise FileNotFoundError("llama_cpp_binaries not found and no server_binary_path provided.")
|
|
141
299
|
|
|
142
|
-
self.port: Optional[int] =
|
|
300
|
+
self.port: Optional[int] = port
|
|
301
|
+
self.pid: Optional[int] = process_pid
|
|
143
302
|
self.server_args = server_args
|
|
144
|
-
|
|
303
|
+
# The actual subprocess.Popen object. Will be None if this instance is just a client to a server started by another process.
|
|
304
|
+
self.process: Optional[subprocess.Popen] = None
|
|
145
305
|
self.session = requests.Session()
|
|
146
306
|
self.host = self.server_args.get("host",DEFAULT_LLAMACPP_SERVER_HOST)
|
|
147
|
-
self.base_url: Optional[str] =
|
|
307
|
+
self.base_url: Optional[str] = f"http://{self.host}:{self.port}" if self.port else None
|
|
148
308
|
self.is_healthy = False
|
|
149
309
|
self._stderr_lines: List[str] = []
|
|
150
310
|
self._stderr_thread: Optional[threading.Thread] = None
|
|
@@ -156,6 +316,23 @@ class LlamaCppServerProcess:
|
|
|
156
316
|
if not self.server_binary_path.exists():
|
|
157
317
|
raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
|
|
158
318
|
|
|
319
|
+
def attach(self):
|
|
320
|
+
"""Attaches to an already running process by checking its health."""
|
|
321
|
+
if not self.pid or not self.port:
|
|
322
|
+
raise ValueError("Cannot attach without PID and port.")
|
|
323
|
+
self.base_url = f"http://{self.host}:{self.port}"
|
|
324
|
+
health_url = f"{self.base_url}/health"
|
|
325
|
+
try:
|
|
326
|
+
response = self.session.get(health_url, timeout=5)
|
|
327
|
+
if response.status_code == 200 and response.json().get("status") == "ok":
|
|
328
|
+
self.is_healthy = True
|
|
329
|
+
ASCIIColors.green(f"Successfully attached to Llama.cpp server on port {self.port} (PID: {self.pid}).")
|
|
330
|
+
return
|
|
331
|
+
except requests.exceptions.RequestException as e:
|
|
332
|
+
ASCIIColors.warning(f"Failed to attach to server on port {self.port}: {e}")
|
|
333
|
+
self.is_healthy = False
|
|
334
|
+
raise ConnectionError(f"Could not connect to existing server at {health_url}")
|
|
335
|
+
|
|
159
336
|
def _filter_stderr(self, stderr_pipe):
|
|
160
337
|
try:
|
|
161
338
|
for line in iter(stderr_pipe.readline, ''):
|
|
@@ -216,6 +393,7 @@ class LlamaCppServerProcess:
|
|
|
216
393
|
|
|
217
394
|
try:
|
|
218
395
|
self.process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, bufsize=1, env=env)
|
|
396
|
+
self.pid = self.process.pid
|
|
219
397
|
except Exception as e:
|
|
220
398
|
ASCIIColors.error(f"Failed to start llama.cpp server process on port {self.port}: {e}"); trace_exception(e); raise
|
|
221
399
|
|
|
@@ -234,7 +412,7 @@ class LlamaCppServerProcess:
|
|
|
234
412
|
response = self.session.get(health_url, timeout=2)
|
|
235
413
|
if response.status_code == 200 and response.json().get("status") == "ok":
|
|
236
414
|
self.is_healthy = True
|
|
237
|
-
ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port}.")
|
|
415
|
+
ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port} (PID: {self.pid}).")
|
|
238
416
|
return
|
|
239
417
|
except requests.exceptions.ConnectionError: time.sleep(1)
|
|
240
418
|
except Exception as e: ASCIIColors.warning(f"Health check for port {self.port} failed: {e}"); time.sleep(1)
|
|
@@ -245,12 +423,13 @@ class LlamaCppServerProcess:
|
|
|
245
423
|
raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
|
|
246
424
|
|
|
247
425
|
def shutdown(self):
|
|
426
|
+
""" This method only shuts down a server if this instance owns the Popen object.
|
|
427
|
+
The actual termination for multi-process is handled by the ServerRegistry. """
|
|
248
428
|
self.is_healthy = False
|
|
249
429
|
if self.process:
|
|
250
|
-
ASCIIColors.info(f"Shutting down Llama.cpp server (PID: {self.process.pid} on port {self.port})...")
|
|
430
|
+
ASCIIColors.info(f"Shutting down owned Llama.cpp server process (PID: {self.process.pid} on port {self.port})...")
|
|
251
431
|
try:
|
|
252
|
-
|
|
253
|
-
else: self.process.terminate()
|
|
432
|
+
self.process.terminate()
|
|
254
433
|
self.process.wait(timeout=10)
|
|
255
434
|
except subprocess.TimeoutExpired:
|
|
256
435
|
ASCIIColors.warning(f"Llama.cpp server (port {self.port}) did not terminate gracefully, killing...")
|
|
@@ -269,45 +448,31 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
269
448
|
"n_gpu_layers": 0, "n_ctx": 128000, "n_batch": 512,
|
|
270
449
|
"embedding": False, "verbose": False, "server_startup_timeout": 120,
|
|
271
450
|
"parallel_slots": 4, # Default parallel slots for server
|
|
451
|
+
"stop_sequences": ["<|im_start|>"], # Default stop sequences
|
|
272
452
|
}
|
|
273
453
|
|
|
274
|
-
def __init__(self,
|
|
275
|
-
**kwargs
|
|
276
|
-
):
|
|
277
|
-
"""Initialize the Llama.cpp server binding.
|
|
278
|
-
Args:
|
|
279
|
-
model_name (str): Name of the model to load. If None, will use initial_model_name_preference.
|
|
280
|
-
models_path (str): Path to the directory containing model files.
|
|
281
|
-
clip_model_name (str): Optional name of the clip model to use. If None, will try to auto-detect based on the main model.
|
|
282
|
-
config (dict): Additional configuration options for the server.
|
|
283
|
-
default_completion_format (ELF_COMPLETION_FORMAT): Default format for completions.
|
|
284
|
-
|
|
285
|
-
"""
|
|
454
|
+
def __init__(self, **kwargs):
|
|
286
455
|
super().__init__(BindingName, **kwargs)
|
|
287
|
-
if llama_cpp_binaries is None
|
|
456
|
+
if llama_cpp_binaries is None or psutil is None:
|
|
457
|
+
raise ImportError("llama-cpp-binaries and psutil packages are required.")
|
|
288
458
|
|
|
459
|
+
self.registry = ServerRegistry()
|
|
289
460
|
models_path = kwargs.get("models_path", Path(__file__).parent/"models")
|
|
290
461
|
self.models_path = Path(models_path)
|
|
291
|
-
# Store initial preferences, but do not load/start server yet.
|
|
292
462
|
self.initial_model_name_preference: Optional[str] = kwargs.get("model_name")
|
|
293
|
-
self.user_provided_model_name: Optional[str] = kwargs.get("model_name")
|
|
463
|
+
self.user_provided_model_name: Optional[str] = kwargs.get("model_name")
|
|
294
464
|
self.initial_clip_model_name_preference: Optional[str] = kwargs.get("clip_model_name")
|
|
295
|
-
|
|
296
|
-
self._model_path_map: Dict[str, Path] = {} # Maps unique name to full Path
|
|
297
|
-
|
|
298
|
-
# Initial scan for available models (to populate listModels)
|
|
465
|
+
self._model_path_map: Dict[str, Path] = {}
|
|
299
466
|
self._scan_models()
|
|
300
|
-
|
|
301
467
|
self.default_completion_format = kwargs.get("default_completion_format", ELF_COMPLETION_FORMAT.Chat)
|
|
302
468
|
self.server_args = {**self.DEFAULT_SERVER_ARGS, **(kwargs.get("config") or {}), **kwargs}
|
|
303
469
|
self.server_binary_path = self._get_server_binary_path()
|
|
304
470
|
|
|
305
|
-
# Current state of the loaded model and server
|
|
306
471
|
self.current_model_path: Optional[Path] = None
|
|
307
|
-
self.clip_model_path: Optional[Path] = None
|
|
472
|
+
self.clip_model_path: Optional[Path] = None
|
|
308
473
|
self.server_process: Optional[LlamaCppServerProcess] = None
|
|
309
474
|
self.port: Optional[int] = None
|
|
310
|
-
self.server_key: Optional[
|
|
475
|
+
self.server_key: Optional[str] = None
|
|
311
476
|
|
|
312
477
|
ASCIIColors.info("LlamaCppServerBinding initialized. Server will start on-demand with first generation call.")
|
|
313
478
|
|
|
@@ -324,7 +489,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
324
489
|
bin_path = Path(bin_path_str)
|
|
325
490
|
if bin_path.exists() and bin_path.is_file():
|
|
326
491
|
ASCIIColors.info(f"Using binary from llama-cpp-binaries: {bin_path}"); return bin_path
|
|
327
|
-
raise FileNotFoundError("Llama.cpp server binary not found.
|
|
492
|
+
raise FileNotFoundError("Llama.cpp server binary not found.")
|
|
328
493
|
|
|
329
494
|
def _resolve_model_path(self, model_name_or_path: str) -> Path:
|
|
330
495
|
"""
|
|
@@ -332,36 +497,16 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
332
497
|
It prioritizes the internal map, then checks for absolute/relative paths,
|
|
333
498
|
and rescans the models directory as a fallback.
|
|
334
499
|
"""
|
|
335
|
-
# 1. Check if the provided name is a key in our map
|
|
336
500
|
if model_name_or_path in self._model_path_map:
|
|
337
|
-
|
|
338
|
-
ASCIIColors.info(f"Resolved model name '{model_name_or_path}' to path: {resolved_path}")
|
|
339
|
-
return resolved_path
|
|
340
|
-
|
|
341
|
-
# 2. If not in map, treat it as a potential path (absolute or relative to models_path)
|
|
501
|
+
return self._model_path_map[model_name_or_path]
|
|
342
502
|
model_p = Path(model_name_or_path)
|
|
343
|
-
if model_p.is_absolute():
|
|
344
|
-
if model_p.exists() and model_p.is_file():
|
|
345
|
-
return model_p
|
|
346
|
-
|
|
503
|
+
if model_p.is_absolute() and model_p.exists(): return model_p
|
|
347
504
|
path_in_models_dir = self.models_path / model_name_or_path
|
|
348
|
-
if path_in_models_dir.exists()
|
|
349
|
-
ASCIIColors.info(f"Found model at relative path: {path_in_models_dir}")
|
|
350
|
-
return path_in_models_dir
|
|
351
|
-
|
|
352
|
-
# 3. As a fallback, rescan the models directory in case the file was just added
|
|
353
|
-
ASCIIColors.info("Model not found in cache, rescanning directory...")
|
|
505
|
+
if path_in_models_dir.exists(): return path_in_models_dir
|
|
354
506
|
self._scan_models()
|
|
355
507
|
if model_name_or_path in self._model_path_map:
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
return resolved_path
|
|
359
|
-
|
|
360
|
-
# Final check for absolute path after rescan
|
|
361
|
-
if model_p.is_absolute() and model_p.exists() and model_p.is_file():
|
|
362
|
-
return model_p
|
|
363
|
-
|
|
364
|
-
raise FileNotFoundError(f"Model '{model_name_or_path}' not found in the map, as an absolute path, or within '{self.models_path}'.")
|
|
508
|
+
return self._model_path_map[model_name_or_path]
|
|
509
|
+
raise FileNotFoundError(f"Model '{model_name_or_path}' not found.")
|
|
365
510
|
|
|
366
511
|
def _find_available_port(self) -> int:
|
|
367
512
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
@@ -369,147 +514,105 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
369
514
|
|
|
370
515
|
def _release_server_instance(self):
|
|
371
516
|
if self.server_process and self.server_key:
|
|
372
|
-
|
|
373
|
-
if self.server_key in _server_ref_counts:
|
|
374
|
-
_server_ref_counts[self.server_key] -= 1
|
|
375
|
-
ASCIIColors.info(f"Decremented ref count for server {self.server_key}. New count: {_server_ref_counts[self.server_key]}")
|
|
376
|
-
if _server_ref_counts[self.server_key] <= 0:
|
|
377
|
-
ASCIIColors.info(f"Ref count for server {self.server_key} is zero. Shutting it down.")
|
|
378
|
-
server_to_stop = _active_servers.pop(self.server_key, None)
|
|
379
|
-
_server_ref_counts.pop(self.server_key, None)
|
|
380
|
-
if server_to_stop:
|
|
381
|
-
try: server_to_stop.shutdown()
|
|
382
|
-
except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
|
|
383
|
-
else:
|
|
384
|
-
ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
|
|
385
|
-
_active_servers.pop(self.server_key, None) # Ensure removal
|
|
386
|
-
|
|
517
|
+
self.registry.decrement_ref_count(self.server_key)
|
|
387
518
|
self.server_process = None
|
|
388
519
|
self.port = None
|
|
389
520
|
self.server_key = None
|
|
390
|
-
self.current_model_path = None
|
|
391
|
-
self.clip_model_path = None
|
|
521
|
+
self.current_model_path = None
|
|
522
|
+
self.clip_model_path = None
|
|
392
523
|
|
|
393
524
|
def load_model(self, model_name_or_path: str) -> bool:
|
|
394
|
-
self.user_provided_model_name = model_name_or_path
|
|
525
|
+
self.user_provided_model_name = model_name_or_path
|
|
395
526
|
try:
|
|
396
527
|
resolved_model_path = self._resolve_model_path(model_name_or_path)
|
|
397
528
|
except Exception as ex:
|
|
398
|
-
trace_exception(ex)
|
|
399
|
-
return False
|
|
529
|
+
trace_exception(ex); return False
|
|
400
530
|
|
|
401
|
-
# Determine the final clip_model_path for this server instance
|
|
402
|
-
# Priority: 1. Explicit `initial_clip_model_name_preference` from __init__ (if valid path)
|
|
403
|
-
# 2. Auto-detection based on the resolved main model.
|
|
404
531
|
final_clip_model_path: Optional[Path] = None
|
|
405
532
|
if self.initial_clip_model_name_preference:
|
|
406
533
|
p_clip_pref = Path(self.initial_clip_model_name_preference)
|
|
407
|
-
if p_clip_pref.is_absolute() and p_clip_pref.exists():
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
elif (self.models_path / self.initial_clip_model_name_preference).exists():
|
|
411
|
-
final_clip_model_path = self.models_path / self.initial_clip_model_name_preference
|
|
412
|
-
ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path} (relative to models path)")
|
|
413
|
-
else:
|
|
414
|
-
ASCIIColors.warning(f"Specified initial clip_model_name '{self.initial_clip_model_name_preference}' not found. Attempting auto-detection.")
|
|
534
|
+
if p_clip_pref.is_absolute() and p_clip_pref.exists(): final_clip_model_path = p_clip_pref
|
|
535
|
+
elif (self.models_path / p_clip_pref).exists(): final_clip_model_path = self.models_path / p_clip_pref
|
|
536
|
+
else: ASCIIColors.warning(f"Specified clip model '{self.initial_clip_model_name_preference}' not found.")
|
|
415
537
|
|
|
416
|
-
if not final_clip_model_path:
|
|
538
|
+
if not final_clip_model_path:
|
|
417
539
|
base_name = get_gguf_model_base_name(resolved_model_path.stem)
|
|
418
540
|
potential_paths = [
|
|
419
541
|
resolved_model_path.parent / f"{base_name}.mmproj",
|
|
420
542
|
resolved_model_path.parent / f"mmproj-{base_name}.gguf",
|
|
421
|
-
|
|
422
|
-
self.models_path / f"{base_name}.mmproj", # Check in general models dir too
|
|
543
|
+
self.models_path / f"{base_name}.mmproj",
|
|
423
544
|
self.models_path / f"mmproj-{base_name}.gguf",
|
|
424
545
|
]
|
|
425
546
|
for p_clip in potential_paths:
|
|
426
|
-
if p_clip.exists():
|
|
427
|
-
final_clip_model_path = p_clip
|
|
428
|
-
ASCIIColors.info(f"Auto-detected LLaVA clip model: {final_clip_model_path}")
|
|
429
|
-
break
|
|
547
|
+
if p_clip.exists(): final_clip_model_path = p_clip; break
|
|
430
548
|
|
|
431
|
-
final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
|
|
432
|
-
|
|
433
|
-
# Server key based on model and essential server configurations (like clip model)
|
|
434
|
-
new_server_key = (str(resolved_model_path), final_clip_model_path_str)
|
|
549
|
+
final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else "None"
|
|
550
|
+
new_server_key = f"{resolved_model_path}|{final_clip_model_path_str}"
|
|
435
551
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
ASCIIColors.info(f"Model '{model_name_or_path}' with clip '{final_clip_model_path_str}' is already loaded and server is healthy on port {self.port}. No change.")
|
|
440
|
-
return True
|
|
441
|
-
|
|
442
|
-
# If this binding was using a *different* server, release it first
|
|
443
|
-
if self.server_process and self.server_key != new_server_key:
|
|
444
|
-
ASCIIColors.info(f"Switching models. Releasing previous server: {self.server_key}")
|
|
445
|
-
self._release_server_instance() # This clears self.server_process, self.port, self.server_key
|
|
446
|
-
|
|
447
|
-
# Check if a suitable server already exists in the global registry
|
|
448
|
-
if new_server_key in _active_servers:
|
|
449
|
-
existing_server = _active_servers[new_server_key]
|
|
450
|
-
if existing_server.is_healthy:
|
|
451
|
-
ASCIIColors.info(f"Reusing existing healthy server for {new_server_key} on port {existing_server.port}.")
|
|
452
|
-
self.server_process = existing_server
|
|
453
|
-
self.port = existing_server.port
|
|
454
|
-
_server_ref_counts[new_server_key] += 1
|
|
455
|
-
self.current_model_path = resolved_model_path
|
|
456
|
-
self.clip_model_path = final_clip_model_path # Update binding's clip path
|
|
457
|
-
self.server_key = new_server_key
|
|
458
|
-
return True
|
|
459
|
-
else: # Found existing but unhealthy server
|
|
460
|
-
ASCIIColors.warning(f"Found unhealthy server for {new_server_key}. Attempting to remove and restart.")
|
|
461
|
-
try: existing_server.shutdown()
|
|
462
|
-
except Exception as e: ASCIIColors.error(f"Error shutting down unhealthy server {new_server_key}: {e}")
|
|
463
|
-
_active_servers.pop(new_server_key, None)
|
|
464
|
-
_server_ref_counts.pop(new_server_key, None)
|
|
465
|
-
|
|
466
|
-
# No suitable server found or existing was unhealthy: start a new one
|
|
467
|
-
ASCIIColors.info(f"Starting new server for {new_server_key}.")
|
|
468
|
-
self.current_model_path = resolved_model_path
|
|
469
|
-
self.clip_model_path = final_clip_model_path # Update binding's clip path for the new server
|
|
470
|
-
self.server_key = new_server_key # Set before potential failure to allow cleanup by _release_server_instance
|
|
471
|
-
|
|
472
|
-
new_port_for_server = self._find_available_port()
|
|
473
|
-
|
|
474
|
-
current_server_args_for_new_server = self.server_args.copy()
|
|
475
|
-
# Ensure parallel_slots is set; it's crucial for shared servers
|
|
476
|
-
if "parallel_slots" not in current_server_args_for_new_server or not isinstance(current_server_args_for_new_server["parallel_slots"], int) or current_server_args_for_new_server["parallel_slots"] <=0:
|
|
477
|
-
current_server_args_for_new_server["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
|
|
478
|
-
|
|
479
|
-
ASCIIColors.info(f"New Llama.cpp server: model={self.current_model_path}, clip={self.clip_model_path}, port={new_port_for_server}, slots={current_server_args_for_new_server['parallel_slots']}")
|
|
552
|
+
if self.server_process and self.server_key == new_server_key and self.server_process.is_healthy:
|
|
553
|
+
ASCIIColors.info(f"Model '{model_name_or_path}' is already loaded. No change.")
|
|
554
|
+
return True
|
|
480
555
|
|
|
556
|
+
if self.server_process and self.server_key != new_server_key:
|
|
557
|
+
self._release_server_instance()
|
|
558
|
+
|
|
559
|
+
# Check registry for an existing server
|
|
560
|
+
existing_server_info = self.registry.get_server(new_server_key)
|
|
561
|
+
if existing_server_info:
|
|
562
|
+
ASCIIColors.info(f"Found existing server for {new_server_key} in registry (PID: {existing_server_info['pid']}, Port: {existing_server_info['port']}). Attaching...")
|
|
481
563
|
try:
|
|
482
|
-
|
|
483
|
-
model_path=
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
server_args=current_server_args_for_new_server,
|
|
564
|
+
self.server_process = LlamaCppServerProcess(
|
|
565
|
+
model_path=resolved_model_path, clip_model_path=final_clip_model_path,
|
|
566
|
+
process_pid=existing_server_info['pid'], port=existing_server_info['port'],
|
|
567
|
+
server_args=self.server_args
|
|
487
568
|
)
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
ASCIIColors.green(f"New server {self.server_key} started on port {self.port}.")
|
|
496
|
-
return True
|
|
497
|
-
else: # Should have been caught by new_server.start() raising an error
|
|
498
|
-
ASCIIColors.error(f"New server {self.server_key} failed to become healthy (this state should be rare).")
|
|
499
|
-
self._release_server_instance() # Clean up registry if something went very wrong
|
|
500
|
-
return False
|
|
569
|
+
self.server_process.attach() # This verifies health
|
|
570
|
+
self.port = self.server_process.port
|
|
571
|
+
self.current_model_path = resolved_model_path
|
|
572
|
+
self.clip_model_path = final_clip_model_path
|
|
573
|
+
self.server_key = new_server_key
|
|
574
|
+
self.registry.increment_ref_count(new_server_key)
|
|
575
|
+
return True
|
|
501
576
|
except Exception as e:
|
|
502
|
-
ASCIIColors.error(f"Failed to
|
|
503
|
-
|
|
504
|
-
|
|
577
|
+
ASCIIColors.error(f"Failed to attach to existing server: {e}. It might be stale. Will attempt to start a new one.")
|
|
578
|
+
self.registry.decrement_ref_count(new_server_key) # Clean up failed attach
|
|
579
|
+
|
|
580
|
+
# Start a new server
|
|
581
|
+
ASCIIColors.info(f"No existing server found for {new_server_key}. Starting a new one.")
|
|
582
|
+
self.current_model_path = resolved_model_path
|
|
583
|
+
self.clip_model_path = final_clip_model_path
|
|
584
|
+
self.server_key = new_server_key
|
|
585
|
+
|
|
586
|
+
try:
|
|
587
|
+
new_port = self._find_available_port()
|
|
588
|
+
current_server_args = self.server_args.copy()
|
|
589
|
+
if "parallel_slots" not in current_server_args or current_server_args["parallel_slots"] <=0:
|
|
590
|
+
current_server_args["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
|
|
591
|
+
|
|
592
|
+
new_server = LlamaCppServerProcess(
|
|
593
|
+
model_path=self.current_model_path, clip_model_path=self.clip_model_path,
|
|
594
|
+
server_binary_path=self.server_binary_path, server_args=current_server_args
|
|
595
|
+
)
|
|
596
|
+
new_server.start(port_to_use=new_port)
|
|
597
|
+
|
|
598
|
+
if new_server.is_healthy:
|
|
599
|
+
self.server_process = new_server
|
|
600
|
+
self.port = new_port
|
|
601
|
+
self.registry.register_new_server(self.server_key, new_server.pid, new_port)
|
|
602
|
+
ASCIIColors.green(f"New server {self.server_key} started and registered.")
|
|
603
|
+
return True
|
|
604
|
+
else:
|
|
505
605
|
return False
|
|
606
|
+
except Exception as e:
|
|
607
|
+
ASCIIColors.error(f"Failed to start new server for '{model_name_or_path}': {e}"); trace_exception(e)
|
|
608
|
+
self._release_server_instance()
|
|
609
|
+
return False
|
|
506
610
|
|
|
507
611
|
def unload_model(self):
|
|
508
612
|
if self.server_process:
|
|
509
|
-
|
|
510
|
-
self._release_server_instance() # Handles ref counting and actual shutdown if needed
|
|
613
|
+
self._release_server_instance()
|
|
511
614
|
else:
|
|
512
|
-
ASCIIColors.info("
|
|
615
|
+
ASCIIColors.info("Unload called, but no server was active for this binding instance.")
|
|
513
616
|
|
|
514
617
|
def _ensure_server_is_running(self) -> bool:
|
|
515
618
|
"""
|
|
@@ -521,21 +624,18 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
521
624
|
|
|
522
625
|
ASCIIColors.info("Server is not running. Attempting to start on-demand...")
|
|
523
626
|
|
|
524
|
-
# Determine which model to load
|
|
525
627
|
model_to_load = self.user_provided_model_name or self.initial_model_name_preference
|
|
526
628
|
|
|
527
629
|
if not model_to_load:
|
|
528
|
-
# No model specified, try to find one automatically
|
|
529
630
|
self._scan_models()
|
|
530
631
|
available_models = self.listModels()
|
|
531
632
|
if not available_models:
|
|
532
633
|
ASCIIColors.error("No model specified and no GGUF models found in models path.")
|
|
533
634
|
return False
|
|
534
635
|
|
|
535
|
-
model_to_load = available_models[0]['name']
|
|
636
|
+
model_to_load = available_models[0]['name']
|
|
536
637
|
ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{model_to_load}'")
|
|
537
638
|
|
|
538
|
-
# Now, attempt to load the selected model
|
|
539
639
|
if self.load_model(model_to_load):
|
|
540
640
|
return True
|
|
541
641
|
else:
|
|
@@ -543,7 +643,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
543
643
|
return False
|
|
544
644
|
|
|
545
645
|
def _get_request_url(self, endpoint: str) -> str:
|
|
546
|
-
# This function now assumes _ensure_server_is_running has been called.
|
|
547
646
|
return f"{self.server_process.base_url}{endpoint}"
|
|
548
647
|
|
|
549
648
|
def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
|
|
@@ -551,10 +650,10 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
551
650
|
repeat_penalty: float = 1.1, repeat_last_n: Optional[int] = 64,
|
|
552
651
|
seed: Optional[int] = None, stream: bool = False, use_chat_format: bool = True,
|
|
553
652
|
images: Optional[List[str]] = None,
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
653
|
+
stop_sequences: Optional[List[str]] = None,
|
|
654
|
+
split:Optional[bool]=False,
|
|
655
|
+
user_keyword:Optional[str]="!@>user:",
|
|
656
|
+
ai_keyword:Optional[str]="!@>assistant:",
|
|
558
657
|
**extra_params) -> Dict:
|
|
559
658
|
payload_params = {
|
|
560
659
|
"temperature": self.server_args.get("temperature", 0.7), "top_k": self.server_args.get("top_k", 40),
|
|
@@ -568,6 +667,15 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
568
667
|
payload_params.update({"temperature": temperature, "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n})
|
|
569
668
|
if n_predict is not None: payload_params['n_predict'] = n_predict
|
|
570
669
|
if seed is not None: payload_params['seed'] = seed
|
|
670
|
+
|
|
671
|
+
# --- Handle stop sequences ---
|
|
672
|
+
all_stop_sequences = set(self.server_args.get("stop_sequences", []))
|
|
673
|
+
if stop_sequences:
|
|
674
|
+
all_stop_sequences.update(stop_sequences)
|
|
675
|
+
if all_stop_sequences:
|
|
676
|
+
payload_params['stop'] = list(all_stop_sequences)
|
|
677
|
+
# --- End stop sequences ---
|
|
678
|
+
|
|
571
679
|
payload_params = {k: v for k, v in payload_params.items() if v is not None}
|
|
572
680
|
payload_params.update(extra_params)
|
|
573
681
|
|
|
@@ -579,7 +687,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
579
687
|
messages += self.split_discussion(user_content,user_keyword=user_keyword, ai_keyword=ai_keyword)
|
|
580
688
|
else:
|
|
581
689
|
messages.append({"role": "user", "content": user_content})
|
|
582
|
-
if images and self.clip_model_path:
|
|
690
|
+
if images and self.clip_model_path:
|
|
583
691
|
image_parts = []
|
|
584
692
|
for img_path in images:
|
|
585
693
|
try:
|
|
@@ -594,7 +702,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
594
702
|
else:
|
|
595
703
|
full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:" if system_prompt and system_prompt.strip() else prompt
|
|
596
704
|
final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
|
|
597
|
-
if images and self.clip_model_path:
|
|
705
|
+
if images and self.clip_model_path:
|
|
598
706
|
image_data_list = []
|
|
599
707
|
for i, img_path in enumerate(images):
|
|
600
708
|
try:
|
|
@@ -620,6 +728,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
620
728
|
n_threads: Optional[int] = None,
|
|
621
729
|
ctx_size: int | None = None,
|
|
622
730
|
streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
|
|
731
|
+
stop_sequences: Optional[List[str]] = None,
|
|
623
732
|
split:Optional[bool]=False,
|
|
624
733
|
user_keyword:Optional[str]="!@>user:",
|
|
625
734
|
ai_keyword:Optional[str]="!@>assistant:",
|
|
@@ -639,6 +748,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
639
748
|
repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
|
|
640
749
|
seed=seed if seed is not None else self.server_args.get("seed", -1), stream=stream,
|
|
641
750
|
use_chat_format=_use_chat_format, images=images,
|
|
751
|
+
stop_sequences=stop_sequences,
|
|
642
752
|
split= split, user_keyword=user_keyword, ai_keyword=ai_keyword, **generation_kwargs
|
|
643
753
|
)
|
|
644
754
|
endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
|
|
@@ -668,7 +778,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
668
778
|
else:
|
|
669
779
|
response_data = response.json()
|
|
670
780
|
return response_data.get('choices', [{}])[0].get('message', {}).get('content', '') if _use_chat_format \
|
|
671
|
-
else response_data.get('content','')
|
|
781
|
+
else response_data.get('content','')
|
|
672
782
|
except requests.exceptions.RequestException as e:
|
|
673
783
|
error_message = f"Llama.cpp server request error: {e}"
|
|
674
784
|
if e.response is not None:
|
|
@@ -694,6 +804,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
694
804
|
n_threads: Optional[int] = None,
|
|
695
805
|
ctx_size: Optional[int] = None,
|
|
696
806
|
streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
|
|
807
|
+
stop_sequences: Optional[List[str]] = None,
|
|
697
808
|
**generation_kwargs
|
|
698
809
|
) -> Union[str, dict]:
|
|
699
810
|
|
|
@@ -706,6 +817,13 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
706
817
|
"top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty,
|
|
707
818
|
"seed": seed, "stream": stream, **generation_kwargs
|
|
708
819
|
}
|
|
820
|
+
|
|
821
|
+
all_stop_sequences = set(self.server_args.get("stop_sequences", []))
|
|
822
|
+
if stop_sequences:
|
|
823
|
+
all_stop_sequences.update(stop_sequences)
|
|
824
|
+
if all_stop_sequences:
|
|
825
|
+
payload['stop'] = list(all_stop_sequences)
|
|
826
|
+
|
|
709
827
|
payload = {k: v for k, v in payload.items() if v is not None}
|
|
710
828
|
|
|
711
829
|
endpoint = "/v1/chat/completions"
|
|
@@ -724,18 +842,20 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
724
842
|
if line_str == '[DONE]': break
|
|
725
843
|
try:
|
|
726
844
|
chunk_data = json.loads(line_str)
|
|
727
|
-
|
|
728
|
-
if
|
|
729
|
-
|
|
730
|
-
if
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
845
|
+
choices = chunk_data.get('choices', [{}])
|
|
846
|
+
if choices and len(choices)>0:
|
|
847
|
+
chunk_content = choices[0].get('delta', {}).get('content', '')
|
|
848
|
+
if chunk_content:
|
|
849
|
+
full_response_text += chunk_content
|
|
850
|
+
if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
851
|
+
ASCIIColors.info("Streaming callback requested stop.")
|
|
852
|
+
response.close()
|
|
853
|
+
break
|
|
734
854
|
except json.JSONDecodeError:
|
|
735
855
|
ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
|
|
736
856
|
continue
|
|
737
857
|
return full_response_text
|
|
738
|
-
else:
|
|
858
|
+
else:
|
|
739
859
|
response_data = response.json()
|
|
740
860
|
return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
741
861
|
|
|
@@ -794,7 +914,6 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
794
914
|
return []
|
|
795
915
|
|
|
796
916
|
def get_model_info(self) -> dict:
|
|
797
|
-
# This method reports the current state without triggering a server start
|
|
798
917
|
is_loaded = self.server_process is not None and self.server_process.is_healthy
|
|
799
918
|
info = {
|
|
800
919
|
"name": self.binding_name,
|
|
@@ -893,7 +1012,10 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
893
1012
|
return None
|
|
894
1013
|
|
|
895
1014
|
if __name__ == '__main__':
|
|
896
|
-
|
|
1015
|
+
# NOTE: This test block is designed for a single-process scenario to verify basic functionality.
|
|
1016
|
+
# Testing the multi-process capabilities requires a separate script that launches multiple
|
|
1017
|
+
# instances of a test program using this binding. The logic here, however, will now use the
|
|
1018
|
+
# new file-based registry system.
|
|
897
1019
|
full_streamed_text = ""
|
|
898
1020
|
ASCIIColors.yellow("Testing LlamaCppServerBinding...")
|
|
899
1021
|
|
|
@@ -917,6 +1039,7 @@ if __name__ == '__main__':
|
|
|
917
1039
|
binding_config = {
|
|
918
1040
|
"n_gpu_layers": 0, "n_ctx": 512, "embedding": True,
|
|
919
1041
|
"verbose": False, "server_startup_timeout": 180, "parallel_slots": 2,
|
|
1042
|
+
"stop_sequences": ["<|user|>", "\nUSER:"], # Example default stop sequences
|
|
920
1043
|
}
|
|
921
1044
|
|
|
922
1045
|
active_binding1: Optional[LlamaCppServerBinding] = None
|
|
@@ -933,12 +1056,18 @@ if __name__ == '__main__':
|
|
|
933
1056
|
ASCIIColors.info(f"Initial model info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
934
1057
|
|
|
935
1058
|
prompt_text = "What is the capital of France?"
|
|
936
|
-
generated_text = active_binding1.generate_text(
|
|
1059
|
+
generated_text = active_binding1.generate_text(
|
|
1060
|
+
prompt_text,
|
|
1061
|
+
system_prompt="Concise expert.",
|
|
1062
|
+
n_predict=20,
|
|
1063
|
+
stream=False,
|
|
1064
|
+
stop_sequences=["Paris"] # Test per-call stop sequence
|
|
1065
|
+
)
|
|
937
1066
|
|
|
938
|
-
if isinstance(generated_text, str) and "Paris" in generated_text:
|
|
939
|
-
ASCIIColors.green(f"SUCCESS: Auto-start generation successful. Response: {generated_text}")
|
|
1067
|
+
if isinstance(generated_text, str) and "Paris" not in generated_text: # Should stop *before* generating Paris
|
|
1068
|
+
ASCIIColors.green(f"SUCCESS: Auto-start generation with stop sequence successful. Response: '{generated_text}'")
|
|
940
1069
|
else:
|
|
941
|
-
ASCIIColors.error(f"FAILURE: Auto-start generation failed. Response: {generated_text}")
|
|
1070
|
+
ASCIIColors.error(f"FAILURE: Auto-start generation failed or stop sequence ignored. Response: {generated_text}")
|
|
942
1071
|
|
|
943
1072
|
ASCIIColors.info(f"Model info after auto-start: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
944
1073
|
if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
|
|
@@ -949,7 +1078,6 @@ if __name__ == '__main__':
|
|
|
949
1078
|
active_binding2 = LlamaCppServerBinding(
|
|
950
1079
|
model_name=model_name_str, models_path=str(models_path), config=binding_config
|
|
951
1080
|
)
|
|
952
|
-
# This call should reuse the server from binding1
|
|
953
1081
|
generated_text_b2 = active_binding2.generate_text("Ping", n_predict=5, stream=False)
|
|
954
1082
|
if isinstance(generated_text_b2, str):
|
|
955
1083
|
ASCIIColors.green(f"SUCCESS: Binding2 generation successful. Response: {generated_text_b2}")
|
|
@@ -966,14 +1094,6 @@ if __name__ == '__main__':
|
|
|
966
1094
|
active_binding1.unload_model()
|
|
967
1095
|
ASCIIColors.info("Binding1 unloaded. Ref count should be 1, server still up for binding2.")
|
|
968
1096
|
|
|
969
|
-
# The server should still be up because binding2 holds a reference
|
|
970
|
-
with _server_registry_lock:
|
|
971
|
-
if not _active_servers:
|
|
972
|
-
ASCIIColors.error("FAILURE: Server shut down prematurely while still referenced by binding2.")
|
|
973
|
-
else:
|
|
974
|
-
ASCIIColors.green("SUCCESS: Server correctly remained active for binding2.")
|
|
975
|
-
|
|
976
|
-
# This call should re-acquire a reference to the same server for binding1
|
|
977
1097
|
generated_text_reloaded = active_binding1.generate_text("Test reload", n_predict=5, stream=False)
|
|
978
1098
|
if isinstance(generated_text_reloaded, str):
|
|
979
1099
|
ASCIIColors.green(f"SUCCESS: Generation after reload successful. Response: {generated_text_reloaded}")
|
|
@@ -1011,17 +1131,18 @@ if __name__ == '__main__':
|
|
|
1011
1131
|
ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
|
|
1012
1132
|
if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
|
|
1013
1133
|
if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
|
|
1134
|
+
# Any other bindings will be cleaned up by __del__ on exit
|
|
1014
1135
|
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
|
|
1022
|
-
_active_servers.pop(key, None)
|
|
1023
|
-
_server_ref_counts.pop(key, None)
|
|
1136
|
+
registry = ServerRegistry()
|
|
1137
|
+
with FileLock(registry.lock_file):
|
|
1138
|
+
final_state = registry._read_registry()
|
|
1139
|
+
if not final_state or not any(c for s in final_state.values() for c in s.get('client_pids',[])):
|
|
1140
|
+
ASCIIColors.green("All servers shut down correctly and registry is empty or has no clients.")
|
|
1141
|
+
if final_state: registry._write_registry({}) # Clean up for next run
|
|
1024
1142
|
else:
|
|
1025
|
-
ASCIIColors.
|
|
1143
|
+
ASCIIColors.warning(f"Warning: Registry is not empty after tests: {final_state}")
|
|
1144
|
+
registry._clean_stale_entries(final_state)
|
|
1145
|
+
registry._write_registry(final_state)
|
|
1146
|
+
ASCIIColors.info("Forced a final registry cleanup.")
|
|
1026
1147
|
|
|
1027
1148
|
ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")
|