lollms-client 1.7.10__py3-none-any.whl → 1.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/claude/__init__.py +0 -1
- lollms_client/llm_bindings/grok/__init__.py +0 -1
- lollms_client/llm_bindings/llama_cpp_server/__init__.py +726 -0
- lollms_client/llm_bindings/ollama/__init__.py +40 -2
- lollms_client/lollms_discussion.py +209 -65
- lollms_client/lollms_llm_binding.py +15 -1
- lollms_client/lollms_mcp_binding.py +15 -3
- lollms_client/lollms_stt_binding.py +16 -2
- lollms_client/lollms_tti_binding.py +16 -2
- lollms_client/lollms_ttm_binding.py +16 -2
- lollms_client/lollms_tts_binding.py +16 -2
- lollms_client/lollms_ttv_binding.py +16 -2
- lollms_client/tti_bindings/diffusers/__init__.py +132 -79
- lollms_client/tti_bindings/diffusers/server/main.py +76 -65
- lollms_client/tti_bindings/open_router/__init__.py +341 -0
- lollms_client/tts_bindings/xtts/__init__.py +1 -1
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/METADATA +1 -1
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/RECORD +22 -21
- lollms_client/llm_bindings/llamacpp/__init__.py +0 -1155
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/WHEEL +0 -0
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/top_level.txt +0 -0
|
@@ -1,1155 +0,0 @@
|
|
|
1
|
-
# bindings/llamacpp_server/binding.py
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
import pprint
|
|
5
|
-
import re
|
|
6
|
-
import socket
|
|
7
|
-
import subprocess
|
|
8
|
-
import sys
|
|
9
|
-
import threading
|
|
10
|
-
import time
|
|
11
|
-
import tempfile
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from typing import Optional, Callable, List, Union, Dict, Any, Set
|
|
14
|
-
import base64
|
|
15
|
-
from lollms_client.lollms_discussion import LollmsDiscussion
|
|
16
|
-
import requests # For HTTP client
|
|
17
|
-
from lollms_client.lollms_llm_binding import LollmsLLMBinding
|
|
18
|
-
from lollms_client.lollms_types import MSG_TYPE, ELF_COMPLETION_FORMAT
|
|
19
|
-
|
|
20
|
-
from ascii_colors import ASCIIColors, trace_exception
|
|
21
|
-
import pipmaster as pm
|
|
22
|
-
import platform
|
|
23
|
-
|
|
24
|
-
# --- Multi-process locking for registry ---
|
|
25
|
-
# On Windows, we need msvcrt, on POSIX, fcntl
|
|
26
|
-
try:
|
|
27
|
-
if platform.system() == "Windows":
|
|
28
|
-
import msvcrt
|
|
29
|
-
else:
|
|
30
|
-
import fcntl
|
|
31
|
-
except ImportError:
|
|
32
|
-
# This might happen in some restricted environments.
|
|
33
|
-
# The binding will fall back to thread-safety only.
|
|
34
|
-
msvcrt = fcntl = None
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class FileLock:
|
|
38
|
-
def __init__(self, lock_file_path):
|
|
39
|
-
self.lock_file_path = lock_file_path
|
|
40
|
-
self.lock_file = None
|
|
41
|
-
self._is_windows = platform.system() == "Windows"
|
|
42
|
-
|
|
43
|
-
def __enter__(self):
|
|
44
|
-
self.lock_file = open(self.lock_file_path, 'w')
|
|
45
|
-
if self._is_windows and msvcrt:
|
|
46
|
-
msvcrt.locking(self.lock_file.fileno(), msvcrt.LK_LOCK, 1)
|
|
47
|
-
elif not self._is_windows and fcntl:
|
|
48
|
-
fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_EX)
|
|
49
|
-
return self
|
|
50
|
-
|
|
51
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
52
|
-
if self.lock_file:
|
|
53
|
-
if self._is_windows and msvcrt:
|
|
54
|
-
self.lock_file.seek(0)
|
|
55
|
-
msvcrt.locking(self.lock_file.fileno(), msvcrt.LK_UNLCK, 1)
|
|
56
|
-
elif not self._is_windows and fcntl:
|
|
57
|
-
fcntl.flock(self.lock_file.fileno(), fcntl.LOCK_UN)
|
|
58
|
-
self.lock_file.close()
|
|
59
|
-
self.lock_file = None
|
|
60
|
-
|
|
61
|
-
# --- End multi-process locking ---
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
# Ensure llama-cpp-binaries, requests, pillow, and psutil are installed
|
|
65
|
-
pm.ensure_packages(["requests", "pillow", "psutil"]) # pillow for dummy image in test, psutil for multi-process management
|
|
66
|
-
if not pm.is_installed("llama-cpp-binaries"):
|
|
67
|
-
def install_llama_cpp():
|
|
68
|
-
system = platform.system()
|
|
69
|
-
python_version_simple = f"py{sys.version_info.major}{sys.version_info.minor}" # e.g. py310 for 3.10
|
|
70
|
-
|
|
71
|
-
version_tag = "v0.56.0"
|
|
72
|
-
cuda_suffix = "+cu124"
|
|
73
|
-
|
|
74
|
-
if system == "Windows":
|
|
75
|
-
# Try version-specific URL first
|
|
76
|
-
url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
|
|
77
|
-
# Fallback to generic py3 if version-specific doesn't exist
|
|
78
|
-
fallback_url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-py3-none-win_amd64.whl"
|
|
79
|
-
elif system == "Linux":
|
|
80
|
-
# Try version-specific URL first
|
|
81
|
-
url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
|
|
82
|
-
# Fallback to generic py3 if version-specific doesn't exist
|
|
83
|
-
fallback_url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-py3-none-linux_x86_64.whl"
|
|
84
|
-
else:
|
|
85
|
-
ASCIIColors.error(f"Unsupported OS for precompiled llama-cpp-binaries: {system}. "
|
|
86
|
-
"You might need to set 'llama_server_binary_path' in the binding config "
|
|
87
|
-
"to point to a manually compiled llama.cpp server binary.")
|
|
88
|
-
return False
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
ASCIIColors.info(f"Attempting to install llama-cpp-binaries from: {url}")
|
|
92
|
-
try:
|
|
93
|
-
pm.install(url)
|
|
94
|
-
except Exception as e:
|
|
95
|
-
ASCIIColors.warning(f"Failed to install specific version from {url}: {e}")
|
|
96
|
-
ASCIIColors.info(f"Attempting fallback URL: {fallback_url}")
|
|
97
|
-
try:
|
|
98
|
-
pm.install(fallback_url)
|
|
99
|
-
except Exception as e_fallback:
|
|
100
|
-
ASCIIColors.error(f"Failed to install from fallback URL {fallback_url}: {e_fallback}")
|
|
101
|
-
ASCIIColors.error("Please try installing llama-cpp-binaries manually, e.g., 'pip install llama-cpp-python[server]' or from a wheel.")
|
|
102
|
-
|
|
103
|
-
install_llama_cpp()
|
|
104
|
-
|
|
105
|
-
try:
|
|
106
|
-
import llama_cpp_binaries
|
|
107
|
-
import psutil
|
|
108
|
-
except ImportError:
|
|
109
|
-
ASCIIColors.error("llama-cpp-binaries or psutil package not found. Please ensure they are installed.")
|
|
110
|
-
ASCIIColors.error("You can try: pip install llama-cpp-python[server] psutil")
|
|
111
|
-
llama_cpp_binaries = None
|
|
112
|
-
psutil = None
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# --- Predefined patterns ---
|
|
116
|
-
_QUANT_COMPONENTS_SET: Set[str] = {
|
|
117
|
-
"Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S",
|
|
118
|
-
"Q3_K_M", "Q4_K_M", "Q5_K_M", "Q3_K_L", "Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
|
|
119
|
-
"Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS", "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
|
|
120
|
-
"F16", "FP16", "F32", "FP32", "BF16", "IQ1_S", "IQ1_M", "IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
|
|
121
|
-
"IQ3_XXS", "IQ3_S", "IQ3_M", "IQ4_NL", "IQ4_XS", "IQ3_M_K", "IQ3_S_K", "IQ4_XS_K", "IQ4_NL_K",
|
|
122
|
-
"I8", "I16", "I32", "ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
|
|
123
|
-
"MOSTLY_Q8_0", "MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
|
|
124
|
-
"MOSTLY_Q4_K_S", "MOSTLY_Q4_K_M", "MOSTLY_Q5_K_S", "MOSTLY_Q5_K_M", "MOSTLY_Q6_K",
|
|
125
|
-
"MOSTLY_IQ1_S", "MOSTLY_IQ1_M", "MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
|
|
126
|
-
"MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", "MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
|
|
127
|
-
}
|
|
128
|
-
_MODEL_NAME_SUFFIX_COMPONENTS_SET: Set[str] = {
|
|
129
|
-
"instruct", "chat", "GGUF", "HF", "ggml", "pytorch", "AWQ", "GPTQ", "EXL2",
|
|
130
|
-
"base", "cont", "continue", "ft", "v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0"
|
|
131
|
-
}
|
|
132
|
-
_ALL_REMOVABLE_COMPONENTS: List[str] = sorted(
|
|
133
|
-
list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)), key=len, reverse=True
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
|
|
137
|
-
if isinstance(file_path_or_name, str): p = Path(file_path_or_name)
|
|
138
|
-
elif isinstance(file_path_or_name, Path): p = file_path_or_name
|
|
139
|
-
else: raise TypeError(f"Input must be a string or Path object. Got: {type(file_path_or_name)}")
|
|
140
|
-
name_part = p.stem if p.suffix.lower() == ".gguf" else p.name
|
|
141
|
-
if name_part.lower().endswith(".gguf"): name_part = name_part[:-5]
|
|
142
|
-
while True:
|
|
143
|
-
original_name_part_len = len(name_part)
|
|
144
|
-
stripped_in_this_iteration = False
|
|
145
|
-
for component in _ALL_REMOVABLE_COMPONENTS:
|
|
146
|
-
component_lower = component.lower()
|
|
147
|
-
for separator in [".", "-", "_"]:
|
|
148
|
-
pattern_to_check = f"{separator}{component_lower}"
|
|
149
|
-
if name_part.lower().endswith(pattern_to_check):
|
|
150
|
-
name_part = name_part[:-(len(pattern_to_check))]
|
|
151
|
-
stripped_in_this_iteration = True; break
|
|
152
|
-
if stripped_in_this_iteration: break
|
|
153
|
-
if not stripped_in_this_iteration or not name_part: break
|
|
154
|
-
while name_part and name_part[-1] in ['.', '-', '_']: name_part = name_part[:-1]
|
|
155
|
-
return name_part
|
|
156
|
-
|
|
157
|
-
# --- Global Server Registry (File-based for multi-process support) ---
|
|
158
|
-
|
|
159
|
-
class ServerRegistry:
|
|
160
|
-
def __init__(self):
|
|
161
|
-
self.registry_dir = Path(tempfile.gettempdir()) / "lollms_llamacpp_servers"
|
|
162
|
-
self.registry_dir.mkdir(parents=True, exist_ok=True)
|
|
163
|
-
self.registry_file = self.registry_dir / "registry.json"
|
|
164
|
-
self.lock_file = self.registry_dir / "registry.lock"
|
|
165
|
-
self.my_pid = os.getpid()
|
|
166
|
-
|
|
167
|
-
def _is_pid_running(self, pid: int) -> bool:
|
|
168
|
-
if psutil is None: return True # Conservative default if psutil is missing
|
|
169
|
-
return psutil.pid_exists(pid)
|
|
170
|
-
|
|
171
|
-
def _read_registry(self) -> Dict[str, Any]:
|
|
172
|
-
if not self.registry_file.exists():
|
|
173
|
-
return {}
|
|
174
|
-
try:
|
|
175
|
-
with open(self.registry_file, 'r') as f:
|
|
176
|
-
return json.load(f)
|
|
177
|
-
except (json.JSONDecodeError, FileNotFoundError):
|
|
178
|
-
return {}
|
|
179
|
-
|
|
180
|
-
def _write_registry(self, data: Dict[str, Any]):
|
|
181
|
-
with open(self.registry_file, 'w') as f:
|
|
182
|
-
json.dump(data, f, indent=2)
|
|
183
|
-
|
|
184
|
-
def _clean_stale_entries(self, registry_data: Dict[str, Any]) -> bool:
|
|
185
|
-
"""Cleans stale servers and clients. Returns True if changes were made."""
|
|
186
|
-
changed = False
|
|
187
|
-
# Clean dead servers
|
|
188
|
-
dead_servers = [k for k, v in registry_data.items() if not self._is_pid_running(v['pid'])]
|
|
189
|
-
for key in dead_servers:
|
|
190
|
-
ASCIIColors.warning(f"Registry Cleaner: Found dead server process (PID: {registry_data[key]['pid']}). Removing entry {key}.")
|
|
191
|
-
del registry_data[key]
|
|
192
|
-
changed = True
|
|
193
|
-
|
|
194
|
-
# Clean dead clients from living servers
|
|
195
|
-
for key, server_info in list(registry_data.items()):
|
|
196
|
-
dead_clients = [pid for pid in server_info.get('client_pids', []) if not self._is_pid_running(pid)]
|
|
197
|
-
if dead_clients:
|
|
198
|
-
ASCIIColors.warning(f"Registry Cleaner: Found dead client PIDs {dead_clients} for server {key}. Cleaning up.")
|
|
199
|
-
server_info['client_pids'] = [pid for pid in server_info['client_pids'] if pid not in dead_clients]
|
|
200
|
-
server_info['ref_count'] = len(server_info['client_pids'])
|
|
201
|
-
changed = True
|
|
202
|
-
|
|
203
|
-
# If a server has no clients left after cleanup, it's an orphan. Remove it.
|
|
204
|
-
if server_info['ref_count'] <= 0:
|
|
205
|
-
ASCIIColors.warning(f"Registry Cleaner: Server {key} (PID: {server_info['pid']}) has no clients left. Shutting it down.")
|
|
206
|
-
try:
|
|
207
|
-
p = psutil.Process(server_info['pid'])
|
|
208
|
-
p.terminate()
|
|
209
|
-
p.wait(timeout=5)
|
|
210
|
-
except psutil.NoSuchProcess: pass
|
|
211
|
-
except Exception as e: ASCIIColors.error(f"Error terminating orphaned server PID {server_info['pid']}: {e}")
|
|
212
|
-
del registry_data[key]
|
|
213
|
-
changed = True
|
|
214
|
-
|
|
215
|
-
return changed
|
|
216
|
-
|
|
217
|
-
def get_server(self, server_key: str) -> Optional[Dict[str, Any]]:
|
|
218
|
-
with FileLock(self.lock_file):
|
|
219
|
-
registry = self._read_registry()
|
|
220
|
-
self._clean_stale_entries(registry) # Always clean before read
|
|
221
|
-
server_info = registry.get(server_key)
|
|
222
|
-
if server_info:
|
|
223
|
-
self._write_registry(registry) # Write back changes from cleaning
|
|
224
|
-
return server_info
|
|
225
|
-
|
|
226
|
-
def register_new_server(self, server_key: str, pid: int, port: int):
|
|
227
|
-
with FileLock(self.lock_file):
|
|
228
|
-
registry = self._read_registry()
|
|
229
|
-
# Clean just in case something happened between server start and registration
|
|
230
|
-
self._clean_stale_entries(registry)
|
|
231
|
-
|
|
232
|
-
registry[server_key] = {
|
|
233
|
-
"pid": pid, "port": port,
|
|
234
|
-
"ref_count": 1, "client_pids": [self.my_pid]
|
|
235
|
-
}
|
|
236
|
-
self._write_registry(registry)
|
|
237
|
-
ASCIIColors.info(f"Process {self.my_pid} registered new server {server_key} (PID: {pid}, Port: {port})")
|
|
238
|
-
|
|
239
|
-
def increment_ref_count(self, server_key: str):
|
|
240
|
-
with FileLock(self.lock_file):
|
|
241
|
-
registry = self._read_registry()
|
|
242
|
-
self._clean_stale_entries(registry)
|
|
243
|
-
|
|
244
|
-
server_info = registry.get(server_key)
|
|
245
|
-
if server_info:
|
|
246
|
-
if self.my_pid not in server_info['client_pids']:
|
|
247
|
-
server_info['client_pids'].append(self.my_pid)
|
|
248
|
-
server_info['ref_count'] = len(server_info['client_pids'])
|
|
249
|
-
self._write_registry(registry)
|
|
250
|
-
ASCIIColors.info(f"Process {self.my_pid} attached to server {server_key}. New ref_count: {server_info['ref_count']}")
|
|
251
|
-
else:
|
|
252
|
-
ASCIIColors.warning(f"Process {self.my_pid} tried to attach to non-existent server {server_key}.")
|
|
253
|
-
|
|
254
|
-
def decrement_ref_count(self, server_key: str):
|
|
255
|
-
with FileLock(self.lock_file):
|
|
256
|
-
registry = self._read_registry()
|
|
257
|
-
made_changes = self._clean_stale_entries(registry)
|
|
258
|
-
|
|
259
|
-
server_info = registry.get(server_key)
|
|
260
|
-
if server_info:
|
|
261
|
-
if self.my_pid in server_info['client_pids']:
|
|
262
|
-
server_info['client_pids'].remove(self.my_pid)
|
|
263
|
-
server_info['ref_count'] = len(server_info['client_pids'])
|
|
264
|
-
made_changes = True
|
|
265
|
-
ASCIIColors.info(f"Process {self.my_pid} detached from server {server_key}. New ref_count: {server_info['ref_count']}")
|
|
266
|
-
|
|
267
|
-
if server_info['ref_count'] <= 0:
|
|
268
|
-
ASCIIColors.info(f"Last client (PID: {self.my_pid}) detached. Shutting down server {server_key} (PID: {server_info['pid']}).")
|
|
269
|
-
try:
|
|
270
|
-
p = psutil.Process(server_info['pid'])
|
|
271
|
-
p.terminate()
|
|
272
|
-
p.wait(timeout=10)
|
|
273
|
-
except psutil.NoSuchProcess:
|
|
274
|
-
ASCIIColors.warning(f"Server process {server_info['pid']} was already gone.")
|
|
275
|
-
except Exception as e:
|
|
276
|
-
ASCIIColors.error(f"Error terminating server process {server_info['pid']}: {e}")
|
|
277
|
-
del registry[server_key]
|
|
278
|
-
|
|
279
|
-
if made_changes:
|
|
280
|
-
self._write_registry(registry)
|
|
281
|
-
|
|
282
|
-
BindingName = "LlamaCppServerBinding"
|
|
283
|
-
DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
|
|
284
|
-
|
|
285
|
-
class LlamaCppServerProcess:
|
|
286
|
-
def __init__(self,
|
|
287
|
-
model_path: Union[str, Path],
|
|
288
|
-
clip_model_path: Optional[Union[str, Path]] = None,
|
|
289
|
-
server_binary_path: Optional[Union[str, Path]]=None,
|
|
290
|
-
server_args: Dict[str, Any]={},
|
|
291
|
-
process_pid: Optional[int]=None, # PID if we are attaching to existing process
|
|
292
|
-
port: Optional[int]=None,
|
|
293
|
-
):
|
|
294
|
-
"""Initialize the Llama.cpp server process wrapper.
|
|
295
|
-
Can either start a new process or wrap an existing one.
|
|
296
|
-
"""
|
|
297
|
-
self.model_path = Path(model_path)
|
|
298
|
-
self.clip_model_path = Path(clip_model_path) if clip_model_path else None
|
|
299
|
-
|
|
300
|
-
if server_binary_path:
|
|
301
|
-
self.server_binary_path = Path(server_binary_path)
|
|
302
|
-
elif llama_cpp_binaries:
|
|
303
|
-
self.server_binary_path = Path(llama_cpp_binaries.get_binary_path())
|
|
304
|
-
else:
|
|
305
|
-
raise FileNotFoundError("llama_cpp_binaries not found and no server_binary_path provided.")
|
|
306
|
-
|
|
307
|
-
self.port: Optional[int] = port
|
|
308
|
-
self.pid: Optional[int] = process_pid
|
|
309
|
-
self.server_args = server_args
|
|
310
|
-
# The actual subprocess.Popen object. Will be None if this instance is just a client to a server started by another process.
|
|
311
|
-
self.process: Optional[subprocess.Popen] = None
|
|
312
|
-
self.session = requests.Session()
|
|
313
|
-
self.host = self.server_args.get("host",DEFAULT_LLAMACPP_SERVER_HOST)
|
|
314
|
-
self.base_url: Optional[str] = f"http://{self.host}:{self.port}" if self.port else None
|
|
315
|
-
self.is_healthy = False
|
|
316
|
-
self._stderr_lines: List[str] = []
|
|
317
|
-
self._stderr_thread: Optional[threading.Thread] = None
|
|
318
|
-
|
|
319
|
-
if not self.model_path.exists():
|
|
320
|
-
raise FileNotFoundError(f"Model file not found: {self.model_path}")
|
|
321
|
-
if self.clip_model_path and not self.clip_model_path.exists():
|
|
322
|
-
ASCIIColors.warning(f"Clip model file '{self.clip_model_path}' not found. Vision features may not work or may use a different auto-detected clip model.")
|
|
323
|
-
if not self.server_binary_path.exists():
|
|
324
|
-
raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
|
|
325
|
-
|
|
326
|
-
def attach(self):
|
|
327
|
-
"""Attaches to an already running process by checking its health."""
|
|
328
|
-
if not self.pid or not self.port:
|
|
329
|
-
raise ValueError("Cannot attach without PID and port.")
|
|
330
|
-
self.base_url = f"http://{self.host}:{self.port}"
|
|
331
|
-
health_url = f"{self.base_url}/health"
|
|
332
|
-
try:
|
|
333
|
-
response = self.session.get(health_url, timeout=5)
|
|
334
|
-
if response.status_code == 200 and response.json().get("status") == "ok":
|
|
335
|
-
self.is_healthy = True
|
|
336
|
-
ASCIIColors.green(f"Successfully attached to Llama.cpp server on port {self.port} (PID: {self.pid}).")
|
|
337
|
-
return
|
|
338
|
-
except requests.exceptions.RequestException as e:
|
|
339
|
-
ASCIIColors.warning(f"Failed to attach to server on port {self.port}: {e}")
|
|
340
|
-
self.is_healthy = False
|
|
341
|
-
raise ConnectionError(f"Could not connect to existing server at {health_url}")
|
|
342
|
-
|
|
343
|
-
def _filter_stderr(self, stderr_pipe):
|
|
344
|
-
try:
|
|
345
|
-
for line in iter(stderr_pipe.readline, ''):
|
|
346
|
-
if line:
|
|
347
|
-
self._stderr_lines.append(line.strip())
|
|
348
|
-
if len(self._stderr_lines) > 50: self._stderr_lines.pop(0)
|
|
349
|
-
if "llama_model_loaded" in line or "error" in line.lower() or "failed" in line.lower():
|
|
350
|
-
ASCIIColors.debug(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
|
|
351
|
-
elif "running on port" in line: # Server startup message
|
|
352
|
-
ASCIIColors.info(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
|
|
353
|
-
except ValueError: pass
|
|
354
|
-
except Exception as e: ASCIIColors.warning(f"Exception in stderr filter thread for port {self.port}: {e}")
|
|
355
|
-
|
|
356
|
-
def start(self, port_to_use: int):
|
|
357
|
-
self.port = port_to_use
|
|
358
|
-
self.base_url = f"http://{self.host}:{self.port}"
|
|
359
|
-
|
|
360
|
-
cmd = [
|
|
361
|
-
str(self.server_binary_path),
|
|
362
|
-
"--model", str(self.model_path),
|
|
363
|
-
"--host", self.host,
|
|
364
|
-
"--port", str(self.port),
|
|
365
|
-
]
|
|
366
|
-
|
|
367
|
-
arg_map = {
|
|
368
|
-
"n_ctx": "--ctx-size", "n_gpu_layers": "--gpu-layers", "main_gpu": "--main-gpu",
|
|
369
|
-
"tensor_split": "--tensor-split", "use_mmap": (lambda v: ["--no-mmap"] if not v else []),
|
|
370
|
-
"use_mlock": (lambda v: ["--mlock"] if v else []), "seed": "--seed",
|
|
371
|
-
"n_batch": "--batch-size", "n_threads": "--threads", "n_threads_batch": "--threads-batch",
|
|
372
|
-
"rope_scaling_type": "--rope-scaling", "rope_freq_base": "--rope-freq-base",
|
|
373
|
-
"rope_freq_scale": "--rope-freq-scale",
|
|
374
|
-
"embedding": (lambda v: ["--embedding"] if v else []),
|
|
375
|
-
"verbose": (lambda v: ["--verbose"] if v else []),
|
|
376
|
-
"chat_template": "--chat-template",
|
|
377
|
-
"parallel_slots": "--parallel", # Number of parallel processing slots
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
if self.clip_model_path: # This should be the actual path resolved by the binding
|
|
381
|
-
cmd.extend(["--mmproj", str(self.clip_model_path)])
|
|
382
|
-
|
|
383
|
-
for key, cli_arg in arg_map.items():
|
|
384
|
-
val = self.server_args.get(key)
|
|
385
|
-
if val is not None:
|
|
386
|
-
if callable(cli_arg): cmd.extend(cli_arg(val))
|
|
387
|
-
else: cmd.extend([cli_arg, str(val)])
|
|
388
|
-
|
|
389
|
-
extra_cli_flags = self.server_args.get("extra_cli_flags", [])
|
|
390
|
-
if isinstance(extra_cli_flags, str): extra_cli_flags = extra_cli_flags.split()
|
|
391
|
-
cmd.extend(extra_cli_flags)
|
|
392
|
-
|
|
393
|
-
ASCIIColors.info(f"Starting Llama.cpp server ({' '.join(cmd)})")
|
|
394
|
-
|
|
395
|
-
env = os.environ.copy()
|
|
396
|
-
if os.name == 'posix' and self.server_binary_path.parent != Path('.'):
|
|
397
|
-
lib_path_str = str(self.server_binary_path.parent.resolve())
|
|
398
|
-
current_ld_path = env.get('LD_LIBRARY_PATH', '')
|
|
399
|
-
env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}" if current_ld_path else lib_path_str
|
|
400
|
-
|
|
401
|
-
try:
|
|
402
|
-
self.process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, bufsize=1, env=env)
|
|
403
|
-
self.pid = self.process.pid
|
|
404
|
-
except Exception as e:
|
|
405
|
-
ASCIIColors.error(f"Failed to start llama.cpp server process on port {self.port}: {e}"); trace_exception(e); raise
|
|
406
|
-
|
|
407
|
-
self._stderr_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stderr,), daemon=True)
|
|
408
|
-
self._stderr_thread.start()
|
|
409
|
-
|
|
410
|
-
health_url = f"{self.base_url}/health"
|
|
411
|
-
max_wait_time = self.server_args.get("server_startup_timeout", 60)
|
|
412
|
-
start_time = time.time()
|
|
413
|
-
|
|
414
|
-
while time.time() - start_time < max_wait_time:
|
|
415
|
-
if self.process.poll() is not None:
|
|
416
|
-
stderr_output = "\n".join(self._stderr_lines[-10:])
|
|
417
|
-
raise RuntimeError(f"Llama.cpp server (port {self.port}) terminated unexpectedly (exit code {self.process.poll()}) during startup. Stderr:\n{stderr_output}")
|
|
418
|
-
try:
|
|
419
|
-
response = self.session.get(health_url, timeout=2)
|
|
420
|
-
if response.status_code == 200 and response.json().get("status") == "ok":
|
|
421
|
-
self.is_healthy = True
|
|
422
|
-
ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port} (PID: {self.pid}).")
|
|
423
|
-
return
|
|
424
|
-
except requests.exceptions.ConnectionError: time.sleep(1)
|
|
425
|
-
except Exception as e: ASCIIColors.warning(f"Health check for port {self.port} failed: {e}"); time.sleep(1)
|
|
426
|
-
|
|
427
|
-
self.is_healthy = False
|
|
428
|
-
self.shutdown()
|
|
429
|
-
stderr_output = "\n".join(self._stderr_lines[-10:])
|
|
430
|
-
raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
|
|
431
|
-
|
|
432
|
-
def shutdown(self):
|
|
433
|
-
""" This method only shuts down a server if this instance owns the Popen object.
|
|
434
|
-
The actual termination for multi-process is handled by the ServerRegistry. """
|
|
435
|
-
self.is_healthy = False
|
|
436
|
-
if self.process:
|
|
437
|
-
ASCIIColors.info(f"Shutting down owned Llama.cpp server process (PID: {self.process.pid} on port {self.port})...")
|
|
438
|
-
try:
|
|
439
|
-
self.process.terminate()
|
|
440
|
-
self.process.wait(timeout=10)
|
|
441
|
-
except subprocess.TimeoutExpired:
|
|
442
|
-
ASCIIColors.warning(f"Llama.cpp server (port {self.port}) did not terminate gracefully, killing...")
|
|
443
|
-
self.process.kill()
|
|
444
|
-
try: self.process.wait(timeout=5)
|
|
445
|
-
except subprocess.TimeoutExpired: ASCIIColors.error(f"Failed to kill llama.cpp server process (port {self.port}).")
|
|
446
|
-
except Exception as e: ASCIIColors.error(f"Error during server shutdown (port {self.port}): {e}")
|
|
447
|
-
finally:
|
|
448
|
-
self.process = None
|
|
449
|
-
if self._stderr_thread and self._stderr_thread.is_alive(): self._stderr_thread.join(timeout=1)
|
|
450
|
-
ASCIIColors.info(f"Llama.cpp server on port {self.port} shut down.")
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
class LlamaCppServerBinding(LollmsLLMBinding):
|
|
454
|
-
DEFAULT_SERVER_ARGS = {
|
|
455
|
-
"n_gpu_layers": 0, "n_ctx": 128000, "n_batch": 512,
|
|
456
|
-
"embedding": False, "verbose": False, "server_startup_timeout": 120,
|
|
457
|
-
"parallel_slots": 4, # Default parallel slots for server
|
|
458
|
-
"stop_sequences": ["<|im_start|>"], # Default stop sequences
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
def __init__(self, **kwargs):
|
|
462
|
-
super().__init__(BindingName, **kwargs)
|
|
463
|
-
if llama_cpp_binaries is None or psutil is None:
|
|
464
|
-
raise ImportError("llama-cpp-binaries and psutil packages are required.")
|
|
465
|
-
|
|
466
|
-
self.registry = ServerRegistry()
|
|
467
|
-
models_path = kwargs.get("models_path", Path(__file__).parent/"models")
|
|
468
|
-
self.models_path = Path(models_path)
|
|
469
|
-
self.initial_model_name_preference: Optional[str] = kwargs.get("model_name")
|
|
470
|
-
self.user_provided_model_name: Optional[str] = kwargs.get("model_name")
|
|
471
|
-
self.initial_clip_model_name_preference: Optional[str] = kwargs.get("clip_model_name")
|
|
472
|
-
self._model_path_map: Dict[str, Path] = {}
|
|
473
|
-
self._scan_models()
|
|
474
|
-
self.default_completion_format = kwargs.get("default_completion_format", ELF_COMPLETION_FORMAT.Chat)
|
|
475
|
-
self.server_args = {**self.DEFAULT_SERVER_ARGS, **(kwargs.get("config") or {}), **kwargs}
|
|
476
|
-
self.server_binary_path = self._get_server_binary_path()
|
|
477
|
-
|
|
478
|
-
self.current_model_path: Optional[Path] = None
|
|
479
|
-
self.clip_model_path: Optional[Path] = None
|
|
480
|
-
self.server_process: Optional[LlamaCppServerProcess] = None
|
|
481
|
-
self.port: Optional[int] = None
|
|
482
|
-
self.server_key: Optional[str] = None
|
|
483
|
-
|
|
484
|
-
ASCIIColors.info("LlamaCppServerBinding initialized. Server will start on-demand with first generation call.")
|
|
485
|
-
|
|
486
|
-
def _get_server_binary_path(self) -> Path:
|
|
487
|
-
custom_path_str = self.server_args.get("llama_server_binary_path")
|
|
488
|
-
if custom_path_str:
|
|
489
|
-
custom_path = Path(custom_path_str)
|
|
490
|
-
if custom_path.exists() and custom_path.is_file():
|
|
491
|
-
ASCIIColors.info(f"Using custom llama.cpp server binary: {custom_path}"); return custom_path
|
|
492
|
-
else: ASCIIColors.warning(f"Custom binary '{custom_path_str}' not found. Falling back.")
|
|
493
|
-
if llama_cpp_binaries:
|
|
494
|
-
bin_path_str = llama_cpp_binaries.get_binary_path()
|
|
495
|
-
if bin_path_str:
|
|
496
|
-
bin_path = Path(bin_path_str)
|
|
497
|
-
if bin_path.exists() and bin_path.is_file():
|
|
498
|
-
ASCIIColors.info(f"Using binary from llama-cpp-binaries: {bin_path}"); return bin_path
|
|
499
|
-
raise FileNotFoundError("Llama.cpp server binary not found.")
|
|
500
|
-
|
|
501
|
-
def _resolve_model_path(self, model_name_or_path: str) -> Path:
|
|
502
|
-
"""
|
|
503
|
-
Resolves a model name or path to a full Path object.
|
|
504
|
-
It prioritizes the internal map, then checks for absolute/relative paths,
|
|
505
|
-
and rescans the models directory as a fallback.
|
|
506
|
-
"""
|
|
507
|
-
if model_name_or_path in self._model_path_map:
|
|
508
|
-
return self._model_path_map[model_name_or_path]
|
|
509
|
-
model_p = Path(model_name_or_path)
|
|
510
|
-
if model_p.is_absolute() and model_p.exists(): return model_p
|
|
511
|
-
path_in_models_dir = self.models_path / model_name_or_path
|
|
512
|
-
if path_in_models_dir.exists(): return path_in_models_dir
|
|
513
|
-
self._scan_models()
|
|
514
|
-
if model_name_or_path in self._model_path_map:
|
|
515
|
-
return self._model_path_map[model_name_or_path]
|
|
516
|
-
raise FileNotFoundError(f"Model '{model_name_or_path}' not found.")
|
|
517
|
-
|
|
518
|
-
def _find_available_port(self) -> int:
|
|
519
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
520
|
-
s.bind(('', 0)); return s.getsockname()[1]
|
|
521
|
-
|
|
522
|
-
def _release_server_instance(self):
|
|
523
|
-
if self.server_process and self.server_key:
|
|
524
|
-
self.registry.decrement_ref_count(self.server_key)
|
|
525
|
-
self.server_process = None
|
|
526
|
-
self.port = None
|
|
527
|
-
self.server_key = None
|
|
528
|
-
self.current_model_path = None
|
|
529
|
-
self.clip_model_path = None
|
|
530
|
-
|
|
531
|
-
def load_model(self, model_name_or_path: str) -> bool:
|
|
532
|
-
self.user_provided_model_name = model_name_or_path
|
|
533
|
-
try:
|
|
534
|
-
resolved_model_path = self._resolve_model_path(model_name_or_path)
|
|
535
|
-
except Exception as ex:
|
|
536
|
-
trace_exception(ex); return False
|
|
537
|
-
|
|
538
|
-
final_clip_model_path: Optional[Path] = None
|
|
539
|
-
if self.initial_clip_model_name_preference:
|
|
540
|
-
p_clip_pref = Path(self.initial_clip_model_name_preference)
|
|
541
|
-
if p_clip_pref.is_absolute() and p_clip_pref.exists(): final_clip_model_path = p_clip_pref
|
|
542
|
-
elif (self.models_path / p_clip_pref).exists(): final_clip_model_path = self.models_path / p_clip_pref
|
|
543
|
-
else: ASCIIColors.warning(f"Specified clip model '{self.initial_clip_model_name_preference}' not found.")
|
|
544
|
-
|
|
545
|
-
if not final_clip_model_path:
|
|
546
|
-
base_name = get_gguf_model_base_name(resolved_model_path.stem)
|
|
547
|
-
potential_paths = [
|
|
548
|
-
resolved_model_path.parent / f"{base_name}.mmproj",
|
|
549
|
-
resolved_model_path.parent / f"mmproj-{base_name}.gguf",
|
|
550
|
-
self.models_path / f"{base_name}.mmproj",
|
|
551
|
-
self.models_path / f"mmproj-{base_name}.gguf",
|
|
552
|
-
]
|
|
553
|
-
for p_clip in potential_paths:
|
|
554
|
-
if p_clip.exists(): final_clip_model_path = p_clip; break
|
|
555
|
-
|
|
556
|
-
final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else "None"
|
|
557
|
-
new_server_key = f"{resolved_model_path}|{final_clip_model_path_str}"
|
|
558
|
-
|
|
559
|
-
if self.server_process and self.server_key == new_server_key and self.server_process.is_healthy:
|
|
560
|
-
ASCIIColors.info(f"Model '{model_name_or_path}' is already loaded. No change.")
|
|
561
|
-
return True
|
|
562
|
-
|
|
563
|
-
if self.server_process and self.server_key != new_server_key:
|
|
564
|
-
self._release_server_instance()
|
|
565
|
-
|
|
566
|
-
# Check registry for an existing server
|
|
567
|
-
existing_server_info = self.registry.get_server(new_server_key)
|
|
568
|
-
if existing_server_info:
|
|
569
|
-
ASCIIColors.info(f"Found existing server for {new_server_key} in registry (PID: {existing_server_info['pid']}, Port: {existing_server_info['port']}). Attaching...")
|
|
570
|
-
try:
|
|
571
|
-
self.server_process = LlamaCppServerProcess(
|
|
572
|
-
model_path=resolved_model_path, clip_model_path=final_clip_model_path,
|
|
573
|
-
process_pid=existing_server_info['pid'], port=existing_server_info['port'],
|
|
574
|
-
server_args=self.server_args
|
|
575
|
-
)
|
|
576
|
-
self.server_process.attach() # This verifies health
|
|
577
|
-
self.port = self.server_process.port
|
|
578
|
-
self.current_model_path = resolved_model_path
|
|
579
|
-
self.clip_model_path = final_clip_model_path
|
|
580
|
-
self.server_key = new_server_key
|
|
581
|
-
self.registry.increment_ref_count(new_server_key)
|
|
582
|
-
return True
|
|
583
|
-
except Exception as e:
|
|
584
|
-
ASCIIColors.error(f"Failed to attach to existing server: {e}. It might be stale. Will attempt to start a new one.")
|
|
585
|
-
self.registry.decrement_ref_count(new_server_key) # Clean up failed attach
|
|
586
|
-
|
|
587
|
-
# Start a new server
|
|
588
|
-
ASCIIColors.info(f"No existing server found for {new_server_key}. Starting a new one.")
|
|
589
|
-
self.current_model_path = resolved_model_path
|
|
590
|
-
self.clip_model_path = final_clip_model_path
|
|
591
|
-
self.server_key = new_server_key
|
|
592
|
-
|
|
593
|
-
try:
|
|
594
|
-
new_port = self._find_available_port()
|
|
595
|
-
current_server_args = self.server_args.copy()
|
|
596
|
-
if "parallel_slots" not in current_server_args or current_server_args["parallel_slots"] <=0:
|
|
597
|
-
current_server_args["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
|
|
598
|
-
|
|
599
|
-
new_server = LlamaCppServerProcess(
|
|
600
|
-
model_path=self.current_model_path, clip_model_path=self.clip_model_path,
|
|
601
|
-
server_binary_path=self.server_binary_path, server_args=current_server_args
|
|
602
|
-
)
|
|
603
|
-
new_server.start(port_to_use=new_port)
|
|
604
|
-
|
|
605
|
-
if new_server.is_healthy:
|
|
606
|
-
self.server_process = new_server
|
|
607
|
-
self.port = new_port
|
|
608
|
-
self.registry.register_new_server(self.server_key, new_server.pid, new_port)
|
|
609
|
-
ASCIIColors.green(f"New server {self.server_key} started and registered.")
|
|
610
|
-
return True
|
|
611
|
-
else:
|
|
612
|
-
return False
|
|
613
|
-
except Exception as e:
|
|
614
|
-
ASCIIColors.error(f"Failed to start new server for '{model_name_or_path}': {e}"); trace_exception(e)
|
|
615
|
-
self._release_server_instance()
|
|
616
|
-
return False
|
|
617
|
-
|
|
618
|
-
def unload_model(self):
|
|
619
|
-
if self.server_process:
|
|
620
|
-
self._release_server_instance()
|
|
621
|
-
else:
|
|
622
|
-
ASCIIColors.info("Unload called, but no server was active for this binding instance.")
|
|
623
|
-
|
|
624
|
-
def _ensure_server_is_running(self) -> bool:
|
|
625
|
-
"""
|
|
626
|
-
Checks if the server is healthy. If not, it attempts to load the configured model.
|
|
627
|
-
Returns True if the server is healthy and ready, False otherwise.
|
|
628
|
-
"""
|
|
629
|
-
if self.server_process and self.server_process.is_healthy:
|
|
630
|
-
return True
|
|
631
|
-
|
|
632
|
-
ASCIIColors.info("Server is not running. Attempting to start on-demand...")
|
|
633
|
-
|
|
634
|
-
model_to_load = self.user_provided_model_name or self.initial_model_name_preference
|
|
635
|
-
|
|
636
|
-
if not model_to_load:
|
|
637
|
-
self._scan_models()
|
|
638
|
-
available_models = self.list_models()
|
|
639
|
-
if not available_models:
|
|
640
|
-
ASCIIColors.error("No model specified and no GGUF models found in models path.")
|
|
641
|
-
return False
|
|
642
|
-
|
|
643
|
-
model_to_load = available_models[0]['name']
|
|
644
|
-
ASCIIColors.info(f"No model was specified. Automatically selecting the first available model: '{model_to_load}'")
|
|
645
|
-
|
|
646
|
-
if self.load_model(model_to_load):
|
|
647
|
-
return True
|
|
648
|
-
else:
|
|
649
|
-
ASCIIColors.error(f"Automatic model load for '{model_to_load}' failed.")
|
|
650
|
-
return False
|
|
651
|
-
|
|
652
|
-
def _get_request_url(self, endpoint: str) -> str:
|
|
653
|
-
return f"{self.server_process.base_url}{endpoint}"
|
|
654
|
-
|
|
655
|
-
def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
|
|
656
|
-
temperature: float = 0.7, top_k: int = 40, top_p: float = 0.9,
|
|
657
|
-
repeat_penalty: float = 1.1, repeat_last_n: Optional[int] = 64,
|
|
658
|
-
seed: Optional[int] = None, stream: bool = False, use_chat_format: bool = True,
|
|
659
|
-
images: Optional[List[str]] = None,
|
|
660
|
-
stop_sequences: Optional[List[str]] = None,
|
|
661
|
-
split:Optional[bool]=False,
|
|
662
|
-
user_keyword:Optional[str]="!@>user:",
|
|
663
|
-
ai_keyword:Optional[str]="!@>assistant:",
|
|
664
|
-
**extra_params) -> Dict:
|
|
665
|
-
payload_params = {
|
|
666
|
-
"temperature": self.server_args.get("temperature", 0.7), "top_k": self.server_args.get("top_k", 40),
|
|
667
|
-
"top_p": self.server_args.get("top_p", 0.9), "repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
|
|
668
|
-
"repeat_last_n": self.server_args.get("repeat_last_n", 64), "mirostat": self.server_args.get("mirostat_mode", 0),
|
|
669
|
-
"mirostat_tau": self.server_args.get("mirostat_tau", 5.0), "mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
|
|
670
|
-
}
|
|
671
|
-
if "grammar_string" in self.server_args and self.server_args["grammar_string"]:
|
|
672
|
-
payload_params["grammar"] = self.server_args["grammar_string"]
|
|
673
|
-
|
|
674
|
-
payload_params.update({"temperature": temperature, "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n})
|
|
675
|
-
if n_predict is not None: payload_params['n_predict'] = n_predict
|
|
676
|
-
if seed is not None: payload_params['seed'] = seed
|
|
677
|
-
|
|
678
|
-
# --- Handle stop sequences ---
|
|
679
|
-
all_stop_sequences = set(self.server_args.get("stop_sequences", []))
|
|
680
|
-
if stop_sequences:
|
|
681
|
-
all_stop_sequences.update(stop_sequences)
|
|
682
|
-
if all_stop_sequences:
|
|
683
|
-
payload_params['stop'] = list(all_stop_sequences)
|
|
684
|
-
# --- End stop sequences ---
|
|
685
|
-
|
|
686
|
-
payload_params = {k: v for k, v in payload_params.items() if v is not None}
|
|
687
|
-
payload_params.update(extra_params)
|
|
688
|
-
|
|
689
|
-
if use_chat_format and self.default_completion_format == ELF_COMPLETION_FORMAT.Chat:
|
|
690
|
-
messages = []
|
|
691
|
-
if system_prompt and system_prompt.strip(): messages.append({"role": "system", "content": system_prompt})
|
|
692
|
-
user_content: Union[str, List[Dict[str, Any]]] = prompt
|
|
693
|
-
if split:
|
|
694
|
-
messages += self.split_discussion(user_content,user_keyword=user_keyword, ai_keyword=ai_keyword)
|
|
695
|
-
else:
|
|
696
|
-
messages.append({"role": "user", "content": user_content})
|
|
697
|
-
if images and self.clip_model_path:
|
|
698
|
-
image_parts = []
|
|
699
|
-
for img_path in images:
|
|
700
|
-
try:
|
|
701
|
-
with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
702
|
-
image_type = Path(img_path).suffix[1:].lower() or "png"; image_type = "jpeg" if image_type == "jpg" else image_type
|
|
703
|
-
image_parts.append({"type": "image_url", "image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}})
|
|
704
|
-
except Exception as ex: trace_exception(ex)
|
|
705
|
-
messages[-1]["content"] =[{"type": "text", "text": messages[-1]["content"]}] + image_parts # type: ignore
|
|
706
|
-
final_payload = {"messages": messages, "stream": stream, **payload_params}
|
|
707
|
-
if 'n_predict' in final_payload: final_payload['max_tokens'] = final_payload.pop('n_predict')
|
|
708
|
-
return final_payload
|
|
709
|
-
else:
|
|
710
|
-
full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:" if system_prompt and system_prompt.strip() else prompt
|
|
711
|
-
final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
|
|
712
|
-
if images and self.clip_model_path:
|
|
713
|
-
image_data_list = []
|
|
714
|
-
for i, img_path in enumerate(images):
|
|
715
|
-
try:
|
|
716
|
-
with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
717
|
-
image_data_list.append({"data": encoded_string, "id": i + 10})
|
|
718
|
-
except Exception as e_img: ASCIIColors.error(f"Could not encode image {img_path}: {e_img}")
|
|
719
|
-
if image_data_list: final_payload["image_data"] = image_data_list
|
|
720
|
-
return final_payload
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
def generate_text(self,
|
|
724
|
-
prompt: str,
|
|
725
|
-
images: Optional[List[str]] = None,
|
|
726
|
-
system_prompt: str = "",
|
|
727
|
-
n_predict: Optional[int] = None,
|
|
728
|
-
stream: Optional[bool] = None,
|
|
729
|
-
temperature: float = 0.7,
|
|
730
|
-
top_k: int = 40,
|
|
731
|
-
top_p: float = 0.9,
|
|
732
|
-
repeat_penalty: float = 1.1,
|
|
733
|
-
repeat_last_n: int = 64,
|
|
734
|
-
seed: Optional[int] = None,
|
|
735
|
-
n_threads: Optional[int] = None,
|
|
736
|
-
ctx_size: int | None = None,
|
|
737
|
-
streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
|
|
738
|
-
stop_sequences: Optional[List[str]] = None,
|
|
739
|
-
split:Optional[bool]=False,
|
|
740
|
-
user_keyword:Optional[str]="!@>user:",
|
|
741
|
-
ai_keyword:Optional[str]="!@>assistant:",
|
|
742
|
-
**generation_kwargs
|
|
743
|
-
) -> Union[str, dict]:
|
|
744
|
-
|
|
745
|
-
if not self._ensure_server_is_running():
|
|
746
|
-
return {"status": False, "error": "Llama.cpp server could not be started. Please check model configuration and logs."}
|
|
747
|
-
|
|
748
|
-
_use_chat_format = True
|
|
749
|
-
payload = self._prepare_generation_payload(
|
|
750
|
-
prompt=prompt, system_prompt=system_prompt, n_predict=n_predict,
|
|
751
|
-
temperature=temperature if temperature is not None else self.server_args.get("temperature",0.7),
|
|
752
|
-
top_k=top_k if top_k is not None else self.server_args.get("top_k",40),
|
|
753
|
-
top_p=top_p if top_p is not None else self.server_args.get("top_p",0.9),
|
|
754
|
-
repeat_penalty=repeat_penalty if repeat_penalty is not None else self.server_args.get("repeat_penalty",1.1),
|
|
755
|
-
repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
|
|
756
|
-
seed=seed if seed is not None else self.server_args.get("seed", -1), stream=stream,
|
|
757
|
-
use_chat_format=_use_chat_format, images=images,
|
|
758
|
-
stop_sequences=stop_sequences,
|
|
759
|
-
split= split, user_keyword=user_keyword, ai_keyword=ai_keyword, **generation_kwargs
|
|
760
|
-
)
|
|
761
|
-
endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
|
|
762
|
-
request_url = self._get_request_url(endpoint)
|
|
763
|
-
|
|
764
|
-
full_response_text = ""
|
|
765
|
-
try:
|
|
766
|
-
response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
|
|
767
|
-
response.raise_for_status()
|
|
768
|
-
if stream:
|
|
769
|
-
for line in response.iter_lines():
|
|
770
|
-
if not line: continue
|
|
771
|
-
line_str = line.decode('utf-8').strip()
|
|
772
|
-
if line_str.startswith('data: '): line_str = line_str[6:]
|
|
773
|
-
if line_str == '[DONE]': break
|
|
774
|
-
try:
|
|
775
|
-
chunk_data = json.loads(line_str)
|
|
776
|
-
chunk_content = (chunk_data.get('choices', [{}])[0].get('delta', {}).get('content', '') if _use_chat_format
|
|
777
|
-
else chunk_data.get('content', ''))
|
|
778
|
-
if chunk_content:
|
|
779
|
-
full_response_text += chunk_content
|
|
780
|
-
if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
781
|
-
ASCIIColors.info("Streaming callback requested stop."); response.close(); break
|
|
782
|
-
if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): break
|
|
783
|
-
except json.JSONDecodeError: ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}"); continue
|
|
784
|
-
return full_response_text
|
|
785
|
-
else:
|
|
786
|
-
response_data = response.json()
|
|
787
|
-
return response_data.get('choices', [{}])[0].get('message', {}).get('content', '') if _use_chat_format \
|
|
788
|
-
else response_data.get('content','')
|
|
789
|
-
except requests.exceptions.RequestException as e:
|
|
790
|
-
error_message = f"Llama.cpp server request error: {e}"
|
|
791
|
-
if e.response is not None:
|
|
792
|
-
try: error_details = e.response.json(); error_message += f" - Details: {error_details.get('error', e.response.text)}"
|
|
793
|
-
except json.JSONDecodeError: error_message += f" - Response: {e.response.text[:200]}"
|
|
794
|
-
ASCIIColors.error(error_message)
|
|
795
|
-
return {"status": False, "error": error_message, "details": str(e.response.text if e.response else "No response text")}
|
|
796
|
-
except Exception as ex:
|
|
797
|
-
error_message = f"Llama.cpp generation error: {str(ex)}"; trace_exception(ex)
|
|
798
|
-
return {"status": False, "error": error_message}
|
|
799
|
-
|
|
800
|
-
def chat(self,
|
|
801
|
-
discussion: LollmsDiscussion,
|
|
802
|
-
branch_tip_id: Optional[str] = None,
|
|
803
|
-
n_predict: Optional[int] = None,
|
|
804
|
-
stream: Optional[bool] = None,
|
|
805
|
-
temperature: float = 0.7,
|
|
806
|
-
top_k: int = 40,
|
|
807
|
-
top_p: float = 0.9,
|
|
808
|
-
repeat_penalty: float = 1.1,
|
|
809
|
-
repeat_last_n: int = 64,
|
|
810
|
-
seed: Optional[int] = None,
|
|
811
|
-
n_threads: Optional[int] = None,
|
|
812
|
-
ctx_size: Optional[int] = None,
|
|
813
|
-
streaming_callback: Optional[Callable[[str, MSG_TYPE], None]] = None,
|
|
814
|
-
stop_sequences: Optional[List[str]] = None,
|
|
815
|
-
**generation_kwargs
|
|
816
|
-
) -> Union[str, dict]:
|
|
817
|
-
|
|
818
|
-
if not self._ensure_server_is_running():
|
|
819
|
-
return {"status": "error", "message": "Llama.cpp server could not be started. Please check model configuration and logs."}
|
|
820
|
-
|
|
821
|
-
messages = discussion.export("openai_chat", branch_tip_id)
|
|
822
|
-
payload = {
|
|
823
|
-
"messages": messages, "max_tokens": n_predict, "temperature": temperature,
|
|
824
|
-
"top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty,
|
|
825
|
-
"seed": seed, "stream": stream, **generation_kwargs
|
|
826
|
-
}
|
|
827
|
-
|
|
828
|
-
all_stop_sequences = set(self.server_args.get("stop_sequences", []))
|
|
829
|
-
if stop_sequences:
|
|
830
|
-
all_stop_sequences.update(stop_sequences)
|
|
831
|
-
if all_stop_sequences:
|
|
832
|
-
payload['stop'] = list(all_stop_sequences)
|
|
833
|
-
|
|
834
|
-
payload = {k: v for k, v in payload.items() if v is not None}
|
|
835
|
-
|
|
836
|
-
endpoint = "/v1/chat/completions"
|
|
837
|
-
request_url = self._get_request_url(endpoint)
|
|
838
|
-
full_response_text = ""
|
|
839
|
-
|
|
840
|
-
try:
|
|
841
|
-
response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
|
|
842
|
-
response.raise_for_status()
|
|
843
|
-
|
|
844
|
-
if stream:
|
|
845
|
-
for line in response.iter_lines():
|
|
846
|
-
if not line: continue
|
|
847
|
-
line_str = line.decode('utf-8').strip()
|
|
848
|
-
if line_str.startswith('data: '): line_str = line_str[6:]
|
|
849
|
-
if line_str == '[DONE]': break
|
|
850
|
-
try:
|
|
851
|
-
chunk_data = json.loads(line_str)
|
|
852
|
-
choices = chunk_data.get('choices', [{}])
|
|
853
|
-
if choices and len(choices)>0:
|
|
854
|
-
chunk_content = choices[0].get('delta', {}).get('content', '')
|
|
855
|
-
if chunk_content:
|
|
856
|
-
full_response_text += chunk_content
|
|
857
|
-
if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
858
|
-
ASCIIColors.info("Streaming callback requested stop.")
|
|
859
|
-
response.close()
|
|
860
|
-
break
|
|
861
|
-
except json.JSONDecodeError:
|
|
862
|
-
ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
|
|
863
|
-
continue
|
|
864
|
-
return full_response_text
|
|
865
|
-
else:
|
|
866
|
-
response_data = response.json()
|
|
867
|
-
return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
868
|
-
|
|
869
|
-
except requests.exceptions.RequestException as e:
|
|
870
|
-
error_message = f"Llama.cpp server request error: {e}"
|
|
871
|
-
if e.response is not None:
|
|
872
|
-
try:
|
|
873
|
-
error_details = e.response.json()
|
|
874
|
-
error_message += f" - Details: {error_details.get('error', e.response.text)}"
|
|
875
|
-
except json.JSONDecodeError:
|
|
876
|
-
error_message += f" - Response: {e.response.text[:200]}"
|
|
877
|
-
ASCIIColors.error(error_message)
|
|
878
|
-
return {"status": "error", "message": error_message}
|
|
879
|
-
except Exception as ex:
|
|
880
|
-
error_message = f"Llama.cpp generation error: {str(ex)}"
|
|
881
|
-
trace_exception(ex)
|
|
882
|
-
return {"status": "error", "message": error_message}
|
|
883
|
-
|
|
884
|
-
def tokenize(self, text: str) -> List[int]:
|
|
885
|
-
if not self._ensure_server_is_running(): return []
|
|
886
|
-
try:
|
|
887
|
-
response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
|
|
888
|
-
response.raise_for_status(); return response.json().get("tokens", [])
|
|
889
|
-
except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
|
|
890
|
-
|
|
891
|
-
def detokenize(self, tokens: List[int]) -> str:
|
|
892
|
-
if not self._ensure_server_is_running(): return ""
|
|
893
|
-
try:
|
|
894
|
-
response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
|
|
895
|
-
response.raise_for_status(); return response.json().get("content", "")
|
|
896
|
-
except Exception as e: ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e); return ""
|
|
897
|
-
|
|
898
|
-
def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
|
|
899
|
-
|
|
900
|
-
def embed(self, text: str, **kwargs) -> List[float]:
|
|
901
|
-
if not self._ensure_server_is_running(): return []
|
|
902
|
-
if not self.server_args.get("embedding"):
|
|
903
|
-
ASCIIColors.warning("Embedding not enabled in server_args. Please set 'embedding' to True in config."); return []
|
|
904
|
-
try:
|
|
905
|
-
payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
|
|
906
|
-
response = self.server_process.session.post(request_url, json=payload)
|
|
907
|
-
if response.status_code == 404: # Fallback
|
|
908
|
-
request_url = self._get_request_url("/embedding")
|
|
909
|
-
response = self.server_process.session.post(request_url, json={"content": text})
|
|
910
|
-
response.raise_for_status(); data = response.json()
|
|
911
|
-
if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: return data["data"][0]["embedding"]
|
|
912
|
-
elif "embedding" in data and isinstance(data["embedding"], list): return data["embedding"]
|
|
913
|
-
else: raise ValueError(f"Unexpected embedding response: {data}")
|
|
914
|
-
except requests.exceptions.RequestException as e:
|
|
915
|
-
err_msg = f"Embedding request error: {e}";
|
|
916
|
-
if e.response: err_msg += f" - {e.response.text[:200]}"
|
|
917
|
-
ASCIIColors.error(err_msg)
|
|
918
|
-
return []
|
|
919
|
-
except Exception as ex:
|
|
920
|
-
trace_exception(ex); ASCIIColors.error(f"Embedding failed: {str(ex)}")
|
|
921
|
-
return []
|
|
922
|
-
|
|
923
|
-
def get_model_info(self) -> dict:
|
|
924
|
-
is_loaded = self.server_process is not None and self.server_process.is_healthy
|
|
925
|
-
info = {
|
|
926
|
-
"name": self.binding_name,
|
|
927
|
-
"user_provided_model_name": self.user_provided_model_name,
|
|
928
|
-
"model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
|
|
929
|
-
"clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
|
|
930
|
-
"loaded": is_loaded,
|
|
931
|
-
"server_args": self.server_args, "port": self.port if self.port else "N/A",
|
|
932
|
-
"server_key": str(self.server_key) if self.server_key else "N/A",
|
|
933
|
-
}
|
|
934
|
-
if is_loaded:
|
|
935
|
-
try:
|
|
936
|
-
props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
|
|
937
|
-
info.update({
|
|
938
|
-
"server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
|
|
939
|
-
"server_chat_format": props_resp.get("chat_format"),
|
|
940
|
-
"server_clip_model_from_props": props_resp.get("mmproj"),
|
|
941
|
-
})
|
|
942
|
-
except Exception: pass
|
|
943
|
-
|
|
944
|
-
is_llava = self.clip_model_path is not None or \
|
|
945
|
-
(info.get("server_clip_model_from_props") is not None) or \
|
|
946
|
-
("llava" in self.current_model_path.name.lower() if self.current_model_path else False)
|
|
947
|
-
info["supports_vision"] = is_llava
|
|
948
|
-
info["supports_structured_output"] = self.server_args.get("grammar_string") is not None
|
|
949
|
-
return info
|
|
950
|
-
|
|
951
|
-
def _scan_models(self):
|
|
952
|
-
self._model_path_map = {}
|
|
953
|
-
if not self.models_path.exists() or not self.models_path.is_dir():
|
|
954
|
-
ASCIIColors.warning(f"Models path does not exist or is not a directory: {self.models_path}")
|
|
955
|
-
return
|
|
956
|
-
|
|
957
|
-
all_paths = list(self.models_path.rglob("*.gguf"))
|
|
958
|
-
filenames_count = {}
|
|
959
|
-
for path in all_paths:
|
|
960
|
-
if path.is_file():
|
|
961
|
-
filenames_count[path.name] = filenames_count.get(path.name, 0) + 1
|
|
962
|
-
|
|
963
|
-
for model_file in all_paths:
|
|
964
|
-
if model_file.is_file():
|
|
965
|
-
relative_path_str = str(model_file.relative_to(self.models_path).as_posix())
|
|
966
|
-
if filenames_count[model_file.name] > 1:
|
|
967
|
-
unique_name = relative_path_str
|
|
968
|
-
else:
|
|
969
|
-
unique_name = model_file.name
|
|
970
|
-
self._model_path_map[unique_name] = model_file
|
|
971
|
-
|
|
972
|
-
ASCIIColors.info(f"Scanned {len(self._model_path_map)} models from {self.models_path}.")
|
|
973
|
-
|
|
974
|
-
def list_models(self) -> List[Dict[str, Any]]:
|
|
975
|
-
self._scan_models()
|
|
976
|
-
models_found = []
|
|
977
|
-
for unique_name, model_path in self._model_path_map.items():
|
|
978
|
-
models_found.append({
|
|
979
|
-
'name': unique_name, 'model_name': model_path.name,
|
|
980
|
-
'path': str(model_path), 'size': model_path.stat().st_size
|
|
981
|
-
})
|
|
982
|
-
return sorted(models_found, key=lambda x: x['name'])
|
|
983
|
-
|
|
984
|
-
def __del__(self):
|
|
985
|
-
self.unload_model()
|
|
986
|
-
|
|
987
|
-
def get_ctx_size(self, model_name: Optional[str] = None) -> Optional[int]:
|
|
988
|
-
if model_name is None:
|
|
989
|
-
model_name = self.user_provided_model_name or self.initial_model_name_preference
|
|
990
|
-
if not model_name and self.current_model_path:
|
|
991
|
-
model_name = self.current_model_path.name
|
|
992
|
-
|
|
993
|
-
if model_name is None:
|
|
994
|
-
ASCIIColors.warning("Cannot determine context size without a model name.")
|
|
995
|
-
return None
|
|
996
|
-
|
|
997
|
-
known_contexts = {
|
|
998
|
-
'llama3.1': 131072, 'llama3.2': 131072, 'llama3.3': 131072, 'llama3': 8192,
|
|
999
|
-
'llama2': 4096, 'mixtral8x22b': 65536, 'mixtral': 32768, 'mistral': 32768,
|
|
1000
|
-
'gemma3': 131072, 'gemma2': 8192, 'gemma': 8192, 'phi3': 131072, 'phi2': 2048,
|
|
1001
|
-
'phi': 2048, 'qwen2.5': 131072, 'qwen2': 32768, 'qwen': 8192,
|
|
1002
|
-
'codellama': 16384, 'codegemma': 8192, 'deepseek-coder-v2': 131072,
|
|
1003
|
-
'deepseek-coder': 16384, 'deepseek-v2': 131072, 'deepseek-llm': 4096,
|
|
1004
|
-
'yi1.5': 32768, 'yi': 4096, 'command-r': 131072, 'wizardlm2': 32768,
|
|
1005
|
-
'wizardlm': 16384, 'zephyr': 65536, 'vicuna': 2048, 'falcon': 2048,
|
|
1006
|
-
'starcoder': 8192, 'stablelm': 4096, 'orca2': 4096, 'orca': 4096,
|
|
1007
|
-
'dolphin': 32768, 'openhermes': 8192,
|
|
1008
|
-
}
|
|
1009
|
-
normalized_model_name = model_name.lower().strip()
|
|
1010
|
-
sorted_base_models = sorted(known_contexts.keys(), key=len, reverse=True)
|
|
1011
|
-
|
|
1012
|
-
for base_name in sorted_base_models:
|
|
1013
|
-
if base_name in normalized_model_name:
|
|
1014
|
-
context_size = known_contexts[base_name]
|
|
1015
|
-
ASCIIColors.info(f"Using hardcoded context size for '{model_name}' based on '{base_name}': {context_size}")
|
|
1016
|
-
return context_size
|
|
1017
|
-
|
|
1018
|
-
ASCIIColors.warning(f"Context size not found for model '{model_name}' in the hardcoded list.")
|
|
1019
|
-
return None
|
|
1020
|
-
|
|
1021
|
-
if __name__ == '__main__':
|
|
1022
|
-
# NOTE: This test block is designed for a single-process scenario to verify basic functionality.
|
|
1023
|
-
# Testing the multi-process capabilities requires a separate script that launches multiple
|
|
1024
|
-
# instances of a test program using this binding. The logic here, however, will now use the
|
|
1025
|
-
# new file-based registry system.
|
|
1026
|
-
full_streamed_text = ""
|
|
1027
|
-
ASCIIColors.yellow("Testing LlamaCppServerBinding...")
|
|
1028
|
-
|
|
1029
|
-
try:
|
|
1030
|
-
models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
|
|
1031
|
-
model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf")
|
|
1032
|
-
|
|
1033
|
-
models_path = Path(models_path_str)
|
|
1034
|
-
models_path.mkdir(parents=True, exist_ok=True)
|
|
1035
|
-
test_model_path = models_path / model_name_str
|
|
1036
|
-
|
|
1037
|
-
primary_model_available = test_model_path.exists()
|
|
1038
|
-
if not primary_model_available:
|
|
1039
|
-
ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set env vars.")
|
|
1040
|
-
ASCIIColors.warning("Some tests will be skipped.")
|
|
1041
|
-
|
|
1042
|
-
except Exception as e:
|
|
1043
|
-
ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
|
|
1044
|
-
sys.exit(1)
|
|
1045
|
-
|
|
1046
|
-
binding_config = {
|
|
1047
|
-
"n_gpu_layers": 0, "n_ctx": 512, "embedding": True,
|
|
1048
|
-
"verbose": False, "server_startup_timeout": 180, "parallel_slots": 2,
|
|
1049
|
-
"stop_sequences": ["<|user|>", "\nUSER:"], # Example default stop sequences
|
|
1050
|
-
}
|
|
1051
|
-
|
|
1052
|
-
active_binding1: Optional[LlamaCppServerBinding] = None
|
|
1053
|
-
active_binding2: Optional[LlamaCppServerBinding] = None
|
|
1054
|
-
|
|
1055
|
-
try:
|
|
1056
|
-
if primary_model_available:
|
|
1057
|
-
# --- Test 1: Auto-start server on first generation call ---
|
|
1058
|
-
ASCIIColors.cyan("\n--- Test 1: Auto-start server with specified model name ---")
|
|
1059
|
-
active_binding1 = LlamaCppServerBinding(
|
|
1060
|
-
model_name=model_name_str, models_path=str(models_path), config=binding_config
|
|
1061
|
-
)
|
|
1062
|
-
ASCIIColors.info("Binding1 initialized. No server should be running yet.")
|
|
1063
|
-
ASCIIColors.info(f"Initial model info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
1064
|
-
|
|
1065
|
-
prompt_text = "What is the capital of France?"
|
|
1066
|
-
generated_text = active_binding1.generate_text(
|
|
1067
|
-
prompt_text,
|
|
1068
|
-
system_prompt="Concise expert.",
|
|
1069
|
-
n_predict=20,
|
|
1070
|
-
stream=False,
|
|
1071
|
-
stop_sequences=["Paris"] # Test per-call stop sequence
|
|
1072
|
-
)
|
|
1073
|
-
|
|
1074
|
-
if isinstance(generated_text, str) and "Paris" not in generated_text: # Should stop *before* generating Paris
|
|
1075
|
-
ASCIIColors.green(f"SUCCESS: Auto-start generation with stop sequence successful. Response: '{generated_text}'")
|
|
1076
|
-
else:
|
|
1077
|
-
ASCIIColors.error(f"FAILURE: Auto-start generation failed or stop sequence ignored. Response: {generated_text}")
|
|
1078
|
-
|
|
1079
|
-
ASCIIColors.info(f"Model info after auto-start: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
1080
|
-
if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
|
|
1081
|
-
raise RuntimeError("Server for binding1 did not seem to start correctly.")
|
|
1082
|
-
|
|
1083
|
-
# --- Test 2: Server reuse with a second binding ---
|
|
1084
|
-
ASCIIColors.cyan("\n--- Test 2: Server reuse with a second binding ---")
|
|
1085
|
-
active_binding2 = LlamaCppServerBinding(
|
|
1086
|
-
model_name=model_name_str, models_path=str(models_path), config=binding_config
|
|
1087
|
-
)
|
|
1088
|
-
generated_text_b2 = active_binding2.generate_text("Ping", n_predict=5, stream=False)
|
|
1089
|
-
if isinstance(generated_text_b2, str):
|
|
1090
|
-
ASCIIColors.green(f"SUCCESS: Binding2 generation successful. Response: {generated_text_b2}")
|
|
1091
|
-
else:
|
|
1092
|
-
ASCIIColors.error(f"FAILURE: Binding2 generation failed. Response: {generated_text_b2}")
|
|
1093
|
-
|
|
1094
|
-
if active_binding1.port != active_binding2.port:
|
|
1095
|
-
ASCIIColors.error("FAILURE: Bindings for the same model are using different ports! Server sharing failed.")
|
|
1096
|
-
else:
|
|
1097
|
-
ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing works.")
|
|
1098
|
-
|
|
1099
|
-
# --- Test 3: Unload and auto-reload ---
|
|
1100
|
-
ASCIIColors.cyan("\n--- Test 3: Unload and auto-reload ---")
|
|
1101
|
-
active_binding1.unload_model()
|
|
1102
|
-
ASCIIColors.info("Binding1 unloaded. Ref count should be 1, server still up for binding2.")
|
|
1103
|
-
|
|
1104
|
-
generated_text_reloaded = active_binding1.generate_text("Test reload", n_predict=5, stream=False)
|
|
1105
|
-
if isinstance(generated_text_reloaded, str):
|
|
1106
|
-
ASCIIColors.green(f"SUCCESS: Generation after reload successful. Response: {generated_text_reloaded}")
|
|
1107
|
-
else:
|
|
1108
|
-
ASCIIColors.error(f"FAILURE: Generation after reload failed. Response: {generated_text_reloaded}")
|
|
1109
|
-
|
|
1110
|
-
if active_binding1.port != active_binding2.port:
|
|
1111
|
-
ASCIIColors.error("FAILURE: Port mismatch after reload.")
|
|
1112
|
-
else:
|
|
1113
|
-
ASCIIColors.green("SUCCESS: Correctly re-used same server after reload.")
|
|
1114
|
-
|
|
1115
|
-
else:
|
|
1116
|
-
ASCIIColors.warning("\n--- Primary model not available, skipping most tests ---")
|
|
1117
|
-
|
|
1118
|
-
# --- Test 4: Initialize with model_name=None and auto-find ---
|
|
1119
|
-
ASCIIColors.cyan("\n--- Test 4: Initialize with model_name=None and auto-find ---")
|
|
1120
|
-
unspecified_binding = LlamaCppServerBinding(model_name=None, models_path=str(models_path), config=binding_config)
|
|
1121
|
-
gen_unspec = unspecified_binding.generate_text("Ping", n_predict=5, stream=False)
|
|
1122
|
-
if primary_model_available:
|
|
1123
|
-
if isinstance(gen_unspec, str):
|
|
1124
|
-
ASCIIColors.green(f"SUCCESS: Auto-find generation successful. Response: {gen_unspec}")
|
|
1125
|
-
ASCIIColors.info(f"Model auto-selected: {unspecified_binding.user_provided_model_name}")
|
|
1126
|
-
else:
|
|
1127
|
-
ASCIIColors.error(f"FAILURE: Auto-find generation failed. Response: {gen_unspec}")
|
|
1128
|
-
else: # If no models, this should fail gracefully
|
|
1129
|
-
if isinstance(gen_unspec, dict) and 'error' in gen_unspec:
|
|
1130
|
-
ASCIIColors.green("SUCCESS: Correctly failed to generate when no models are available.")
|
|
1131
|
-
else:
|
|
1132
|
-
ASCIIColors.error(f"FAILURE: Incorrect behavior when no models are available. Response: {gen_unspec}")
|
|
1133
|
-
|
|
1134
|
-
except Exception as e_main:
|
|
1135
|
-
ASCIIColors.error(f"An unexpected error occurred during testing: {e_main}")
|
|
1136
|
-
trace_exception(e_main)
|
|
1137
|
-
finally:
|
|
1138
|
-
ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
|
|
1139
|
-
if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
|
|
1140
|
-
if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
|
|
1141
|
-
# Any other bindings will be cleaned up by __del__ on exit
|
|
1142
|
-
|
|
1143
|
-
registry = ServerRegistry()
|
|
1144
|
-
with FileLock(registry.lock_file):
|
|
1145
|
-
final_state = registry._read_registry()
|
|
1146
|
-
if not final_state or not any(c for s in final_state.values() for c in s.get('client_pids',[])):
|
|
1147
|
-
ASCIIColors.green("All servers shut down correctly and registry is empty or has no clients.")
|
|
1148
|
-
if final_state: registry._write_registry({}) # Clean up for next run
|
|
1149
|
-
else:
|
|
1150
|
-
ASCIIColors.warning(f"Warning: Registry is not empty after tests: {final_state}")
|
|
1151
|
-
registry._clean_stale_entries(final_state)
|
|
1152
|
-
registry._write_registry(final_state)
|
|
1153
|
-
ASCIIColors.info("Forced a final registry cleanup.")
|
|
1154
|
-
|
|
1155
|
-
ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")
|