lollms-client 0.15.0__py3-none-any.whl → 0.15.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lollms-client might be problematic. Click here for more details.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/llamacpp/__init__.py +561 -734
- {lollms_client-0.15.0.dist-info → lollms_client-0.15.2.dist-info}/METADATA +1 -1
- {lollms_client-0.15.0.dist-info → lollms_client-0.15.2.dist-info}/RECORD +7 -7
- {lollms_client-0.15.0.dist-info → lollms_client-0.15.2.dist-info}/WHEEL +0 -0
- {lollms_client-0.15.0.dist-info → lollms_client-0.15.2.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-0.15.0.dist-info → lollms_client-0.15.2.dist-info}/top_level.txt +0 -0
|
@@ -24,211 +24,156 @@ pm.ensure_packages(["requests", "pillow"]) # pillow for dummy image in test
|
|
|
24
24
|
if not pm.is_installed("llama-cpp-binaries"):
|
|
25
25
|
def install_llama_cpp():
|
|
26
26
|
system = platform.system()
|
|
27
|
+
python_version_simple = f"py{sys.version_info.major}{sys.version_info.minor}" # e.g. py310 for 3.10
|
|
28
|
+
|
|
29
|
+
# Determine CUDA suffix based on common recent versions. Adjust if needed.
|
|
30
|
+
# For simplicity, we'll target a common recent CUDA version.
|
|
31
|
+
# Users with specific needs might need to install manually.
|
|
32
|
+
# As of late 2023/early 2024, cu121 or cu118 are common.
|
|
33
|
+
# The oobabooga binaries often use +cu124 for recent builds. Let's try that.
|
|
34
|
+
cuda_suffix = "+cu124"
|
|
35
|
+
|
|
27
36
|
|
|
28
37
|
if system == "Windows":
|
|
29
|
-
|
|
38
|
+
# llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl
|
|
39
|
+
url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
|
|
40
|
+
fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl" # Generic py3
|
|
30
41
|
elif system == "Linux":
|
|
31
|
-
|
|
42
|
+
# llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl
|
|
43
|
+
url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
|
|
44
|
+
fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl" # Generic py3
|
|
32
45
|
else:
|
|
33
|
-
|
|
46
|
+
ASCIIColors.warning(f"Unsupported OS for prebuilt llama-cpp-binaries: {system}. Please install manually.")
|
|
34
47
|
return
|
|
35
|
-
|
|
48
|
+
|
|
49
|
+
ASCIIColors.info(f"Attempting to install llama-cpp-binaries from: {url}")
|
|
50
|
+
try:
|
|
51
|
+
pm.install(url)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
ASCIIColors.warning(f"Failed to install specific version from {url}: {e}")
|
|
54
|
+
ASCIIColors.info(f"Attempting fallback URL: {fallback_url}")
|
|
55
|
+
try:
|
|
56
|
+
pm.install(fallback_url)
|
|
57
|
+
except Exception as e_fallback:
|
|
58
|
+
ASCIIColors.error(f"Failed to install from fallback URL {fallback_url}: {e_fallback}")
|
|
59
|
+
ASCIIColors.error("Please try installing llama-cpp-binaries manually, e.g., 'pip install llama-cpp-python[server]' or from a wheel.")
|
|
60
|
+
|
|
36
61
|
install_llama_cpp()
|
|
37
62
|
|
|
38
63
|
try:
|
|
39
64
|
import llama_cpp_binaries
|
|
40
65
|
except ImportError:
|
|
41
66
|
ASCIIColors.error("llama-cpp-binaries package not found. Please install it.")
|
|
42
|
-
ASCIIColors.error("You can try: pip install llama-cpp-
|
|
43
|
-
ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases")
|
|
67
|
+
ASCIIColors.error("You can try: pip install llama-cpp-python[server] (for server support)")
|
|
68
|
+
ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases or https://pypi.org/project/llama-cpp-python/#files")
|
|
44
69
|
llama_cpp_binaries = None
|
|
45
70
|
|
|
46
71
|
|
|
47
72
|
# --- Predefined patterns ---
|
|
48
|
-
|
|
49
|
-
# Quantization type strings (derived from ggml.h, llama.cpp, and common usage)
|
|
50
|
-
# These are the "core component" strings, without separators like '.', '-', or '_'
|
|
51
73
|
_QUANT_COMPONENTS_SET: Set[str] = {
|
|
52
|
-
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
|
|
58
|
-
"
|
|
59
|
-
"Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS",
|
|
60
|
-
|
|
61
|
-
# Non-K-quant legacy types
|
|
62
|
-
"Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
|
|
63
|
-
|
|
64
|
-
# Floating point types
|
|
65
|
-
"F16", "FP16", "F32", "FP32", "BF16",
|
|
66
|
-
|
|
67
|
-
# IQ (Innovative Quantization) types
|
|
68
|
-
"IQ1_S", "IQ1_M",
|
|
69
|
-
"IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
|
|
70
|
-
"IQ3_XXS", "IQ3_S", "IQ3_M",
|
|
71
|
-
"IQ4_NL", "IQ4_XS",
|
|
72
|
-
|
|
73
|
-
# Newer IQ K-Quant variants (IQ types using K-quant style super-blocks)
|
|
74
|
-
"IQ3_M_K", "IQ3_S_K", # Adding IQ3_S_K as it's plausible
|
|
75
|
-
"IQ4_XS_K", "IQ4_NL_K", # Adding IQ4_NL_K as it's plausible
|
|
76
|
-
|
|
77
|
-
# Basic integer types (less common in user-facing LLM filenames as primary quantizer)
|
|
78
|
-
"I8", "I16", "I32",
|
|
79
|
-
|
|
80
|
-
# Special GGUF type names that might appear (from ggml.c `ggml_type_name`)
|
|
81
|
-
"ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
|
|
82
|
-
"MOSTLY_Q8_0",
|
|
83
|
-
"MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
|
|
74
|
+
"Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K", "Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S",
|
|
75
|
+
"Q3_K_M", "Q4_K_M", "Q5_K_M", "Q3_K_L", "Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
|
|
76
|
+
"Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS", "Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
|
|
77
|
+
"F16", "FP16", "F32", "FP32", "BF16", "IQ1_S", "IQ1_M", "IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
|
|
78
|
+
"IQ3_XXS", "IQ3_S", "IQ3_M", "IQ4_NL", "IQ4_XS", "IQ3_M_K", "IQ3_S_K", "IQ4_XS_K", "IQ4_NL_K",
|
|
79
|
+
"I8", "I16", "I32", "ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
|
|
80
|
+
"MOSTLY_Q8_0", "MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
|
|
84
81
|
"MOSTLY_Q4_K_S", "MOSTLY_Q4_K_M", "MOSTLY_Q5_K_S", "MOSTLY_Q5_K_M", "MOSTLY_Q6_K",
|
|
85
|
-
"MOSTLY_IQ1_S", "MOSTLY_IQ1_M",
|
|
86
|
-
"
|
|
87
|
-
"MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", # Adding IQ3_M, IQ3_S
|
|
88
|
-
"MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
|
|
82
|
+
"MOSTLY_IQ1_S", "MOSTLY_IQ1_M", "MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
|
|
83
|
+
"MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", "MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
|
|
89
84
|
}
|
|
90
|
-
|
|
91
|
-
# Common descriptive suffixes for model names
|
|
92
85
|
_MODEL_NAME_SUFFIX_COMPONENTS_SET: Set[str] = {
|
|
93
86
|
"instruct", "chat", "GGUF", "HF", "ggml", "pytorch", "AWQ", "GPTQ", "EXL2",
|
|
94
|
-
"base", "cont", "continue", "ft",
|
|
95
|
-
"v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0", # Common version tags if they are truly suffixes
|
|
96
|
-
# Be cautious with general version numbers (e.g., "v1", "v2") or model sizes (e.g., "7b")
|
|
97
|
-
# as they are often integral parts of the base name. Only add if they are
|
|
98
|
-
# *always* extraneous suffixes in your context.
|
|
99
|
-
# The ones above are more specific and often appear as full suffix components.
|
|
87
|
+
"base", "cont", "continue", "ft", "v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0"
|
|
100
88
|
}
|
|
101
|
-
|
|
102
|
-
# Combine, ensure uniqueness by using sets, then sort by length descending.
|
|
103
|
-
# Sorting ensures longer patterns (e.g., "Q4_K_M") are checked before
|
|
104
|
-
# shorter sub-patterns (e.g., "Q4_K" or "K_M").
|
|
105
89
|
_ALL_REMOVABLE_COMPONENTS: List[str] = sorted(
|
|
106
|
-
list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)),
|
|
107
|
-
key=len,
|
|
108
|
-
reverse=True
|
|
90
|
+
list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)), key=len, reverse=True
|
|
109
91
|
)
|
|
110
92
|
|
|
111
93
|
def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
The stripping is case-insensitive and checks for patterns preceded
|
|
118
|
-
by '.', '-', or '_'.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
file_path_or_name: The file path (as a string or Path object)
|
|
122
|
-
or just the filename string.
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
The derived base model name string.
|
|
126
|
-
"""
|
|
127
|
-
if isinstance(file_path_or_name, str):
|
|
128
|
-
p = Path(file_path_or_name)
|
|
129
|
-
elif isinstance(file_path_or_name, Path):
|
|
130
|
-
p = file_path_or_name
|
|
131
|
-
else:
|
|
132
|
-
raise TypeError(
|
|
133
|
-
"Input must be a string or Path object. "
|
|
134
|
-
f"Got: {type(file_path_or_name)}"
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
name_part = p.name # Full filename, e.g., "MyModel-7B-chat.Q4_K_M.gguf"
|
|
138
|
-
|
|
139
|
-
# 1. Remove .gguf extension (case-insensitive)
|
|
140
|
-
if name_part.lower().endswith(".gguf"):
|
|
141
|
-
name_part = name_part[:-5] # Remove last 5 chars: ".gguf"
|
|
142
|
-
|
|
143
|
-
# 2. Iteratively strip known components (quantization, common suffixes)
|
|
144
|
-
# These components are usually preceded by '.', '-', or '_'
|
|
94
|
+
if isinstance(file_path_or_name, str): p = Path(file_path_or_name)
|
|
95
|
+
elif isinstance(file_path_or_name, Path): p = file_path_or_name
|
|
96
|
+
else: raise TypeError(f"Input must be a string or Path object. Got: {type(file_path_or_name)}")
|
|
97
|
+
name_part = p.stem if p.suffix.lower() == ".gguf" else p.name
|
|
98
|
+
if name_part.lower().endswith(".gguf"): name_part = name_part[:-5]
|
|
145
99
|
while True:
|
|
146
100
|
original_name_part_len = len(name_part)
|
|
147
101
|
stripped_in_this_iteration = False
|
|
148
|
-
|
|
149
102
|
for component in _ALL_REMOVABLE_COMPONENTS:
|
|
150
103
|
component_lower = component.lower()
|
|
151
|
-
# Check for patterns like ".component", "-component", or "_component"
|
|
152
104
|
for separator in [".", "-", "_"]:
|
|
153
105
|
pattern_to_check = f"{separator}{component_lower}"
|
|
154
106
|
if name_part.lower().endswith(pattern_to_check):
|
|
155
|
-
# Remove from the original-case name_part
|
|
156
107
|
name_part = name_part[:-(len(pattern_to_check))]
|
|
157
|
-
stripped_in_this_iteration = True
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
# If no component was stripped in a full pass through _ALL_REMOVABLE_COMPONENTS,
|
|
163
|
-
# or if name_part became empty, we're done.
|
|
164
|
-
if not stripped_in_this_iteration or not name_part:
|
|
165
|
-
break
|
|
166
|
-
|
|
167
|
-
# 3. Final cleanup: remove trailing separators if any are left after stripping
|
|
168
|
-
while name_part and name_part[-1] in ['.', '-', '_']:
|
|
169
|
-
name_part = name_part[:-1]
|
|
170
|
-
|
|
108
|
+
stripped_in_this_iteration = True; break
|
|
109
|
+
if stripped_in_this_iteration: break
|
|
110
|
+
if not stripped_in_this_iteration or not name_part: break
|
|
111
|
+
while name_part and name_part[-1] in ['.', '-', '_']: name_part = name_part[:-1]
|
|
171
112
|
return name_part
|
|
172
113
|
|
|
114
|
+
# --- Global Server Registry ---
|
|
115
|
+
_active_servers: Dict[tuple, 'LlamaCppServerProcess'] = {}
|
|
116
|
+
_server_ref_counts: Dict[tuple, int] = {}
|
|
117
|
+
_server_registry_lock = threading.Lock()
|
|
173
118
|
|
|
174
119
|
BindingName = "LlamaCppServerBinding"
|
|
175
120
|
DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
|
|
176
|
-
|
|
177
|
-
#
|
|
121
|
+
# Port is now dynamic, this constant is less critical for direct use but good for reference.
|
|
122
|
+
# DEFAULT_LLAMACPP_SERVER_PORT = 9641
|
|
123
|
+
|
|
178
124
|
class LlamaCppServerProcess:
|
|
179
|
-
def __init__(self, model_path: str
|
|
125
|
+
def __init__(self, model_path: Union[str, Path], clip_model_path: Optional[Union[str, Path]] = None, server_binary_path: Optional[Union[str, Path]]=None, server_args: Dict[str, Any]={}):
|
|
180
126
|
self.model_path = Path(model_path)
|
|
181
|
-
self.clip_model_path = clip_model_path
|
|
182
|
-
|
|
183
|
-
if
|
|
184
|
-
self.server_binary_path =
|
|
185
|
-
|
|
127
|
+
self.clip_model_path = Path(clip_model_path) if clip_model_path else None
|
|
128
|
+
|
|
129
|
+
if server_binary_path:
|
|
130
|
+
self.server_binary_path = Path(server_binary_path)
|
|
131
|
+
elif llama_cpp_binaries:
|
|
132
|
+
self.server_binary_path = Path(llama_cpp_binaries.get_binary_path())
|
|
133
|
+
else:
|
|
134
|
+
raise FileNotFoundError("llama_cpp_binaries not found and no server_binary_path provided.")
|
|
135
|
+
|
|
136
|
+
self.port: Optional[int] = None # Set by start() method
|
|
186
137
|
self.server_args = server_args
|
|
187
138
|
self.process: Optional[subprocess.Popen] = None
|
|
188
139
|
self.session = requests.Session()
|
|
189
|
-
self.host = DEFAULT_LLAMACPP_SERVER_HOST
|
|
190
|
-
self.base_url =
|
|
140
|
+
self.host = self.server_args.get("host",DEFAULT_LLAMACPP_SERVER_HOST)
|
|
141
|
+
self.base_url: Optional[str] = None # Set by start() method
|
|
191
142
|
self.is_healthy = False
|
|
192
|
-
self._stderr_lines = []
|
|
193
|
-
self._stderr_thread = None
|
|
143
|
+
self._stderr_lines: List[str] = []
|
|
144
|
+
self._stderr_thread: Optional[threading.Thread] = None
|
|
194
145
|
|
|
195
146
|
if not self.model_path.exists():
|
|
196
147
|
raise FileNotFoundError(f"Model file not found: {self.model_path}")
|
|
148
|
+
if self.clip_model_path and not self.clip_model_path.exists():
|
|
149
|
+
ASCIIColors.warning(f"Clip model file '{self.clip_model_path}' not found. Vision features may not work or may use a different auto-detected clip model.")
|
|
197
150
|
if not self.server_binary_path.exists():
|
|
198
151
|
raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
|
|
199
152
|
|
|
200
|
-
self._start_server()
|
|
201
|
-
|
|
202
153
|
def _filter_stderr(self, stderr_pipe):
|
|
203
154
|
try:
|
|
204
155
|
for line in iter(stderr_pipe.readline, ''):
|
|
205
156
|
if line:
|
|
206
157
|
self._stderr_lines.append(line.strip())
|
|
207
|
-
if len(self._stderr_lines) > 50:
|
|
208
|
-
self._stderr_lines.pop(0)
|
|
209
|
-
# Simple progress or key info logging
|
|
158
|
+
if len(self._stderr_lines) > 50: self._stderr_lines.pop(0)
|
|
210
159
|
if "llama_model_loaded" in line or "error" in line.lower() or "failed" in line.lower():
|
|
211
|
-
ASCIIColors.debug(f"[LLAMA_SERVER_STDERR] {line.strip()}")
|
|
212
|
-
elif "running
|
|
213
|
-
ASCIIColors.info(f"[LLAMA_SERVER_STDERR] {line.strip()}")
|
|
214
|
-
|
|
215
|
-
except
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def _start_server(self, is_embedding=False):
|
|
160
|
+
ASCIIColors.debug(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
|
|
161
|
+
elif "running on port" in line: # Server startup message
|
|
162
|
+
ASCIIColors.info(f"[LLAMA_SERVER_STDERR:{self.port}] {line.strip()}")
|
|
163
|
+
except ValueError: pass
|
|
164
|
+
except Exception as e: ASCIIColors.warning(f"Exception in stderr filter thread for port {self.port}: {e}")
|
|
165
|
+
|
|
166
|
+
def start(self, port_to_use: int):
|
|
167
|
+
self.port = port_to_use
|
|
168
|
+
self.base_url = f"http://{self.host}:{self.port}"
|
|
169
|
+
|
|
222
170
|
cmd = [
|
|
223
171
|
str(self.server_binary_path),
|
|
224
172
|
"--model", str(self.model_path),
|
|
225
173
|
"--host", self.host,
|
|
226
174
|
"--port", str(self.port),
|
|
227
|
-
# Add other common defaults or arguments from self.server_args
|
|
228
175
|
]
|
|
229
176
|
|
|
230
|
-
# Common arguments mapping from LlamaCppBinding to server CLI args
|
|
231
|
-
# (This needs to be kept in sync with llama.cpp server's CLI)
|
|
232
177
|
arg_map = {
|
|
233
178
|
"n_ctx": "--ctx-size", "n_gpu_layers": "--gpu-layers", "main_gpu": "--main-gpu",
|
|
234
179
|
"tensor_split": "--tensor-split", "use_mmap": (lambda v: ["--no-mmap"] if not v else []),
|
|
@@ -236,446 +181,356 @@ class LlamaCppServerProcess:
|
|
|
236
181
|
"n_batch": "--batch-size", "n_threads": "--threads", "n_threads_batch": "--threads-batch",
|
|
237
182
|
"rope_scaling_type": "--rope-scaling", "rope_freq_base": "--rope-freq-base",
|
|
238
183
|
"rope_freq_scale": "--rope-freq-scale",
|
|
239
|
-
"embedding": (lambda v: ["--embedding"] if
|
|
184
|
+
"embedding": (lambda v: ["--embedding"] if v else []),
|
|
240
185
|
"verbose": (lambda v: ["--verbose"] if v else []),
|
|
241
|
-
"chat_template": "--chat-template",
|
|
242
|
-
|
|
186
|
+
"chat_template": "--chat-template",
|
|
187
|
+
"parallel_slots": "--parallel", # Number of parallel processing slots
|
|
243
188
|
}
|
|
244
189
|
|
|
245
|
-
#
|
|
246
|
-
if self.clip_model_path:
|
|
190
|
+
if self.clip_model_path: # This should be the actual path resolved by the binding
|
|
247
191
|
cmd.extend(["--mmproj", str(self.clip_model_path)])
|
|
248
|
-
# The server might automatically detect LLaVA chat format or need a specific flag
|
|
249
|
-
# e.g., --chat-template llava-1.5 (if server supports templates)
|
|
250
|
-
# For older servers, a specific chat format flag like --chatml with LLaVA prompt structure was used.
|
|
251
|
-
# The server from llama-cpp-binaries is usually quite up-to-date.
|
|
252
192
|
|
|
253
193
|
for key, cli_arg in arg_map.items():
|
|
254
194
|
val = self.server_args.get(key)
|
|
255
195
|
if val is not None:
|
|
256
|
-
if callable(cli_arg):
|
|
257
|
-
|
|
258
|
-
else:
|
|
259
|
-
cmd.extend([cli_arg, str(val)])
|
|
196
|
+
if callable(cli_arg): cmd.extend(cli_arg(val))
|
|
197
|
+
else: cmd.extend([cli_arg, str(val)])
|
|
260
198
|
|
|
261
|
-
# Add any extra CLI flags directly
|
|
262
199
|
extra_cli_flags = self.server_args.get("extra_cli_flags", [])
|
|
263
|
-
if isinstance(extra_cli_flags, str):
|
|
264
|
-
extra_cli_flags = extra_cli_flags.split()
|
|
200
|
+
if isinstance(extra_cli_flags, str): extra_cli_flags = extra_cli_flags.split()
|
|
265
201
|
cmd.extend(extra_cli_flags)
|
|
266
202
|
|
|
267
|
-
|
|
268
|
-
ASCIIColors.info(f"Starting Llama.cpp server with command: {' '.join(cmd)}")
|
|
203
|
+
ASCIIColors.info(f"Starting Llama.cpp server ({' '.join(cmd)})")
|
|
269
204
|
|
|
270
|
-
# Prevent paths with spaces from breaking the command on some OS, though Popen usually handles this.
|
|
271
|
-
# For safety, ensure paths are quoted if necessary, or rely on Popen's list-based command.
|
|
272
|
-
|
|
273
205
|
env = os.environ.copy()
|
|
274
|
-
# On Linux, it might be necessary to set LD_LIBRARY_PATH if server binary has shared lib dependencies in its folder
|
|
275
206
|
if os.name == 'posix' and self.server_binary_path.parent != Path('.'):
|
|
276
207
|
lib_path_str = str(self.server_binary_path.parent.resolve())
|
|
277
208
|
current_ld_path = env.get('LD_LIBRARY_PATH', '')
|
|
278
|
-
if current_ld_path
|
|
279
|
-
env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}"
|
|
280
|
-
else:
|
|
281
|
-
env['LD_LIBRARY_PATH'] = lib_path_str
|
|
209
|
+
env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}" if current_ld_path else lib_path_str
|
|
282
210
|
|
|
283
211
|
try:
|
|
284
|
-
|
|
285
|
-
self.process = subprocess.Popen(
|
|
286
|
-
cmd,
|
|
287
|
-
stderr=subprocess.PIPE,
|
|
288
|
-
stdout=subprocess.PIPE, # Capture stdout as well for debugging
|
|
289
|
-
text=True,
|
|
290
|
-
bufsize=1, # Line buffered
|
|
291
|
-
env=env
|
|
292
|
-
)
|
|
212
|
+
self.process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, bufsize=1, env=env)
|
|
293
213
|
except Exception as e:
|
|
294
|
-
ASCIIColors.error(f"Failed to start llama.cpp server process: {e}")
|
|
295
|
-
trace_exception(e)
|
|
296
|
-
raise
|
|
214
|
+
ASCIIColors.error(f"Failed to start llama.cpp server process on port {self.port}: {e}"); trace_exception(e); raise
|
|
297
215
|
|
|
298
|
-
# Start stderr/stdout reading threads
|
|
299
216
|
self._stderr_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stderr,), daemon=True)
|
|
300
217
|
self._stderr_thread.start()
|
|
301
|
-
# self._stdout_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stdout,), daemon=True) # can use same filter
|
|
302
|
-
# self._stdout_thread.start()
|
|
303
|
-
|
|
304
218
|
|
|
305
|
-
# Wait for server to be healthy
|
|
306
219
|
health_url = f"{self.base_url}/health"
|
|
307
|
-
max_wait_time = self.server_args.get("server_startup_timeout", 60)
|
|
220
|
+
max_wait_time = self.server_args.get("server_startup_timeout", 60)
|
|
308
221
|
start_time = time.time()
|
|
309
222
|
|
|
310
223
|
while time.time() - start_time < max_wait_time:
|
|
311
224
|
if self.process.poll() is not None:
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
raise RuntimeError(f"Llama.cpp server process terminated unexpectedly with exit code {exit_code} during startup. Stderr:\n{stderr_output}")
|
|
225
|
+
stderr_output = "\n".join(self._stderr_lines[-10:])
|
|
226
|
+
raise RuntimeError(f"Llama.cpp server (port {self.port}) terminated unexpectedly (exit code {self.process.poll()}) during startup. Stderr:\n{stderr_output}")
|
|
315
227
|
try:
|
|
316
228
|
response = self.session.get(health_url, timeout=2)
|
|
317
229
|
if response.status_code == 200 and response.json().get("status") == "ok":
|
|
318
230
|
self.is_healthy = True
|
|
319
231
|
ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port}.")
|
|
320
232
|
return
|
|
321
|
-
except requests.exceptions.ConnectionError:
|
|
322
|
-
|
|
323
|
-
except Exception as e:
|
|
324
|
-
ASCIIColors.warning(f"Health check failed: {e}")
|
|
325
|
-
time.sleep(1)
|
|
233
|
+
except requests.exceptions.ConnectionError: time.sleep(1)
|
|
234
|
+
except Exception as e: ASCIIColors.warning(f"Health check for port {self.port} failed: {e}"); time.sleep(1)
|
|
326
235
|
|
|
327
236
|
self.is_healthy = False
|
|
328
|
-
self.
|
|
237
|
+
self.shutdown()
|
|
329
238
|
stderr_output = "\n".join(self._stderr_lines[-10:])
|
|
330
239
|
raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
|
|
331
240
|
|
|
332
|
-
def
|
|
241
|
+
def shutdown(self):
|
|
333
242
|
self.is_healthy = False
|
|
334
243
|
if self.process:
|
|
335
|
-
ASCIIColors.info(f"
|
|
244
|
+
ASCIIColors.info(f"Shutting down Llama.cpp server (PID: {self.process.pid} on port {self.port})...")
|
|
336
245
|
try:
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
# self.process.send_signal(signal.CTRL_C_EVENT) # Requires creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
|
|
341
|
-
self.process.terminate() # For Windows, terminate is often like kill
|
|
342
|
-
else: # POSIX
|
|
343
|
-
self.process.terminate() # Sends SIGTERM
|
|
344
|
-
|
|
345
|
-
self.process.wait(timeout=10) # Wait for graceful shutdown
|
|
246
|
+
if os.name == 'nt': self.process.terminate()
|
|
247
|
+
else: self.process.terminate()
|
|
248
|
+
self.process.wait(timeout=10)
|
|
346
249
|
except subprocess.TimeoutExpired:
|
|
347
|
-
ASCIIColors.warning("Llama.cpp server did not terminate gracefully, killing...")
|
|
348
|
-
self.process.kill()
|
|
349
|
-
try:
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
ASCIIColors.error("Failed to kill llama.cpp server process.")
|
|
353
|
-
except Exception as e:
|
|
354
|
-
ASCIIColors.error(f"Error during server stop: {e}")
|
|
250
|
+
ASCIIColors.warning(f"Llama.cpp server (port {self.port}) did not terminate gracefully, killing...")
|
|
251
|
+
self.process.kill()
|
|
252
|
+
try: self.process.wait(timeout=5)
|
|
253
|
+
except subprocess.TimeoutExpired: ASCIIColors.error(f"Failed to kill llama.cpp server process (port {self.port}).")
|
|
254
|
+
except Exception as e: ASCIIColors.error(f"Error during server shutdown (port {self.port}): {e}")
|
|
355
255
|
finally:
|
|
356
256
|
self.process = None
|
|
357
|
-
if self._stderr_thread and self._stderr_thread.is_alive():
|
|
358
|
-
|
|
359
|
-
ASCIIColors.info("Llama.cpp server stopped.")
|
|
257
|
+
if self._stderr_thread and self._stderr_thread.is_alive(): self._stderr_thread.join(timeout=1)
|
|
258
|
+
ASCIIColors.info(f"Llama.cpp server on port {self.port} shut down.")
|
|
360
259
|
|
|
361
260
|
|
|
362
261
|
class LlamaCppServerBinding(LollmsLLMBinding):
|
|
363
|
-
"""
|
|
364
|
-
Binding for llama.cpp server using pre-compiled binaries.
|
|
365
|
-
Manages a local llama.cpp server subprocess and communicates via HTTP.
|
|
366
|
-
"""
|
|
367
|
-
# Default parameters for the llama.cpp server
|
|
368
262
|
DEFAULT_SERVER_ARGS = {
|
|
369
|
-
"n_gpu_layers": 0,
|
|
370
|
-
"
|
|
371
|
-
"
|
|
372
|
-
"embedding": False, # Enable if embeddings are needed via /embedding or /v1/embeddings
|
|
373
|
-
"verbose": False,
|
|
374
|
-
"server_startup_timeout": 120, # seconds
|
|
375
|
-
# "chat_format": "chatml", # Deprecated in favor of --chat-template, but some old servers might need it
|
|
376
|
-
# For LLaVA
|
|
377
|
-
# "clip_model_path": None,
|
|
378
|
-
# "chat_template": "llava-1.5" # if server supports it. Or specific prompt structure.
|
|
263
|
+
"n_gpu_layers": 0, "n_ctx": 128000, "n_batch": 512,
|
|
264
|
+
"embedding": False, "verbose": False, "server_startup_timeout": 120,
|
|
265
|
+
"parallel_slots": 4, # Default parallel slots for server
|
|
379
266
|
}
|
|
380
267
|
|
|
381
|
-
def __init__(self,
|
|
382
|
-
|
|
383
|
-
models_path: str,
|
|
384
|
-
clip_model_name: str = None,
|
|
385
|
-
config: Optional[Dict[str, Any]] = None, # Binding specific config from global_config.yaml
|
|
386
|
-
default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
|
|
387
|
-
**kwargs # Overrides for server_args
|
|
388
|
-
):
|
|
389
|
-
|
|
268
|
+
def __init__(self, model_name: str, models_path: str, clip_model_name: Optional[str] = None,
|
|
269
|
+
config: Optional[Dict[str, Any]] = None, default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat, **kwargs):
|
|
390
270
|
super().__init__(binding_name=BindingName)
|
|
391
|
-
|
|
392
|
-
if llama_cpp_binaries is None:
|
|
393
|
-
raise ImportError("llama-cpp-binaries package is required but not found.")
|
|
271
|
+
if llama_cpp_binaries is None: raise ImportError("llama-cpp-binaries package is required but not found.")
|
|
394
272
|
|
|
395
273
|
self.models_path = Path(models_path)
|
|
396
|
-
self.
|
|
397
|
-
self.model_path = self.models_path/self.model_name
|
|
398
|
-
self.clip_model_path = self.models_path/clip_model_name if clip_model_name else None
|
|
399
|
-
self.default_completion_format = default_completion_format
|
|
274
|
+
self.user_provided_model_name = model_name # Store the name/path user gave
|
|
400
275
|
|
|
401
|
-
|
|
402
|
-
self.
|
|
403
|
-
|
|
276
|
+
# Initial hint for clip_model_path, resolved fully in load_model
|
|
277
|
+
self.clip_model_path: Optional[Path] = None
|
|
278
|
+
if clip_model_name:
|
|
279
|
+
p_clip = Path(clip_model_name)
|
|
280
|
+
if p_clip.is_absolute() and p_clip.exists():
|
|
281
|
+
self.clip_model_path = p_clip
|
|
282
|
+
elif (self.models_path / clip_model_name).exists(): # Relative to models_path
|
|
283
|
+
self.clip_model_path = self.models_path / clip_model_name
|
|
284
|
+
else:
|
|
285
|
+
ASCIIColors.warning(f"Specified clip_model_name '{clip_model_name}' not found. Will rely on auto-detection if applicable.")
|
|
286
|
+
|
|
287
|
+
self.default_completion_format = default_completion_format
|
|
288
|
+
self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {}), **kwargs}
|
|
404
289
|
self.server_binary_path = self._get_server_binary_path()
|
|
405
|
-
|
|
290
|
+
|
|
291
|
+
self.current_model_path: Optional[Path] = None # Actual resolved path of loaded model
|
|
406
292
|
self.server_process: Optional[LlamaCppServerProcess] = None
|
|
407
293
|
self.port: Optional[int] = None
|
|
294
|
+
self.server_key: Optional[tuple] = None
|
|
408
295
|
|
|
409
|
-
|
|
410
|
-
|
|
296
|
+
if not self.load_model(self.user_provided_model_name):
|
|
297
|
+
ASCIIColors.error(f"Initial model load for '{self.user_provided_model_name}' failed. Binding may not be functional.")
|
|
411
298
|
|
|
412
299
|
def _get_server_binary_path(self) -> Path:
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
if
|
|
417
|
-
custom_path
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
else:
|
|
422
|
-
ASCIIColors.warning(f"Custom llama.cpp server binary path '{custom_path_str}' not found or not a file. Falling back.")
|
|
423
|
-
|
|
424
|
-
# Default to using llama_cpp_binaries
|
|
425
|
-
bin_path_str = llama_cpp_binaries.get_binary_path() # specify "server"
|
|
300
|
+
custom_path_str = self.server_args.get("llama_server_binary_path")
|
|
301
|
+
if custom_path_str:
|
|
302
|
+
custom_path = Path(custom_path_str)
|
|
303
|
+
if custom_path.exists() and custom_path.is_file():
|
|
304
|
+
ASCIIColors.info(f"Using custom llama.cpp server binary: {custom_path}"); return custom_path
|
|
305
|
+
else: ASCIIColors.warning(f"Custom binary '{custom_path_str}' not found. Falling back.")
|
|
306
|
+
if llama_cpp_binaries:
|
|
307
|
+
bin_path_str = llama_cpp_binaries.get_binary_path()
|
|
426
308
|
if bin_path_str:
|
|
427
309
|
bin_path = Path(bin_path_str)
|
|
428
310
|
if bin_path.exists() and bin_path.is_file():
|
|
429
|
-
ASCIIColors.info(f"Using
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
# As a last resort, try a common name in system PATH or a known location if Lollms ships one
|
|
438
|
-
# For now, rely on llama-cpp-binaries or explicit config.
|
|
439
|
-
raise FileNotFoundError(
|
|
440
|
-
"Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' is installed "
|
|
441
|
-
"or provide 'llama_server_binary_path' in the binding's configuration."
|
|
442
|
-
) from e
|
|
443
|
-
|
|
444
|
-
def _resolve_model_path(self, model_path: str) -> Path:
|
|
445
|
-
# Search order:
|
|
446
|
-
# 1. Absolute path
|
|
447
|
-
# 2. Relative to binding-specific models path (e.g., personal_models_path/LlamaCppServerBinding/)
|
|
448
|
-
# 3. Relative to personal_models_path
|
|
449
|
-
# 4. Relative to models_zoo_path
|
|
311
|
+
ASCIIColors.info(f"Using binary from llama-cpp-binaries: {bin_path}"); return bin_path
|
|
312
|
+
raise FileNotFoundError("Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' or 'llama-cpp-python[server]' is installed or provide 'llama_server_binary_path'.")
|
|
313
|
+
|
|
314
|
+
def _resolve_model_path(self, model_name_or_path: str) -> Path:
|
|
315
|
+
model_p = Path(model_name_or_path)
|
|
316
|
+
if model_p.is_absolute():
|
|
317
|
+
if model_p.exists(): return model_p
|
|
318
|
+
else: raise FileNotFoundError(f"Absolute model path specified but not found: {model_p}")
|
|
450
319
|
|
|
451
|
-
|
|
452
|
-
if
|
|
453
|
-
return
|
|
454
|
-
|
|
455
|
-
paths_to_check = []
|
|
456
|
-
binding_specific_folder_name = self.binding_name # "LlamaCppServerBinding"
|
|
457
|
-
paths_to_check.append(self.models_path)
|
|
458
|
-
|
|
459
|
-
for p in paths_to_check:
|
|
460
|
-
if p.exists() and p.is_file():
|
|
461
|
-
ASCIIColors.info(f"Found model at: {p}")
|
|
462
|
-
return p
|
|
320
|
+
path_in_models_dir = self.models_path / model_name_or_path
|
|
321
|
+
if path_in_models_dir.exists() and path_in_models_dir.is_file():
|
|
322
|
+
ASCIIColors.info(f"Found model at: {path_in_models_dir}"); return path_in_models_dir
|
|
463
323
|
|
|
464
|
-
raise FileNotFoundError(f"Model '{
|
|
324
|
+
raise FileNotFoundError(f"Model '{model_name_or_path}' not found as absolute path or within '{self.models_path}'.")
|
|
465
325
|
|
|
466
326
|
def _find_available_port(self) -> int:
|
|
467
327
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
468
|
-
s.bind(('', 0))
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
328
|
+
s.bind(('', 0)); return s.getsockname()[1]
|
|
329
|
+
|
|
330
|
+
def _release_server_instance(self):
|
|
331
|
+
if self.server_process and self.server_key:
|
|
332
|
+
with _server_registry_lock:
|
|
333
|
+
if self.server_key in _server_ref_counts:
|
|
334
|
+
_server_ref_counts[self.server_key] -= 1
|
|
335
|
+
ASCIIColors.info(f"Decremented ref count for server {self.server_key}. New count: {_server_ref_counts[self.server_key]}")
|
|
336
|
+
if _server_ref_counts[self.server_key] <= 0:
|
|
337
|
+
ASCIIColors.info(f"Ref count for server {self.server_key} is zero. Shutting it down.")
|
|
338
|
+
server_to_stop = _active_servers.pop(self.server_key, None)
|
|
339
|
+
_server_ref_counts.pop(self.server_key, None)
|
|
340
|
+
if server_to_stop:
|
|
341
|
+
try: server_to_stop.shutdown()
|
|
342
|
+
except Exception as e: ASCIIColors.error(f"Error shutting down server {self.server_key}: {e}")
|
|
343
|
+
# else: ASCIIColors.warning(f"Attempted to stop server {self.server_key} but it was not in _active_servers.") # Can be noisy
|
|
344
|
+
else:
|
|
345
|
+
ASCIIColors.warning(f"Server key {self.server_key} not in ref counts during release. Might have been shut down already.")
|
|
346
|
+
_active_servers.pop(self.server_key, None) # Ensure removal
|
|
473
347
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
348
|
+
self.server_process = None
|
|
349
|
+
self.port = None
|
|
350
|
+
self.server_key = None
|
|
477
351
|
|
|
478
|
-
if self.server_process:
|
|
479
|
-
self.unload_model() # Stop existing server
|
|
480
352
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
self.port = self._find_available_port()
|
|
353
|
+
def load_model(self, model_name_or_path: str) -> bool:
|
|
354
|
+
resolved_model_path = self._resolve_model_path(model_name_or_path)
|
|
484
355
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
356
|
+
# Determine the clip_model_path for this server instance
|
|
357
|
+
# Priority: 1. Explicit `clip_model_path` from init (if exists) 2. Auto-detection
|
|
358
|
+
final_clip_model_path: Optional[Path] = None
|
|
359
|
+
if self.clip_model_path and self.clip_model_path.exists(): # From __init__
|
|
360
|
+
final_clip_model_path = self.clip_model_path
|
|
361
|
+
ASCIIColors.info(f"Using explicitly configured LLaVA clip model: {final_clip_model_path}")
|
|
362
|
+
elif not self.clip_model_path or (self.clip_model_path and not self.clip_model_path.exists()): # if init path was bad or not given
|
|
363
|
+
if self.clip_model_path and not self.clip_model_path.exists():
|
|
364
|
+
ASCIIColors.warning(f"Initial clip model path '{self.clip_model_path}' not found. Attempting auto-detection.")
|
|
365
|
+
base_name = get_gguf_model_base_name(resolved_model_path.stem)
|
|
366
|
+
potential_paths = [
|
|
367
|
+
resolved_model_path.parent / f"{base_name}.mmproj",
|
|
368
|
+
resolved_model_path.parent / f"mmproj-{base_name}.gguf",
|
|
369
|
+
resolved_model_path.with_suffix(".mmproj"),
|
|
370
|
+
self.models_path / f"{base_name}.mmproj", # Check in general models dir too
|
|
371
|
+
self.models_path / f"mmproj-{base_name}.gguf",
|
|
499
372
|
]
|
|
500
|
-
|
|
501
|
-
for p_clip in potential_clip_paths:
|
|
373
|
+
for p_clip in potential_paths:
|
|
502
374
|
if p_clip.exists():
|
|
503
|
-
|
|
504
|
-
ASCIIColors.info(f"Auto-detected LLaVA clip model: {
|
|
375
|
+
final_clip_model_path = p_clip
|
|
376
|
+
ASCIIColors.info(f"Auto-detected LLaVA clip model: {final_clip_model_path}")
|
|
505
377
|
break
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
378
|
+
|
|
379
|
+
final_clip_model_path_str = str(final_clip_model_path) if final_clip_model_path else None
|
|
380
|
+
|
|
381
|
+
# Server key based on model and essential server configurations (like clip model)
|
|
382
|
+
# More server_args could be added to the key if they necessitate separate server instances
|
|
383
|
+
# For example, different n_gpu_layers might require a server restart.
|
|
384
|
+
# For now, model and clip model are the main differentiators for distinct servers.
|
|
385
|
+
new_server_key = (str(resolved_model_path), final_clip_model_path_str)
|
|
386
|
+
|
|
387
|
+
with _server_registry_lock:
|
|
388
|
+
# If this binding instance is already using the exact same server, do nothing
|
|
389
|
+
if self.server_process and self.server_key == new_server_key and self.server_process.is_healthy:
|
|
390
|
+
ASCIIColors.info(f"Model '{model_name_or_path}' with clip '{final_clip_model_path_str}' is already loaded and server is healthy on port {self.port}. No change.")
|
|
391
|
+
return True
|
|
392
|
+
|
|
393
|
+
# If this binding was using a *different* server, release it first
|
|
394
|
+
if self.server_process and self.server_key != new_server_key:
|
|
395
|
+
ASCIIColors.info(f"Switching models. Releasing previous server: {self.server_key}")
|
|
396
|
+
self._release_server_instance() # This clears self.server_process, self.port, self.server_key
|
|
397
|
+
|
|
398
|
+
# Check if a suitable server already exists in the global registry
|
|
399
|
+
if new_server_key in _active_servers:
|
|
400
|
+
existing_server = _active_servers[new_server_key]
|
|
401
|
+
if existing_server.is_healthy:
|
|
402
|
+
ASCIIColors.info(f"Reusing existing healthy server for {new_server_key} on port {existing_server.port}.")
|
|
403
|
+
self.server_process = existing_server
|
|
404
|
+
self.port = existing_server.port
|
|
405
|
+
_server_ref_counts[new_server_key] += 1
|
|
406
|
+
self.current_model_path = resolved_model_path
|
|
407
|
+
self.clip_model_path = final_clip_model_path # Update binding's clip path
|
|
408
|
+
self.server_key = new_server_key
|
|
409
|
+
return True
|
|
410
|
+
else: # Found existing but unhealthy server
|
|
411
|
+
ASCIIColors.warning(f"Found unhealthy server for {new_server_key}. Attempting to remove and restart.")
|
|
412
|
+
try: existing_server.shutdown()
|
|
413
|
+
except Exception as e: ASCIIColors.error(f"Error shutting down unhealthy server {new_server_key}: {e}")
|
|
414
|
+
_active_servers.pop(new_server_key, None)
|
|
415
|
+
_server_ref_counts.pop(new_server_key, None)
|
|
416
|
+
|
|
417
|
+
# No suitable server found or existing was unhealthy: start a new one
|
|
418
|
+
ASCIIColors.info(f"Starting new server for {new_server_key}.")
|
|
419
|
+
self.current_model_path = resolved_model_path
|
|
420
|
+
self.clip_model_path = final_clip_model_path # Update binding's clip path for the new server
|
|
421
|
+
self.server_key = new_server_key # Set before potential failure to allow cleanup by _release_server_instance
|
|
514
422
|
|
|
423
|
+
new_port_for_server = self._find_available_port()
|
|
424
|
+
|
|
425
|
+
current_server_args_for_new_server = self.server_args.copy()
|
|
426
|
+
# Ensure parallel_slots is set; it's crucial for shared servers
|
|
427
|
+
if "parallel_slots" not in current_server_args_for_new_server or not isinstance(current_server_args_for_new_server["parallel_slots"], int) or current_server_args_for_new_server["parallel_slots"] <=0:
|
|
428
|
+
current_server_args_for_new_server["parallel_slots"] = self.DEFAULT_SERVER_ARGS["parallel_slots"]
|
|
429
|
+
|
|
430
|
+
ASCIIColors.info(f"New Llama.cpp server: model={self.current_model_path}, clip={self.clip_model_path}, port={new_port_for_server}, slots={current_server_args_for_new_server['parallel_slots']}")
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
new_server = LlamaCppServerProcess(
|
|
434
|
+
model_path=str(self.current_model_path),
|
|
435
|
+
clip_model_path=str(self.clip_model_path) if self.clip_model_path else None,
|
|
436
|
+
server_binary_path=str(self.server_binary_path),
|
|
437
|
+
server_args=current_server_args_for_new_server,
|
|
438
|
+
)
|
|
439
|
+
new_server.start(port_to_use=new_port_for_server) # Actual server start
|
|
440
|
+
|
|
441
|
+
if new_server.is_healthy:
|
|
442
|
+
self.server_process = new_server
|
|
443
|
+
self.port = new_port_for_server
|
|
444
|
+
_active_servers[self.server_key] = new_server
|
|
445
|
+
_server_ref_counts[self.server_key] = 1
|
|
446
|
+
ASCIIColors.green(f"New server {self.server_key} started on port {self.port}.")
|
|
447
|
+
return True
|
|
448
|
+
else: # Should have been caught by new_server.start() raising an error
|
|
449
|
+
ASCIIColors.error(f"New server {self.server_key} failed to become healthy (this state should be rare).")
|
|
450
|
+
self._release_server_instance() # Clean up registry if something went very wrong
|
|
451
|
+
return False
|
|
452
|
+
except Exception as e:
|
|
453
|
+
ASCIIColors.error(f"Failed to load model '{model_name_or_path}' and start server: {e}")
|
|
454
|
+
trace_exception(e)
|
|
455
|
+
self._release_server_instance() # Ensure cleanup if start failed
|
|
456
|
+
return False
|
|
515
457
|
|
|
516
|
-
try:
|
|
517
|
-
self.server_process = LlamaCppServerProcess(
|
|
518
|
-
model_path=str(self.current_model_path),
|
|
519
|
-
clip_model_path = str(self.clip_model_path),
|
|
520
|
-
server_binary_path=str(self.server_binary_path),
|
|
521
|
-
port=self.port,
|
|
522
|
-
server_args=current_server_args,
|
|
523
|
-
)
|
|
524
|
-
return self.server_process.is_healthy
|
|
525
|
-
except Exception as e:
|
|
526
|
-
ASCIIColors.error(f"Failed to load model '{model_name}' and start server: {e}")
|
|
527
|
-
trace_exception(e)
|
|
528
|
-
self.server_process = None
|
|
529
|
-
self.current_model_path = None
|
|
530
|
-
return False
|
|
531
458
|
|
|
532
459
|
def unload_model(self):
|
|
533
460
|
if self.server_process:
|
|
534
|
-
self.
|
|
535
|
-
self.
|
|
461
|
+
ASCIIColors.info(f"Unloading model for binding. Current server: {self.server_key}, port: {self.port}")
|
|
462
|
+
self._release_server_instance() # Handles ref counting and actual shutdown if needed
|
|
463
|
+
else:
|
|
464
|
+
ASCIIColors.info("Unload_model called, but no server process was active for this binding instance.")
|
|
536
465
|
self.current_model_path = None
|
|
537
|
-
self.
|
|
538
|
-
|
|
539
|
-
|
|
466
|
+
self.clip_model_path = None # Also clear the instance's clip path idea
|
|
467
|
+
# self.port and self.server_key are cleared by _release_server_instance
|
|
468
|
+
|
|
540
469
|
def _get_request_url(self, endpoint: str) -> str:
|
|
541
470
|
if not self.server_process or not self.server_process.is_healthy:
|
|
542
471
|
raise ConnectionError("Llama.cpp server is not running or not healthy.")
|
|
543
472
|
return f"{self.server_process.base_url}{endpoint}"
|
|
544
473
|
|
|
545
|
-
def _prepare_generation_payload(self,
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
top_k: int = 40,
|
|
551
|
-
top_p: float = 0.9,
|
|
552
|
-
repeat_penalty: float = 1.1,
|
|
553
|
-
repeat_last_n: Optional[int] = 64, # Server calls this repeat_last_n or penalty_last_n
|
|
554
|
-
seed: Optional[int] = None,
|
|
555
|
-
stream: bool = False,
|
|
556
|
-
use_chat_format: bool = True, # True for /v1/chat/completions, False for /completion
|
|
557
|
-
images: Optional[List[str]] = None,
|
|
558
|
-
**extra_params # For things like grammar, mirostat, etc from server_args
|
|
559
|
-
) -> Dict:
|
|
560
|
-
|
|
561
|
-
# Start with defaults from server_args, then override with call params
|
|
474
|
+
def _prepare_generation_payload(self, prompt: str, system_prompt: str = "", n_predict: Optional[int] = None,
|
|
475
|
+
temperature: float = 0.7, top_k: int = 40, top_p: float = 0.9,
|
|
476
|
+
repeat_penalty: float = 1.1, repeat_last_n: Optional[int] = 64,
|
|
477
|
+
seed: Optional[int] = None, stream: bool = False, use_chat_format: bool = True,
|
|
478
|
+
images: Optional[List[str]] = None, **extra_params) -> Dict:
|
|
562
479
|
payload_params = {
|
|
563
|
-
"temperature": self.server_args.get("temperature", 0.7),
|
|
564
|
-
"
|
|
565
|
-
"
|
|
566
|
-
"
|
|
567
|
-
"repeat_last_n": self.server_args.get("repeat_last_n", 64),
|
|
568
|
-
"mirostat": self.server_args.get("mirostat_mode", 0), # llama.cpp server uses mirostat (0=disabled, 1=v1, 2=v2)
|
|
569
|
-
"mirostat_tau": self.server_args.get("mirostat_tau", 5.0),
|
|
570
|
-
"mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
|
|
571
|
-
# Add other mappable params from self.server_args like min_p, typical_p, grammar etc.
|
|
480
|
+
"temperature": self.server_args.get("temperature", 0.7), "top_k": self.server_args.get("top_k", 40),
|
|
481
|
+
"top_p": self.server_args.get("top_p", 0.9), "repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
|
|
482
|
+
"repeat_last_n": self.server_args.get("repeat_last_n", 64), "mirostat": self.server_args.get("mirostat_mode", 0),
|
|
483
|
+
"mirostat_tau": self.server_args.get("mirostat_tau", 5.0), "mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
|
|
572
484
|
}
|
|
573
|
-
if "grammar_string" in self.server_args and self.server_args["grammar_string"]:
|
|
485
|
+
if "grammar_string" in self.server_args and self.server_args["grammar_string"]:
|
|
574
486
|
payload_params["grammar"] = self.server_args["grammar_string"]
|
|
575
487
|
|
|
576
|
-
|
|
577
|
-
payload_params
|
|
578
|
-
"temperature": temperature, "top_k": top_k, "top_p": top_p,
|
|
579
|
-
"repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n,
|
|
580
|
-
})
|
|
581
|
-
if n_predict is not None: payload_params['n_predict'] = n_predict # Server uses n_predict
|
|
488
|
+
payload_params.update({"temperature": temperature, "top_k": top_k, "top_p": top_p, "repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n})
|
|
489
|
+
if n_predict is not None: payload_params['n_predict'] = n_predict
|
|
582
490
|
if seed is not None: payload_params['seed'] = seed
|
|
583
|
-
|
|
584
|
-
# Filter None values, as server might not like them
|
|
585
491
|
payload_params = {k: v for k, v in payload_params.items() if v is not None}
|
|
586
|
-
payload_params.update(extra_params)
|
|
492
|
+
payload_params.update(extra_params)
|
|
587
493
|
|
|
588
494
|
if use_chat_format and self.default_completion_format == ELF_COMPLETION_FORMAT.Chat:
|
|
589
|
-
# Use /v1/chat/completions format
|
|
590
495
|
messages = []
|
|
591
|
-
if system_prompt and system_prompt.strip():
|
|
592
|
-
messages.append({"role": "system", "content": system_prompt})
|
|
593
|
-
|
|
496
|
+
if system_prompt and system_prompt.strip(): messages.append({"role": "system", "content": system_prompt})
|
|
594
497
|
user_content: Union[str, List[Dict[str, Any]]] = prompt
|
|
595
|
-
if images and self.clip_model_path: #
|
|
498
|
+
if images and self.clip_model_path: # Use the binding's current clip_model_path
|
|
596
499
|
image_parts = []
|
|
597
500
|
for img_path in images:
|
|
598
501
|
try:
|
|
599
|
-
with open(img_path, "rb") as image_file:
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
# Llama.cpp server expects image data directly for LLaVA with /completion
|
|
604
|
-
# For /v1/chat/completions, it expects OpenAI's format for multimodal
|
|
605
|
-
image_parts.append({
|
|
606
|
-
"type": "image_url",
|
|
607
|
-
"image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}
|
|
608
|
-
})
|
|
609
|
-
except Exception as ex:
|
|
610
|
-
trace_exception(ex)
|
|
502
|
+
with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
503
|
+
image_type = Path(img_path).suffix[1:].lower() or "png"; image_type = "jpeg" if image_type == "jpg" else image_type
|
|
504
|
+
image_parts.append({"type": "image_url", "image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}})
|
|
505
|
+
except Exception as ex: trace_exception(ex)
|
|
611
506
|
user_content = [{"type": "text", "text": prompt}] + image_parts # type: ignore
|
|
612
|
-
|
|
613
507
|
messages.append({"role": "user", "content": user_content})
|
|
614
|
-
|
|
615
508
|
final_payload = {"messages": messages, "stream": stream, **payload_params}
|
|
616
|
-
|
|
617
|
-
if 'n_predict' in final_payload:
|
|
618
|
-
final_payload['max_tokens'] = final_payload.pop('n_predict')
|
|
619
|
-
|
|
509
|
+
if 'n_predict' in final_payload: final_payload['max_tokens'] = final_payload.pop('n_predict')
|
|
620
510
|
return final_payload
|
|
621
511
|
else:
|
|
622
|
-
|
|
623
|
-
# For LLaVA with /completion, images are typically passed in a special way in the prompt
|
|
624
|
-
# or via an 'image_data' field if the server supports it.
|
|
625
|
-
# The example class uses tokenized prompt for /completion.
|
|
626
|
-
# For simplicity here, we'll send text prompt, server tokenizes.
|
|
627
|
-
# Llama.cpp server's /completion often expects 'prompt' as string or tokens.
|
|
628
|
-
# If images are involved with /completion, it needs specific handling.
|
|
629
|
-
# Example: 'prompt': "USER: <image>\nWhat is this?\nASSISTANT:", 'image_data': [{'data': base64_image, 'id': 10}]
|
|
630
|
-
|
|
631
|
-
full_prompt = prompt
|
|
632
|
-
if system_prompt and system_prompt.strip():
|
|
633
|
-
# Heuristic for instruct models, actual formatting depends on model/template
|
|
634
|
-
full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:"
|
|
635
|
-
|
|
512
|
+
full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:" if system_prompt and system_prompt.strip() else prompt
|
|
636
513
|
final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
|
|
637
|
-
|
|
638
|
-
if images and self.server_args.get("clip_model_path"):
|
|
514
|
+
if images and self.clip_model_path: # Use binding's clip_model_path
|
|
639
515
|
image_data_list = []
|
|
640
516
|
for i, img_path in enumerate(images):
|
|
641
517
|
try:
|
|
642
|
-
with open(img_path, "rb") as image_file:
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
ASCIIColors.error(f"Could not encode image {img_path} for /completion: {e_img}")
|
|
647
|
-
if image_data_list:
|
|
648
|
-
final_payload["image_data"] = image_data_list
|
|
649
|
-
# The prompt needs to contain placeholder like USER: <image 1>\n<prompt>\nASSISTANT:
|
|
650
|
-
# This part is tricky and model-dependent. For now, we assume user's prompt is already formatted.
|
|
651
|
-
# Or, the server (if new enough) might handle it with chat_template even for /completion.
|
|
652
|
-
|
|
518
|
+
with open(img_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
519
|
+
image_data_list.append({"data": encoded_string, "id": i + 10})
|
|
520
|
+
except Exception as e_img: ASCIIColors.error(f"Could not encode image {img_path}: {e_img}")
|
|
521
|
+
if image_data_list: final_payload["image_data"] = image_data_list
|
|
653
522
|
return final_payload
|
|
654
523
|
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
system_prompt: str = "",
|
|
660
|
-
n_predict: Optional[int] = None,
|
|
661
|
-
stream: bool = False,
|
|
662
|
-
temperature: float = None, # Use binding's default if None
|
|
663
|
-
top_k: int = None,
|
|
664
|
-
top_p: float = None,
|
|
665
|
-
repeat_penalty: float = None,
|
|
666
|
-
repeat_last_n: Optional[int] = None,
|
|
667
|
-
seed: Optional[int] = None,
|
|
524
|
+
def generate_text(self, prompt: str, images: Optional[List[str]] = None, system_prompt: str = "",
|
|
525
|
+
n_predict: Optional[int] = None, stream: bool = False, temperature: float = None,
|
|
526
|
+
top_k: int = None, top_p: float = None, repeat_penalty: float = None,
|
|
527
|
+
repeat_last_n: Optional[int] = None, seed: Optional[int] = None,
|
|
668
528
|
streaming_callback: Optional[Callable[[str, int], bool]] = None,
|
|
669
|
-
use_chat_format_override: Optional[bool] = None,
|
|
670
|
-
**generation_kwargs
|
|
671
|
-
) -> Union[str, Dict[str, any]]:
|
|
672
|
-
|
|
529
|
+
use_chat_format_override: Optional[bool] = None, **generation_kwargs) -> Union[str, Dict[str, any]]:
|
|
673
530
|
if not self.server_process or not self.server_process.is_healthy:
|
|
674
531
|
return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
|
|
675
532
|
|
|
676
|
-
_use_chat_format = use_chat_format_override if use_chat_format_override is not None
|
|
677
|
-
else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
|
|
678
|
-
|
|
533
|
+
_use_chat_format = use_chat_format_override if use_chat_format_override is not None else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
|
|
679
534
|
payload = self._prepare_generation_payload(
|
|
680
535
|
prompt=prompt, system_prompt=system_prompt, n_predict=n_predict,
|
|
681
536
|
temperature=temperature if temperature is not None else self.server_args.get("temperature",0.7),
|
|
@@ -683,359 +538,331 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
683
538
|
top_p=top_p if top_p is not None else self.server_args.get("top_p",0.9),
|
|
684
539
|
repeat_penalty=repeat_penalty if repeat_penalty is not None else self.server_args.get("repeat_penalty",1.1),
|
|
685
540
|
repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
|
|
686
|
-
seed=seed if seed is not None else self.server_args.get("seed", -1),
|
|
687
|
-
|
|
688
|
-
**generation_kwargs
|
|
541
|
+
seed=seed if seed is not None else self.server_args.get("seed", -1), stream=stream,
|
|
542
|
+
use_chat_format=_use_chat_format, images=images, **generation_kwargs
|
|
689
543
|
)
|
|
690
|
-
|
|
691
544
|
endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
|
|
692
545
|
request_url = self._get_request_url(endpoint)
|
|
693
546
|
|
|
694
|
-
#
|
|
695
|
-
debug_payload = {k:v for k,v in payload.items() if k not in ["image_data"]}
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
ASCIIColors.debug(f"Request to {request_url} with payload: {json.dumps(debug_payload, indent=2)[:500]}...")
|
|
547
|
+
# Debug payload (simplified)
|
|
548
|
+
# debug_payload = {k:v for k,v in payload.items() if k not in ["image_data","messages"] or (k=="messages" and not any("image_url" in part for item in v for part in (item.get("content") if isinstance(item.get("content"),list) else [])))} # Complex filter for brevity
|
|
549
|
+
# ASCIIColors.debug(f"Request to {request_url} with payload (simplified): {json.dumps(debug_payload, indent=2)[:500]}...")
|
|
550
|
+
|
|
699
551
|
|
|
700
552
|
full_response_text = ""
|
|
701
553
|
try:
|
|
702
554
|
response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
|
|
703
555
|
response.raise_for_status()
|
|
704
|
-
|
|
705
556
|
if stream:
|
|
706
557
|
for line in response.iter_lines():
|
|
707
558
|
if not line: continue
|
|
708
559
|
line_str = line.decode('utf-8').strip()
|
|
709
560
|
if line_str.startswith('data: '): line_str = line_str[6:]
|
|
710
|
-
if line_str == '[DONE]': break
|
|
711
|
-
|
|
561
|
+
if line_str == '[DONE]': break
|
|
712
562
|
try:
|
|
713
563
|
chunk_data = json.loads(line_str)
|
|
714
|
-
chunk_content =
|
|
715
|
-
|
|
716
|
-
delta = chunk_data.get('choices', [{}])[0].get('delta', {})
|
|
717
|
-
chunk_content = delta.get('content', '')
|
|
718
|
-
else: # /completion format
|
|
719
|
-
chunk_content = chunk_data.get('content', '')
|
|
720
|
-
|
|
564
|
+
chunk_content = (chunk_data.get('choices', [{}])[0].get('delta', {}).get('content', '') if _use_chat_format
|
|
565
|
+
else chunk_data.get('content', ''))
|
|
721
566
|
if chunk_content:
|
|
722
567
|
full_response_text += chunk_content
|
|
723
568
|
if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
ASCIIColors.info("Streaming callback requested stop.")
|
|
728
|
-
response.close() # Attempt to signal server by closing connection
|
|
729
|
-
break
|
|
730
|
-
if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): # /completion specific stop flags
|
|
731
|
-
break
|
|
732
|
-
except json.JSONDecodeError:
|
|
733
|
-
ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
|
|
734
|
-
continue # Or handle error
|
|
569
|
+
ASCIIColors.info("Streaming callback requested stop."); response.close(); break
|
|
570
|
+
if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): break
|
|
571
|
+
except json.JSONDecodeError: ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}"); continue
|
|
735
572
|
return full_response_text
|
|
736
|
-
else:
|
|
573
|
+
else:
|
|
737
574
|
response_data = response.json()
|
|
738
|
-
return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
739
|
-
|
|
575
|
+
return response_data.get('choices', [{}])[0].get('message', {}).get('content', '') if _use_chat_format \
|
|
576
|
+
else response_data.get('content','') # /completion has 'content' at top level for non-stream
|
|
740
577
|
except requests.exceptions.RequestException as e:
|
|
741
578
|
error_message = f"Llama.cpp server request error: {e}"
|
|
742
579
|
if e.response is not None:
|
|
743
|
-
try:
|
|
744
|
-
|
|
745
|
-
error_message += f" - Details: {error_details.get('error', e.response.text)}"
|
|
746
|
-
except json.JSONDecodeError:
|
|
747
|
-
error_message += f" - Response: {e.response.text[:200]}"
|
|
580
|
+
try: error_details = e.response.json(); error_message += f" - Details: {error_details.get('error', e.response.text)}"
|
|
581
|
+
except json.JSONDecodeError: error_message += f" - Response: {e.response.text[:200]}"
|
|
748
582
|
ASCIIColors.error(error_message)
|
|
749
583
|
return {"status": False, "error": error_message, "details": str(e.response.text if e.response else "No response text")}
|
|
750
584
|
except Exception as ex:
|
|
751
|
-
error_message = f"Llama.cpp generation error: {str(ex)}"
|
|
752
|
-
trace_exception(ex)
|
|
585
|
+
error_message = f"Llama.cpp generation error: {str(ex)}"; trace_exception(ex)
|
|
753
586
|
return {"status": False, "error": error_message}
|
|
754
587
|
|
|
755
588
|
def tokenize(self, text: str) -> List[int]:
|
|
756
|
-
if not self.server_process or not self.server_process.is_healthy:
|
|
757
|
-
raise ConnectionError("Llama.cpp server is not running.")
|
|
589
|
+
if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
|
|
758
590
|
try:
|
|
759
591
|
response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
|
|
760
|
-
response.raise_for_status()
|
|
761
|
-
|
|
762
|
-
except Exception as e:
|
|
763
|
-
ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e)
|
|
764
|
-
return [] # Or raise
|
|
592
|
+
response.raise_for_status(); return response.json().get("tokens", [])
|
|
593
|
+
except Exception as e: ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e); return []
|
|
765
594
|
|
|
766
595
|
def detokenize(self, tokens: List[int]) -> str:
|
|
767
|
-
if not self.server_process or not self.server_process.is_healthy:
|
|
768
|
-
raise ConnectionError("Llama.cpp server is not running.")
|
|
596
|
+
if not self.server_process or not self.server_process.is_healthy: raise ConnectionError("Server not running.")
|
|
769
597
|
try:
|
|
770
598
|
response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
|
|
771
|
-
response.raise_for_status()
|
|
772
|
-
|
|
773
|
-
except Exception as e:
|
|
774
|
-
ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e)
|
|
775
|
-
return "" # Or raise
|
|
599
|
+
response.raise_for_status(); return response.json().get("content", "")
|
|
600
|
+
except Exception as e: ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e); return ""
|
|
776
601
|
|
|
777
|
-
def count_tokens(self, text: str) -> int:
|
|
778
|
-
return len(self.tokenize(text))
|
|
602
|
+
def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
|
|
779
603
|
|
|
780
604
|
def embed(self, text: str, **kwargs) -> List[float]:
|
|
781
|
-
if not self.server_process or not self.server_process.is_healthy:
|
|
782
|
-
|
|
783
|
-
if not self.server_args.get("embedding"):
|
|
784
|
-
raise Exception("Embedding support was not enabled in server_args (set 'embedding: true').")
|
|
785
|
-
|
|
605
|
+
if not self.server_process or not self.server_process.is_healthy: raise Exception("Server not running.")
|
|
606
|
+
if not self.server_args.get("embedding"): raise Exception("Embedding not enabled in server_args.")
|
|
786
607
|
try:
|
|
787
|
-
|
|
788
|
-
# Let's try /v1/embeddings first for compatibility
|
|
789
|
-
payload = {"input": text}
|
|
790
|
-
if "model" in kwargs: payload["model"] = kwargs["model"] # Can specify model if server handles multiple embedding models (unlikely for llama.cpp server)
|
|
791
|
-
|
|
792
|
-
request_url = self._get_request_url("/v1/embeddings")
|
|
608
|
+
payload = {"input": text}; request_url = self._get_request_url("/v1/embeddings")
|
|
793
609
|
response = self.server_process.session.post(request_url, json=payload)
|
|
794
|
-
|
|
795
|
-
if response.status_code == 404: # Fallback to /embedding if /v1/embeddings not found
|
|
796
|
-
ASCIIColors.debug("Trying /embedding endpoint as /v1/embeddings was not found.")
|
|
610
|
+
if response.status_code == 404: # Fallback
|
|
797
611
|
request_url = self._get_request_url("/embedding")
|
|
798
|
-
response = self.server_process.session.post(request_url, json={"content": text})
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
data
|
|
802
|
-
|
|
803
|
-
if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: # /v1/embeddings format
|
|
804
|
-
return data["data"][0]["embedding"]
|
|
805
|
-
elif "embedding" in data and isinstance(data["embedding"], list): # /embedding format
|
|
806
|
-
return data["embedding"]
|
|
807
|
-
else:
|
|
808
|
-
raise ValueError(f"Unexpected embedding response format: {data}")
|
|
809
|
-
|
|
612
|
+
response = self.server_process.session.post(request_url, json={"content": text})
|
|
613
|
+
response.raise_for_status(); data = response.json()
|
|
614
|
+
if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: return data["data"][0]["embedding"]
|
|
615
|
+
elif "embedding" in data and isinstance(data["embedding"], list): return data["embedding"]
|
|
616
|
+
else: raise ValueError(f"Unexpected embedding response: {data}")
|
|
810
617
|
except requests.exceptions.RequestException as e:
|
|
811
|
-
err_msg = f"
|
|
618
|
+
err_msg = f"Embedding request error: {e}";
|
|
812
619
|
if e.response: err_msg += f" - {e.response.text[:200]}"
|
|
813
620
|
raise Exception(err_msg) from e
|
|
814
|
-
except Exception as ex:
|
|
815
|
-
trace_exception(ex); raise Exception(f"Llama.cpp embedding failed: {str(ex)}") from ex
|
|
621
|
+
except Exception as ex: trace_exception(ex); raise Exception(f"Embedding failed: {str(ex)}") from ex
|
|
816
622
|
|
|
817
623
|
def get_model_info(self) -> dict:
|
|
818
624
|
info = {
|
|
819
625
|
"name": self.binding_name,
|
|
820
|
-
"
|
|
626
|
+
"user_provided_model_name": self.user_provided_model_name,
|
|
821
627
|
"model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
|
|
628
|
+
"clip_model_path": str(self.clip_model_path) if self.clip_model_path else "N/A",
|
|
822
629
|
"loaded": self.server_process is not None and self.server_process.is_healthy,
|
|
823
|
-
"server_args": self.server_args,
|
|
824
|
-
"
|
|
630
|
+
"server_args": self.server_args, "port": self.port if self.port else "N/A",
|
|
631
|
+
"server_key": str(self.server_key) if self.server_key else "N/A",
|
|
825
632
|
}
|
|
826
|
-
if info["loaded"]:
|
|
827
|
-
# Try to get more info from server's /props or /v1/models
|
|
633
|
+
if info["loaded"] and self.server_process:
|
|
828
634
|
try:
|
|
829
|
-
|
|
830
|
-
props_resp = self.server_process.session.get(props_url, timeout=5).json()
|
|
635
|
+
props_resp = self.server_process.session.get(self._get_request_url("/props"), timeout=5).json()
|
|
831
636
|
info.update({
|
|
832
|
-
"server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
|
|
637
|
+
"server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"),
|
|
833
638
|
"server_chat_format": props_resp.get("chat_format"),
|
|
834
|
-
"
|
|
639
|
+
"server_clip_model_from_props": props_resp.get("mmproj"), # Server's view of clip model
|
|
835
640
|
})
|
|
836
|
-
except Exception: pass
|
|
837
|
-
|
|
838
|
-
is_llava = ("llava" in self.model_name.lower() or "bakllava" in self.model_name.lower()) or \
|
|
839
|
-
(self.server_args.get("clip_model_path") is not None) or \
|
|
840
|
-
(info.get("server_clip_model") is not None)
|
|
641
|
+
except Exception: pass
|
|
841
642
|
|
|
643
|
+
is_llava = self.clip_model_path is not None or \
|
|
644
|
+
(info.get("server_clip_model_from_props") is not None) or \
|
|
645
|
+
("llava" in self.current_model_path.name.lower() if self.current_model_path else False)
|
|
842
646
|
info["supports_vision"] = is_llava
|
|
843
647
|
info["supports_structured_output"] = self.server_args.get("grammar_string") is not None
|
|
844
648
|
return info
|
|
845
649
|
|
|
846
650
|
def listModels(self) -> List[Dict[str, str]]:
|
|
847
|
-
# This binding manages one GGUF model at a time by starting a server for it.
|
|
848
|
-
# To "list models", we could scan the Lollms model directories for .gguf files.
|
|
849
651
|
models_found = []
|
|
850
|
-
gguf_pattern = "*.gguf"
|
|
851
|
-
|
|
852
|
-
search_paths = []
|
|
853
|
-
binding_specific_folder_name = self.binding_name
|
|
854
|
-
|
|
855
|
-
search_paths.append(self.models_path)
|
|
856
|
-
|
|
857
652
|
unique_models = set()
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
})
|
|
868
|
-
unique_models.add(model_file.name)
|
|
653
|
+
if self.models_path.exists() and self.models_path.is_dir():
|
|
654
|
+
for model_file in self.models_path.rglob("*.gguf"):
|
|
655
|
+
if model_file.is_file() and model_file.name not in unique_models:
|
|
656
|
+
models_found.append({
|
|
657
|
+
'model_name': model_file.name,
|
|
658
|
+
'path_hint': str(model_file.relative_to(self.models_path.parent) if model_file.is_relative_to(self.models_path.parent) else model_file),
|
|
659
|
+
'size_gb': f"{model_file.stat().st_size / (1024**3):.2f} GB"
|
|
660
|
+
})
|
|
661
|
+
unique_models.add(model_file.name)
|
|
869
662
|
return models_found
|
|
870
663
|
|
|
871
664
|
def __del__(self):
|
|
872
|
-
self.unload_model()
|
|
665
|
+
self.unload_model()
|
|
873
666
|
|
|
874
667
|
|
|
875
668
|
if __name__ == '__main__':
|
|
876
|
-
global full_streamed_text
|
|
669
|
+
global full_streamed_text # Define for the callback
|
|
670
|
+
full_streamed_text = ""
|
|
877
671
|
ASCIIColors.yellow("Testing LlamaCppServerBinding...")
|
|
878
672
|
|
|
879
673
|
# --- Configuration ---
|
|
880
|
-
# This should be the NAME of your GGUF model file.
|
|
881
|
-
#
|
|
882
|
-
#
|
|
883
|
-
#
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
#
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
674
|
+
# This should be the NAME of your GGUF model file.
|
|
675
|
+
# Ensure this model is placed in your models_path directory.
|
|
676
|
+
# Example: models_path = "E:\\lollms\\models\\gguf" (Windows)
|
|
677
|
+
# model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
|
|
678
|
+
|
|
679
|
+
# For CI/local testing without specific paths, you might download a tiny model
|
|
680
|
+
# or require user to set environment variables for these.
|
|
681
|
+
# For this example, replace with your actual paths/model.
|
|
682
|
+
try:
|
|
683
|
+
models_path_str = os.environ.get("LOLLMS_MODELS_PATH", str(Path(__file__).parent / "test_models"))
|
|
684
|
+
model_name_str = os.environ.get("LOLLMS_TEST_MODEL_GGUF", "tinyllama-1.1b-chat-v1.0.Q2_K.gguf") # A small model
|
|
685
|
+
llava_model_name_str = os.environ.get("LOLLMS_TEST_LLAVA_MODEL_GGUF", "llava-v1.5-7b.Q2_K.gguf") # Placeholder
|
|
686
|
+
llava_clip_name_str = os.environ.get("LOLLMS_TEST_LLAVA_CLIP", "mmproj-model2-q4_0.gguf") # Placeholder
|
|
687
|
+
|
|
688
|
+
models_path = Path(models_path_str)
|
|
689
|
+
models_path.mkdir(parents=True, exist_ok=True) # Ensure test_models dir exists
|
|
690
|
+
|
|
691
|
+
# Verify model exists, or skip tests gracefully
|
|
692
|
+
test_model_path = models_path / model_name_str
|
|
693
|
+
if not test_model_path.exists():
|
|
694
|
+
ASCIIColors.warning(f"Test model {test_model_path} not found. Please place a GGUF model there or set LOLLMS_TEST_MODEL_GGUF and LOLLMS_MODELS_PATH env vars.")
|
|
695
|
+
ASCIIColors.warning("Some tests will be skipped.")
|
|
696
|
+
# sys.exit(1) # Or allow to continue with skips
|
|
697
|
+
primary_model_available = False
|
|
698
|
+
else:
|
|
699
|
+
primary_model_available = True
|
|
700
|
+
|
|
701
|
+
except Exception as e:
|
|
702
|
+
ASCIIColors.error(f"Error setting up test paths: {e}"); trace_exception(e)
|
|
703
|
+
sys.exit(1)
|
|
704
|
+
|
|
892
705
|
binding_config = {
|
|
893
|
-
"n_gpu_layers": 0,
|
|
894
|
-
"
|
|
895
|
-
"embedding": True, # Enable for embedding tests
|
|
896
|
-
"verbose": False, # llama.cpp server verbose logs
|
|
897
|
-
# "extra_cli_flags": ["--cont-batching"] # Example of extra flags
|
|
898
|
-
"server_startup_timeout": 180 # Give more time for server to start, esp. with large models
|
|
706
|
+
"n_gpu_layers": 0, "n_ctx": 512, "embedding": True,
|
|
707
|
+
"verbose": False, "server_startup_timeout": 180, "parallel_slots": 2,
|
|
899
708
|
}
|
|
900
709
|
|
|
901
|
-
|
|
710
|
+
active_binding1: Optional[LlamaCppServerBinding] = None
|
|
711
|
+
active_binding2: Optional[LlamaCppServerBinding] = None
|
|
712
|
+
active_binding_llava: Optional[LlamaCppServerBinding] = None
|
|
713
|
+
|
|
902
714
|
try:
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
715
|
+
if primary_model_available:
|
|
716
|
+
ASCIIColors.cyan("\n--- Initializing First LlamaCppServerBinding Instance ---")
|
|
717
|
+
active_binding1 = LlamaCppServerBinding(
|
|
718
|
+
model_name=model_name_str, models_path=str(models_path), config=binding_config
|
|
719
|
+
)
|
|
720
|
+
if not active_binding1.server_process or not active_binding1.server_process.is_healthy:
|
|
721
|
+
raise RuntimeError("Server for binding1 failed to start or become healthy.")
|
|
722
|
+
ASCIIColors.green(f"Binding1 initialized. Server for '{active_binding1.current_model_path.name}' running on port {active_binding1.port}.")
|
|
723
|
+
ASCIIColors.info(f"Binding1 Model Info: {json.dumps(active_binding1.get_model_info(), indent=2)}")
|
|
724
|
+
|
|
725
|
+
ASCIIColors.cyan("\n--- Initializing Second LlamaCppServerBinding Instance (Same Model) ---")
|
|
726
|
+
active_binding2 = LlamaCppServerBinding(
|
|
727
|
+
model_name=model_name_str, models_path=str(models_path), config=binding_config # Same model and config
|
|
728
|
+
)
|
|
729
|
+
if not active_binding2.server_process or not active_binding2.server_process.is_healthy:
|
|
730
|
+
raise RuntimeError("Server for binding2 failed to start or become healthy (should reuse).")
|
|
731
|
+
ASCIIColors.green(f"Binding2 initialized. Server for '{active_binding2.current_model_path.name}' running on port {active_binding2.port}.")
|
|
732
|
+
ASCIIColors.info(f"Binding2 Model Info: {json.dumps(active_binding2.get_model_info(), indent=2)}")
|
|
911
733
|
|
|
912
|
-
|
|
913
|
-
|
|
734
|
+
if active_binding1.port != active_binding2.port:
|
|
735
|
+
ASCIIColors.error("ERROR: Bindings for the same model are using different ports! Server sharing failed.")
|
|
736
|
+
else:
|
|
737
|
+
ASCIIColors.green("SUCCESS: Both bindings use the same server port. Server sharing appears to work.")
|
|
738
|
+
|
|
739
|
+
# --- List Models (scans configured directories) ---
|
|
740
|
+
ASCIIColors.cyan("\n--- Listing Models (from search paths, using binding1) ---")
|
|
741
|
+
listed_models = active_binding1.listModels()
|
|
742
|
+
if listed_models: ASCIIColors.green(f"Found {len(listed_models)} GGUF files. First 5: {listed_models[:5]}")
|
|
743
|
+
else: ASCIIColors.warning("No GGUF models found in search paths.")
|
|
744
|
+
|
|
745
|
+
# --- Tokenize/Detokenize ---
|
|
746
|
+
ASCIIColors.cyan("\n--- Tokenize/Detokenize (using binding1) ---")
|
|
747
|
+
sample_text = "Hello, Llama.cpp server world!"
|
|
748
|
+
tokens = active_binding1.tokenize(sample_text)
|
|
749
|
+
ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
|
|
750
|
+
if tokens:
|
|
751
|
+
detokenized_text = active_binding1.detokenize(tokens)
|
|
752
|
+
ASCIIColors.green(f"Detokenized text: {detokenized_text}")
|
|
753
|
+
else: ASCIIColors.warning("Tokenization returned empty list.")
|
|
754
|
+
|
|
755
|
+
# --- Text Generation (Non-Streaming, Chat API, binding1) ---
|
|
756
|
+
ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API, binding1) ---")
|
|
757
|
+
prompt_text = "What is the capital of Germany?"
|
|
758
|
+
generated_text = active_binding1.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=20, stream=False, use_chat_format_override=True)
|
|
759
|
+
if isinstance(generated_text, str): ASCIIColors.green(f"Generated text (binding1): {generated_text}")
|
|
760
|
+
else: ASCIIColors.error(f"Generation failed (binding1): {generated_text}")
|
|
761
|
+
|
|
762
|
+
# --- Text Generation (Streaming, Completion API, binding2) ---
|
|
763
|
+
ASCIIColors.cyan("\n--- Text Generation (Streaming, Completion API, binding2) ---")
|
|
764
|
+
full_streamed_text = "" # Reset global
|
|
765
|
+
def stream_callback(chunk: str, msg_type: int): global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True); full_streamed_text += chunk; return True
|
|
766
|
+
|
|
767
|
+
result_b2 = active_binding2.generate_text(prompt_text, system_prompt="Concise expert.", n_predict=30, stream=True, streaming_callback=stream_callback, use_chat_format_override=False)
|
|
768
|
+
print("\n--- End of Stream (binding2) ---")
|
|
769
|
+
if isinstance(result_b2, str): ASCIIColors.green(f"Full streamed text (binding2): {result_b2}")
|
|
770
|
+
else: ASCIIColors.error(f"Streaming generation failed (binding2): {result_b2}")
|
|
771
|
+
|
|
772
|
+
# --- Embeddings (binding1) ---
|
|
773
|
+
if binding_config.get("embedding"):
|
|
774
|
+
ASCIIColors.cyan("\n--- Embeddings (binding1) ---")
|
|
775
|
+
try:
|
|
776
|
+
embedding_vector = active_binding1.embed("Test embedding.")
|
|
777
|
+
ASCIIColors.green(f"Embedding (first 3 dims): {embedding_vector[:3]}... Dim: {len(embedding_vector)}")
|
|
778
|
+
except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
|
|
779
|
+
else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false) ---")
|
|
780
|
+
|
|
781
|
+
else: # primary_model_available is False
|
|
782
|
+
ASCIIColors.warning("Primary test model not available. Skipping most tests.")
|
|
914
783
|
|
|
915
|
-
|
|
916
|
-
# --- List Models (scans configured directories) ---
|
|
917
|
-
ASCIIColors.cyan("\n--- Listing Models (from search paths) ---")
|
|
918
|
-
listed_models = active_binding.listModels()
|
|
919
|
-
if listed_models:
|
|
920
|
-
ASCIIColors.green(f"Found {len(listed_models)} GGUF files. First 5:")
|
|
921
|
-
for m in listed_models[:5]: print(m)
|
|
922
|
-
else: ASCIIColors.warning("No GGUF models found in search paths.")
|
|
923
|
-
|
|
924
|
-
# --- Tokenize/Detokenize ---
|
|
925
|
-
ASCIIColors.cyan("\n--- Tokenize/Detokenize ---")
|
|
926
|
-
sample_text = "Hello, Llama.cpp server world!"
|
|
927
|
-
tokens = active_binding.tokenize(sample_text)
|
|
928
|
-
ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
|
|
929
|
-
token_count = active_binding.count_tokens(sample_text)
|
|
930
|
-
ASCIIColors.green(f"Token count: {token_count}")
|
|
931
|
-
if tokens: # Only detokenize if tokenization worked
|
|
932
|
-
detokenized_text = active_binding.detokenize(tokens)
|
|
933
|
-
ASCIIColors.green(f"Detokenized text: {detokenized_text}")
|
|
934
|
-
# Note: exact match might depend on BOS/EOS handling by server's tokenizer
|
|
935
|
-
# assert detokenized_text.strip() == sample_text.strip(), "Tokenization/Detokenization mismatch!"
|
|
936
|
-
else: ASCIIColors.warning("Tokenization returned empty list, skipping detokenization.")
|
|
937
|
-
|
|
938
|
-
# --- Text Generation (Non-Streaming, Chat Format using /v1/chat/completions) ---
|
|
939
|
-
ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API) ---")
|
|
940
|
-
prompt_text = "What is the capital of Germany?"
|
|
941
|
-
system_prompt_text = "You are a concise geography expert."
|
|
942
|
-
generated_text = active_binding.generate_text(
|
|
943
|
-
prompt_text, system_prompt=system_prompt_text, n_predict=20, stream=False,
|
|
944
|
-
use_chat_format_override=True # Force /v1/chat/completions
|
|
945
|
-
)
|
|
946
|
-
if isinstance(generated_text, str): ASCIIColors.green(f"Generated text: {generated_text}")
|
|
947
|
-
else: ASCIIColors.error(f"Generation failed: {generated_text}")
|
|
948
|
-
|
|
949
|
-
# --- Text Generation (Streaming, /completion API) ---
|
|
950
|
-
ASCIIColors.cyan("\n--- Text Generation (Streaming, Completion API) ---")
|
|
951
|
-
full_streamed_text = ""
|
|
952
|
-
def stream_callback(chunk: str, msg_type: int):
|
|
953
|
-
global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True)
|
|
954
|
-
full_streamed_text += chunk; return True
|
|
955
|
-
|
|
956
|
-
result = active_binding.generate_text(
|
|
957
|
-
prompt_text, system_prompt=system_prompt_text, n_predict=30, stream=True,
|
|
958
|
-
streaming_callback=stream_callback, use_chat_format_override=False # Force /completion
|
|
959
|
-
)
|
|
960
|
-
print("\n--- End of Stream ---")
|
|
961
|
-
if isinstance(result, str): ASCIIColors.green(f"Full streamed text: {result}")
|
|
962
|
-
else: ASCIIColors.error(f"Streaming generation failed: {result}")
|
|
963
|
-
|
|
964
|
-
# --- Embeddings ---
|
|
965
|
-
if binding_config.get("embedding"):
|
|
966
|
-
ASCIIColors.cyan("\n--- Embeddings ---")
|
|
967
|
-
embedding_text = "Test sentence for server-based embeddings."
|
|
968
|
-
try:
|
|
969
|
-
embedding_vector = active_binding.embed(embedding_text)
|
|
970
|
-
ASCIIColors.green(f"Embedding for '{embedding_text}' (first 3 dims): {embedding_vector[:3]}...")
|
|
971
|
-
ASCIIColors.info(f"Embedding vector dimension: {len(embedding_vector)}")
|
|
972
|
-
except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
|
|
973
|
-
else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false in config) ---")
|
|
974
784
|
|
|
975
785
|
# --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
model_path = Path(models_path)/model_name
|
|
980
|
-
ASCIIColors.cyan("\n--- LLaVA Vision Test ---")
|
|
981
|
-
dummy_image_path = Path("E:\\drumber\\drumber.png")
|
|
982
|
-
try:
|
|
983
|
-
from PIL import Image, ImageDraw
|
|
984
|
-
img = Image.new('RGB', (150, 70), color = ('magenta'))
|
|
985
|
-
d = ImageDraw.Draw(img); d.text((10,10), "Server LLaVA", fill=('white'))
|
|
986
|
-
img.save(dummy_image_path)
|
|
987
|
-
ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
|
|
988
|
-
|
|
989
|
-
llava_prompt = "Describe this image."
|
|
990
|
-
# For /v1/chat/completions with LLaVA, images are passed in messages.
|
|
991
|
-
# For /completion with LLaVA, prompt needs <image> placeholder and image_data field.
|
|
992
|
-
llava_response = active_binding.generate_text(
|
|
993
|
-
prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False,
|
|
994
|
-
use_chat_format_override=True # Use /v1/chat/completions for easier multimodal
|
|
995
|
-
)
|
|
996
|
-
if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
|
|
997
|
-
else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
|
|
998
|
-
except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
|
|
999
|
-
except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
|
|
1000
|
-
finally:
|
|
1001
|
-
if dummy_image_path.exists(): dummy_image_path.unlink()
|
|
1002
|
-
|
|
1003
|
-
# --- Test changing model ---
|
|
1004
|
-
# This part is conceptual. You'd need another GGUF model file for a real test.
|
|
1005
|
-
# For now, we'll just call load_model with the same model to test the logic.
|
|
1006
|
-
|
|
1007
|
-
ASCIIColors.cyan("\n--- Testing Model Change (reloading same model) ---")
|
|
1008
|
-
reload_success = active_binding.load_model(str(model_path))
|
|
1009
|
-
if reload_success and active_binding.server_process and active_binding.server_process.is_healthy:
|
|
1010
|
-
ASCIIColors.green(f"Model reloaded/re-confirmed successfully. Server on port {active_binding.port}.")
|
|
1011
|
-
# Quick generation test after reload
|
|
1012
|
-
reloaded_gen = active_binding.generate_text("Ping", n_predict=5, stream=False)
|
|
1013
|
-
if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping response: {reloaded_gen.strip()}")
|
|
1014
|
-
else: ASCIIColors.error(f"Post-reload generation failed: {reloaded_gen}")
|
|
1015
|
-
else:
|
|
1016
|
-
ASCIIColors.error("Failed to reload model or server not healthy after reload attempt.")
|
|
786
|
+
ASCIIColors.cyan("\n--- LLaVA Vision Test (if model available) ---")
|
|
787
|
+
llava_model_path = models_path / llava_model_name_str
|
|
788
|
+
llava_clip_path_actual = models_path / llava_clip_name_str # Assuming clip is in models_path too
|
|
1017
789
|
|
|
790
|
+
if llava_model_path.exists() and llava_clip_path_actual.exists():
|
|
791
|
+
dummy_image_path = models_path / "dummy_llava_image.png"
|
|
792
|
+
try:
|
|
793
|
+
from PIL import Image, ImageDraw
|
|
794
|
+
img = Image.new('RGB', (150, 70), color = ('magenta')); d = ImageDraw.Draw(img); d.text((10,10), "LLaVA Test", fill=('white')); img.save(dummy_image_path)
|
|
795
|
+
ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
|
|
1018
796
|
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
ASCIIColors.error(f"File not found error: {e_fnf}. Check model or server binary paths.")
|
|
1023
|
-
except ConnectionError as e_conn:
|
|
1024
|
-
ASCIIColors.error(f"Connection error (server might have failed to start or is unresponsive): {e_conn}")
|
|
1025
|
-
except RuntimeError as e_rt:
|
|
1026
|
-
ASCIIColors.error(f"Runtime error (often server process issue): {e_rt}")
|
|
1027
|
-
if active_binding and active_binding.server_process:
|
|
1028
|
-
ASCIIColors.error("Last stderr lines from server:")
|
|
1029
|
-
for line in active_binding.server_process._stderr_lines[-20:]: print(line) # Print last 20
|
|
1030
|
-
except Exception as e_main:
|
|
1031
|
-
ASCIIColors.error(f"An unexpected error occurred: {e_main}")
|
|
1032
|
-
trace_exception(e_main)
|
|
1033
|
-
finally:
|
|
1034
|
-
if active_binding:
|
|
1035
|
-
ASCIIColors.cyan("\n--- Unloading Model and Stopping Server ---")
|
|
1036
|
-
active_binding.unload_model()
|
|
1037
|
-
ASCIIColors.green("Server stopped and model unloaded.")
|
|
797
|
+
llava_binding_config = binding_config.copy()
|
|
798
|
+
# LLaVA might need specific chat template if server doesn't auto-detect well.
|
|
799
|
+
# llava_binding_config["chat_template"] = "llava-1.5"
|
|
1038
800
|
|
|
801
|
+
active_binding_llava = LlamaCppServerBinding(
|
|
802
|
+
model_name=str(llava_model_path), # Pass full path for clarity in test
|
|
803
|
+
models_path=str(models_path),
|
|
804
|
+
clip_model_name=str(llava_clip_path_actual), # Pass full path for clip
|
|
805
|
+
config=llava_binding_config
|
|
806
|
+
)
|
|
807
|
+
if not active_binding_llava.server_process or not active_binding_llava.server_process.is_healthy:
|
|
808
|
+
raise RuntimeError("LLaVA server failed to start or become healthy.")
|
|
809
|
+
ASCIIColors.green(f"LLaVA Binding initialized. Server for '{active_binding_llava.current_model_path.name}' running on port {active_binding_llava.port}.")
|
|
810
|
+
ASCIIColors.info(f"LLaVA Binding Model Info: {json.dumps(active_binding_llava.get_model_info(), indent=2)}")
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
llava_prompt = "Describe this image."
|
|
814
|
+
llava_response = active_binding_llava.generate_text(
|
|
815
|
+
prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False, use_chat_format_override=True
|
|
816
|
+
)
|
|
817
|
+
if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
|
|
818
|
+
else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
|
|
819
|
+
|
|
820
|
+
except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
|
|
821
|
+
except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
|
|
822
|
+
finally:
|
|
823
|
+
if dummy_image_path.exists(): dummy_image_path.unlink()
|
|
824
|
+
else:
|
|
825
|
+
ASCIIColors.warning(f"LLaVA model '{llava_model_path.name}' or clip model '{llava_clip_path_actual.name}' not found in '{models_path}'. Skipping LLaVA test.")
|
|
826
|
+
|
|
827
|
+
if primary_model_available and active_binding1:
|
|
828
|
+
# --- Test changing model (using binding1 to load a different or same model) ---
|
|
829
|
+
ASCIIColors.cyan("\n--- Testing Model Change (binding1 reloads its model) ---")
|
|
830
|
+
# For a real change, use a different model name if available. Here, we reload the same.
|
|
831
|
+
reload_success = active_binding1.load_model(model_name_str) # Reload original model
|
|
832
|
+
if reload_success and active_binding1.server_process and active_binding1.server_process.is_healthy:
|
|
833
|
+
ASCIIColors.green(f"Model reloaded/re-confirmed successfully by binding1. Server on port {active_binding1.port}.")
|
|
834
|
+
reloaded_gen = active_binding1.generate_text("Ping", n_predict=5, stream=False)
|
|
835
|
+
if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping (binding1): {reloaded_gen.strip()}")
|
|
836
|
+
else: ASCIIColors.error(f"Post-reload generation failed (binding1): {reloaded_gen}")
|
|
837
|
+
else:
|
|
838
|
+
ASCIIColors.error("Failed to reload model or server not healthy after reload attempt by binding1.")
|
|
1039
839
|
|
|
840
|
+
except ImportError as e_imp: ASCIIColors.error(f"Import error: {e_imp}.")
|
|
841
|
+
except FileNotFoundError as e_fnf: ASCIIColors.error(f"File not found error: {e_fnf}.")
|
|
842
|
+
except ConnectionError as e_conn: ASCIIColors.error(f"Connection error: {e_conn}")
|
|
843
|
+
except RuntimeError as e_rt:
|
|
844
|
+
ASCIIColors.error(f"Runtime error: {e_rt}")
|
|
845
|
+
if active_binding1 and active_binding1.server_process: ASCIIColors.error(f"Binding1 stderr:\n{active_binding1.server_process._stderr_lines[-20:]}")
|
|
846
|
+
if active_binding2 and active_binding2.server_process: ASCIIColors.error(f"Binding2 stderr:\n{active_binding2.server_process._stderr_lines[-20:]}")
|
|
847
|
+
if active_binding_llava and active_binding_llava.server_process: ASCIIColors.error(f"LLaVA Binding stderr:\n{active_binding_llava.server_process._stderr_lines[-20:]}")
|
|
848
|
+
except Exception as e_main: ASCIIColors.error(f"An unexpected error occurred: {e_main}"); trace_exception(e_main)
|
|
849
|
+
finally:
|
|
850
|
+
ASCIIColors.cyan("\n--- Unloading Models and Stopping Servers ---")
|
|
851
|
+
if active_binding1: active_binding1.unload_model(); ASCIIColors.info("Binding1 unloaded.")
|
|
852
|
+
if active_binding2: active_binding2.unload_model(); ASCIIColors.info("Binding2 unloaded.")
|
|
853
|
+
if active_binding_llava: active_binding_llava.unload_model(); ASCIIColors.info("LLaVA Binding unloaded.")
|
|
854
|
+
|
|
855
|
+
# Check if any servers remain (should be none if all bindings unloaded)
|
|
856
|
+
with _server_registry_lock:
|
|
857
|
+
if _active_servers:
|
|
858
|
+
ASCIIColors.warning(f"Warning: {_active_servers.keys()} servers still in registry after all known bindings unloaded.")
|
|
859
|
+
for key, server_proc in list(_active_servers.items()): # list() for safe iteration if modifying
|
|
860
|
+
ASCIIColors.info(f"Force shutting down stray server: {key}")
|
|
861
|
+
try: server_proc.shutdown()
|
|
862
|
+
except Exception as e_shutdown: ASCIIColors.error(f"Error shutting down stray server {key}: {e_shutdown}")
|
|
863
|
+
_active_servers.pop(key,None)
|
|
864
|
+
_server_ref_counts.pop(key,None)
|
|
865
|
+
else:
|
|
866
|
+
ASCIIColors.green("All servers shut down correctly.")
|
|
1040
867
|
|
|
1041
868
|
ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")
|