lollms-client 0.14.1__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lollms-client might be problematic. Click here for more details.
- examples/simple_text_gen_with_image_test.py +21 -9
- examples/text_gen.py +3 -1
- examples/text_gen_system_prompt.py +2 -1
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/llamacpp/__init__.py +1041 -0
- lollms_client/llm_bindings/ollama/__init__.py +3 -3
- lollms_client/llm_bindings/openllm/__init__.py +547 -0
- lollms_client/llm_bindings/pythonllamacpp/__init__.py +591 -0
- lollms_client/llm_bindings/transformers/__init__.py +660 -251
- lollms_client/lollms_core.py +5 -3
- lollms_client/lollms_llm_binding.py +1 -5
- {lollms_client-0.14.1.dist-info → lollms_client-0.15.0.dist-info}/METADATA +1 -1
- {lollms_client-0.14.1.dist-info → lollms_client-0.15.0.dist-info}/RECORD +16 -13
- {lollms_client-0.14.1.dist-info → lollms_client-0.15.0.dist-info}/WHEEL +0 -0
- {lollms_client-0.14.1.dist-info → lollms_client-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-0.14.1.dist-info → lollms_client-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1041 @@
|
|
|
1
|
+
# bindings/llamacpp_server/binding.py
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import pprint
|
|
5
|
+
import re
|
|
6
|
+
import socket
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional, Callable, List, Union, Dict, Any, Set
|
|
13
|
+
import base64
|
|
14
|
+
import requests # For HTTP client
|
|
15
|
+
from lollms_client.lollms_llm_binding import LollmsLLMBinding
|
|
16
|
+
from lollms_client.lollms_types import MSG_TYPE, ELF_COMPLETION_FORMAT
|
|
17
|
+
|
|
18
|
+
from ascii_colors import ASCIIColors, trace_exception
|
|
19
|
+
import pipmaster as pm
|
|
20
|
+
import platform
|
|
21
|
+
|
|
22
|
+
# Ensure llama-cpp-binaries and requests are installed
|
|
23
|
+
pm.ensure_packages(["requests", "pillow"]) # pillow for dummy image in test
|
|
24
|
+
if not pm.is_installed("llama-cpp-binaries"):
|
|
25
|
+
def install_llama_cpp():
|
|
26
|
+
system = platform.system()
|
|
27
|
+
|
|
28
|
+
if system == "Windows":
|
|
29
|
+
url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl"
|
|
30
|
+
elif system == "Linux":
|
|
31
|
+
url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl"
|
|
32
|
+
else:
|
|
33
|
+
print(f"Unsupported OS: {system}")
|
|
34
|
+
return
|
|
35
|
+
pm.install(url)
|
|
36
|
+
install_llama_cpp()
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
import llama_cpp_binaries
|
|
40
|
+
except ImportError:
|
|
41
|
+
ASCIIColors.error("llama-cpp-binaries package not found. Please install it.")
|
|
42
|
+
ASCIIColors.error("You can try: pip install llama-cpp-binaries")
|
|
43
|
+
ASCIIColors.error("Or download a wheel from: https://github.com/oobabooga/llama-cpp-binaries/releases")
|
|
44
|
+
llama_cpp_binaries = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# --- Predefined patterns ---
|
|
48
|
+
|
|
49
|
+
# Quantization type strings (derived from ggml.h, llama.cpp, and common usage)
|
|
50
|
+
# These are the "core component" strings, without separators like '.', '-', or '_'
|
|
51
|
+
_QUANT_COMPONENTS_SET: Set[str] = {
|
|
52
|
+
# K-quants (most common, often with S/M/L suffix, and now XS/XXS)
|
|
53
|
+
"Q2_K", "Q3_K", "Q4_K", "Q5_K", "Q6_K",
|
|
54
|
+
"Q2_K_S", "Q3_K_S", "Q4_K_S", "Q5_K_S", # No Q6_K_S usually
|
|
55
|
+
"Q3_K_M", "Q4_K_M", "Q5_K_M", # No Q2/Q6_K_M usually
|
|
56
|
+
"Q3_K_L", # Only Q3_K_L is common
|
|
57
|
+
# Adding XS and XXS variants for K-quants by analogy with IQ types
|
|
58
|
+
"Q2_K_XS", "Q3_K_XS", "Q4_K_XS", "Q5_K_XS", "Q6_K_XS",
|
|
59
|
+
"Q2_K_XXS", "Q3_K_XXS", "Q4_K_XXS", "Q5_K_XXS", "Q6_K_XXS",
|
|
60
|
+
|
|
61
|
+
# Non-K-quant legacy types
|
|
62
|
+
"Q4_0", "Q4_1", "Q5_0", "Q5_1", "Q8_0",
|
|
63
|
+
|
|
64
|
+
# Floating point types
|
|
65
|
+
"F16", "FP16", "F32", "FP32", "BF16",
|
|
66
|
+
|
|
67
|
+
# IQ (Innovative Quantization) types
|
|
68
|
+
"IQ1_S", "IQ1_M",
|
|
69
|
+
"IQ2_XXS", "IQ2_XS", "IQ2_S", "IQ2_M",
|
|
70
|
+
"IQ3_XXS", "IQ3_S", "IQ3_M",
|
|
71
|
+
"IQ4_NL", "IQ4_XS",
|
|
72
|
+
|
|
73
|
+
# Newer IQ K-Quant variants (IQ types using K-quant style super-blocks)
|
|
74
|
+
"IQ3_M_K", "IQ3_S_K", # Adding IQ3_S_K as it's plausible
|
|
75
|
+
"IQ4_XS_K", "IQ4_NL_K", # Adding IQ4_NL_K as it's plausible
|
|
76
|
+
|
|
77
|
+
# Basic integer types (less common in user-facing LLM filenames as primary quantizer)
|
|
78
|
+
"I8", "I16", "I32",
|
|
79
|
+
|
|
80
|
+
# Special GGUF type names that might appear (from ggml.c `ggml_type_name`)
|
|
81
|
+
"ALL_F32", "MOSTLY_F16", "MOSTLY_Q4_0", "MOSTLY_Q4_1", "MOSTLY_Q5_0", "MOSTLY_Q5_1",
|
|
82
|
+
"MOSTLY_Q8_0",
|
|
83
|
+
"MOSTLY_Q2_K", "MOSTLY_Q3_K_S", "MOSTLY_Q3_K_M", "MOSTLY_Q3_K_L",
|
|
84
|
+
"MOSTLY_Q4_K_S", "MOSTLY_Q4_K_M", "MOSTLY_Q5_K_S", "MOSTLY_Q5_K_M", "MOSTLY_Q6_K",
|
|
85
|
+
"MOSTLY_IQ1_S", "MOSTLY_IQ1_M", # Adding these
|
|
86
|
+
"MOSTLY_IQ2_XXS", "MOSTLY_IQ2_XS", "MOSTLY_IQ2_S", "MOSTLY_IQ2_M",
|
|
87
|
+
"MOSTLY_IQ3_XXS", "MOSTLY_IQ3_S", "MOSTLY_IQ3_M", # Adding IQ3_M, IQ3_S
|
|
88
|
+
"MOSTLY_IQ4_NL", "MOSTLY_IQ4_XS"
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Common descriptive suffixes for model names
|
|
92
|
+
_MODEL_NAME_SUFFIX_COMPONENTS_SET: Set[str] = {
|
|
93
|
+
"instruct", "chat", "GGUF", "HF", "ggml", "pytorch", "AWQ", "GPTQ", "EXL2",
|
|
94
|
+
"base", "cont", "continue", "ft", # Fine-tuning related
|
|
95
|
+
"v0.1", "v0.2", "v1.0", "v1.1", "v1.5", "v1.6", "v2.0", # Common version tags if they are truly suffixes
|
|
96
|
+
# Be cautious with general version numbers (e.g., "v1", "v2") or model sizes (e.g., "7b")
|
|
97
|
+
# as they are often integral parts of the base name. Only add if they are
|
|
98
|
+
# *always* extraneous suffixes in your context.
|
|
99
|
+
# The ones above are more specific and often appear as full suffix components.
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
# Combine, ensure uniqueness by using sets, then sort by length descending.
|
|
103
|
+
# Sorting ensures longer patterns (e.g., "Q4_K_M") are checked before
|
|
104
|
+
# shorter sub-patterns (e.g., "Q4_K" or "K_M").
|
|
105
|
+
_ALL_REMOVABLE_COMPONENTS: List[str] = sorted(
|
|
106
|
+
list(_QUANT_COMPONENTS_SET.union(_MODEL_NAME_SUFFIX_COMPONENTS_SET)),
|
|
107
|
+
key=len,
|
|
108
|
+
reverse=True
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def get_gguf_model_base_name(file_path_or_name: Union[str, Path]) -> str:
|
|
112
|
+
"""
|
|
113
|
+
Extracts a base model name from a GGUF filename or path by removing
|
|
114
|
+
the .gguf extension and then iteratively stripping known quantization
|
|
115
|
+
patterns and common descriptive suffixes from the end of the name.
|
|
116
|
+
|
|
117
|
+
The stripping is case-insensitive and checks for patterns preceded
|
|
118
|
+
by '.', '-', or '_'.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
file_path_or_name: The file path (as a string or Path object)
|
|
122
|
+
or just the filename string.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
The derived base model name string.
|
|
126
|
+
"""
|
|
127
|
+
if isinstance(file_path_or_name, str):
|
|
128
|
+
p = Path(file_path_or_name)
|
|
129
|
+
elif isinstance(file_path_or_name, Path):
|
|
130
|
+
p = file_path_or_name
|
|
131
|
+
else:
|
|
132
|
+
raise TypeError(
|
|
133
|
+
"Input must be a string or Path object. "
|
|
134
|
+
f"Got: {type(file_path_or_name)}"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
name_part = p.name # Full filename, e.g., "MyModel-7B-chat.Q4_K_M.gguf"
|
|
138
|
+
|
|
139
|
+
# 1. Remove .gguf extension (case-insensitive)
|
|
140
|
+
if name_part.lower().endswith(".gguf"):
|
|
141
|
+
name_part = name_part[:-5] # Remove last 5 chars: ".gguf"
|
|
142
|
+
|
|
143
|
+
# 2. Iteratively strip known components (quantization, common suffixes)
|
|
144
|
+
# These components are usually preceded by '.', '-', or '_'
|
|
145
|
+
while True:
|
|
146
|
+
original_name_part_len = len(name_part)
|
|
147
|
+
stripped_in_this_iteration = False
|
|
148
|
+
|
|
149
|
+
for component in _ALL_REMOVABLE_COMPONENTS:
|
|
150
|
+
component_lower = component.lower()
|
|
151
|
+
# Check for patterns like ".component", "-component", or "_component"
|
|
152
|
+
for separator in [".", "-", "_"]:
|
|
153
|
+
pattern_to_check = f"{separator}{component_lower}"
|
|
154
|
+
if name_part.lower().endswith(pattern_to_check):
|
|
155
|
+
# Remove from the original-case name_part
|
|
156
|
+
name_part = name_part[:-(len(pattern_to_check))]
|
|
157
|
+
stripped_in_this_iteration = True
|
|
158
|
+
break # Break from separator loop
|
|
159
|
+
if stripped_in_this_iteration:
|
|
160
|
+
break # Break from component loop (found a match, restart while loop with shorter name_part)
|
|
161
|
+
|
|
162
|
+
# If no component was stripped in a full pass through _ALL_REMOVABLE_COMPONENTS,
|
|
163
|
+
# or if name_part became empty, we're done.
|
|
164
|
+
if not stripped_in_this_iteration or not name_part:
|
|
165
|
+
break
|
|
166
|
+
|
|
167
|
+
# 3. Final cleanup: remove trailing separators if any are left after stripping
|
|
168
|
+
while name_part and name_part[-1] in ['.', '-', '_']:
|
|
169
|
+
name_part = name_part[:-1]
|
|
170
|
+
|
|
171
|
+
return name_part
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
BindingName = "LlamaCppServerBinding"
|
|
175
|
+
DEFAULT_LLAMACPP_SERVER_HOST = "127.0.0.1"
|
|
176
|
+
DEFAULT_LLAMACPP_SERVER_PORT = 9641
|
|
177
|
+
# Based on the LlamaServer class provided in the prompt
|
|
178
|
+
class LlamaCppServerProcess:
|
|
179
|
+
def __init__(self, model_path: str|Path, clip_model_path: str = None, server_binary_path: str=None, port: int=None, server_args: Dict[str, Any]={}):
|
|
180
|
+
self.model_path = Path(model_path)
|
|
181
|
+
self.clip_model_path = clip_model_path
|
|
182
|
+
self.server_binary_path = Path(server_binary_path)
|
|
183
|
+
if self.server_binary_path is None:
|
|
184
|
+
self.server_binary_path = llama_cpp_binaries.get_binary_path()
|
|
185
|
+
self.port = port if port else DEFAULT_LLAMACPP_SERVER_PORT
|
|
186
|
+
self.server_args = server_args
|
|
187
|
+
self.process: Optional[subprocess.Popen] = None
|
|
188
|
+
self.session = requests.Session()
|
|
189
|
+
self.host = DEFAULT_LLAMACPP_SERVER_HOST
|
|
190
|
+
self.base_url = f"http://{self.host}:{self.port}"
|
|
191
|
+
self.is_healthy = False
|
|
192
|
+
self._stderr_lines = [] # Store last few stderr lines for debugging
|
|
193
|
+
self._stderr_thread = None
|
|
194
|
+
|
|
195
|
+
if not self.model_path.exists():
|
|
196
|
+
raise FileNotFoundError(f"Model file not found: {self.model_path}")
|
|
197
|
+
if not self.server_binary_path.exists():
|
|
198
|
+
raise FileNotFoundError(f"Llama.cpp server binary not found: {self.server_binary_path}")
|
|
199
|
+
|
|
200
|
+
self._start_server()
|
|
201
|
+
|
|
202
|
+
def _filter_stderr(self, stderr_pipe):
|
|
203
|
+
try:
|
|
204
|
+
for line in iter(stderr_pipe.readline, ''):
|
|
205
|
+
if line:
|
|
206
|
+
self._stderr_lines.append(line.strip())
|
|
207
|
+
if len(self._stderr_lines) > 50: # Keep last 50 lines
|
|
208
|
+
self._stderr_lines.pop(0)
|
|
209
|
+
# Simple progress or key info logging
|
|
210
|
+
if "llama_model_loaded" in line or "error" in line.lower() or "failed" in line.lower():
|
|
211
|
+
ASCIIColors.debug(f"[LLAMA_SERVER_STDERR] {line.strip()}")
|
|
212
|
+
elif "running" in line and "port" in line: # Server startup message
|
|
213
|
+
ASCIIColors.info(f"[LLAMA_SERVER_STDERR] {line.strip()}")
|
|
214
|
+
|
|
215
|
+
except ValueError: # Pipe closed
|
|
216
|
+
pass
|
|
217
|
+
except Exception as e:
|
|
218
|
+
ASCIIColors.warning(f"Exception in stderr filter thread: {e}")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _start_server(self, is_embedding=False):
|
|
222
|
+
cmd = [
|
|
223
|
+
str(self.server_binary_path),
|
|
224
|
+
"--model", str(self.model_path),
|
|
225
|
+
"--host", self.host,
|
|
226
|
+
"--port", str(self.port),
|
|
227
|
+
# Add other common defaults or arguments from self.server_args
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
# Common arguments mapping from LlamaCppBinding to server CLI args
|
|
231
|
+
# (This needs to be kept in sync with llama.cpp server's CLI)
|
|
232
|
+
arg_map = {
|
|
233
|
+
"n_ctx": "--ctx-size", "n_gpu_layers": "--gpu-layers", "main_gpu": "--main-gpu",
|
|
234
|
+
"tensor_split": "--tensor-split", "use_mmap": (lambda v: ["--no-mmap"] if not v else []),
|
|
235
|
+
"use_mlock": (lambda v: ["--mlock"] if v else []), "seed": "--seed",
|
|
236
|
+
"n_batch": "--batch-size", "n_threads": "--threads", "n_threads_batch": "--threads-batch",
|
|
237
|
+
"rope_scaling_type": "--rope-scaling", "rope_freq_base": "--rope-freq-base",
|
|
238
|
+
"rope_freq_scale": "--rope-freq-scale",
|
|
239
|
+
"embedding": (lambda v: ["--embedding"] if is_embedding else []), # Server needs to be started with embedding support
|
|
240
|
+
"verbose": (lambda v: ["--verbose"] if v else []),
|
|
241
|
+
"chat_template": "--chat-template", # For newer servers if they support jinja chat templates
|
|
242
|
+
# Old llama.cpp server used --chatml or specific format flags
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
# For LLaVA, specific args are needed
|
|
246
|
+
if self.clip_model_path:
|
|
247
|
+
cmd.extend(["--mmproj", str(self.clip_model_path)])
|
|
248
|
+
# The server might automatically detect LLaVA chat format or need a specific flag
|
|
249
|
+
# e.g., --chat-template llava-1.5 (if server supports templates)
|
|
250
|
+
# For older servers, a specific chat format flag like --chatml with LLaVA prompt structure was used.
|
|
251
|
+
# The server from llama-cpp-binaries is usually quite up-to-date.
|
|
252
|
+
|
|
253
|
+
for key, cli_arg in arg_map.items():
|
|
254
|
+
val = self.server_args.get(key)
|
|
255
|
+
if val is not None:
|
|
256
|
+
if callable(cli_arg): # For args like --no-mmap
|
|
257
|
+
cmd.extend(cli_arg(val))
|
|
258
|
+
else:
|
|
259
|
+
cmd.extend([cli_arg, str(val)])
|
|
260
|
+
|
|
261
|
+
# Add any extra CLI flags directly
|
|
262
|
+
extra_cli_flags = self.server_args.get("extra_cli_flags", [])
|
|
263
|
+
if isinstance(extra_cli_flags, str): # If it's a string, split it
|
|
264
|
+
extra_cli_flags = extra_cli_flags.split()
|
|
265
|
+
cmd.extend(extra_cli_flags)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
ASCIIColors.info(f"Starting Llama.cpp server with command: {' '.join(cmd)}")
|
|
269
|
+
|
|
270
|
+
# Prevent paths with spaces from breaking the command on some OS, though Popen usually handles this.
|
|
271
|
+
# For safety, ensure paths are quoted if necessary, or rely on Popen's list-based command.
|
|
272
|
+
|
|
273
|
+
env = os.environ.copy()
|
|
274
|
+
# On Linux, it might be necessary to set LD_LIBRARY_PATH if server binary has shared lib dependencies in its folder
|
|
275
|
+
if os.name == 'posix' and self.server_binary_path.parent != Path('.'):
|
|
276
|
+
lib_path_str = str(self.server_binary_path.parent.resolve())
|
|
277
|
+
current_ld_path = env.get('LD_LIBRARY_PATH', '')
|
|
278
|
+
if current_ld_path:
|
|
279
|
+
env['LD_LIBRARY_PATH'] = f"{lib_path_str}:{current_ld_path}"
|
|
280
|
+
else:
|
|
281
|
+
env['LD_LIBRARY_PATH'] = lib_path_str
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
ASCIIColors.green(f"running server: {' '.join(cmd)}")
|
|
285
|
+
self.process = subprocess.Popen(
|
|
286
|
+
cmd,
|
|
287
|
+
stderr=subprocess.PIPE,
|
|
288
|
+
stdout=subprocess.PIPE, # Capture stdout as well for debugging
|
|
289
|
+
text=True,
|
|
290
|
+
bufsize=1, # Line buffered
|
|
291
|
+
env=env
|
|
292
|
+
)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
ASCIIColors.error(f"Failed to start llama.cpp server process: {e}")
|
|
295
|
+
trace_exception(e)
|
|
296
|
+
raise
|
|
297
|
+
|
|
298
|
+
# Start stderr/stdout reading threads
|
|
299
|
+
self._stderr_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stderr,), daemon=True)
|
|
300
|
+
self._stderr_thread.start()
|
|
301
|
+
# self._stdout_thread = threading.Thread(target=self._filter_stderr, args=(self.process.stdout,), daemon=True) # can use same filter
|
|
302
|
+
# self._stdout_thread.start()
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# Wait for server to be healthy
|
|
306
|
+
health_url = f"{self.base_url}/health"
|
|
307
|
+
max_wait_time = self.server_args.get("server_startup_timeout", 60) # seconds
|
|
308
|
+
start_time = time.time()
|
|
309
|
+
|
|
310
|
+
while time.time() - start_time < max_wait_time:
|
|
311
|
+
if self.process.poll() is not None:
|
|
312
|
+
exit_code = self.process.poll()
|
|
313
|
+
stderr_output = "\n".join(self._stderr_lines[-10:]) # Last 10 lines
|
|
314
|
+
raise RuntimeError(f"Llama.cpp server process terminated unexpectedly with exit code {exit_code} during startup. Stderr:\n{stderr_output}")
|
|
315
|
+
try:
|
|
316
|
+
response = self.session.get(health_url, timeout=2)
|
|
317
|
+
if response.status_code == 200 and response.json().get("status") == "ok":
|
|
318
|
+
self.is_healthy = True
|
|
319
|
+
ASCIIColors.green(f"Llama.cpp server started successfully on port {self.port}.")
|
|
320
|
+
return
|
|
321
|
+
except requests.exceptions.ConnectionError:
|
|
322
|
+
time.sleep(1) # Wait and retry
|
|
323
|
+
except Exception as e:
|
|
324
|
+
ASCIIColors.warning(f"Health check failed: {e}")
|
|
325
|
+
time.sleep(1)
|
|
326
|
+
|
|
327
|
+
self.is_healthy = False
|
|
328
|
+
self.stop() # Ensure process is killed if health check failed
|
|
329
|
+
stderr_output = "\n".join(self._stderr_lines[-10:])
|
|
330
|
+
raise TimeoutError(f"Llama.cpp server failed to become healthy on port {self.port} within {max_wait_time}s. Stderr:\n{stderr_output}")
|
|
331
|
+
|
|
332
|
+
def stop(self):
|
|
333
|
+
self.is_healthy = False
|
|
334
|
+
if self.process:
|
|
335
|
+
ASCIIColors.info(f"Stopping Llama.cpp server (PID: {self.process.pid})...")
|
|
336
|
+
try:
|
|
337
|
+
# Try graceful termination first
|
|
338
|
+
if os.name == 'nt': # Windows
|
|
339
|
+
# Sending CTRL_C_EVENT to the process group might be more effective for console apps
|
|
340
|
+
# self.process.send_signal(signal.CTRL_C_EVENT) # Requires creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
|
|
341
|
+
self.process.terminate() # For Windows, terminate is often like kill
|
|
342
|
+
else: # POSIX
|
|
343
|
+
self.process.terminate() # Sends SIGTERM
|
|
344
|
+
|
|
345
|
+
self.process.wait(timeout=10) # Wait for graceful shutdown
|
|
346
|
+
except subprocess.TimeoutExpired:
|
|
347
|
+
ASCIIColors.warning("Llama.cpp server did not terminate gracefully, killing...")
|
|
348
|
+
self.process.kill() # Force kill
|
|
349
|
+
try:
|
|
350
|
+
self.process.wait(timeout=5)
|
|
351
|
+
except subprocess.TimeoutExpired:
|
|
352
|
+
ASCIIColors.error("Failed to kill llama.cpp server process.")
|
|
353
|
+
except Exception as e:
|
|
354
|
+
ASCIIColors.error(f"Error during server stop: {e}")
|
|
355
|
+
finally:
|
|
356
|
+
self.process = None
|
|
357
|
+
if self._stderr_thread and self._stderr_thread.is_alive():
|
|
358
|
+
self._stderr_thread.join(timeout=1) # Wait for thread to finish
|
|
359
|
+
ASCIIColors.info("Llama.cpp server stopped.")
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
class LlamaCppServerBinding(LollmsLLMBinding):
|
|
363
|
+
"""
|
|
364
|
+
Binding for llama.cpp server using pre-compiled binaries.
|
|
365
|
+
Manages a local llama.cpp server subprocess and communicates via HTTP.
|
|
366
|
+
"""
|
|
367
|
+
# Default parameters for the llama.cpp server
|
|
368
|
+
DEFAULT_SERVER_ARGS = {
|
|
369
|
+
"n_gpu_layers": 0,
|
|
370
|
+
"n_ctx": 128000,
|
|
371
|
+
"n_batch": 512,
|
|
372
|
+
"embedding": False, # Enable if embeddings are needed via /embedding or /v1/embeddings
|
|
373
|
+
"verbose": False,
|
|
374
|
+
"server_startup_timeout": 120, # seconds
|
|
375
|
+
# "chat_format": "chatml", # Deprecated in favor of --chat-template, but some old servers might need it
|
|
376
|
+
# For LLaVA
|
|
377
|
+
# "clip_model_path": None,
|
|
378
|
+
# "chat_template": "llava-1.5" # if server supports it. Or specific prompt structure.
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
def __init__(self,
|
|
382
|
+
model_name: str, # Name of the GGUF file (e.g., "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
|
|
383
|
+
models_path: str,
|
|
384
|
+
clip_model_name: str = None,
|
|
385
|
+
config: Optional[Dict[str, Any]] = None, # Binding specific config from global_config.yaml
|
|
386
|
+
default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
|
|
387
|
+
**kwargs # Overrides for server_args
|
|
388
|
+
):
|
|
389
|
+
|
|
390
|
+
super().__init__(binding_name=BindingName)
|
|
391
|
+
|
|
392
|
+
if llama_cpp_binaries is None:
|
|
393
|
+
raise ImportError("llama-cpp-binaries package is required but not found.")
|
|
394
|
+
|
|
395
|
+
self.models_path = Path(models_path)
|
|
396
|
+
self.model_name = model_name
|
|
397
|
+
self.model_path = self.models_path/self.model_name
|
|
398
|
+
self.clip_model_path = self.models_path/clip_model_name if clip_model_name else None
|
|
399
|
+
self.default_completion_format = default_completion_format
|
|
400
|
+
|
|
401
|
+
self.server_args = {**self.DEFAULT_SERVER_ARGS, **(config or {})}
|
|
402
|
+
self.server_args.update(kwargs) # Apply direct kwargs overrides
|
|
403
|
+
|
|
404
|
+
self.server_binary_path = self._get_server_binary_path()
|
|
405
|
+
self.current_model_path: Optional[Path] = None
|
|
406
|
+
self.server_process: Optional[LlamaCppServerProcess] = None
|
|
407
|
+
self.port: Optional[int] = None
|
|
408
|
+
|
|
409
|
+
# Attempt to load the model (which starts the server)
|
|
410
|
+
self.load_model(str(self.model_path))
|
|
411
|
+
|
|
412
|
+
def _get_server_binary_path(self) -> Path:
|
|
413
|
+
try:
|
|
414
|
+
# Check if a custom path is provided in config
|
|
415
|
+
custom_path_str = self.server_args.get("llama_server_binary_path")
|
|
416
|
+
if custom_path_str:
|
|
417
|
+
custom_path = Path(custom_path_str)
|
|
418
|
+
if custom_path.exists() and custom_path.is_file():
|
|
419
|
+
ASCIIColors.info(f"Using custom llama.cpp server binary path: {custom_path}")
|
|
420
|
+
return custom_path
|
|
421
|
+
else:
|
|
422
|
+
ASCIIColors.warning(f"Custom llama.cpp server binary path '{custom_path_str}' not found or not a file. Falling back.")
|
|
423
|
+
|
|
424
|
+
# Default to using llama_cpp_binaries
|
|
425
|
+
bin_path_str = llama_cpp_binaries.get_binary_path() # specify "server"
|
|
426
|
+
if bin_path_str:
|
|
427
|
+
bin_path = Path(bin_path_str)
|
|
428
|
+
if bin_path.exists() and bin_path.is_file():
|
|
429
|
+
ASCIIColors.info(f"Using llama.cpp server binary from llama-cpp-binaries: {bin_path}")
|
|
430
|
+
return bin_path
|
|
431
|
+
|
|
432
|
+
raise FileNotFoundError("Could not locate llama.cpp server binary via llama-cpp-binaries or custom path.")
|
|
433
|
+
|
|
434
|
+
except Exception as e:
|
|
435
|
+
ASCIIColors.error(f"Error getting llama.cpp server binary path: {e}")
|
|
436
|
+
trace_exception(e)
|
|
437
|
+
# As a last resort, try a common name in system PATH or a known location if Lollms ships one
|
|
438
|
+
# For now, rely on llama-cpp-binaries or explicit config.
|
|
439
|
+
raise FileNotFoundError(
|
|
440
|
+
"Llama.cpp server binary not found. Ensure 'llama-cpp-binaries' is installed "
|
|
441
|
+
"or provide 'llama_server_binary_path' in the binding's configuration."
|
|
442
|
+
) from e
|
|
443
|
+
|
|
444
|
+
def _resolve_model_path(self, model_path: str) -> Path:
|
|
445
|
+
# Search order:
|
|
446
|
+
# 1. Absolute path
|
|
447
|
+
# 2. Relative to binding-specific models path (e.g., personal_models_path/LlamaCppServerBinding/)
|
|
448
|
+
# 3. Relative to personal_models_path
|
|
449
|
+
# 4. Relative to models_zoo_path
|
|
450
|
+
|
|
451
|
+
model_p = Path(model_path)
|
|
452
|
+
if model_p.is_absolute() and model_p.exists():
|
|
453
|
+
return model_p
|
|
454
|
+
|
|
455
|
+
paths_to_check = []
|
|
456
|
+
binding_specific_folder_name = self.binding_name # "LlamaCppServerBinding"
|
|
457
|
+
paths_to_check.append(self.models_path)
|
|
458
|
+
|
|
459
|
+
for p in paths_to_check:
|
|
460
|
+
if p.exists() and p.is_file():
|
|
461
|
+
ASCIIColors.info(f"Found model at: {p}")
|
|
462
|
+
return p
|
|
463
|
+
|
|
464
|
+
raise FileNotFoundError(f"Model '{model_name}' not found in standard Lollms model paths or as an absolute path.")
|
|
465
|
+
|
|
466
|
+
def _find_available_port(self) -> int:
|
|
467
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
468
|
+
s.bind(('', 0)) # Bind to port 0 to get an OS-assigned available port
|
|
469
|
+
return s.getsockname()[1]
|
|
470
|
+
|
|
471
|
+
def load_model(self, model_name: str) -> bool:
|
|
472
|
+
resolved_path = self._resolve_model_path(model_name)
|
|
473
|
+
|
|
474
|
+
if self.server_process and self.server_process.is_healthy and self.current_model_path == resolved_path:
|
|
475
|
+
ASCIIColors.info(f"Model '{model_name}' is already loaded and server is running.")
|
|
476
|
+
return True
|
|
477
|
+
|
|
478
|
+
if self.server_process:
|
|
479
|
+
self.unload_model() # Stop existing server
|
|
480
|
+
|
|
481
|
+
self.model_name = model_name # Store the name provided by user
|
|
482
|
+
self.current_model_path = resolved_path
|
|
483
|
+
self.port = self._find_available_port()
|
|
484
|
+
|
|
485
|
+
ASCIIColors.info(f"Attempting to start Llama.cpp server for model: {self.current_model_path} on port {self.port}")
|
|
486
|
+
|
|
487
|
+
# Prepare server_args specifically for this model load
|
|
488
|
+
current_server_args = self.server_args.copy()
|
|
489
|
+
|
|
490
|
+
if not self.clip_model_path:
|
|
491
|
+
# Try to find a corresponding .mmproj file or allow user to specify in config
|
|
492
|
+
# e.g. if model is llava-v1.5-7b.Q4_K_M.gguf, look for llava-v1.5-7b.mmproj or mmproj-modelname.gguf
|
|
493
|
+
base_name = get_gguf_model_base_name(self.current_model_path.stem) # etc.
|
|
494
|
+
|
|
495
|
+
potential_clip_paths = [
|
|
496
|
+
self.current_model_path.parent / f"{base_name}.mmproj",
|
|
497
|
+
self.current_model_path.parent / f"mmproj-{base_name}.gguf", # Common pattern
|
|
498
|
+
self.current_model_path.with_suffix(".mmproj"),
|
|
499
|
+
]
|
|
500
|
+
found_clip_path = None
|
|
501
|
+
for p_clip in potential_clip_paths:
|
|
502
|
+
if p_clip.exists():
|
|
503
|
+
found_clip_path = str(p_clip)
|
|
504
|
+
ASCIIColors.info(f"Auto-detected LLaVA clip model: {found_clip_path}")
|
|
505
|
+
break
|
|
506
|
+
if found_clip_path:
|
|
507
|
+
self.clip_model_path = found_clip_path
|
|
508
|
+
# Set a default LLaVA chat template if server supports it, or rely on server auto-detection
|
|
509
|
+
#if not current_server_args.get("chat_template") and not current_server_args.get("chat_format"):
|
|
510
|
+
# current_server_args["chat_template"] = "llava-1.5" # Common default
|
|
511
|
+
else:
|
|
512
|
+
ASCIIColors.warning("Vision capabilities will likely not work. Please ensure the .mmproj file is "
|
|
513
|
+
"next to the model or specify 'clip_model_path' in binding config.")
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
try:
|
|
517
|
+
self.server_process = LlamaCppServerProcess(
|
|
518
|
+
model_path=str(self.current_model_path),
|
|
519
|
+
clip_model_path = str(self.clip_model_path),
|
|
520
|
+
server_binary_path=str(self.server_binary_path),
|
|
521
|
+
port=self.port,
|
|
522
|
+
server_args=current_server_args,
|
|
523
|
+
)
|
|
524
|
+
return self.server_process.is_healthy
|
|
525
|
+
except Exception as e:
|
|
526
|
+
ASCIIColors.error(f"Failed to load model '{model_name}' and start server: {e}")
|
|
527
|
+
trace_exception(e)
|
|
528
|
+
self.server_process = None
|
|
529
|
+
self.current_model_path = None
|
|
530
|
+
return False
|
|
531
|
+
|
|
532
|
+
def unload_model(self):
|
|
533
|
+
if self.server_process:
|
|
534
|
+
self.server_process.stop()
|
|
535
|
+
self.server_process = None
|
|
536
|
+
self.current_model_path = None
|
|
537
|
+
self.port = None
|
|
538
|
+
ASCIIColors.info("Llama.cpp server and model unloaded.")
|
|
539
|
+
|
|
540
|
+
def _get_request_url(self, endpoint: str) -> str:
|
|
541
|
+
if not self.server_process or not self.server_process.is_healthy:
|
|
542
|
+
raise ConnectionError("Llama.cpp server is not running or not healthy.")
|
|
543
|
+
return f"{self.server_process.base_url}{endpoint}"
|
|
544
|
+
|
|
545
|
+
def _prepare_generation_payload(self,
|
|
546
|
+
prompt: str,
|
|
547
|
+
system_prompt: str = "",
|
|
548
|
+
n_predict: Optional[int] = None,
|
|
549
|
+
temperature: float = 0.7,
|
|
550
|
+
top_k: int = 40,
|
|
551
|
+
top_p: float = 0.9,
|
|
552
|
+
repeat_penalty: float = 1.1,
|
|
553
|
+
repeat_last_n: Optional[int] = 64, # Server calls this repeat_last_n or penalty_last_n
|
|
554
|
+
seed: Optional[int] = None,
|
|
555
|
+
stream: bool = False,
|
|
556
|
+
use_chat_format: bool = True, # True for /v1/chat/completions, False for /completion
|
|
557
|
+
images: Optional[List[str]] = None,
|
|
558
|
+
**extra_params # For things like grammar, mirostat, etc from server_args
|
|
559
|
+
) -> Dict:
|
|
560
|
+
|
|
561
|
+
# Start with defaults from server_args, then override with call params
|
|
562
|
+
payload_params = {
|
|
563
|
+
"temperature": self.server_args.get("temperature", 0.7),
|
|
564
|
+
"top_k": self.server_args.get("top_k", 40),
|
|
565
|
+
"top_p": self.server_args.get("top_p", 0.9),
|
|
566
|
+
"repeat_penalty": self.server_args.get("repeat_penalty", 1.1),
|
|
567
|
+
"repeat_last_n": self.server_args.get("repeat_last_n", 64),
|
|
568
|
+
"mirostat": self.server_args.get("mirostat_mode", 0), # llama.cpp server uses mirostat (0=disabled, 1=v1, 2=v2)
|
|
569
|
+
"mirostat_tau": self.server_args.get("mirostat_tau", 5.0),
|
|
570
|
+
"mirostat_eta": self.server_args.get("mirostat_eta", 0.1),
|
|
571
|
+
# Add other mappable params from self.server_args like min_p, typical_p, grammar etc.
|
|
572
|
+
}
|
|
573
|
+
if "grammar_string" in self.server_args and self.server_args["grammar_string"]: # From config
|
|
574
|
+
payload_params["grammar"] = self.server_args["grammar_string"]
|
|
575
|
+
|
|
576
|
+
# Override with specific call parameters
|
|
577
|
+
payload_params.update({
|
|
578
|
+
"temperature": temperature, "top_k": top_k, "top_p": top_p,
|
|
579
|
+
"repeat_penalty": repeat_penalty, "repeat_last_n": repeat_last_n,
|
|
580
|
+
})
|
|
581
|
+
if n_predict is not None: payload_params['n_predict'] = n_predict # Server uses n_predict
|
|
582
|
+
if seed is not None: payload_params['seed'] = seed
|
|
583
|
+
|
|
584
|
+
# Filter None values, as server might not like them
|
|
585
|
+
payload_params = {k: v for k, v in payload_params.items() if v is not None}
|
|
586
|
+
payload_params.update(extra_params) # Add any other specific params for this call
|
|
587
|
+
|
|
588
|
+
if use_chat_format and self.default_completion_format == ELF_COMPLETION_FORMAT.Chat:
|
|
589
|
+
# Use /v1/chat/completions format
|
|
590
|
+
messages = []
|
|
591
|
+
if system_prompt and system_prompt.strip():
|
|
592
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
593
|
+
|
|
594
|
+
user_content: Union[str, List[Dict[str, Any]]] = prompt
|
|
595
|
+
if images and self.clip_model_path: # Check if it's a LLaVA setup
|
|
596
|
+
image_parts = []
|
|
597
|
+
for img_path in images:
|
|
598
|
+
try:
|
|
599
|
+
with open(img_path, "rb") as image_file:
|
|
600
|
+
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
601
|
+
image_type = Path(img_path).suffix[1:].lower() or "png"
|
|
602
|
+
if image_type == "jpg": image_type = "jpeg"
|
|
603
|
+
# Llama.cpp server expects image data directly for LLaVA with /completion
|
|
604
|
+
# For /v1/chat/completions, it expects OpenAI's format for multimodal
|
|
605
|
+
image_parts.append({
|
|
606
|
+
"type": "image_url",
|
|
607
|
+
"image_url": {"url": f"data:image/{image_type};base64,{encoded_string}"}
|
|
608
|
+
})
|
|
609
|
+
except Exception as ex:
|
|
610
|
+
trace_exception(ex)
|
|
611
|
+
user_content = [{"type": "text", "text": prompt}] + image_parts # type: ignore
|
|
612
|
+
|
|
613
|
+
messages.append({"role": "user", "content": user_content})
|
|
614
|
+
|
|
615
|
+
final_payload = {"messages": messages, "stream": stream, **payload_params}
|
|
616
|
+
# n_predict is max_tokens for OpenAI API
|
|
617
|
+
if 'n_predict' in final_payload:
|
|
618
|
+
final_payload['max_tokens'] = final_payload.pop('n_predict')
|
|
619
|
+
|
|
620
|
+
return final_payload
|
|
621
|
+
else:
|
|
622
|
+
# Use /completion format (legacy or for raw text)
|
|
623
|
+
# For LLaVA with /completion, images are typically passed in a special way in the prompt
|
|
624
|
+
# or via an 'image_data' field if the server supports it.
|
|
625
|
+
# The example class uses tokenized prompt for /completion.
|
|
626
|
+
# For simplicity here, we'll send text prompt, server tokenizes.
|
|
627
|
+
# Llama.cpp server's /completion often expects 'prompt' as string or tokens.
|
|
628
|
+
# If images are involved with /completion, it needs specific handling.
|
|
629
|
+
# Example: 'prompt': "USER: <image>\nWhat is this?\nASSISTANT:", 'image_data': [{'data': base64_image, 'id': 10}]
|
|
630
|
+
|
|
631
|
+
full_prompt = prompt
|
|
632
|
+
if system_prompt and system_prompt.strip():
|
|
633
|
+
# Heuristic for instruct models, actual formatting depends on model/template
|
|
634
|
+
full_prompt = f"{system_prompt}\n\nUSER: {prompt}\nASSISTANT:"
|
|
635
|
+
|
|
636
|
+
final_payload = {"prompt": full_prompt, "stream": stream, **payload_params}
|
|
637
|
+
|
|
638
|
+
if images and self.server_args.get("clip_model_path"):
|
|
639
|
+
image_data_list = []
|
|
640
|
+
for i, img_path in enumerate(images):
|
|
641
|
+
try:
|
|
642
|
+
with open(img_path, "rb") as image_file:
|
|
643
|
+
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
|
644
|
+
image_data_list.append({"data": encoded_string, "id": i + 10}) # ID needs to be > 9 for llama.cpp server
|
|
645
|
+
except Exception as e_img:
|
|
646
|
+
ASCIIColors.error(f"Could not encode image {img_path} for /completion: {e_img}")
|
|
647
|
+
if image_data_list:
|
|
648
|
+
final_payload["image_data"] = image_data_list
|
|
649
|
+
# The prompt needs to contain placeholder like USER: <image 1>\n<prompt>\nASSISTANT:
|
|
650
|
+
# This part is tricky and model-dependent. For now, we assume user's prompt is already formatted.
|
|
651
|
+
# Or, the server (if new enough) might handle it with chat_template even for /completion.
|
|
652
|
+
|
|
653
|
+
return final_payload
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def generate_text(self,
|
|
657
|
+
prompt: str,
|
|
658
|
+
images: Optional[List[str]] = None,
|
|
659
|
+
system_prompt: str = "",
|
|
660
|
+
n_predict: Optional[int] = None,
|
|
661
|
+
stream: bool = False,
|
|
662
|
+
temperature: float = None, # Use binding's default if None
|
|
663
|
+
top_k: int = None,
|
|
664
|
+
top_p: float = None,
|
|
665
|
+
repeat_penalty: float = None,
|
|
666
|
+
repeat_last_n: Optional[int] = None,
|
|
667
|
+
seed: Optional[int] = None,
|
|
668
|
+
streaming_callback: Optional[Callable[[str, int], bool]] = None,
|
|
669
|
+
use_chat_format_override: Optional[bool] = None, # Allow overriding binding's default format
|
|
670
|
+
**generation_kwargs
|
|
671
|
+
) -> Union[str, Dict[str, any]]:
|
|
672
|
+
|
|
673
|
+
if not self.server_process or not self.server_process.is_healthy:
|
|
674
|
+
return {"status": False, "error": "Llama.cpp server is not running or not healthy."}
|
|
675
|
+
|
|
676
|
+
_use_chat_format = use_chat_format_override if use_chat_format_override is not None \
|
|
677
|
+
else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
|
|
678
|
+
|
|
679
|
+
payload = self._prepare_generation_payload(
|
|
680
|
+
prompt=prompt, system_prompt=system_prompt, n_predict=n_predict,
|
|
681
|
+
temperature=temperature if temperature is not None else self.server_args.get("temperature",0.7),
|
|
682
|
+
top_k=top_k if top_k is not None else self.server_args.get("top_k",40),
|
|
683
|
+
top_p=top_p if top_p is not None else self.server_args.get("top_p",0.9),
|
|
684
|
+
repeat_penalty=repeat_penalty if repeat_penalty is not None else self.server_args.get("repeat_penalty",1.1),
|
|
685
|
+
repeat_last_n=repeat_last_n if repeat_last_n is not None else self.server_args.get("repeat_last_n",64),
|
|
686
|
+
seed=seed if seed is not None else self.server_args.get("seed", -1), # Use server's default seed if not provided
|
|
687
|
+
stream=stream, use_chat_format=_use_chat_format, images=images,
|
|
688
|
+
**generation_kwargs
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
endpoint = "/v1/chat/completions" if _use_chat_format else "/completion"
|
|
692
|
+
request_url = self._get_request_url(endpoint)
|
|
693
|
+
|
|
694
|
+
# For debugging, print payload (excluding potentially large image data)
|
|
695
|
+
debug_payload = {k:v for k,v in payload.items() if k not in ["image_data"]}
|
|
696
|
+
if "messages" in debug_payload:
|
|
697
|
+
debug_payload["messages"] = [{k:v for k,v in msg.items() if k !="content" or not isinstance(v,list) or not any("image_url" in part for part in v)} for msg in debug_payload["messages"]]
|
|
698
|
+
ASCIIColors.debug(f"Request to {request_url} with payload: {json.dumps(debug_payload, indent=2)[:500]}...")
|
|
699
|
+
|
|
700
|
+
full_response_text = ""
|
|
701
|
+
try:
|
|
702
|
+
response = self.server_process.session.post(request_url, json=payload, stream=stream, timeout=self.server_args.get("generation_timeout", 300))
|
|
703
|
+
response.raise_for_status()
|
|
704
|
+
|
|
705
|
+
if stream:
|
|
706
|
+
for line in response.iter_lines():
|
|
707
|
+
if not line: continue
|
|
708
|
+
line_str = line.decode('utf-8').strip()
|
|
709
|
+
if line_str.startswith('data: '): line_str = line_str[6:]
|
|
710
|
+
if line_str == '[DONE]': break # OpenAI stream end
|
|
711
|
+
|
|
712
|
+
try:
|
|
713
|
+
chunk_data = json.loads(line_str)
|
|
714
|
+
chunk_content = ""
|
|
715
|
+
if _use_chat_format: # OpenAI /v1/chat/completions format
|
|
716
|
+
delta = chunk_data.get('choices', [{}])[0].get('delta', {})
|
|
717
|
+
chunk_content = delta.get('content', '')
|
|
718
|
+
else: # /completion format
|
|
719
|
+
chunk_content = chunk_data.get('content', '')
|
|
720
|
+
|
|
721
|
+
if chunk_content:
|
|
722
|
+
full_response_text += chunk_content
|
|
723
|
+
if streaming_callback and not streaming_callback(chunk_content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
724
|
+
# If callback returns False, we should try to stop generation.
|
|
725
|
+
# Llama.cpp server's /completion doesn't have a direct way to stop mid-stream via API.
|
|
726
|
+
# Closing the connection might be the only way if server supports it.
|
|
727
|
+
ASCIIColors.info("Streaming callback requested stop.")
|
|
728
|
+
response.close() # Attempt to signal server by closing connection
|
|
729
|
+
break
|
|
730
|
+
if chunk_data.get('stop', False) or chunk_data.get('stopped_eos',False) or chunk_data.get('stopped_limit',False): # /completion specific stop flags
|
|
731
|
+
break
|
|
732
|
+
except json.JSONDecodeError:
|
|
733
|
+
ASCIIColors.warning(f"Failed to decode JSON stream chunk: {line_str}")
|
|
734
|
+
continue # Or handle error
|
|
735
|
+
return full_response_text
|
|
736
|
+
else: # Not streaming
|
|
737
|
+
response_data = response.json()
|
|
738
|
+
return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
739
|
+
|
|
740
|
+
except requests.exceptions.RequestException as e:
|
|
741
|
+
error_message = f"Llama.cpp server request error: {e}"
|
|
742
|
+
if e.response is not None:
|
|
743
|
+
try:
|
|
744
|
+
error_details = e.response.json()
|
|
745
|
+
error_message += f" - Details: {error_details.get('error', e.response.text)}"
|
|
746
|
+
except json.JSONDecodeError:
|
|
747
|
+
error_message += f" - Response: {e.response.text[:200]}"
|
|
748
|
+
ASCIIColors.error(error_message)
|
|
749
|
+
return {"status": False, "error": error_message, "details": str(e.response.text if e.response else "No response text")}
|
|
750
|
+
except Exception as ex:
|
|
751
|
+
error_message = f"Llama.cpp generation error: {str(ex)}"
|
|
752
|
+
trace_exception(ex)
|
|
753
|
+
return {"status": False, "error": error_message}
|
|
754
|
+
|
|
755
|
+
def tokenize(self, text: str) -> List[int]:
|
|
756
|
+
if not self.server_process or not self.server_process.is_healthy:
|
|
757
|
+
raise ConnectionError("Llama.cpp server is not running.")
|
|
758
|
+
try:
|
|
759
|
+
response = self.server_process.session.post(self._get_request_url("/tokenize"), json={"content": text})
|
|
760
|
+
response.raise_for_status()
|
|
761
|
+
return response.json().get("tokens", [])
|
|
762
|
+
except Exception as e:
|
|
763
|
+
ASCIIColors.error(f"Tokenization error: {e}"); trace_exception(e)
|
|
764
|
+
return [] # Or raise
|
|
765
|
+
|
|
766
|
+
def detokenize(self, tokens: List[int]) -> str:
|
|
767
|
+
if not self.server_process or not self.server_process.is_healthy:
|
|
768
|
+
raise ConnectionError("Llama.cpp server is not running.")
|
|
769
|
+
try:
|
|
770
|
+
response = self.server_process.session.post(self._get_request_url("/detokenize"), json={"tokens": tokens})
|
|
771
|
+
response.raise_for_status()
|
|
772
|
+
return response.json().get("content", "")
|
|
773
|
+
except Exception as e:
|
|
774
|
+
ASCIIColors.error(f"Detokenization error: {e}"); trace_exception(e)
|
|
775
|
+
return "" # Or raise
|
|
776
|
+
|
|
777
|
+
def count_tokens(self, text: str) -> int:
|
|
778
|
+
return len(self.tokenize(text))
|
|
779
|
+
|
|
780
|
+
def embed(self, text: str, **kwargs) -> List[float]:
|
|
781
|
+
if not self.server_process or not self.server_process.is_healthy:
|
|
782
|
+
raise Exception("Llama.cpp server is not running.")
|
|
783
|
+
if not self.server_args.get("embedding"):
|
|
784
|
+
raise Exception("Embedding support was not enabled in server_args (set 'embedding: true').")
|
|
785
|
+
|
|
786
|
+
try:
|
|
787
|
+
# llama.cpp server has /embedding endpoint (non-OpenAI) and /v1/embeddings (OpenAI-compatible)
|
|
788
|
+
# Let's try /v1/embeddings first for compatibility
|
|
789
|
+
payload = {"input": text}
|
|
790
|
+
if "model" in kwargs: payload["model"] = kwargs["model"] # Can specify model if server handles multiple embedding models (unlikely for llama.cpp server)
|
|
791
|
+
|
|
792
|
+
request_url = self._get_request_url("/v1/embeddings")
|
|
793
|
+
response = self.server_process.session.post(request_url, json=payload)
|
|
794
|
+
|
|
795
|
+
if response.status_code == 404: # Fallback to /embedding if /v1/embeddings not found
|
|
796
|
+
ASCIIColors.debug("Trying /embedding endpoint as /v1/embeddings was not found.")
|
|
797
|
+
request_url = self._get_request_url("/embedding")
|
|
798
|
+
response = self.server_process.session.post(request_url, json={"content": text}) # /embedding uses "content"
|
|
799
|
+
|
|
800
|
+
response.raise_for_status()
|
|
801
|
+
data = response.json()
|
|
802
|
+
|
|
803
|
+
if "data" in data and isinstance(data["data"], list) and "embedding" in data["data"][0]: # /v1/embeddings format
|
|
804
|
+
return data["data"][0]["embedding"]
|
|
805
|
+
elif "embedding" in data and isinstance(data["embedding"], list): # /embedding format
|
|
806
|
+
return data["embedding"]
|
|
807
|
+
else:
|
|
808
|
+
raise ValueError(f"Unexpected embedding response format: {data}")
|
|
809
|
+
|
|
810
|
+
except requests.exceptions.RequestException as e:
|
|
811
|
+
err_msg = f"Llama.cpp server embedding request error: {e}"
|
|
812
|
+
if e.response: err_msg += f" - {e.response.text[:200]}"
|
|
813
|
+
raise Exception(err_msg) from e
|
|
814
|
+
except Exception as ex:
|
|
815
|
+
trace_exception(ex); raise Exception(f"Llama.cpp embedding failed: {str(ex)}") from ex
|
|
816
|
+
|
|
817
|
+
def get_model_info(self) -> dict:
|
|
818
|
+
info = {
|
|
819
|
+
"name": self.binding_name,
|
|
820
|
+
"model_name": self.model_name, # User-provided name
|
|
821
|
+
"model_path": str(self.current_model_path) if self.current_model_path else "Not loaded",
|
|
822
|
+
"loaded": self.server_process is not None and self.server_process.is_healthy,
|
|
823
|
+
"server_args": self.server_args,
|
|
824
|
+
"port": self.port if self.port else "N/A"
|
|
825
|
+
}
|
|
826
|
+
if info["loaded"]:
|
|
827
|
+
# Try to get more info from server's /props or /v1/models
|
|
828
|
+
try:
|
|
829
|
+
props_url = self._get_request_url("/props") # llama.cpp specific
|
|
830
|
+
props_resp = self.server_process.session.get(props_url, timeout=5).json()
|
|
831
|
+
info.update({
|
|
832
|
+
"server_n_ctx": props_resp.get("default_generation_settings",{}).get("n_ctx"), # Example path
|
|
833
|
+
"server_chat_format": props_resp.get("chat_format"),
|
|
834
|
+
"server_clip_model": props_resp.get("mmproj"),
|
|
835
|
+
})
|
|
836
|
+
except Exception: pass # Ignore if /props fails or data missing
|
|
837
|
+
|
|
838
|
+
is_llava = ("llava" in self.model_name.lower() or "bakllava" in self.model_name.lower()) or \
|
|
839
|
+
(self.server_args.get("clip_model_path") is not None) or \
|
|
840
|
+
(info.get("server_clip_model") is not None)
|
|
841
|
+
|
|
842
|
+
info["supports_vision"] = is_llava
|
|
843
|
+
info["supports_structured_output"] = self.server_args.get("grammar_string") is not None
|
|
844
|
+
return info
|
|
845
|
+
|
|
846
|
+
def listModels(self) -> List[Dict[str, str]]:
|
|
847
|
+
# This binding manages one GGUF model at a time by starting a server for it.
|
|
848
|
+
# To "list models", we could scan the Lollms model directories for .gguf files.
|
|
849
|
+
models_found = []
|
|
850
|
+
gguf_pattern = "*.gguf"
|
|
851
|
+
|
|
852
|
+
search_paths = []
|
|
853
|
+
binding_specific_folder_name = self.binding_name
|
|
854
|
+
|
|
855
|
+
search_paths.append(self.models_path)
|
|
856
|
+
|
|
857
|
+
unique_models = set()
|
|
858
|
+
for spath in search_paths:
|
|
859
|
+
if spath.exists() and spath.is_dir():
|
|
860
|
+
for model_file in spath.rglob(gguf_pattern): # rglob for recursive
|
|
861
|
+
if model_file.is_file() and model_file.name not in unique_models:
|
|
862
|
+
models_found.append({
|
|
863
|
+
'model_name': model_file.name,
|
|
864
|
+
# Path relative to one of the main model roots for display/selection
|
|
865
|
+
'path_hint': str(model_file.relative_to(spath.parent) if model_file.is_relative_to(spath.parent) else model_file),
|
|
866
|
+
'size_gb': f"{model_file.stat().st_size / (1024**3):.2f} GB"
|
|
867
|
+
})
|
|
868
|
+
unique_models.add(model_file.name)
|
|
869
|
+
return models_found
|
|
870
|
+
|
|
871
|
+
def __del__(self):
|
|
872
|
+
self.unload_model() # Ensure server is stopped when binding is deleted
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
if __name__ == '__main__':
|
|
876
|
+
global full_streamed_text
|
|
877
|
+
ASCIIColors.yellow("Testing LlamaCppServerBinding...")
|
|
878
|
+
|
|
879
|
+
# --- Configuration ---
|
|
880
|
+
# This should be the NAME of your GGUF model file. The binding will search for it.
|
|
881
|
+
# e.g., "Mistral-7B-Instruct-v0.2-Q4_K_M.gguf"
|
|
882
|
+
# Ensure this model is placed in one of the Lollms model directories.
|
|
883
|
+
# For testing, you can put a small GGUF model in the same directory as this script
|
|
884
|
+
# and set personal_models_path to "."
|
|
885
|
+
|
|
886
|
+
# Adjust current_directory if your models are elsewhere for testing
|
|
887
|
+
current_directory = Path(__file__).parent
|
|
888
|
+
models_path = "E:\lollms\models\gguf\Mistral-Nemo-Instruct-2407-GGUF" #replace with your own model path
|
|
889
|
+
model_name = "Mistral-Nemo-Instruct-2407-Q2_K.gguf"
|
|
890
|
+
|
|
891
|
+
# Binding config (passed to server_args)
|
|
892
|
+
binding_config = {
|
|
893
|
+
"n_gpu_layers": 0, # Set to -1 or a number for GPU offload
|
|
894
|
+
"n_ctx": 512, # Short context for testing
|
|
895
|
+
"embedding": True, # Enable for embedding tests
|
|
896
|
+
"verbose": False, # llama.cpp server verbose logs
|
|
897
|
+
# "extra_cli_flags": ["--cont-batching"] # Example of extra flags
|
|
898
|
+
"server_startup_timeout": 180 # Give more time for server to start, esp. with large models
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
active_binding = None
|
|
902
|
+
try:
|
|
903
|
+
ASCIIColors.cyan("\n--- Initializing LlamaCppServerBinding ---")
|
|
904
|
+
active_binding = LlamaCppServerBinding(
|
|
905
|
+
model_name=model_name,
|
|
906
|
+
models_path=models_path,
|
|
907
|
+
config=binding_config
|
|
908
|
+
)
|
|
909
|
+
if not active_binding.server_process or not active_binding.server_process.is_healthy:
|
|
910
|
+
raise RuntimeError("Server process failed to start or become healthy.")
|
|
911
|
+
|
|
912
|
+
ASCIIColors.green(f"Binding initialized. Server for '{active_binding.model_name}' running on port {active_binding.port}.")
|
|
913
|
+
ASCIIColors.info(f"Model Info: {json.dumps(active_binding.get_model_info(), indent=2)}")
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
# --- List Models (scans configured directories) ---
|
|
917
|
+
ASCIIColors.cyan("\n--- Listing Models (from search paths) ---")
|
|
918
|
+
listed_models = active_binding.listModels()
|
|
919
|
+
if listed_models:
|
|
920
|
+
ASCIIColors.green(f"Found {len(listed_models)} GGUF files. First 5:")
|
|
921
|
+
for m in listed_models[:5]: print(m)
|
|
922
|
+
else: ASCIIColors.warning("No GGUF models found in search paths.")
|
|
923
|
+
|
|
924
|
+
# --- Tokenize/Detokenize ---
|
|
925
|
+
ASCIIColors.cyan("\n--- Tokenize/Detokenize ---")
|
|
926
|
+
sample_text = "Hello, Llama.cpp server world!"
|
|
927
|
+
tokens = active_binding.tokenize(sample_text)
|
|
928
|
+
ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
|
|
929
|
+
token_count = active_binding.count_tokens(sample_text)
|
|
930
|
+
ASCIIColors.green(f"Token count: {token_count}")
|
|
931
|
+
if tokens: # Only detokenize if tokenization worked
|
|
932
|
+
detokenized_text = active_binding.detokenize(tokens)
|
|
933
|
+
ASCIIColors.green(f"Detokenized text: {detokenized_text}")
|
|
934
|
+
# Note: exact match might depend on BOS/EOS handling by server's tokenizer
|
|
935
|
+
# assert detokenized_text.strip() == sample_text.strip(), "Tokenization/Detokenization mismatch!"
|
|
936
|
+
else: ASCIIColors.warning("Tokenization returned empty list, skipping detokenization.")
|
|
937
|
+
|
|
938
|
+
# --- Text Generation (Non-Streaming, Chat Format using /v1/chat/completions) ---
|
|
939
|
+
ASCIIColors.cyan("\n--- Text Generation (Non-Streaming, Chat API) ---")
|
|
940
|
+
prompt_text = "What is the capital of Germany?"
|
|
941
|
+
system_prompt_text = "You are a concise geography expert."
|
|
942
|
+
generated_text = active_binding.generate_text(
|
|
943
|
+
prompt_text, system_prompt=system_prompt_text, n_predict=20, stream=False,
|
|
944
|
+
use_chat_format_override=True # Force /v1/chat/completions
|
|
945
|
+
)
|
|
946
|
+
if isinstance(generated_text, str): ASCIIColors.green(f"Generated text: {generated_text}")
|
|
947
|
+
else: ASCIIColors.error(f"Generation failed: {generated_text}")
|
|
948
|
+
|
|
949
|
+
# --- Text Generation (Streaming, /completion API) ---
|
|
950
|
+
ASCIIColors.cyan("\n--- Text Generation (Streaming, Completion API) ---")
|
|
951
|
+
full_streamed_text = ""
|
|
952
|
+
def stream_callback(chunk: str, msg_type: int):
|
|
953
|
+
global full_streamed_text; ASCIIColors.green(f"{chunk}", end="", flush=True)
|
|
954
|
+
full_streamed_text += chunk; return True
|
|
955
|
+
|
|
956
|
+
result = active_binding.generate_text(
|
|
957
|
+
prompt_text, system_prompt=system_prompt_text, n_predict=30, stream=True,
|
|
958
|
+
streaming_callback=stream_callback, use_chat_format_override=False # Force /completion
|
|
959
|
+
)
|
|
960
|
+
print("\n--- End of Stream ---")
|
|
961
|
+
if isinstance(result, str): ASCIIColors.green(f"Full streamed text: {result}")
|
|
962
|
+
else: ASCIIColors.error(f"Streaming generation failed: {result}")
|
|
963
|
+
|
|
964
|
+
# --- Embeddings ---
|
|
965
|
+
if binding_config.get("embedding"):
|
|
966
|
+
ASCIIColors.cyan("\n--- Embeddings ---")
|
|
967
|
+
embedding_text = "Test sentence for server-based embeddings."
|
|
968
|
+
try:
|
|
969
|
+
embedding_vector = active_binding.embed(embedding_text)
|
|
970
|
+
ASCIIColors.green(f"Embedding for '{embedding_text}' (first 3 dims): {embedding_vector[:3]}...")
|
|
971
|
+
ASCIIColors.info(f"Embedding vector dimension: {len(embedding_vector)}")
|
|
972
|
+
except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
|
|
973
|
+
else: ASCIIColors.yellow("\n--- Embeddings Skipped (embedding: false in config) ---")
|
|
974
|
+
|
|
975
|
+
# --- LLaVA Test (Conceptual - requires a LLaVA model and mmproj) ---
|
|
976
|
+
# To test LLaVA:
|
|
977
|
+
models_path = "E:\drumber" #replace with your own model path
|
|
978
|
+
model_name = "llava-v1.6-mistral-7b.Q3_K_XS.gguf"
|
|
979
|
+
model_path = Path(models_path)/model_name
|
|
980
|
+
ASCIIColors.cyan("\n--- LLaVA Vision Test ---")
|
|
981
|
+
dummy_image_path = Path("E:\\drumber\\drumber.png")
|
|
982
|
+
try:
|
|
983
|
+
from PIL import Image, ImageDraw
|
|
984
|
+
img = Image.new('RGB', (150, 70), color = ('magenta'))
|
|
985
|
+
d = ImageDraw.Draw(img); d.text((10,10), "Server LLaVA", fill=('white'))
|
|
986
|
+
img.save(dummy_image_path)
|
|
987
|
+
ASCIIColors.info(f"Created dummy image for LLaVA: {dummy_image_path}")
|
|
988
|
+
|
|
989
|
+
llava_prompt = "Describe this image."
|
|
990
|
+
# For /v1/chat/completions with LLaVA, images are passed in messages.
|
|
991
|
+
# For /completion with LLaVA, prompt needs <image> placeholder and image_data field.
|
|
992
|
+
llava_response = active_binding.generate_text(
|
|
993
|
+
prompt=llava_prompt, images=[str(dummy_image_path)], n_predict=40, stream=False,
|
|
994
|
+
use_chat_format_override=True # Use /v1/chat/completions for easier multimodal
|
|
995
|
+
)
|
|
996
|
+
if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
|
|
997
|
+
else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
|
|
998
|
+
except ImportError: ASCIIColors.warning("Pillow not found. Cannot create dummy image for LLaVA.")
|
|
999
|
+
except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
|
|
1000
|
+
finally:
|
|
1001
|
+
if dummy_image_path.exists(): dummy_image_path.unlink()
|
|
1002
|
+
|
|
1003
|
+
# --- Test changing model ---
|
|
1004
|
+
# This part is conceptual. You'd need another GGUF model file for a real test.
|
|
1005
|
+
# For now, we'll just call load_model with the same model to test the logic.
|
|
1006
|
+
|
|
1007
|
+
ASCIIColors.cyan("\n--- Testing Model Change (reloading same model) ---")
|
|
1008
|
+
reload_success = active_binding.load_model(str(model_path))
|
|
1009
|
+
if reload_success and active_binding.server_process and active_binding.server_process.is_healthy:
|
|
1010
|
+
ASCIIColors.green(f"Model reloaded/re-confirmed successfully. Server on port {active_binding.port}.")
|
|
1011
|
+
# Quick generation test after reload
|
|
1012
|
+
reloaded_gen = active_binding.generate_text("Ping", n_predict=5, stream=False)
|
|
1013
|
+
if isinstance(reloaded_gen, str): ASCIIColors.green(f"Post-reload ping response: {reloaded_gen.strip()}")
|
|
1014
|
+
else: ASCIIColors.error(f"Post-reload generation failed: {reloaded_gen}")
|
|
1015
|
+
else:
|
|
1016
|
+
ASCIIColors.error("Failed to reload model or server not healthy after reload attempt.")
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
except ImportError as e_imp:
|
|
1020
|
+
ASCIIColors.error(f"Import error: {e_imp}. Ensure llama-cpp-binaries is installed.")
|
|
1021
|
+
except FileNotFoundError as e_fnf:
|
|
1022
|
+
ASCIIColors.error(f"File not found error: {e_fnf}. Check model or server binary paths.")
|
|
1023
|
+
except ConnectionError as e_conn:
|
|
1024
|
+
ASCIIColors.error(f"Connection error (server might have failed to start or is unresponsive): {e_conn}")
|
|
1025
|
+
except RuntimeError as e_rt:
|
|
1026
|
+
ASCIIColors.error(f"Runtime error (often server process issue): {e_rt}")
|
|
1027
|
+
if active_binding and active_binding.server_process:
|
|
1028
|
+
ASCIIColors.error("Last stderr lines from server:")
|
|
1029
|
+
for line in active_binding.server_process._stderr_lines[-20:]: print(line) # Print last 20
|
|
1030
|
+
except Exception as e_main:
|
|
1031
|
+
ASCIIColors.error(f"An unexpected error occurred: {e_main}")
|
|
1032
|
+
trace_exception(e_main)
|
|
1033
|
+
finally:
|
|
1034
|
+
if active_binding:
|
|
1035
|
+
ASCIIColors.cyan("\n--- Unloading Model and Stopping Server ---")
|
|
1036
|
+
active_binding.unload_model()
|
|
1037
|
+
ASCIIColors.green("Server stopped and model unloaded.")
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
ASCIIColors.yellow("\nLlamaCppServerBinding test finished.")
|