lollms-client 1.7.10__py3-none-any.whl → 1.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/claude/__init__.py +0 -1
- lollms_client/llm_bindings/grok/__init__.py +0 -1
- lollms_client/llm_bindings/llama_cpp_server/__init__.py +726 -0
- lollms_client/llm_bindings/ollama/__init__.py +40 -2
- lollms_client/lollms_discussion.py +209 -65
- lollms_client/lollms_llm_binding.py +15 -1
- lollms_client/lollms_mcp_binding.py +15 -3
- lollms_client/lollms_stt_binding.py +16 -2
- lollms_client/lollms_tti_binding.py +16 -2
- lollms_client/lollms_ttm_binding.py +16 -2
- lollms_client/lollms_tts_binding.py +16 -2
- lollms_client/lollms_ttv_binding.py +16 -2
- lollms_client/tti_bindings/diffusers/__init__.py +132 -79
- lollms_client/tti_bindings/diffusers/server/main.py +76 -65
- lollms_client/tti_bindings/open_router/__init__.py +341 -0
- lollms_client/tts_bindings/xtts/__init__.py +1 -1
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/METADATA +1 -1
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/RECORD +22 -21
- lollms_client/llm_bindings/llamacpp/__init__.py +0 -1155
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/WHEEL +0 -0
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-1.7.10.dist-info → lollms_client-1.8.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,726 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import requests
|
|
6
|
+
import socket
|
|
7
|
+
import re
|
|
8
|
+
import platform
|
|
9
|
+
import zipfile
|
|
10
|
+
import tarfile
|
|
11
|
+
import json
|
|
12
|
+
import yaml
|
|
13
|
+
import atexit
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional, List, Dict, Any, Union, Callable
|
|
16
|
+
|
|
17
|
+
import pipmaster as pm
|
|
18
|
+
from ascii_colors import ASCIIColors, trace_exception
|
|
19
|
+
from lollms_client.lollms_llm_binding import LollmsLLMBinding
|
|
20
|
+
from lollms_client.lollms_types import MSG_TYPE
|
|
21
|
+
from lollms_client.lollms_discussion import LollmsDiscussion
|
|
22
|
+
|
|
23
|
+
# Ensure dependencies
|
|
24
|
+
pm.ensure_packages(["openai", "huggingface_hub", "filelock", "requests", "tqdm", "psutil", "pyyaml"])
|
|
25
|
+
import openai
|
|
26
|
+
from huggingface_hub import hf_hub_download
|
|
27
|
+
from filelock import FileLock
|
|
28
|
+
from tqdm import tqdm
|
|
29
|
+
import psutil
|
|
30
|
+
|
|
31
|
+
BindingName = "LlamaCppServerBinding"
|
|
32
|
+
|
|
33
|
+
def get_free_port(start_port=9624, max_port=10000):
|
|
34
|
+
"""
|
|
35
|
+
Finds a free port on localhost.
|
|
36
|
+
Race-condition safe-ish: We bind to it to check, but release it immediately.
|
|
37
|
+
Real safety comes from the FileLock around this call.
|
|
38
|
+
"""
|
|
39
|
+
for port in range(start_port, max_port):
|
|
40
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
41
|
+
try:
|
|
42
|
+
sock.bind(('localhost', port))
|
|
43
|
+
return port
|
|
44
|
+
except OSError:
|
|
45
|
+
continue
|
|
46
|
+
raise RuntimeError("No free ports available.")
|
|
47
|
+
|
|
48
|
+
class LlamaCppServerBinding(LollmsLLMBinding):
|
|
49
|
+
def __init__(self, **kwargs):
|
|
50
|
+
super().__init__(BindingName, **kwargs)
|
|
51
|
+
self.config = kwargs
|
|
52
|
+
|
|
53
|
+
# Configuration
|
|
54
|
+
self.host = kwargs.get("host", "localhost")
|
|
55
|
+
self.model_name = kwargs.get("model_name", "")
|
|
56
|
+
self.n_ctx = kwargs.get("ctx_size", 4096)
|
|
57
|
+
self.n_gpu_layers = kwargs.get("n_gpu_layers", -1)
|
|
58
|
+
self.n_threads = kwargs.get("n_threads", None)
|
|
59
|
+
self.n_parallel = kwargs.get("n_parallel", 1)
|
|
60
|
+
self.batch_size = kwargs.get("batch_size", 512)
|
|
61
|
+
|
|
62
|
+
# Server Management
|
|
63
|
+
self.max_active_models = int(kwargs.get("max_active_models", 1))
|
|
64
|
+
self.idle_timeout = float(kwargs.get("idle_timeout", -1))
|
|
65
|
+
|
|
66
|
+
# Paths
|
|
67
|
+
self.binding_dir = Path(__file__).parent
|
|
68
|
+
self.bin_dir = self.binding_dir / "bin"
|
|
69
|
+
self.models_dir = Path(kwargs.get("models_path", "models/llama_cpp_models")).resolve()
|
|
70
|
+
|
|
71
|
+
# Multimodal Registry
|
|
72
|
+
self.mm_registry_path = self.models_dir / "multimodal_bindings.yaml"
|
|
73
|
+
|
|
74
|
+
# Registry directory for inter-process coordination
|
|
75
|
+
self.servers_dir = self.models_dir / "servers"
|
|
76
|
+
self.servers_dir.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
self.bin_dir.mkdir(exist_ok=True)
|
|
78
|
+
|
|
79
|
+
# Global lock file for all operations on the registry
|
|
80
|
+
self.global_lock_path = self.models_dir / "global_server_manager.lock"
|
|
81
|
+
|
|
82
|
+
# Installation check
|
|
83
|
+
if not self._get_server_executable().exists():
|
|
84
|
+
ASCIIColors.warning("Llama.cpp binary not found. Attempting installation...")
|
|
85
|
+
self.install_llama_cpp()
|
|
86
|
+
|
|
87
|
+
# Register cleanup for this process
|
|
88
|
+
atexit.register(self.cleanup_orphans_if_needed)
|
|
89
|
+
|
|
90
|
+
def _get_server_executable(self) -> Path:
|
|
91
|
+
if platform.system() == "Windows":
|
|
92
|
+
return self.bin_dir / "llama-server.exe"
|
|
93
|
+
else:
|
|
94
|
+
return self.bin_dir / "llama-server"
|
|
95
|
+
|
|
96
|
+
def detect_hardware(self) -> str:
|
|
97
|
+
sys_plat = platform.system()
|
|
98
|
+
if sys_plat == "Darwin":
|
|
99
|
+
return "macos"
|
|
100
|
+
try:
|
|
101
|
+
subprocess.check_call(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
102
|
+
return "cuda"
|
|
103
|
+
except:
|
|
104
|
+
pass
|
|
105
|
+
return "cpu"
|
|
106
|
+
|
|
107
|
+
def install_llama_cpp(self):
|
|
108
|
+
try:
|
|
109
|
+
ASCIIColors.info("Checking latest llama.cpp release...")
|
|
110
|
+
releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
|
111
|
+
response = requests.get(releases_url)
|
|
112
|
+
response.raise_for_status()
|
|
113
|
+
release_data = response.json()
|
|
114
|
+
assets = release_data.get("assets", [])
|
|
115
|
+
|
|
116
|
+
hardware = self.detect_hardware()
|
|
117
|
+
sys_plat = platform.system()
|
|
118
|
+
|
|
119
|
+
target_asset = None
|
|
120
|
+
search_terms = []
|
|
121
|
+
|
|
122
|
+
if sys_plat == "Windows":
|
|
123
|
+
search_terms.append("win")
|
|
124
|
+
search_terms.append("cuda" if hardware == "cuda" else "avx2")
|
|
125
|
+
search_terms.append("x64")
|
|
126
|
+
elif sys_plat == "Linux":
|
|
127
|
+
search_terms.append("ubuntu")
|
|
128
|
+
search_terms.append("x64")
|
|
129
|
+
elif sys_plat == "Darwin":
|
|
130
|
+
search_terms.append("macos")
|
|
131
|
+
search_terms.append("arm64" if platform.machine() == "arm64" else "x64")
|
|
132
|
+
|
|
133
|
+
for asset in assets:
|
|
134
|
+
name = asset["name"].lower()
|
|
135
|
+
if "cudart" in name: continue
|
|
136
|
+
if all(term in name for term in search_terms):
|
|
137
|
+
if "cuda" in name and "cu11" in name and hardware == "cuda": continue
|
|
138
|
+
target_asset = asset
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
# Windows CPU fallback
|
|
142
|
+
if not target_asset and sys_plat == "Windows" and hardware == "cpu":
|
|
143
|
+
for asset in assets:
|
|
144
|
+
if "cudart" in asset["name"].lower(): continue
|
|
145
|
+
if "win" in asset["name"].lower() and "x64" in asset["name"].lower() and "cuda" not in asset["name"].lower():
|
|
146
|
+
target_asset = asset
|
|
147
|
+
break
|
|
148
|
+
|
|
149
|
+
if not target_asset:
|
|
150
|
+
raise RuntimeError(f"No suitable binary found for {sys_plat} / {hardware}")
|
|
151
|
+
|
|
152
|
+
download_url = target_asset["browser_download_url"]
|
|
153
|
+
filename = target_asset["name"]
|
|
154
|
+
dest_file = self.bin_dir / filename
|
|
155
|
+
|
|
156
|
+
ASCIIColors.info(f"Downloading {filename}...")
|
|
157
|
+
with requests.get(download_url, stream=True) as r:
|
|
158
|
+
r.raise_for_status()
|
|
159
|
+
with open(dest_file, 'wb') as f:
|
|
160
|
+
for chunk in r.iter_content(chunk_size=8192):
|
|
161
|
+
f.write(chunk)
|
|
162
|
+
|
|
163
|
+
ASCIIColors.info("Extracting...")
|
|
164
|
+
if filename.endswith(".zip"):
|
|
165
|
+
with zipfile.ZipFile(dest_file, 'r') as z: z.extractall(self.bin_dir)
|
|
166
|
+
elif filename.endswith(".tar.gz"):
|
|
167
|
+
with tarfile.open(dest_file, "r:gz") as t: t.extractall(self.bin_dir)
|
|
168
|
+
|
|
169
|
+
dest_file.unlink()
|
|
170
|
+
|
|
171
|
+
# Normalize binary name
|
|
172
|
+
exe_name = "llama-server.exe" if sys_plat == "Windows" else "llama-server"
|
|
173
|
+
legacy_name = "server.exe" if sys_plat == "Windows" else "server"
|
|
174
|
+
if not (self.bin_dir / exe_name).exists() and (self.bin_dir / legacy_name).exists():
|
|
175
|
+
shutil.move(str(self.bin_dir / legacy_name), str(self.bin_dir / exe_name))
|
|
176
|
+
|
|
177
|
+
if sys_plat != "Windows":
|
|
178
|
+
exe_path = self.bin_dir / exe_name
|
|
179
|
+
if exe_path.exists(): os.chmod(exe_path, 0o755)
|
|
180
|
+
|
|
181
|
+
ASCIIColors.success("Llama.cpp installed successfully.")
|
|
182
|
+
except Exception as e:
|
|
183
|
+
trace_exception(e)
|
|
184
|
+
ASCIIColors.error(f"Failed to install llama.cpp: {e}")
|
|
185
|
+
|
|
186
|
+
# --- Server Management Logic ---
|
|
187
|
+
|
|
188
|
+
def _get_registry_file(self, model_name: str) -> Path:
|
|
189
|
+
# Sanitize filename
|
|
190
|
+
safe_name = "".join(c for c in model_name if c.isalnum() or c in ('-', '_', '.'))
|
|
191
|
+
return self.servers_dir / f"{safe_name}.json"
|
|
192
|
+
|
|
193
|
+
def _get_server_info(self, model_name: str) -> Optional[Dict]:
|
|
194
|
+
"""Reads registry file for a model, returns dict or None if invalid."""
|
|
195
|
+
reg_file = self._get_registry_file(model_name)
|
|
196
|
+
if not reg_file.exists():
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
with open(reg_file, 'r') as f:
|
|
201
|
+
info = json.load(f)
|
|
202
|
+
|
|
203
|
+
# Verify process is alive
|
|
204
|
+
if psutil.pid_exists(info['pid']):
|
|
205
|
+
# Verify it's actually llama-server (optional but safe)
|
|
206
|
+
try:
|
|
207
|
+
p = psutil.Process(info['pid'])
|
|
208
|
+
if "llama" in p.name().lower() or "server" in p.name().lower():
|
|
209
|
+
return info
|
|
210
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
# If we get here, process is dead or invalid
|
|
214
|
+
ASCIIColors.warning(f"Found stale registry file for {model_name} (PID {info['pid']}). Cleaning up.")
|
|
215
|
+
reg_file.unlink()
|
|
216
|
+
return None
|
|
217
|
+
except Exception:
|
|
218
|
+
# Corrupt file
|
|
219
|
+
if reg_file.exists(): reg_file.unlink()
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
def _kill_server(self, model_name: str, info: Dict):
|
|
223
|
+
"""Kills a server process and removes its registry file."""
|
|
224
|
+
ASCIIColors.info(f"Stopping server for {model_name} (PID {info['pid']})...")
|
|
225
|
+
try:
|
|
226
|
+
p = psutil.Process(info['pid'])
|
|
227
|
+
p.terminate()
|
|
228
|
+
p.wait(timeout=5)
|
|
229
|
+
except psutil.NoSuchProcess:
|
|
230
|
+
pass # Already gone
|
|
231
|
+
except psutil.TimeoutExpired:
|
|
232
|
+
p.kill()
|
|
233
|
+
except Exception as e:
|
|
234
|
+
ASCIIColors.error(f"Error killing process: {e}")
|
|
235
|
+
|
|
236
|
+
# Remove registry file
|
|
237
|
+
reg_file = self._get_registry_file(model_name)
|
|
238
|
+
if reg_file.exists():
|
|
239
|
+
reg_file.unlink()
|
|
240
|
+
|
|
241
|
+
def _ensure_capacity_locked(self):
|
|
242
|
+
"""
|
|
243
|
+
Called while holding the lock. Ensures we have space for a new model.
|
|
244
|
+
"""
|
|
245
|
+
registry_files = list(self.servers_dir.glob("*.json"))
|
|
246
|
+
|
|
247
|
+
# 1. Clean up stale entries first
|
|
248
|
+
valid_servers = []
|
|
249
|
+
for rf in registry_files:
|
|
250
|
+
try:
|
|
251
|
+
with open(rf, 'r') as f:
|
|
252
|
+
data = json.load(f)
|
|
253
|
+
if psutil.pid_exists(data['pid']):
|
|
254
|
+
valid_servers.append((rf, data))
|
|
255
|
+
else:
|
|
256
|
+
rf.unlink() # Clean stale
|
|
257
|
+
except:
|
|
258
|
+
if rf.exists(): rf.unlink()
|
|
259
|
+
|
|
260
|
+
# 2. Check capacity
|
|
261
|
+
if len(valid_servers) >= self.max_active_models:
|
|
262
|
+
# Sort by file modification time (mtime), which acts as our "last used" heartbeat
|
|
263
|
+
# Oldest mtime = Least Recently Used
|
|
264
|
+
valid_servers.sort(key=lambda x: x[0].stat().st_mtime)
|
|
265
|
+
|
|
266
|
+
# Kill the oldest
|
|
267
|
+
oldest_file, oldest_info = valid_servers[0]
|
|
268
|
+
model_to_kill = oldest_info.get("model_name", "unknown")
|
|
269
|
+
ASCIIColors.warning(f"Max active models ({self.max_active_models}) reached. Unloading LRU model: {model_to_kill}")
|
|
270
|
+
self._kill_server(model_to_kill, oldest_info)
|
|
271
|
+
|
|
272
|
+
def _load_mm_registry(self) -> Dict[str, str]:
|
|
273
|
+
if not self.mm_registry_path.exists():
|
|
274
|
+
return {}
|
|
275
|
+
try:
|
|
276
|
+
with open(self.mm_registry_path, 'r') as f:
|
|
277
|
+
registry = yaml.safe_load(f) or {}
|
|
278
|
+
|
|
279
|
+
# Self-healing: remove missing files
|
|
280
|
+
updated = False
|
|
281
|
+
to_remove = []
|
|
282
|
+
for m, p in registry.items():
|
|
283
|
+
if not (self.models_dir / m).exists() or not (self.models_dir / p).exists():
|
|
284
|
+
to_remove.append(m)
|
|
285
|
+
updated = True
|
|
286
|
+
|
|
287
|
+
for m in to_remove:
|
|
288
|
+
del registry[m]
|
|
289
|
+
|
|
290
|
+
if updated:
|
|
291
|
+
self._save_mm_registry(registry)
|
|
292
|
+
return registry
|
|
293
|
+
except Exception as e:
|
|
294
|
+
ASCIIColors.error(f"Failed to load multimodal registry: {e}")
|
|
295
|
+
return {}
|
|
296
|
+
|
|
297
|
+
def _save_mm_registry(self, registry: Dict[str, str]):
|
|
298
|
+
try:
|
|
299
|
+
with open(self.mm_registry_path, 'w') as f:
|
|
300
|
+
yaml.dump(registry, f)
|
|
301
|
+
except Exception as e:
|
|
302
|
+
ASCIIColors.error(f"Failed to save multimodal registry: {e}")
|
|
303
|
+
|
|
304
|
+
def bind_multimodal_model(self, model_name: str, mmproj_name: str) -> dict:
|
|
305
|
+
"""Explicitly binds a model to an mmproj file."""
|
|
306
|
+
if not (self.models_dir / model_name).exists():
|
|
307
|
+
return {"status": False, "error": f"Model {model_name} not found."}
|
|
308
|
+
if not (self.models_dir / mmproj_name).exists():
|
|
309
|
+
return {"status": False, "error": f"Projector {mmproj_name} not found."}
|
|
310
|
+
|
|
311
|
+
registry = self._load_mm_registry()
|
|
312
|
+
registry[model_name] = mmproj_name
|
|
313
|
+
self._save_mm_registry(registry)
|
|
314
|
+
|
|
315
|
+
ASCIIColors.success(f"Bound {model_name} with {mmproj_name}")
|
|
316
|
+
return {"status": True, "message": f"Bound {model_name} with {mmproj_name}"}
|
|
317
|
+
|
|
318
|
+
def _find_mmproj(self, model_path: Path) -> Optional[Path]:
|
|
319
|
+
"""Finds a corresponding mmproj file for a given model path."""
|
|
320
|
+
# 1. Check registry first
|
|
321
|
+
registry = self._load_mm_registry()
|
|
322
|
+
if model_path.name in registry:
|
|
323
|
+
proj_path = self.models_dir / registry[model_path.name]
|
|
324
|
+
if proj_path.exists():
|
|
325
|
+
return proj_path
|
|
326
|
+
|
|
327
|
+
# 2. Automatic detection patterns
|
|
328
|
+
stem = model_path.stem
|
|
329
|
+
clean_stem = re.sub(r'\.(Q\d_.*|f16|f32)$', '', stem)
|
|
330
|
+
patterns = [
|
|
331
|
+
f"{stem}.mmproj", f"{stem}-mmproj.gguf", f"{stem}.mmproj.gguf",
|
|
332
|
+
f"{clean_stem}.mmproj", f"{clean_stem}-mmproj.gguf",
|
|
333
|
+
f"mmproj-{stem}.gguf", "mmproj.gguf"
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
for p in patterns:
|
|
337
|
+
pot = model_path.parent / p
|
|
338
|
+
if pot.exists():
|
|
339
|
+
return pot
|
|
340
|
+
|
|
341
|
+
# 3. Last resort: simple scan
|
|
342
|
+
try:
|
|
343
|
+
for f in model_path.parent.iterdir():
|
|
344
|
+
if f.is_file() and "mmproj" in f.name.lower() and f.name != model_path.name:
|
|
345
|
+
if f.suffix in [".gguf", ".mmproj", ".bin"]:
|
|
346
|
+
return f
|
|
347
|
+
except:
|
|
348
|
+
pass
|
|
349
|
+
|
|
350
|
+
return None
|
|
351
|
+
|
|
352
|
+
def _spawn_server_detached(self, model_name: str):
|
|
353
|
+
"""Spawns the server process detached so it survives if this python script ends."""
|
|
354
|
+
exe_path = self._get_server_executable()
|
|
355
|
+
model_path = self.models_dir / model_name
|
|
356
|
+
|
|
357
|
+
if not model_path.exists():
|
|
358
|
+
raise FileNotFoundError(f"Model {model_name} not found at {model_path}")
|
|
359
|
+
|
|
360
|
+
port = get_free_port()
|
|
361
|
+
|
|
362
|
+
cmd = [
|
|
363
|
+
str(exe_path),
|
|
364
|
+
"--model", str(model_path),
|
|
365
|
+
"--host", self.host,
|
|
366
|
+
"--port", str(port),
|
|
367
|
+
"--ctx-size", str(self.n_ctx),
|
|
368
|
+
"--n-gpu-layers", str(self.n_gpu_layers),
|
|
369
|
+
"--parallel", str(self.n_parallel),
|
|
370
|
+
"--batch-size", str(self.batch_size),
|
|
371
|
+
"--embedding"
|
|
372
|
+
]
|
|
373
|
+
|
|
374
|
+
# Automatic detection or Registry-based mmproj
|
|
375
|
+
mmproj_path = self._find_mmproj(model_path)
|
|
376
|
+
if mmproj_path:
|
|
377
|
+
ASCIIColors.info(f"Detected multimodal projector: {mmproj_path}")
|
|
378
|
+
cmd.extend(["--mmproj", str(mmproj_path)])
|
|
379
|
+
|
|
380
|
+
if self.n_threads:
|
|
381
|
+
cmd.extend(["--threads", str(self.n_threads)])
|
|
382
|
+
|
|
383
|
+
ASCIIColors.info(f"Spawning server for {model_name} on port {port}...")
|
|
384
|
+
|
|
385
|
+
# Process creation flags for detachment
|
|
386
|
+
kwargs = {}
|
|
387
|
+
if platform.system() == "Windows":
|
|
388
|
+
kwargs['creationflags'] = subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
|
|
389
|
+
else:
|
|
390
|
+
kwargs['start_new_session'] = True
|
|
391
|
+
|
|
392
|
+
proc = subprocess.Popen(
|
|
393
|
+
cmd,
|
|
394
|
+
stdout=subprocess.DEVNULL,
|
|
395
|
+
stderr=subprocess.DEVNULL,
|
|
396
|
+
**kwargs
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Wait for health check
|
|
400
|
+
url = f"http://{self.host}:{port}/v1"
|
|
401
|
+
start_time = time.time()
|
|
402
|
+
while time.time() - start_time < 120:
|
|
403
|
+
try:
|
|
404
|
+
res = requests.get(f"{url}/models", timeout=1)
|
|
405
|
+
if res.status_code == 200:
|
|
406
|
+
return proc.pid, port, url
|
|
407
|
+
except:
|
|
408
|
+
pass
|
|
409
|
+
|
|
410
|
+
if proc.poll() is not None:
|
|
411
|
+
raise RuntimeError(f"Server process exited immediately with code {proc.returncode}")
|
|
412
|
+
time.sleep(0.5)
|
|
413
|
+
|
|
414
|
+
proc.terminate()
|
|
415
|
+
raise TimeoutError(f"Server for {model_name} failed to become responsive.")
|
|
416
|
+
|
|
417
|
+
def load_model(self, model_name: str) -> bool:
|
|
418
|
+
"""Thread-safe and Process-safe model loading."""
|
|
419
|
+
if not self.global_lock_path.parent.exists():
|
|
420
|
+
self.global_lock_path.parent.mkdir(parents=True)
|
|
421
|
+
|
|
422
|
+
lock = FileLock(str(self.global_lock_path))
|
|
423
|
+
try:
|
|
424
|
+
with lock.acquire(timeout=60):
|
|
425
|
+
info = self._get_server_info(model_name)
|
|
426
|
+
if info:
|
|
427
|
+
try:
|
|
428
|
+
self._get_registry_file(model_name).touch()
|
|
429
|
+
except:
|
|
430
|
+
pass
|
|
431
|
+
self.model_name = model_name
|
|
432
|
+
return True
|
|
433
|
+
|
|
434
|
+
self._ensure_capacity_locked()
|
|
435
|
+
pid, port, url = self._spawn_server_detached(model_name)
|
|
436
|
+
|
|
437
|
+
reg_file = self._get_registry_file(model_name)
|
|
438
|
+
with open(reg_file, 'w') as f:
|
|
439
|
+
json.dump({
|
|
440
|
+
"model_name": model_name, "pid": pid, "port": port, "url": url, "started_at": time.time()
|
|
441
|
+
}, f)
|
|
442
|
+
|
|
443
|
+
self.model_name = model_name
|
|
444
|
+
return True
|
|
445
|
+
except Exception as e:
|
|
446
|
+
ASCIIColors.error(f"Error loading model {model_name}: {e}")
|
|
447
|
+
trace_exception(e)
|
|
448
|
+
return False
|
|
449
|
+
|
|
450
|
+
def _get_client(self, model_name: str = None) -> openai.OpenAI:
|
|
451
|
+
target_model = model_name or self.model_name
|
|
452
|
+
if not target_model:
|
|
453
|
+
raise ValueError("No model specified.")
|
|
454
|
+
info = self._get_server_info(target_model)
|
|
455
|
+
if not info:
|
|
456
|
+
if self.load_model(target_model):
|
|
457
|
+
info = self._get_server_info(target_model)
|
|
458
|
+
else:
|
|
459
|
+
raise RuntimeError(f"Could not load model {target_model}")
|
|
460
|
+
else:
|
|
461
|
+
try:
|
|
462
|
+
self._get_registry_file(target_model).touch()
|
|
463
|
+
except:
|
|
464
|
+
pass
|
|
465
|
+
if not info:
|
|
466
|
+
raise RuntimeError(f"Model {target_model} failed to load.")
|
|
467
|
+
return openai.OpenAI(base_url=info['url'], api_key="sk-no-key-required")
|
|
468
|
+
|
|
469
|
+
def _execute_with_retry(self, func: Callable, *args, **kwargs):
|
|
470
|
+
retries = 60
|
|
471
|
+
for i in range(retries):
|
|
472
|
+
try:
|
|
473
|
+
return func(*args, **kwargs)
|
|
474
|
+
except openai.InternalServerError as e:
|
|
475
|
+
if e.status_code == 503:
|
|
476
|
+
if i % 10 == 0:
|
|
477
|
+
ASCIIColors.warning(f"Model is loading (503). Waiting... ({i+1}/{retries})")
|
|
478
|
+
time.sleep(2)
|
|
479
|
+
continue
|
|
480
|
+
raise e
|
|
481
|
+
except openai.APIConnectionError:
|
|
482
|
+
if i % 10 == 0:
|
|
483
|
+
ASCIIColors.warning(f"Connection error. Waiting... ({i+1}/{retries})")
|
|
484
|
+
time.sleep(2)
|
|
485
|
+
continue
|
|
486
|
+
return func(*args, **kwargs)
|
|
487
|
+
|
|
488
|
+
def generate_text(self, prompt: str, n_predict: int = None, stream: bool = False, **kwargs) -> Union[str, Dict]:
|
|
489
|
+
try:
|
|
490
|
+
client = self._get_client()
|
|
491
|
+
def do_gen():
|
|
492
|
+
return client.completions.create(
|
|
493
|
+
model=self.model_name, prompt=prompt,
|
|
494
|
+
max_tokens=n_predict if n_predict else 1024,
|
|
495
|
+
temperature=kwargs.get("temperature", 0.7),
|
|
496
|
+
top_p=kwargs.get("top_p", 0.9), stream=stream,
|
|
497
|
+
extra_body={"top_k": kwargs.get("top_k", 40), "repeat_penalty": kwargs.get("repeat_penalty", 1.1), "n_predict": n_predict}
|
|
498
|
+
)
|
|
499
|
+
completion = self._execute_with_retry(do_gen)
|
|
500
|
+
if stream:
|
|
501
|
+
full_text = ""
|
|
502
|
+
for chunk in completion:
|
|
503
|
+
content = chunk.choices[0].text
|
|
504
|
+
full_text += content
|
|
505
|
+
if kwargs.get("streaming_callback"):
|
|
506
|
+
if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
507
|
+
break
|
|
508
|
+
return full_text
|
|
509
|
+
else:
|
|
510
|
+
return completion.choices[0].text
|
|
511
|
+
except Exception as e:
|
|
512
|
+
trace_exception(e)
|
|
513
|
+
return {"status": False, "error": str(e)}
|
|
514
|
+
|
|
515
|
+
def chat(self, discussion: LollmsDiscussion, **kwargs) -> Union[str, Dict]:
|
|
516
|
+
try:
|
|
517
|
+
client = self._get_client()
|
|
518
|
+
messages = discussion.export("openai_chat")
|
|
519
|
+
def do_chat():
|
|
520
|
+
return client.chat.completions.create(
|
|
521
|
+
model=self.model_name, messages=messages,
|
|
522
|
+
max_tokens=kwargs.get("n_predict", 1024),
|
|
523
|
+
temperature=kwargs.get("temperature", 0.7),
|
|
524
|
+
stream=kwargs.get("stream", False),
|
|
525
|
+
extra_body={"top_k": kwargs.get("top_k", 40), "repeat_penalty": kwargs.get("repeat_penalty", 1.1)}
|
|
526
|
+
)
|
|
527
|
+
response = self._execute_with_retry(do_chat)
|
|
528
|
+
if kwargs.get("stream", False):
|
|
529
|
+
full_text = ""
|
|
530
|
+
for chunk in response:
|
|
531
|
+
content = chunk.choices[0].delta.content or ""
|
|
532
|
+
full_text += content
|
|
533
|
+
if kwargs.get("streaming_callback"):
|
|
534
|
+
if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
535
|
+
break
|
|
536
|
+
return full_text
|
|
537
|
+
else:
|
|
538
|
+
return response.choices[0].message.content
|
|
539
|
+
except Exception as e:
|
|
540
|
+
trace_exception(e)
|
|
541
|
+
return {"status": False, "error": str(e)}
|
|
542
|
+
|
|
543
|
+
def list_models(self) -> List[Dict[str, Any]]:
|
|
544
|
+
models = []
|
|
545
|
+
if self.models_dir.exists():
|
|
546
|
+
for f in self.models_dir.glob("*.gguf"):
|
|
547
|
+
# Hide files explicitly containing 'mmproj' as they are not standalone models
|
|
548
|
+
if "mmproj" in f.name.lower():
|
|
549
|
+
continue
|
|
550
|
+
|
|
551
|
+
if re.search(r'-\d{5}-of-\d{5}\.gguf$', f.name):
|
|
552
|
+
if "00001-of-" not in f.name: continue
|
|
553
|
+
models.append({"model_name": f.name, "owned_by": "local", "created": time.ctime(f.stat().st_ctime), "size": f.stat().st_size})
|
|
554
|
+
return models
|
|
555
|
+
|
|
556
|
+
def get_model_info(self) -> dict:
|
|
557
|
+
info = {"name": BindingName, "version": "source-wrapper", "active_model": self.model_name}
|
|
558
|
+
reg = self._get_server_info(self.model_name)
|
|
559
|
+
if reg: info["host_address"] = reg['url']
|
|
560
|
+
return info
|
|
561
|
+
|
|
562
|
+
def tokenize(self, text: str) -> list:
|
|
563
|
+
try:
|
|
564
|
+
client = self._get_client()
|
|
565
|
+
url = client.base_url
|
|
566
|
+
def do_tokenize():
|
|
567
|
+
ep = f"{url}tokenize"
|
|
568
|
+
res = requests.post(ep, json={"content": text})
|
|
569
|
+
if res.status_code == 404:
|
|
570
|
+
res = requests.post(str(url).replace("/v1/", "/tokenize"), json={"content": text})
|
|
571
|
+
if res.status_code == 503:
|
|
572
|
+
raise openai.InternalServerError("Loading model", response=res, body=None)
|
|
573
|
+
return res
|
|
574
|
+
res = self._execute_with_retry(do_tokenize)
|
|
575
|
+
if res.status_code == 200: return res.json().get("tokens", [])
|
|
576
|
+
except: pass
|
|
577
|
+
return list(text)
|
|
578
|
+
|
|
579
|
+
def detokenize(self, tokens: list) -> str:
|
|
580
|
+
try:
|
|
581
|
+
client = self._get_client()
|
|
582
|
+
url = client.base_url
|
|
583
|
+
def do_detokenize():
|
|
584
|
+
ep = f"{url}detokenize"
|
|
585
|
+
res = requests.post(ep, json={"tokens": tokens})
|
|
586
|
+
if res.status_code == 404:
|
|
587
|
+
res = requests.post(str(url).replace("/v1/", "/detokenize"), json={"tokens": tokens})
|
|
588
|
+
if res.status_code == 503:
|
|
589
|
+
raise openai.InternalServerError("Loading model", response=res, body=None)
|
|
590
|
+
return res
|
|
591
|
+
res = self._execute_with_retry(do_detokenize)
|
|
592
|
+
if res.status_code == 200: return res.json().get("content", "")
|
|
593
|
+
except: pass
|
|
594
|
+
return "".join(map(str, tokens))
|
|
595
|
+
|
|
596
|
+
def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
|
|
597
|
+
|
|
598
|
+
def embed(self, text: str, **kwargs) -> List[float]:
|
|
599
|
+
client = self._get_client()
|
|
600
|
+
def do_embed():
|
|
601
|
+
return client.embeddings.create(input=text, model=self.model_name)
|
|
602
|
+
res = self._execute_with_retry(do_embed)
|
|
603
|
+
return res.data[0].embedding
|
|
604
|
+
|
|
605
|
+
def get_zoo(self) -> List[Dict[str, Any]]:
|
|
606
|
+
return [
|
|
607
|
+
# Ministral 3: High-performance edge model (3B)
|
|
608
|
+
{
|
|
609
|
+
"name": "Ministral-3-3B-Instruct-2512-GGUF",
|
|
610
|
+
"description": "Mistral AI Ministral 3 3B Instruct (Bartowski Quant) - Efficient Edge Model",
|
|
611
|
+
"size": "2.2 GB (Q4_K_M)",
|
|
612
|
+
"type": "gguf",
|
|
613
|
+
"link": "bartowski/mistralai_Ministral-3-3B-Instruct-2512-GGUF",
|
|
614
|
+
"filename": "mistralai_Ministral-3-3B-Instruct-2512-Q4_K_M.gguf"
|
|
615
|
+
},
|
|
616
|
+
# Devstral 2 Mini: Agentic coding specialist (24B)
|
|
617
|
+
{
|
|
618
|
+
"name": "Devstral-Small-2-24B-Instruct-GGUF",
|
|
619
|
+
"description": "Mistral AI Devstral Small 2 24B Instruct (Bartowski Quant) - Coding Specialist",
|
|
620
|
+
"size": "14.8 GB (Q4_K_M)",
|
|
621
|
+
"type": "gguf",
|
|
622
|
+
"link": "bartowski/mistralai_Devstral-Small-2-24B-Instruct-2512-GGUF",
|
|
623
|
+
"filename": "mistralai_Devstral-Small-2-24B-Instruct-2512-Q4_K_M.gguf"
|
|
624
|
+
},
|
|
625
|
+
# Llama 4 Scout: Meta's efficient MoE (17B)
|
|
626
|
+
{
|
|
627
|
+
"name": "Llama-4-Scout-17B-Instruct-GGUF",
|
|
628
|
+
"description": "Meta Llama 4 Scout 17B Instruct (Bartowski Quant) - 16-Expert MoE",
|
|
629
|
+
"size": "11.2 GB (Q4_K_M)",
|
|
630
|
+
"type": "gguf",
|
|
631
|
+
"link": "bartowski/meta-llama_Llama-4-Scout-17B-16E-Instruct-old-GGUF",
|
|
632
|
+
"filename": "meta-llama_Llama-4-Scout-17B-16E-Instruct-Q4_K_M.gguf"
|
|
633
|
+
},
|
|
634
|
+
# Qwen 3 VL: Vision-Language with "Thinking" (32B)
|
|
635
|
+
{
|
|
636
|
+
"name": "Qwen3-VL-32B-Thinking-GGUF",
|
|
637
|
+
"description": "Qwen 3 VL 32B Thinking (Bartowski Quant) - Vision CoT Reasoning",
|
|
638
|
+
"size": "19.5 GB (Q4_K_M)",
|
|
639
|
+
"type": "gguf",
|
|
640
|
+
"link": "bartowski/Qwen_Qwen3-VL-32B-Thinking-GGUF",
|
|
641
|
+
"filename": "Qwen_Qwen3-VL-32B-Thinking-Q4_K_M.gguf"
|
|
642
|
+
},
|
|
643
|
+
# Qwen 3: Dense reasoning powerhouse (72B)
|
|
644
|
+
{
|
|
645
|
+
"name": "Qwen3-72B-Embiggened-GGUF",
|
|
646
|
+
"description": "Qwen 3 72B Embiggened (Bartowski Quant) - Enhanced Reasoning Dense Model",
|
|
647
|
+
"size": "43.1 GB (Q4_K_M)",
|
|
648
|
+
"type": "gguf",
|
|
649
|
+
"link": "bartowski/cognitivecomputations_Qwen3-72B-Embiggened-GGUF",
|
|
650
|
+
"filename": "Qwen3-72B-Embiggened-Q4_K_M.gguf"
|
|
651
|
+
},
|
|
652
|
+
# Devstral 2: Massive coding architecture (123B)
|
|
653
|
+
{
|
|
654
|
+
"name": "Devstral-2-123B-Instruct-GGUF",
|
|
655
|
+
"description": "Mistral AI Devstral 2 123B Instruct (Bartowski Quant) - Heavy Duty Coding",
|
|
656
|
+
"size": "71.4 GB (Q4_K_M)",
|
|
657
|
+
"type": "gguf",
|
|
658
|
+
"link": "bartowski/mistralai_Devstral-2-123B-Instruct-2512-GGUF",
|
|
659
|
+
"filename": "Devstral-2-123B-Instruct-2512-Q4_K_M.gguf"
|
|
660
|
+
},
|
|
661
|
+
# ChatGPT OSS: Open weights rival (120B)
|
|
662
|
+
{
|
|
663
|
+
"name": "ChatGPT-OSS-120B-GGUF",
|
|
664
|
+
"description": "OpenAI GPT-OSS 120B (Bartowski Quant) - Open Weight Research Model",
|
|
665
|
+
"size": "69.8 GB (Q4_K_M)",
|
|
666
|
+
"type": "gguf",
|
|
667
|
+
"link": "bartowski/openai_gpt-oss-120b-GGUF",
|
|
668
|
+
"filename": "gpt-oss-120b-Q4_K_M.gguf"
|
|
669
|
+
},
|
|
670
|
+
# DeepSeek V3: The MoE Giant (671B Base / 37B Active)
|
|
671
|
+
{
|
|
672
|
+
"name": "DeepSeek-V3-0324-GGUF",
|
|
673
|
+
"description": "DeepSeek V3 0324 (Bartowski Quant) - 671B MoE",
|
|
674
|
+
"size": "365 GB (Q4_K_M)",
|
|
675
|
+
"type": "gguf",
|
|
676
|
+
"link": "bartowski/deepseek-ai_DeepSeek-V3-0324-GGUF",
|
|
677
|
+
"filename": "DeepSeek-V3-0324-Q4_K_M.gguf"
|
|
678
|
+
}
|
|
679
|
+
]
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def download_from_zoo(self, index: int, progress_callback: Callable[[dict], None] = None) -> dict:
|
|
683
|
+
zoo = self.get_zoo();
|
|
684
|
+
if index < 0 or index >= len(zoo): return {"status": False, "message": "Index out of bounds"}
|
|
685
|
+
item = zoo[index]
|
|
686
|
+
return self.pull_model(item["link"], item.get("filename"), progress_callback=progress_callback)
|
|
687
|
+
|
|
688
|
+
def pull_model(self, repo_id: str, filename: str, mmproj_repo_id: str = None, mmproj_filename: str = None, progress_callback: Callable[[dict], None] = None) -> dict:
|
|
689
|
+
try:
|
|
690
|
+
match = re.match(r"^(.*)-(\d{5})-of-(\d{5})\.gguf$", filename)
|
|
691
|
+
files = []
|
|
692
|
+
if match:
|
|
693
|
+
base, total = match.group(1), int(match.group(3))
|
|
694
|
+
ASCIIColors.info(f"Detected multi-file model with {total} parts.")
|
|
695
|
+
for i in range(1, total + 1): files.append(f"{base}-{i:05d}-of-{total:05d}.gguf")
|
|
696
|
+
else:
|
|
697
|
+
files.append(filename)
|
|
698
|
+
paths = []
|
|
699
|
+
for f in files:
|
|
700
|
+
ASCIIColors.info(f"Downloading {f} from {repo_id}...")
|
|
701
|
+
if progress_callback: progress_callback({"status": "downloading", "message": f"Downloading {f}", "completed": 0, "total": 100})
|
|
702
|
+
p = hf_hub_download(repo_id=repo_id, filename=f, local_dir=self.models_dir, local_dir_use_symlinks=False, resume_download=True)
|
|
703
|
+
paths.append(p)
|
|
704
|
+
ASCIIColors.success(f"Downloaded {f}")
|
|
705
|
+
|
|
706
|
+
if mmproj_filename:
|
|
707
|
+
proj_repo = mmproj_repo_id if mmproj_repo_id else repo_id
|
|
708
|
+
ASCIIColors.info(f"Downloading mmproj {mmproj_filename} from {proj_repo}...")
|
|
709
|
+
hf_hub_download(repo_id=proj_repo, filename=mmproj_filename, local_dir=self.models_dir, local_dir_use_symlinks=False, resume_download=True)
|
|
710
|
+
ASCIIColors.success(f"Downloaded mmproj {mmproj_filename}")
|
|
711
|
+
# Automatically bind the model with its projector
|
|
712
|
+
self.bind_multimodal_model(filename, mmproj_filename)
|
|
713
|
+
|
|
714
|
+
msg = f"Successfully downloaded model: {filename}"
|
|
715
|
+
if mmproj_filename: msg += f" and bound with projector: {mmproj_filename}"
|
|
716
|
+
if progress_callback: progress_callback({"status": "success", "message": msg, "completed": 100, "total": 100})
|
|
717
|
+
return {"status": True, "message": msg, "path": paths[0]}
|
|
718
|
+
except Exception as e:
|
|
719
|
+
trace_exception(e)
|
|
720
|
+
return {"status": False, "error": str(e)}
|
|
721
|
+
|
|
722
|
+
def cleanup_orphans_if_needed(self):
|
|
723
|
+
pass
|
|
724
|
+
|
|
725
|
+
def __del__(self):
|
|
726
|
+
pass
|