lollms-client 1.5.6__py3-none-any.whl → 1.7.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/azure_openai/__init__.py +2 -2
- lollms_client/llm_bindings/claude/__init__.py +125 -35
- lollms_client/llm_bindings/gemini/__init__.py +261 -159
- lollms_client/llm_bindings/grok/__init__.py +52 -15
- lollms_client/llm_bindings/groq/__init__.py +2 -2
- lollms_client/llm_bindings/hugging_face_inference_api/__init__.py +2 -2
- lollms_client/llm_bindings/litellm/__init__.py +1 -1
- lollms_client/llm_bindings/llama_cpp_server/__init__.py +605 -0
- lollms_client/llm_bindings/llamacpp/__init__.py +18 -11
- lollms_client/llm_bindings/lollms/__init__.py +76 -21
- lollms_client/llm_bindings/lollms_webui/__init__.py +1 -1
- lollms_client/llm_bindings/mistral/__init__.py +2 -2
- lollms_client/llm_bindings/novita_ai/__init__.py +142 -6
- lollms_client/llm_bindings/ollama/__init__.py +345 -89
- lollms_client/llm_bindings/open_router/__init__.py +2 -2
- lollms_client/llm_bindings/openai/__init__.py +81 -20
- lollms_client/llm_bindings/openllm/__init__.py +362 -506
- lollms_client/llm_bindings/openwebui/__init__.py +333 -171
- lollms_client/llm_bindings/perplexity/__init__.py +2 -2
- lollms_client/llm_bindings/pythonllamacpp/__init__.py +3 -3
- lollms_client/llm_bindings/tensor_rt/__init__.py +1 -1
- lollms_client/llm_bindings/transformers/__init__.py +428 -632
- lollms_client/llm_bindings/vllm/__init__.py +1 -1
- lollms_client/lollms_agentic.py +4 -2
- lollms_client/lollms_base_binding.py +61 -0
- lollms_client/lollms_core.py +512 -1890
- lollms_client/lollms_discussion.py +65 -39
- lollms_client/lollms_llm_binding.py +126 -261
- lollms_client/lollms_mcp_binding.py +49 -77
- lollms_client/lollms_stt_binding.py +99 -52
- lollms_client/lollms_tti_binding.py +38 -38
- lollms_client/lollms_ttm_binding.py +38 -42
- lollms_client/lollms_tts_binding.py +43 -18
- lollms_client/lollms_ttv_binding.py +38 -42
- lollms_client/lollms_types.py +4 -2
- lollms_client/stt_bindings/whisper/__init__.py +108 -23
- lollms_client/stt_bindings/whispercpp/__init__.py +7 -1
- lollms_client/tti_bindings/diffusers/__init__.py +464 -803
- lollms_client/tti_bindings/diffusers/server/main.py +1062 -0
- lollms_client/tti_bindings/gemini/__init__.py +182 -239
- lollms_client/tti_bindings/leonardo_ai/__init__.py +6 -3
- lollms_client/tti_bindings/lollms/__init__.py +4 -1
- lollms_client/tti_bindings/novita_ai/__init__.py +5 -2
- lollms_client/tti_bindings/openai/__init__.py +10 -11
- lollms_client/tti_bindings/stability_ai/__init__.py +5 -3
- lollms_client/ttm_bindings/audiocraft/__init__.py +7 -12
- lollms_client/ttm_bindings/beatoven_ai/__init__.py +7 -3
- lollms_client/ttm_bindings/lollms/__init__.py +4 -17
- lollms_client/ttm_bindings/replicate/__init__.py +7 -4
- lollms_client/ttm_bindings/stability_ai/__init__.py +7 -4
- lollms_client/ttm_bindings/topmediai/__init__.py +6 -3
- lollms_client/tts_bindings/bark/__init__.py +7 -10
- lollms_client/tts_bindings/lollms/__init__.py +6 -1
- lollms_client/tts_bindings/piper_tts/__init__.py +8 -11
- lollms_client/tts_bindings/xtts/__init__.py +157 -74
- lollms_client/tts_bindings/xtts/server/main.py +241 -280
- {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/METADATA +113 -5
- lollms_client-1.7.13.dist-info/RECORD +90 -0
- lollms_client-1.5.6.dist-info/RECORD +0 -87
- {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/WHEEL +0 -0
- {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,605 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import requests
|
|
6
|
+
import socket
|
|
7
|
+
import re
|
|
8
|
+
import platform
|
|
9
|
+
import zipfile
|
|
10
|
+
import tarfile
|
|
11
|
+
import json
|
|
12
|
+
import atexit
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional, List, Dict, Any, Union, Callable
|
|
15
|
+
|
|
16
|
+
import pipmaster as pm
|
|
17
|
+
from ascii_colors import ASCIIColors, trace_exception
|
|
18
|
+
from lollms_client.lollms_llm_binding import LollmsLLMBinding
|
|
19
|
+
from lollms_client.lollms_types import MSG_TYPE
|
|
20
|
+
from lollms_client.lollms_discussion import LollmsDiscussion
|
|
21
|
+
|
|
22
|
+
# Ensure dependencies
|
|
23
|
+
pm.ensure_packages(["openai", "huggingface_hub", "filelock", "requests", "tqdm", "psutil"])
|
|
24
|
+
import openai
|
|
25
|
+
from huggingface_hub import hf_hub_download
|
|
26
|
+
from filelock import FileLock
|
|
27
|
+
from tqdm import tqdm
|
|
28
|
+
import psutil
|
|
29
|
+
|
|
30
|
+
BindingName = "LlamaCppServerBinding"
|
|
31
|
+
|
|
32
|
+
def get_free_port(start_port=9624, max_port=10000):
|
|
33
|
+
"""
|
|
34
|
+
Finds a free port on localhost.
|
|
35
|
+
Race-condition safe-ish: We bind to it to check, but release it immediately.
|
|
36
|
+
Real safety comes from the FileLock around this call.
|
|
37
|
+
"""
|
|
38
|
+
for port in range(start_port, max_port):
|
|
39
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
40
|
+
try:
|
|
41
|
+
sock.bind(('localhost', port))
|
|
42
|
+
return port
|
|
43
|
+
except OSError:
|
|
44
|
+
continue
|
|
45
|
+
raise RuntimeError("No free ports available.")
|
|
46
|
+
|
|
47
|
+
class LlamaCppServerBinding(LollmsLLMBinding):
|
|
48
|
+
def __init__(self, **kwargs):
|
|
49
|
+
super().__init__(BindingName, **kwargs)
|
|
50
|
+
self.config = kwargs
|
|
51
|
+
|
|
52
|
+
# Configuration
|
|
53
|
+
self.host = kwargs.get("host", "localhost")
|
|
54
|
+
self.model_name = kwargs.get("model_name", "")
|
|
55
|
+
self.n_ctx = kwargs.get("ctx_size", 4096)
|
|
56
|
+
self.n_gpu_layers = kwargs.get("n_gpu_layers", -1)
|
|
57
|
+
self.n_threads = kwargs.get("n_threads", None)
|
|
58
|
+
self.n_parallel = kwargs.get("n_parallel", 1)
|
|
59
|
+
self.batch_size = kwargs.get("batch_size", 512)
|
|
60
|
+
|
|
61
|
+
# Server Management
|
|
62
|
+
self.max_active_models = int(kwargs.get("max_active_models", 1))
|
|
63
|
+
self.idle_timeout = float(kwargs.get("idle_timeout", -1))
|
|
64
|
+
|
|
65
|
+
# Paths
|
|
66
|
+
self.binding_dir = Path(__file__).parent
|
|
67
|
+
self.bin_dir = self.binding_dir / "bin"
|
|
68
|
+
self.models_dir = Path(kwargs.get("models_path", "models/llama_cpp_models")).resolve()
|
|
69
|
+
|
|
70
|
+
# Registry directory for inter-process coordination
|
|
71
|
+
self.servers_dir = self.models_dir / "servers"
|
|
72
|
+
self.servers_dir.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
self.bin_dir.mkdir(exist_ok=True)
|
|
74
|
+
|
|
75
|
+
# Global lock file for all operations on the registry
|
|
76
|
+
self.global_lock_path = self.models_dir / "global_server_manager.lock"
|
|
77
|
+
|
|
78
|
+
# Installation check
|
|
79
|
+
if not self._get_server_executable().exists():
|
|
80
|
+
ASCIIColors.warning("Llama.cpp binary not found. Attempting installation...")
|
|
81
|
+
self.install_llama_cpp()
|
|
82
|
+
|
|
83
|
+
# Register cleanup for this process
|
|
84
|
+
atexit.register(self.cleanup_orphans_if_needed)
|
|
85
|
+
|
|
86
|
+
def _get_server_executable(self) -> Path:
|
|
87
|
+
if platform.system() == "Windows":
|
|
88
|
+
return self.bin_dir / "llama-server.exe"
|
|
89
|
+
else:
|
|
90
|
+
return self.bin_dir / "llama-server"
|
|
91
|
+
|
|
92
|
+
def detect_hardware(self) -> str:
|
|
93
|
+
sys_plat = platform.system()
|
|
94
|
+
if sys_plat == "Darwin":
|
|
95
|
+
return "macos"
|
|
96
|
+
try:
|
|
97
|
+
subprocess.check_call(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
98
|
+
return "cuda"
|
|
99
|
+
except:
|
|
100
|
+
pass
|
|
101
|
+
return "cpu"
|
|
102
|
+
|
|
103
|
+
def install_llama_cpp(self):
|
|
104
|
+
try:
|
|
105
|
+
ASCIIColors.info("Checking latest llama.cpp release...")
|
|
106
|
+
releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
|
107
|
+
response = requests.get(releases_url)
|
|
108
|
+
response.raise_for_status()
|
|
109
|
+
release_data = response.json()
|
|
110
|
+
assets = release_data.get("assets", [])
|
|
111
|
+
|
|
112
|
+
hardware = self.detect_hardware()
|
|
113
|
+
sys_plat = platform.system()
|
|
114
|
+
|
|
115
|
+
target_asset = None
|
|
116
|
+
search_terms = []
|
|
117
|
+
|
|
118
|
+
if sys_plat == "Windows":
|
|
119
|
+
search_terms.append("win")
|
|
120
|
+
search_terms.append("cuda" if hardware == "cuda" else "avx2")
|
|
121
|
+
search_terms.append("x64")
|
|
122
|
+
elif sys_plat == "Linux":
|
|
123
|
+
search_terms.append("ubuntu")
|
|
124
|
+
search_terms.append("x64")
|
|
125
|
+
elif sys_plat == "Darwin":
|
|
126
|
+
search_terms.append("macos")
|
|
127
|
+
search_terms.append("arm64" if platform.machine() == "arm64" else "x64")
|
|
128
|
+
|
|
129
|
+
for asset in assets:
|
|
130
|
+
name = asset["name"].lower()
|
|
131
|
+
if "cudart" in name: continue
|
|
132
|
+
if all(term in name for term in search_terms):
|
|
133
|
+
if "cuda" in name and "cu11" in name and hardware == "cuda": continue
|
|
134
|
+
target_asset = asset
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
# Windows CPU fallback
|
|
138
|
+
if not target_asset and sys_plat == "Windows" and hardware == "cpu":
|
|
139
|
+
for asset in assets:
|
|
140
|
+
if "cudart" in asset["name"].lower(): continue
|
|
141
|
+
if "win" in asset["name"].lower() and "x64" in asset["name"].lower() and "cuda" not in asset["name"].lower():
|
|
142
|
+
target_asset = asset
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
if not target_asset:
|
|
146
|
+
raise RuntimeError(f"No suitable binary found for {sys_plat} / {hardware}")
|
|
147
|
+
|
|
148
|
+
download_url = target_asset["browser_download_url"]
|
|
149
|
+
filename = target_asset["name"]
|
|
150
|
+
dest_file = self.bin_dir / filename
|
|
151
|
+
|
|
152
|
+
ASCIIColors.info(f"Downloading {filename}...")
|
|
153
|
+
with requests.get(download_url, stream=True) as r:
|
|
154
|
+
r.raise_for_status()
|
|
155
|
+
with open(dest_file, 'wb') as f:
|
|
156
|
+
for chunk in r.iter_content(chunk_size=8192):
|
|
157
|
+
f.write(chunk)
|
|
158
|
+
|
|
159
|
+
ASCIIColors.info("Extracting...")
|
|
160
|
+
if filename.endswith(".zip"):
|
|
161
|
+
with zipfile.ZipFile(dest_file, 'r') as z: z.extractall(self.bin_dir)
|
|
162
|
+
elif filename.endswith(".tar.gz"):
|
|
163
|
+
with tarfile.open(dest_file, "r:gz") as t: t.extractall(self.bin_dir)
|
|
164
|
+
|
|
165
|
+
dest_file.unlink()
|
|
166
|
+
|
|
167
|
+
# Normalize binary name
|
|
168
|
+
exe_name = "llama-server.exe" if sys_plat == "Windows" else "llama-server"
|
|
169
|
+
legacy_name = "server.exe" if sys_plat == "Windows" else "server"
|
|
170
|
+
if not (self.bin_dir / exe_name).exists() and (self.bin_dir / legacy_name).exists():
|
|
171
|
+
shutil.move(str(self.bin_dir / legacy_name), str(self.bin_dir / exe_name))
|
|
172
|
+
|
|
173
|
+
if sys_plat != "Windows":
|
|
174
|
+
exe_path = self.bin_dir / exe_name
|
|
175
|
+
if exe_path.exists(): os.chmod(exe_path, 0o755)
|
|
176
|
+
|
|
177
|
+
ASCIIColors.success("Llama.cpp installed successfully.")
|
|
178
|
+
except Exception as e:
|
|
179
|
+
trace_exception(e)
|
|
180
|
+
ASCIIColors.error(f"Failed to install llama.cpp: {e}")
|
|
181
|
+
|
|
182
|
+
# --- Server Management Logic ---
|
|
183
|
+
|
|
184
|
+
def _get_registry_file(self, model_name: str) -> Path:
|
|
185
|
+
# Sanitize filename
|
|
186
|
+
safe_name = "".join(c for c in model_name if c.isalnum() or c in ('-', '_', '.'))
|
|
187
|
+
return self.servers_dir / f"{safe_name}.json"
|
|
188
|
+
|
|
189
|
+
def _get_server_info(self, model_name: str) -> Optional[Dict]:
|
|
190
|
+
"""Reads registry file for a model, returns dict or None if invalid."""
|
|
191
|
+
reg_file = self._get_registry_file(model_name)
|
|
192
|
+
if not reg_file.exists():
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
with open(reg_file, 'r') as f:
|
|
197
|
+
info = json.load(f)
|
|
198
|
+
|
|
199
|
+
# Verify process is alive
|
|
200
|
+
if psutil.pid_exists(info['pid']):
|
|
201
|
+
# Verify it's actually llama-server (optional but safe)
|
|
202
|
+
try:
|
|
203
|
+
p = psutil.Process(info['pid'])
|
|
204
|
+
if "llama" in p.name().lower() or "server" in p.name().lower():
|
|
205
|
+
return info
|
|
206
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
# If we get here, process is dead or invalid
|
|
210
|
+
ASCIIColors.warning(f"Found stale registry file for {model_name} (PID {info['pid']}). Cleaning up.")
|
|
211
|
+
reg_file.unlink()
|
|
212
|
+
return None
|
|
213
|
+
except Exception:
|
|
214
|
+
# Corrupt file
|
|
215
|
+
if reg_file.exists(): reg_file.unlink()
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
def _kill_server(self, model_name: str, info: Dict):
|
|
219
|
+
"""Kills a server process and removes its registry file."""
|
|
220
|
+
ASCIIColors.info(f"Stopping server for {model_name} (PID {info['pid']})...")
|
|
221
|
+
try:
|
|
222
|
+
p = psutil.Process(info['pid'])
|
|
223
|
+
p.terminate()
|
|
224
|
+
p.wait(timeout=5)
|
|
225
|
+
except psutil.NoSuchProcess:
|
|
226
|
+
pass # Already gone
|
|
227
|
+
except psutil.TimeoutExpired:
|
|
228
|
+
p.kill()
|
|
229
|
+
except Exception as e:
|
|
230
|
+
ASCIIColors.error(f"Error killing process: {e}")
|
|
231
|
+
|
|
232
|
+
# Remove registry file
|
|
233
|
+
reg_file = self._get_registry_file(model_name)
|
|
234
|
+
if reg_file.exists():
|
|
235
|
+
reg_file.unlink()
|
|
236
|
+
|
|
237
|
+
def _ensure_capacity_locked(self):
|
|
238
|
+
"""
|
|
239
|
+
Called while holding the lock. Ensures we have space for a new model.
|
|
240
|
+
"""
|
|
241
|
+
registry_files = list(self.servers_dir.glob("*.json"))
|
|
242
|
+
|
|
243
|
+
# 1. Clean up stale entries first
|
|
244
|
+
valid_servers = []
|
|
245
|
+
for rf in registry_files:
|
|
246
|
+
try:
|
|
247
|
+
with open(rf, 'r') as f:
|
|
248
|
+
data = json.load(f)
|
|
249
|
+
if psutil.pid_exists(data['pid']):
|
|
250
|
+
valid_servers.append((rf, data))
|
|
251
|
+
else:
|
|
252
|
+
rf.unlink() # Clean stale
|
|
253
|
+
except:
|
|
254
|
+
if rf.exists(): rf.unlink()
|
|
255
|
+
|
|
256
|
+
# 2. Check capacity
|
|
257
|
+
if len(valid_servers) >= self.max_active_models:
|
|
258
|
+
# Sort by file modification time (mtime), which acts as our "last used" heartbeat
|
|
259
|
+
# Oldest mtime = Least Recently Used
|
|
260
|
+
valid_servers.sort(key=lambda x: x[0].stat().st_mtime)
|
|
261
|
+
|
|
262
|
+
# Kill the oldest
|
|
263
|
+
oldest_file, oldest_info = valid_servers[0]
|
|
264
|
+
model_to_kill = oldest_info.get("model_name", "unknown")
|
|
265
|
+
ASCIIColors.warning(f"Max active models ({self.max_active_models}) reached. Unloading LRU model: {model_to_kill}")
|
|
266
|
+
self._kill_server(model_to_kill, oldest_info)
|
|
267
|
+
|
|
268
|
+
def _spawn_server_detached(self, model_name: str):
|
|
269
|
+
"""Spawns the server process detached so it survives if this python script ends."""
|
|
270
|
+
exe_path = self._get_server_executable()
|
|
271
|
+
model_path = self.models_dir / model_name
|
|
272
|
+
|
|
273
|
+
if not model_path.exists():
|
|
274
|
+
raise FileNotFoundError(f"Model {model_name} not found at {model_path}")
|
|
275
|
+
|
|
276
|
+
port = get_free_port()
|
|
277
|
+
|
|
278
|
+
cmd = [
|
|
279
|
+
str(exe_path),
|
|
280
|
+
"--model", str(model_path),
|
|
281
|
+
"--host", self.host,
|
|
282
|
+
"--port", str(port),
|
|
283
|
+
"--ctx-size", str(self.n_ctx),
|
|
284
|
+
"--n-gpu-layers", str(self.n_gpu_layers),
|
|
285
|
+
"--parallel", str(self.n_parallel),
|
|
286
|
+
"--batch-size", str(self.batch_size),
|
|
287
|
+
"--embedding"
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
if self.n_threads:
|
|
291
|
+
cmd.extend(["--threads", str(self.n_threads)])
|
|
292
|
+
|
|
293
|
+
ASCIIColors.info(f"Spawning server for {model_name} on port {port}...")
|
|
294
|
+
|
|
295
|
+
# Process creation flags for detachment
|
|
296
|
+
kwargs = {}
|
|
297
|
+
if platform.system() == "Windows":
|
|
298
|
+
kwargs['creationflags'] = subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
|
|
299
|
+
else:
|
|
300
|
+
kwargs['start_new_session'] = True
|
|
301
|
+
|
|
302
|
+
proc = subprocess.Popen(
|
|
303
|
+
cmd,
|
|
304
|
+
stdout=subprocess.DEVNULL,
|
|
305
|
+
stderr=subprocess.DEVNULL,
|
|
306
|
+
**kwargs
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Wait for health check (WAIT until STATUS 200 OK)
|
|
310
|
+
url = f"http://{self.host}:{port}/v1"
|
|
311
|
+
start_time = time.time()
|
|
312
|
+
# Increased timeout to 120s for larger models
|
|
313
|
+
while time.time() - start_time < 120:
|
|
314
|
+
try:
|
|
315
|
+
res = requests.get(f"{url}/models", timeout=1)
|
|
316
|
+
# STRICTLY check for 200, as 503 means loading
|
|
317
|
+
if res.status_code == 200:
|
|
318
|
+
return proc.pid, port, url
|
|
319
|
+
except:
|
|
320
|
+
pass
|
|
321
|
+
|
|
322
|
+
if proc.poll() is not None:
|
|
323
|
+
raise RuntimeError(f"Server process exited immediately with code {proc.returncode}")
|
|
324
|
+
|
|
325
|
+
time.sleep(0.5)
|
|
326
|
+
|
|
327
|
+
# Timeout
|
|
328
|
+
proc.terminate()
|
|
329
|
+
raise TimeoutError(f"Server for {model_name} failed to become responsive (timeout).")
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def load_model(self, model_name: str) -> bool:
|
|
333
|
+
"""
|
|
334
|
+
Thread-safe and Process-safe model loading.
|
|
335
|
+
"""
|
|
336
|
+
if not self.global_lock_path.parent.exists():
|
|
337
|
+
self.global_lock_path.parent.mkdir(parents=True)
|
|
338
|
+
|
|
339
|
+
lock = FileLock(str(self.global_lock_path))
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
with lock.acquire(timeout=60):
|
|
343
|
+
info = self._get_server_info(model_name)
|
|
344
|
+
|
|
345
|
+
if info:
|
|
346
|
+
# Update heartbeat
|
|
347
|
+
try:
|
|
348
|
+
self._get_registry_file(model_name).touch()
|
|
349
|
+
except:
|
|
350
|
+
pass
|
|
351
|
+
self.model_name = model_name
|
|
352
|
+
return True
|
|
353
|
+
|
|
354
|
+
self._ensure_capacity_locked()
|
|
355
|
+
pid, port, url = self._spawn_server_detached(model_name)
|
|
356
|
+
|
|
357
|
+
reg_file = self._get_registry_file(model_name)
|
|
358
|
+
with open(reg_file, 'w') as f:
|
|
359
|
+
json.dump({
|
|
360
|
+
"model_name": model_name,
|
|
361
|
+
"pid": pid,
|
|
362
|
+
"port": port,
|
|
363
|
+
"url": url,
|
|
364
|
+
"started_at": time.time()
|
|
365
|
+
}, f)
|
|
366
|
+
|
|
367
|
+
self.model_name = model_name
|
|
368
|
+
return True
|
|
369
|
+
|
|
370
|
+
except Exception as e:
|
|
371
|
+
ASCIIColors.error(f"Error loading model {model_name}: {e}")
|
|
372
|
+
trace_exception(e)
|
|
373
|
+
return False
|
|
374
|
+
|
|
375
|
+
def _get_client(self, model_name: str = None) -> openai.OpenAI:
|
|
376
|
+
target_model = model_name or self.model_name
|
|
377
|
+
if not target_model:
|
|
378
|
+
raise ValueError("No model specified.")
|
|
379
|
+
|
|
380
|
+
info = self._get_server_info(target_model)
|
|
381
|
+
|
|
382
|
+
if not info:
|
|
383
|
+
if self.load_model(target_model):
|
|
384
|
+
info = self._get_server_info(target_model)
|
|
385
|
+
else:
|
|
386
|
+
raise RuntimeError(f"Could not load model {target_model}")
|
|
387
|
+
else:
|
|
388
|
+
try:
|
|
389
|
+
self._get_registry_file(target_model).touch()
|
|
390
|
+
except:
|
|
391
|
+
pass
|
|
392
|
+
|
|
393
|
+
if not info:
|
|
394
|
+
raise RuntimeError(f"Model {target_model} failed to load.")
|
|
395
|
+
|
|
396
|
+
return openai.OpenAI(base_url=info['url'], api_key="sk-no-key-required")
|
|
397
|
+
|
|
398
|
+
def _execute_with_retry(self, func: Callable, *args, **kwargs):
|
|
399
|
+
"""
|
|
400
|
+
Executes an API call with retries for 503 (Model Loading) errors.
|
|
401
|
+
"""
|
|
402
|
+
retries = 60 # Wait up to ~2 minutes
|
|
403
|
+
for i in range(retries):
|
|
404
|
+
try:
|
|
405
|
+
return func(*args, **kwargs)
|
|
406
|
+
except openai.InternalServerError as e:
|
|
407
|
+
# Catch 503 Loading model
|
|
408
|
+
if e.status_code == 503:
|
|
409
|
+
if i % 10 == 0: # Reduce log spam
|
|
410
|
+
ASCIIColors.warning(f"Model is loading (503). Waiting... ({i+1}/{retries})")
|
|
411
|
+
time.sleep(2)
|
|
412
|
+
continue
|
|
413
|
+
raise e
|
|
414
|
+
except openai.APIConnectionError:
|
|
415
|
+
# Server might be briefly unreachable during heavy load or restart
|
|
416
|
+
if i % 10 == 0:
|
|
417
|
+
ASCIIColors.warning(f"Connection error. Waiting... ({i+1}/{retries})")
|
|
418
|
+
time.sleep(2)
|
|
419
|
+
continue
|
|
420
|
+
# Final attempt
|
|
421
|
+
return func(*args, **kwargs)
|
|
422
|
+
|
|
423
|
+
def generate_text(self, prompt: str, n_predict: int = None, stream: bool = False, **kwargs) -> Union[str, Dict]:
|
|
424
|
+
try:
|
|
425
|
+
client = self._get_client()
|
|
426
|
+
|
|
427
|
+
def do_gen():
|
|
428
|
+
return client.completions.create(
|
|
429
|
+
model=self.model_name,
|
|
430
|
+
prompt=prompt,
|
|
431
|
+
max_tokens=n_predict if n_predict else 1024,
|
|
432
|
+
temperature=kwargs.get("temperature", 0.7),
|
|
433
|
+
top_p=kwargs.get("top_p", 0.9),
|
|
434
|
+
stream=stream,
|
|
435
|
+
extra_body={
|
|
436
|
+
"top_k": kwargs.get("top_k", 40),
|
|
437
|
+
"repeat_penalty": kwargs.get("repeat_penalty", 1.1),
|
|
438
|
+
"n_predict": n_predict
|
|
439
|
+
}
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
completion = self._execute_with_retry(do_gen)
|
|
443
|
+
|
|
444
|
+
if stream:
|
|
445
|
+
full_text = ""
|
|
446
|
+
for chunk in completion:
|
|
447
|
+
content = chunk.choices[0].text
|
|
448
|
+
full_text += content
|
|
449
|
+
if kwargs.get("streaming_callback"):
|
|
450
|
+
if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
451
|
+
break
|
|
452
|
+
return full_text
|
|
453
|
+
else:
|
|
454
|
+
return completion.choices[0].text
|
|
455
|
+
except Exception as e:
|
|
456
|
+
trace_exception(e)
|
|
457
|
+
return {"status": False, "error": str(e)}
|
|
458
|
+
|
|
459
|
+
def chat(self, discussion: LollmsDiscussion, **kwargs) -> Union[str, Dict]:
|
|
460
|
+
try:
|
|
461
|
+
client = self._get_client()
|
|
462
|
+
messages = discussion.export("openai_chat")
|
|
463
|
+
|
|
464
|
+
def do_chat():
|
|
465
|
+
return client.chat.completions.create(
|
|
466
|
+
model=self.model_name,
|
|
467
|
+
messages=messages,
|
|
468
|
+
max_tokens=kwargs.get("n_predict", 1024),
|
|
469
|
+
temperature=kwargs.get("temperature", 0.7),
|
|
470
|
+
stream=kwargs.get("stream", False),
|
|
471
|
+
extra_body={
|
|
472
|
+
"top_k": kwargs.get("top_k", 40),
|
|
473
|
+
"repeat_penalty": kwargs.get("repeat_penalty", 1.1)
|
|
474
|
+
}
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
response = self._execute_with_retry(do_chat)
|
|
478
|
+
|
|
479
|
+
if kwargs.get("stream", False):
|
|
480
|
+
full_text = ""
|
|
481
|
+
for chunk in response:
|
|
482
|
+
content = chunk.choices[0].delta.content or ""
|
|
483
|
+
full_text += content
|
|
484
|
+
if kwargs.get("streaming_callback"):
|
|
485
|
+
if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
486
|
+
break
|
|
487
|
+
return full_text
|
|
488
|
+
else:
|
|
489
|
+
return response.choices[0].message.content
|
|
490
|
+
except Exception as e:
|
|
491
|
+
trace_exception(e)
|
|
492
|
+
return {"status": False, "error": str(e)}
|
|
493
|
+
|
|
494
|
+
def list_models(self) -> List[Dict[str, Any]]:
|
|
495
|
+
models = []
|
|
496
|
+
if self.models_dir.exists():
|
|
497
|
+
for f in self.models_dir.glob("*.gguf"):
|
|
498
|
+
if re.search(r'-\d{5}-of-\d{5}\.gguf$', f.name):
|
|
499
|
+
if "00001-of-" not in f.name: continue
|
|
500
|
+
models.append({"model_name": f.name, "owned_by": "local", "created": time.ctime(f.stat().st_ctime), "size": f.stat().st_size})
|
|
501
|
+
return models
|
|
502
|
+
|
|
503
|
+
def get_model_info(self) -> dict:
|
|
504
|
+
info = {"name": BindingName, "version": "source-wrapper", "active_model": self.model_name}
|
|
505
|
+
reg = self._get_server_info(self.model_name)
|
|
506
|
+
if reg: info["host_address"] = reg['url']
|
|
507
|
+
return info
|
|
508
|
+
|
|
509
|
+
def tokenize(self, text: str) -> list:
|
|
510
|
+
try:
|
|
511
|
+
client = self._get_client()
|
|
512
|
+
url = client.base_url
|
|
513
|
+
|
|
514
|
+
def do_tokenize():
|
|
515
|
+
# Llama-server specific endpoint
|
|
516
|
+
ep = f"{url}tokenize"
|
|
517
|
+
# Strip v1/ if present because tokenize is often at root in older llama-server,
|
|
518
|
+
# but in recent versions it might be under v1 or root. We try robustly.
|
|
519
|
+
res = requests.post(ep, json={"content": text})
|
|
520
|
+
if res.status_code == 404:
|
|
521
|
+
res = requests.post(str(url).replace("/v1/", "/tokenize"), json={"content": text})
|
|
522
|
+
|
|
523
|
+
if res.status_code == 503:
|
|
524
|
+
raise openai.InternalServerError("Loading model", response=res, body=None)
|
|
525
|
+
return res
|
|
526
|
+
|
|
527
|
+
res = self._execute_with_retry(do_tokenize)
|
|
528
|
+
if res.status_code == 200: return res.json().get("tokens", [])
|
|
529
|
+
except: pass
|
|
530
|
+
return list(text)
|
|
531
|
+
|
|
532
|
+
def detokenize(self, tokens: list) -> str:
|
|
533
|
+
try:
|
|
534
|
+
client = self._get_client()
|
|
535
|
+
url = client.base_url
|
|
536
|
+
|
|
537
|
+
def do_detokenize():
|
|
538
|
+
ep = f"{url}detokenize"
|
|
539
|
+
res = requests.post(ep, json={"tokens": tokens})
|
|
540
|
+
if res.status_code == 404:
|
|
541
|
+
res = requests.post(str(url).replace("/v1/", "/detokenize"), json={"tokens": tokens})
|
|
542
|
+
|
|
543
|
+
if res.status_code == 503:
|
|
544
|
+
raise openai.InternalServerError("Loading model", response=res, body=None)
|
|
545
|
+
return res
|
|
546
|
+
|
|
547
|
+
res = self._execute_with_retry(do_detokenize)
|
|
548
|
+
if res.status_code == 200: return res.json().get("content", "")
|
|
549
|
+
except: pass
|
|
550
|
+
return "".join(map(str, tokens))
|
|
551
|
+
|
|
552
|
+
def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
|
|
553
|
+
|
|
554
|
+
def embed(self, text: str, **kwargs) -> List[float]:
|
|
555
|
+
client = self._get_client()
|
|
556
|
+
def do_embed():
|
|
557
|
+
return client.embeddings.create(input=text, model=self.model_name)
|
|
558
|
+
res = self._execute_with_retry(do_embed)
|
|
559
|
+
return res.data[0].embedding
|
|
560
|
+
|
|
561
|
+
def get_zoo(self) -> List[Dict[str, Any]]:
|
|
562
|
+
return [
|
|
563
|
+
{"name": "Llama-3-8B-Instruct-v0.1-GGUF", "description": "Meta Llama 3 8B Instruct (Quantized)", "size": "5.7 GB (Q5_K_M)", "type": "gguf", "link": "MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF", "filename": "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"},
|
|
564
|
+
{"name": "Phi-3-mini-4k-instruct-GGUF", "description": "Microsoft Phi 3 Mini 4k (Quantized)", "size": "2.4 GB (Q4_K_M)", "type": "gguf", "link": "microsoft/Phi-3-mini-4k-instruct-gguf", "filename": "Phi-3-mini-4k-instruct-q4.gguf"},
|
|
565
|
+
{"name": "Mistral-7B-Instruct-v0.3-GGUF", "description": "Mistral 7B Instruct v0.3 (Quantized)", "size": "4.6 GB (Q4_K_M)", "type": "gguf", "link": "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF", "filename": "Mistral-7B-Instruct-v0.3.Q4_K_M.gguf"},
|
|
566
|
+
{"name": "Qwen2.5-7B-Instruct-GGUF", "description": "Qwen 2.5 7B Instruct (Quantized)", "size": "4.7 GB (Q5_K_M)", "type": "gguf", "link": "Qwen/Qwen2.5-7B-Instruct-GGUF", "filename": "qwen2.5-7b-instruct-q5_k_m.gguf"}
|
|
567
|
+
]
|
|
568
|
+
|
|
569
|
+
def download_from_zoo(self, index: int, progress_callback: Callable[[dict], None] = None) -> dict:
|
|
570
|
+
zoo = self.get_zoo();
|
|
571
|
+
if index < 0 or index >= len(zoo): return {"status": False, "message": "Index out of bounds"}
|
|
572
|
+
item = zoo[index]
|
|
573
|
+
return self.pull_model(item["link"], item.get("filename"), progress_callback)
|
|
574
|
+
|
|
575
|
+
def pull_model(self, repo_id: str, filename: str, progress_callback: Callable[[dict], None] = None) -> dict:
|
|
576
|
+
try:
|
|
577
|
+
match = re.match(r"^(.*)-(\d{5})-of-(\d{5})\.gguf$", filename)
|
|
578
|
+
files = []
|
|
579
|
+
if match:
|
|
580
|
+
base, total = match.group(1), int(match.group(3))
|
|
581
|
+
ASCIIColors.info(f"Detected multi-file model with {total} parts.")
|
|
582
|
+
for i in range(1, total + 1): files.append(f"{base}-{i:05d}-of-{total:05d}.gguf")
|
|
583
|
+
else:
|
|
584
|
+
files.append(filename)
|
|
585
|
+
|
|
586
|
+
paths = []
|
|
587
|
+
for f in files:
|
|
588
|
+
ASCIIColors.info(f"Downloading {f} from {repo_id}...")
|
|
589
|
+
if progress_callback: progress_callback({"status": "downloading", "message": f"Downloading {f}", "completed": 0, "total": 100})
|
|
590
|
+
p = hf_hub_download(repo_id=repo_id, filename=f, local_dir=self.models_dir, local_dir_use_symlinks=False, resume_download=True)
|
|
591
|
+
paths.append(p)
|
|
592
|
+
ASCIIColors.success(f"Downloaded {f}")
|
|
593
|
+
|
|
594
|
+
msg = f"Successfully downloaded model: {filename}"
|
|
595
|
+
if progress_callback: progress_callback({"status": "success", "message": msg, "completed": 100, "total": 100})
|
|
596
|
+
return {"status": True, "message": msg, "path": paths[0]}
|
|
597
|
+
except Exception as e:
|
|
598
|
+
trace_exception(e)
|
|
599
|
+
return {"status": False, "error": str(e)}
|
|
600
|
+
|
|
601
|
+
def cleanup_orphans_if_needed(self):
|
|
602
|
+
pass
|
|
603
|
+
|
|
604
|
+
def __del__(self):
|
|
605
|
+
pass
|
|
@@ -66,20 +66,27 @@ pm.ensure_packages(["requests", "pillow", "psutil"]) # pillow for dummy image in
|
|
|
66
66
|
if not pm.is_installed("llama-cpp-binaries"):
|
|
67
67
|
def install_llama_cpp():
|
|
68
68
|
system = platform.system()
|
|
69
|
-
python_version_simple = f"py{sys.version_info.major}"
|
|
70
|
-
|
|
71
|
-
cuda_suffix = "+cu124"
|
|
69
|
+
python_version_simple = f"py{sys.version_info.major}{sys.version_info.minor}" # e.g. py310 for 3.10
|
|
72
70
|
|
|
71
|
+
version_tag = "v0.56.0"
|
|
72
|
+
cuda_suffix = "+cu124"
|
|
73
73
|
|
|
74
74
|
if system == "Windows":
|
|
75
|
-
|
|
76
|
-
|
|
75
|
+
# Try version-specific URL first
|
|
76
|
+
url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
|
|
77
|
+
# Fallback to generic py3 if version-specific doesn't exist
|
|
78
|
+
fallback_url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-py3-none-win_amd64.whl"
|
|
77
79
|
elif system == "Linux":
|
|
78
|
-
|
|
79
|
-
|
|
80
|
+
# Try version-specific URL first
|
|
81
|
+
url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
|
|
82
|
+
# Fallback to generic py3 if version-specific doesn't exist
|
|
83
|
+
fallback_url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-py3-none-linux_x86_64.whl"
|
|
80
84
|
else:
|
|
81
|
-
ASCIIColors.
|
|
82
|
-
|
|
85
|
+
ASCIIColors.error(f"Unsupported OS for precompiled llama-cpp-binaries: {system}. "
|
|
86
|
+
"You might need to set 'llama_server_binary_path' in the binding config "
|
|
87
|
+
"to point to a manually compiled llama.cpp server binary.")
|
|
88
|
+
return False
|
|
89
|
+
|
|
83
90
|
|
|
84
91
|
ASCIIColors.info(f"Attempting to install llama-cpp-binaries from: {url}")
|
|
85
92
|
try:
|
|
@@ -628,7 +635,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
628
635
|
|
|
629
636
|
if not model_to_load:
|
|
630
637
|
self._scan_models()
|
|
631
|
-
available_models = self.
|
|
638
|
+
available_models = self.list_models()
|
|
632
639
|
if not available_models:
|
|
633
640
|
ASCIIColors.error("No model specified and no GGUF models found in models path.")
|
|
634
641
|
return False
|
|
@@ -964,7 +971,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
|
|
|
964
971
|
|
|
965
972
|
ASCIIColors.info(f"Scanned {len(self._model_path_map)} models from {self.models_path}.")
|
|
966
973
|
|
|
967
|
-
def
|
|
974
|
+
def list_models(self) -> List[Dict[str, Any]]:
|
|
968
975
|
self._scan_models()
|
|
969
976
|
models_found = []
|
|
970
977
|
for unique_name, model_path in self._model_path_map.items():
|