lollms-client 1.7.10__py3-none-any.whl → 1.7.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/claude/__init__.py +0 -1
- lollms_client/llm_bindings/grok/__init__.py +0 -1
- lollms_client/llm_bindings/llama_cpp_server/__init__.py +605 -0
- lollms_client/llm_bindings/ollama/__init__.py +40 -2
- lollms_client/lollms_discussion.py +40 -28
- lollms_client/lollms_llm_binding.py +15 -1
- lollms_client/lollms_mcp_binding.py +15 -2
- lollms_client/lollms_stt_binding.py +15 -1
- lollms_client/lollms_tti_binding.py +15 -1
- lollms_client/lollms_ttm_binding.py +15 -1
- lollms_client/lollms_tts_binding.py +15 -1
- lollms_client/lollms_ttv_binding.py +15 -1
- lollms_client/tti_bindings/diffusers/__init__.py +132 -79
- lollms_client/tti_bindings/diffusers/server/main.py +76 -65
- lollms_client/tts_bindings/xtts/__init__.py +1 -1
- {lollms_client-1.7.10.dist-info → lollms_client-1.7.13.dist-info}/METADATA +1 -1
- {lollms_client-1.7.10.dist-info → lollms_client-1.7.13.dist-info}/RECORD +21 -20
- {lollms_client-1.7.10.dist-info → lollms_client-1.7.13.dist-info}/WHEEL +0 -0
- {lollms_client-1.7.10.dist-info → lollms_client-1.7.13.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-1.7.10.dist-info → lollms_client-1.7.13.dist-info}/top_level.txt +0 -0
lollms_client/__init__.py
CHANGED
|
@@ -8,7 +8,7 @@ from lollms_client.lollms_utilities import PromptReshaper # Keep general utiliti
|
|
|
8
8
|
from lollms_client.lollms_mcp_binding import LollmsMCPBinding, LollmsMCPBindingManager
|
|
9
9
|
from lollms_client.lollms_llm_binding import LollmsLLMBindingManager
|
|
10
10
|
|
|
11
|
-
__version__ = "1.7.
|
|
11
|
+
__version__ = "1.7.13" # Updated version
|
|
12
12
|
|
|
13
13
|
# Optionally, you could define __all__ if you want to be explicit about exports
|
|
14
14
|
__all__ = [
|
|
@@ -469,7 +469,6 @@ class ClaudeBinding(LollmsLLMBinding):
|
|
|
469
469
|
url = f"{ANTHROPIC_API_BASE_URL}/models"
|
|
470
470
|
|
|
471
471
|
try:
|
|
472
|
-
ASCIIColors.info("Fetching available models from Anthropic API...")
|
|
473
472
|
response = requests.get(url, headers=headers, timeout=15)
|
|
474
473
|
response.raise_for_status()
|
|
475
474
|
|
|
@@ -418,7 +418,6 @@ class GrokBinding(LollmsLLMBinding):
|
|
|
418
418
|
return self._cached_models
|
|
419
419
|
|
|
420
420
|
try:
|
|
421
|
-
ASCIIColors.info("Fetching available models from xAI API...")
|
|
422
421
|
response = requests.get(f"{self.base_url}/models", headers=self.headers, timeout=15)
|
|
423
422
|
response.raise_for_status()
|
|
424
423
|
|
|
@@ -0,0 +1,605 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import requests
|
|
6
|
+
import socket
|
|
7
|
+
import re
|
|
8
|
+
import platform
|
|
9
|
+
import zipfile
|
|
10
|
+
import tarfile
|
|
11
|
+
import json
|
|
12
|
+
import atexit
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional, List, Dict, Any, Union, Callable
|
|
15
|
+
|
|
16
|
+
import pipmaster as pm
|
|
17
|
+
from ascii_colors import ASCIIColors, trace_exception
|
|
18
|
+
from lollms_client.lollms_llm_binding import LollmsLLMBinding
|
|
19
|
+
from lollms_client.lollms_types import MSG_TYPE
|
|
20
|
+
from lollms_client.lollms_discussion import LollmsDiscussion
|
|
21
|
+
|
|
22
|
+
# Ensure dependencies
|
|
23
|
+
pm.ensure_packages(["openai", "huggingface_hub", "filelock", "requests", "tqdm", "psutil"])
|
|
24
|
+
import openai
|
|
25
|
+
from huggingface_hub import hf_hub_download
|
|
26
|
+
from filelock import FileLock
|
|
27
|
+
from tqdm import tqdm
|
|
28
|
+
import psutil
|
|
29
|
+
|
|
30
|
+
BindingName = "LlamaCppServerBinding"
|
|
31
|
+
|
|
32
|
+
def get_free_port(start_port=9624, max_port=10000):
|
|
33
|
+
"""
|
|
34
|
+
Finds a free port on localhost.
|
|
35
|
+
Race-condition safe-ish: We bind to it to check, but release it immediately.
|
|
36
|
+
Real safety comes from the FileLock around this call.
|
|
37
|
+
"""
|
|
38
|
+
for port in range(start_port, max_port):
|
|
39
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
40
|
+
try:
|
|
41
|
+
sock.bind(('localhost', port))
|
|
42
|
+
return port
|
|
43
|
+
except OSError:
|
|
44
|
+
continue
|
|
45
|
+
raise RuntimeError("No free ports available.")
|
|
46
|
+
|
|
47
|
+
class LlamaCppServerBinding(LollmsLLMBinding):
|
|
48
|
+
def __init__(self, **kwargs):
|
|
49
|
+
super().__init__(BindingName, **kwargs)
|
|
50
|
+
self.config = kwargs
|
|
51
|
+
|
|
52
|
+
# Configuration
|
|
53
|
+
self.host = kwargs.get("host", "localhost")
|
|
54
|
+
self.model_name = kwargs.get("model_name", "")
|
|
55
|
+
self.n_ctx = kwargs.get("ctx_size", 4096)
|
|
56
|
+
self.n_gpu_layers = kwargs.get("n_gpu_layers", -1)
|
|
57
|
+
self.n_threads = kwargs.get("n_threads", None)
|
|
58
|
+
self.n_parallel = kwargs.get("n_parallel", 1)
|
|
59
|
+
self.batch_size = kwargs.get("batch_size", 512)
|
|
60
|
+
|
|
61
|
+
# Server Management
|
|
62
|
+
self.max_active_models = int(kwargs.get("max_active_models", 1))
|
|
63
|
+
self.idle_timeout = float(kwargs.get("idle_timeout", -1))
|
|
64
|
+
|
|
65
|
+
# Paths
|
|
66
|
+
self.binding_dir = Path(__file__).parent
|
|
67
|
+
self.bin_dir = self.binding_dir / "bin"
|
|
68
|
+
self.models_dir = Path(kwargs.get("models_path", "models/llama_cpp_models")).resolve()
|
|
69
|
+
|
|
70
|
+
# Registry directory for inter-process coordination
|
|
71
|
+
self.servers_dir = self.models_dir / "servers"
|
|
72
|
+
self.servers_dir.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
self.bin_dir.mkdir(exist_ok=True)
|
|
74
|
+
|
|
75
|
+
# Global lock file for all operations on the registry
|
|
76
|
+
self.global_lock_path = self.models_dir / "global_server_manager.lock"
|
|
77
|
+
|
|
78
|
+
# Installation check
|
|
79
|
+
if not self._get_server_executable().exists():
|
|
80
|
+
ASCIIColors.warning("Llama.cpp binary not found. Attempting installation...")
|
|
81
|
+
self.install_llama_cpp()
|
|
82
|
+
|
|
83
|
+
# Register cleanup for this process
|
|
84
|
+
atexit.register(self.cleanup_orphans_if_needed)
|
|
85
|
+
|
|
86
|
+
def _get_server_executable(self) -> Path:
|
|
87
|
+
if platform.system() == "Windows":
|
|
88
|
+
return self.bin_dir / "llama-server.exe"
|
|
89
|
+
else:
|
|
90
|
+
return self.bin_dir / "llama-server"
|
|
91
|
+
|
|
92
|
+
def detect_hardware(self) -> str:
|
|
93
|
+
sys_plat = platform.system()
|
|
94
|
+
if sys_plat == "Darwin":
|
|
95
|
+
return "macos"
|
|
96
|
+
try:
|
|
97
|
+
subprocess.check_call(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
98
|
+
return "cuda"
|
|
99
|
+
except:
|
|
100
|
+
pass
|
|
101
|
+
return "cpu"
|
|
102
|
+
|
|
103
|
+
def install_llama_cpp(self):
|
|
104
|
+
try:
|
|
105
|
+
ASCIIColors.info("Checking latest llama.cpp release...")
|
|
106
|
+
releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
|
107
|
+
response = requests.get(releases_url)
|
|
108
|
+
response.raise_for_status()
|
|
109
|
+
release_data = response.json()
|
|
110
|
+
assets = release_data.get("assets", [])
|
|
111
|
+
|
|
112
|
+
hardware = self.detect_hardware()
|
|
113
|
+
sys_plat = platform.system()
|
|
114
|
+
|
|
115
|
+
target_asset = None
|
|
116
|
+
search_terms = []
|
|
117
|
+
|
|
118
|
+
if sys_plat == "Windows":
|
|
119
|
+
search_terms.append("win")
|
|
120
|
+
search_terms.append("cuda" if hardware == "cuda" else "avx2")
|
|
121
|
+
search_terms.append("x64")
|
|
122
|
+
elif sys_plat == "Linux":
|
|
123
|
+
search_terms.append("ubuntu")
|
|
124
|
+
search_terms.append("x64")
|
|
125
|
+
elif sys_plat == "Darwin":
|
|
126
|
+
search_terms.append("macos")
|
|
127
|
+
search_terms.append("arm64" if platform.machine() == "arm64" else "x64")
|
|
128
|
+
|
|
129
|
+
for asset in assets:
|
|
130
|
+
name = asset["name"].lower()
|
|
131
|
+
if "cudart" in name: continue
|
|
132
|
+
if all(term in name for term in search_terms):
|
|
133
|
+
if "cuda" in name and "cu11" in name and hardware == "cuda": continue
|
|
134
|
+
target_asset = asset
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
# Windows CPU fallback
|
|
138
|
+
if not target_asset and sys_plat == "Windows" and hardware == "cpu":
|
|
139
|
+
for asset in assets:
|
|
140
|
+
if "cudart" in asset["name"].lower(): continue
|
|
141
|
+
if "win" in asset["name"].lower() and "x64" in asset["name"].lower() and "cuda" not in asset["name"].lower():
|
|
142
|
+
target_asset = asset
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
if not target_asset:
|
|
146
|
+
raise RuntimeError(f"No suitable binary found for {sys_plat} / {hardware}")
|
|
147
|
+
|
|
148
|
+
download_url = target_asset["browser_download_url"]
|
|
149
|
+
filename = target_asset["name"]
|
|
150
|
+
dest_file = self.bin_dir / filename
|
|
151
|
+
|
|
152
|
+
ASCIIColors.info(f"Downloading {filename}...")
|
|
153
|
+
with requests.get(download_url, stream=True) as r:
|
|
154
|
+
r.raise_for_status()
|
|
155
|
+
with open(dest_file, 'wb') as f:
|
|
156
|
+
for chunk in r.iter_content(chunk_size=8192):
|
|
157
|
+
f.write(chunk)
|
|
158
|
+
|
|
159
|
+
ASCIIColors.info("Extracting...")
|
|
160
|
+
if filename.endswith(".zip"):
|
|
161
|
+
with zipfile.ZipFile(dest_file, 'r') as z: z.extractall(self.bin_dir)
|
|
162
|
+
elif filename.endswith(".tar.gz"):
|
|
163
|
+
with tarfile.open(dest_file, "r:gz") as t: t.extractall(self.bin_dir)
|
|
164
|
+
|
|
165
|
+
dest_file.unlink()
|
|
166
|
+
|
|
167
|
+
# Normalize binary name
|
|
168
|
+
exe_name = "llama-server.exe" if sys_plat == "Windows" else "llama-server"
|
|
169
|
+
legacy_name = "server.exe" if sys_plat == "Windows" else "server"
|
|
170
|
+
if not (self.bin_dir / exe_name).exists() and (self.bin_dir / legacy_name).exists():
|
|
171
|
+
shutil.move(str(self.bin_dir / legacy_name), str(self.bin_dir / exe_name))
|
|
172
|
+
|
|
173
|
+
if sys_plat != "Windows":
|
|
174
|
+
exe_path = self.bin_dir / exe_name
|
|
175
|
+
if exe_path.exists(): os.chmod(exe_path, 0o755)
|
|
176
|
+
|
|
177
|
+
ASCIIColors.success("Llama.cpp installed successfully.")
|
|
178
|
+
except Exception as e:
|
|
179
|
+
trace_exception(e)
|
|
180
|
+
ASCIIColors.error(f"Failed to install llama.cpp: {e}")
|
|
181
|
+
|
|
182
|
+
# --- Server Management Logic ---
|
|
183
|
+
|
|
184
|
+
def _get_registry_file(self, model_name: str) -> Path:
|
|
185
|
+
# Sanitize filename
|
|
186
|
+
safe_name = "".join(c for c in model_name if c.isalnum() or c in ('-', '_', '.'))
|
|
187
|
+
return self.servers_dir / f"{safe_name}.json"
|
|
188
|
+
|
|
189
|
+
def _get_server_info(self, model_name: str) -> Optional[Dict]:
|
|
190
|
+
"""Reads registry file for a model, returns dict or None if invalid."""
|
|
191
|
+
reg_file = self._get_registry_file(model_name)
|
|
192
|
+
if not reg_file.exists():
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
with open(reg_file, 'r') as f:
|
|
197
|
+
info = json.load(f)
|
|
198
|
+
|
|
199
|
+
# Verify process is alive
|
|
200
|
+
if psutil.pid_exists(info['pid']):
|
|
201
|
+
# Verify it's actually llama-server (optional but safe)
|
|
202
|
+
try:
|
|
203
|
+
p = psutil.Process(info['pid'])
|
|
204
|
+
if "llama" in p.name().lower() or "server" in p.name().lower():
|
|
205
|
+
return info
|
|
206
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
# If we get here, process is dead or invalid
|
|
210
|
+
ASCIIColors.warning(f"Found stale registry file for {model_name} (PID {info['pid']}). Cleaning up.")
|
|
211
|
+
reg_file.unlink()
|
|
212
|
+
return None
|
|
213
|
+
except Exception:
|
|
214
|
+
# Corrupt file
|
|
215
|
+
if reg_file.exists(): reg_file.unlink()
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
def _kill_server(self, model_name: str, info: Dict):
|
|
219
|
+
"""Kills a server process and removes its registry file."""
|
|
220
|
+
ASCIIColors.info(f"Stopping server for {model_name} (PID {info['pid']})...")
|
|
221
|
+
try:
|
|
222
|
+
p = psutil.Process(info['pid'])
|
|
223
|
+
p.terminate()
|
|
224
|
+
p.wait(timeout=5)
|
|
225
|
+
except psutil.NoSuchProcess:
|
|
226
|
+
pass # Already gone
|
|
227
|
+
except psutil.TimeoutExpired:
|
|
228
|
+
p.kill()
|
|
229
|
+
except Exception as e:
|
|
230
|
+
ASCIIColors.error(f"Error killing process: {e}")
|
|
231
|
+
|
|
232
|
+
# Remove registry file
|
|
233
|
+
reg_file = self._get_registry_file(model_name)
|
|
234
|
+
if reg_file.exists():
|
|
235
|
+
reg_file.unlink()
|
|
236
|
+
|
|
237
|
+
def _ensure_capacity_locked(self):
|
|
238
|
+
"""
|
|
239
|
+
Called while holding the lock. Ensures we have space for a new model.
|
|
240
|
+
"""
|
|
241
|
+
registry_files = list(self.servers_dir.glob("*.json"))
|
|
242
|
+
|
|
243
|
+
# 1. Clean up stale entries first
|
|
244
|
+
valid_servers = []
|
|
245
|
+
for rf in registry_files:
|
|
246
|
+
try:
|
|
247
|
+
with open(rf, 'r') as f:
|
|
248
|
+
data = json.load(f)
|
|
249
|
+
if psutil.pid_exists(data['pid']):
|
|
250
|
+
valid_servers.append((rf, data))
|
|
251
|
+
else:
|
|
252
|
+
rf.unlink() # Clean stale
|
|
253
|
+
except:
|
|
254
|
+
if rf.exists(): rf.unlink()
|
|
255
|
+
|
|
256
|
+
# 2. Check capacity
|
|
257
|
+
if len(valid_servers) >= self.max_active_models:
|
|
258
|
+
# Sort by file modification time (mtime), which acts as our "last used" heartbeat
|
|
259
|
+
# Oldest mtime = Least Recently Used
|
|
260
|
+
valid_servers.sort(key=lambda x: x[0].stat().st_mtime)
|
|
261
|
+
|
|
262
|
+
# Kill the oldest
|
|
263
|
+
oldest_file, oldest_info = valid_servers[0]
|
|
264
|
+
model_to_kill = oldest_info.get("model_name", "unknown")
|
|
265
|
+
ASCIIColors.warning(f"Max active models ({self.max_active_models}) reached. Unloading LRU model: {model_to_kill}")
|
|
266
|
+
self._kill_server(model_to_kill, oldest_info)
|
|
267
|
+
|
|
268
|
+
def _spawn_server_detached(self, model_name: str):
|
|
269
|
+
"""Spawns the server process detached so it survives if this python script ends."""
|
|
270
|
+
exe_path = self._get_server_executable()
|
|
271
|
+
model_path = self.models_dir / model_name
|
|
272
|
+
|
|
273
|
+
if not model_path.exists():
|
|
274
|
+
raise FileNotFoundError(f"Model {model_name} not found at {model_path}")
|
|
275
|
+
|
|
276
|
+
port = get_free_port()
|
|
277
|
+
|
|
278
|
+
cmd = [
|
|
279
|
+
str(exe_path),
|
|
280
|
+
"--model", str(model_path),
|
|
281
|
+
"--host", self.host,
|
|
282
|
+
"--port", str(port),
|
|
283
|
+
"--ctx-size", str(self.n_ctx),
|
|
284
|
+
"--n-gpu-layers", str(self.n_gpu_layers),
|
|
285
|
+
"--parallel", str(self.n_parallel),
|
|
286
|
+
"--batch-size", str(self.batch_size),
|
|
287
|
+
"--embedding"
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
if self.n_threads:
|
|
291
|
+
cmd.extend(["--threads", str(self.n_threads)])
|
|
292
|
+
|
|
293
|
+
ASCIIColors.info(f"Spawning server for {model_name} on port {port}...")
|
|
294
|
+
|
|
295
|
+
# Process creation flags for detachment
|
|
296
|
+
kwargs = {}
|
|
297
|
+
if platform.system() == "Windows":
|
|
298
|
+
kwargs['creationflags'] = subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
|
|
299
|
+
else:
|
|
300
|
+
kwargs['start_new_session'] = True
|
|
301
|
+
|
|
302
|
+
proc = subprocess.Popen(
|
|
303
|
+
cmd,
|
|
304
|
+
stdout=subprocess.DEVNULL,
|
|
305
|
+
stderr=subprocess.DEVNULL,
|
|
306
|
+
**kwargs
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Wait for health check (WAIT until STATUS 200 OK)
|
|
310
|
+
url = f"http://{self.host}:{port}/v1"
|
|
311
|
+
start_time = time.time()
|
|
312
|
+
# Increased timeout to 120s for larger models
|
|
313
|
+
while time.time() - start_time < 120:
|
|
314
|
+
try:
|
|
315
|
+
res = requests.get(f"{url}/models", timeout=1)
|
|
316
|
+
# STRICTLY check for 200, as 503 means loading
|
|
317
|
+
if res.status_code == 200:
|
|
318
|
+
return proc.pid, port, url
|
|
319
|
+
except:
|
|
320
|
+
pass
|
|
321
|
+
|
|
322
|
+
if proc.poll() is not None:
|
|
323
|
+
raise RuntimeError(f"Server process exited immediately with code {proc.returncode}")
|
|
324
|
+
|
|
325
|
+
time.sleep(0.5)
|
|
326
|
+
|
|
327
|
+
# Timeout
|
|
328
|
+
proc.terminate()
|
|
329
|
+
raise TimeoutError(f"Server for {model_name} failed to become responsive (timeout).")
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def load_model(self, model_name: str) -> bool:
|
|
333
|
+
"""
|
|
334
|
+
Thread-safe and Process-safe model loading.
|
|
335
|
+
"""
|
|
336
|
+
if not self.global_lock_path.parent.exists():
|
|
337
|
+
self.global_lock_path.parent.mkdir(parents=True)
|
|
338
|
+
|
|
339
|
+
lock = FileLock(str(self.global_lock_path))
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
with lock.acquire(timeout=60):
|
|
343
|
+
info = self._get_server_info(model_name)
|
|
344
|
+
|
|
345
|
+
if info:
|
|
346
|
+
# Update heartbeat
|
|
347
|
+
try:
|
|
348
|
+
self._get_registry_file(model_name).touch()
|
|
349
|
+
except:
|
|
350
|
+
pass
|
|
351
|
+
self.model_name = model_name
|
|
352
|
+
return True
|
|
353
|
+
|
|
354
|
+
self._ensure_capacity_locked()
|
|
355
|
+
pid, port, url = self._spawn_server_detached(model_name)
|
|
356
|
+
|
|
357
|
+
reg_file = self._get_registry_file(model_name)
|
|
358
|
+
with open(reg_file, 'w') as f:
|
|
359
|
+
json.dump({
|
|
360
|
+
"model_name": model_name,
|
|
361
|
+
"pid": pid,
|
|
362
|
+
"port": port,
|
|
363
|
+
"url": url,
|
|
364
|
+
"started_at": time.time()
|
|
365
|
+
}, f)
|
|
366
|
+
|
|
367
|
+
self.model_name = model_name
|
|
368
|
+
return True
|
|
369
|
+
|
|
370
|
+
except Exception as e:
|
|
371
|
+
ASCIIColors.error(f"Error loading model {model_name}: {e}")
|
|
372
|
+
trace_exception(e)
|
|
373
|
+
return False
|
|
374
|
+
|
|
375
|
+
def _get_client(self, model_name: str = None) -> openai.OpenAI:
|
|
376
|
+
target_model = model_name or self.model_name
|
|
377
|
+
if not target_model:
|
|
378
|
+
raise ValueError("No model specified.")
|
|
379
|
+
|
|
380
|
+
info = self._get_server_info(target_model)
|
|
381
|
+
|
|
382
|
+
if not info:
|
|
383
|
+
if self.load_model(target_model):
|
|
384
|
+
info = self._get_server_info(target_model)
|
|
385
|
+
else:
|
|
386
|
+
raise RuntimeError(f"Could not load model {target_model}")
|
|
387
|
+
else:
|
|
388
|
+
try:
|
|
389
|
+
self._get_registry_file(target_model).touch()
|
|
390
|
+
except:
|
|
391
|
+
pass
|
|
392
|
+
|
|
393
|
+
if not info:
|
|
394
|
+
raise RuntimeError(f"Model {target_model} failed to load.")
|
|
395
|
+
|
|
396
|
+
return openai.OpenAI(base_url=info['url'], api_key="sk-no-key-required")
|
|
397
|
+
|
|
398
|
+
def _execute_with_retry(self, func: Callable, *args, **kwargs):
|
|
399
|
+
"""
|
|
400
|
+
Executes an API call with retries for 503 (Model Loading) errors.
|
|
401
|
+
"""
|
|
402
|
+
retries = 60 # Wait up to ~2 minutes
|
|
403
|
+
for i in range(retries):
|
|
404
|
+
try:
|
|
405
|
+
return func(*args, **kwargs)
|
|
406
|
+
except openai.InternalServerError as e:
|
|
407
|
+
# Catch 503 Loading model
|
|
408
|
+
if e.status_code == 503:
|
|
409
|
+
if i % 10 == 0: # Reduce log spam
|
|
410
|
+
ASCIIColors.warning(f"Model is loading (503). Waiting... ({i+1}/{retries})")
|
|
411
|
+
time.sleep(2)
|
|
412
|
+
continue
|
|
413
|
+
raise e
|
|
414
|
+
except openai.APIConnectionError:
|
|
415
|
+
# Server might be briefly unreachable during heavy load or restart
|
|
416
|
+
if i % 10 == 0:
|
|
417
|
+
ASCIIColors.warning(f"Connection error. Waiting... ({i+1}/{retries})")
|
|
418
|
+
time.sleep(2)
|
|
419
|
+
continue
|
|
420
|
+
# Final attempt
|
|
421
|
+
return func(*args, **kwargs)
|
|
422
|
+
|
|
423
|
+
def generate_text(self, prompt: str, n_predict: int = None, stream: bool = False, **kwargs) -> Union[str, Dict]:
|
|
424
|
+
try:
|
|
425
|
+
client = self._get_client()
|
|
426
|
+
|
|
427
|
+
def do_gen():
|
|
428
|
+
return client.completions.create(
|
|
429
|
+
model=self.model_name,
|
|
430
|
+
prompt=prompt,
|
|
431
|
+
max_tokens=n_predict if n_predict else 1024,
|
|
432
|
+
temperature=kwargs.get("temperature", 0.7),
|
|
433
|
+
top_p=kwargs.get("top_p", 0.9),
|
|
434
|
+
stream=stream,
|
|
435
|
+
extra_body={
|
|
436
|
+
"top_k": kwargs.get("top_k", 40),
|
|
437
|
+
"repeat_penalty": kwargs.get("repeat_penalty", 1.1),
|
|
438
|
+
"n_predict": n_predict
|
|
439
|
+
}
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
completion = self._execute_with_retry(do_gen)
|
|
443
|
+
|
|
444
|
+
if stream:
|
|
445
|
+
full_text = ""
|
|
446
|
+
for chunk in completion:
|
|
447
|
+
content = chunk.choices[0].text
|
|
448
|
+
full_text += content
|
|
449
|
+
if kwargs.get("streaming_callback"):
|
|
450
|
+
if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
451
|
+
break
|
|
452
|
+
return full_text
|
|
453
|
+
else:
|
|
454
|
+
return completion.choices[0].text
|
|
455
|
+
except Exception as e:
|
|
456
|
+
trace_exception(e)
|
|
457
|
+
return {"status": False, "error": str(e)}
|
|
458
|
+
|
|
459
|
+
def chat(self, discussion: LollmsDiscussion, **kwargs) -> Union[str, Dict]:
|
|
460
|
+
try:
|
|
461
|
+
client = self._get_client()
|
|
462
|
+
messages = discussion.export("openai_chat")
|
|
463
|
+
|
|
464
|
+
def do_chat():
|
|
465
|
+
return client.chat.completions.create(
|
|
466
|
+
model=self.model_name,
|
|
467
|
+
messages=messages,
|
|
468
|
+
max_tokens=kwargs.get("n_predict", 1024),
|
|
469
|
+
temperature=kwargs.get("temperature", 0.7),
|
|
470
|
+
stream=kwargs.get("stream", False),
|
|
471
|
+
extra_body={
|
|
472
|
+
"top_k": kwargs.get("top_k", 40),
|
|
473
|
+
"repeat_penalty": kwargs.get("repeat_penalty", 1.1)
|
|
474
|
+
}
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
response = self._execute_with_retry(do_chat)
|
|
478
|
+
|
|
479
|
+
if kwargs.get("stream", False):
|
|
480
|
+
full_text = ""
|
|
481
|
+
for chunk in response:
|
|
482
|
+
content = chunk.choices[0].delta.content or ""
|
|
483
|
+
full_text += content
|
|
484
|
+
if kwargs.get("streaming_callback"):
|
|
485
|
+
if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
486
|
+
break
|
|
487
|
+
return full_text
|
|
488
|
+
else:
|
|
489
|
+
return response.choices[0].message.content
|
|
490
|
+
except Exception as e:
|
|
491
|
+
trace_exception(e)
|
|
492
|
+
return {"status": False, "error": str(e)}
|
|
493
|
+
|
|
494
|
+
def list_models(self) -> List[Dict[str, Any]]:
|
|
495
|
+
models = []
|
|
496
|
+
if self.models_dir.exists():
|
|
497
|
+
for f in self.models_dir.glob("*.gguf"):
|
|
498
|
+
if re.search(r'-\d{5}-of-\d{5}\.gguf$', f.name):
|
|
499
|
+
if "00001-of-" not in f.name: continue
|
|
500
|
+
models.append({"model_name": f.name, "owned_by": "local", "created": time.ctime(f.stat().st_ctime), "size": f.stat().st_size})
|
|
501
|
+
return models
|
|
502
|
+
|
|
503
|
+
def get_model_info(self) -> dict:
|
|
504
|
+
info = {"name": BindingName, "version": "source-wrapper", "active_model": self.model_name}
|
|
505
|
+
reg = self._get_server_info(self.model_name)
|
|
506
|
+
if reg: info["host_address"] = reg['url']
|
|
507
|
+
return info
|
|
508
|
+
|
|
509
|
+
def tokenize(self, text: str) -> list:
|
|
510
|
+
try:
|
|
511
|
+
client = self._get_client()
|
|
512
|
+
url = client.base_url
|
|
513
|
+
|
|
514
|
+
def do_tokenize():
|
|
515
|
+
# Llama-server specific endpoint
|
|
516
|
+
ep = f"{url}tokenize"
|
|
517
|
+
# Strip v1/ if present because tokenize is often at root in older llama-server,
|
|
518
|
+
# but in recent versions it might be under v1 or root. We try robustly.
|
|
519
|
+
res = requests.post(ep, json={"content": text})
|
|
520
|
+
if res.status_code == 404:
|
|
521
|
+
res = requests.post(str(url).replace("/v1/", "/tokenize"), json={"content": text})
|
|
522
|
+
|
|
523
|
+
if res.status_code == 503:
|
|
524
|
+
raise openai.InternalServerError("Loading model", response=res, body=None)
|
|
525
|
+
return res
|
|
526
|
+
|
|
527
|
+
res = self._execute_with_retry(do_tokenize)
|
|
528
|
+
if res.status_code == 200: return res.json().get("tokens", [])
|
|
529
|
+
except: pass
|
|
530
|
+
return list(text)
|
|
531
|
+
|
|
532
|
+
def detokenize(self, tokens: list) -> str:
|
|
533
|
+
try:
|
|
534
|
+
client = self._get_client()
|
|
535
|
+
url = client.base_url
|
|
536
|
+
|
|
537
|
+
def do_detokenize():
|
|
538
|
+
ep = f"{url}detokenize"
|
|
539
|
+
res = requests.post(ep, json={"tokens": tokens})
|
|
540
|
+
if res.status_code == 404:
|
|
541
|
+
res = requests.post(str(url).replace("/v1/", "/detokenize"), json={"tokens": tokens})
|
|
542
|
+
|
|
543
|
+
if res.status_code == 503:
|
|
544
|
+
raise openai.InternalServerError("Loading model", response=res, body=None)
|
|
545
|
+
return res
|
|
546
|
+
|
|
547
|
+
res = self._execute_with_retry(do_detokenize)
|
|
548
|
+
if res.status_code == 200: return res.json().get("content", "")
|
|
549
|
+
except: pass
|
|
550
|
+
return "".join(map(str, tokens))
|
|
551
|
+
|
|
552
|
+
def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
|
|
553
|
+
|
|
554
|
+
def embed(self, text: str, **kwargs) -> List[float]:
|
|
555
|
+
client = self._get_client()
|
|
556
|
+
def do_embed():
|
|
557
|
+
return client.embeddings.create(input=text, model=self.model_name)
|
|
558
|
+
res = self._execute_with_retry(do_embed)
|
|
559
|
+
return res.data[0].embedding
|
|
560
|
+
|
|
561
|
+
def get_zoo(self) -> List[Dict[str, Any]]:
|
|
562
|
+
return [
|
|
563
|
+
{"name": "Llama-3-8B-Instruct-v0.1-GGUF", "description": "Meta Llama 3 8B Instruct (Quantized)", "size": "5.7 GB (Q5_K_M)", "type": "gguf", "link": "MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF", "filename": "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"},
|
|
564
|
+
{"name": "Phi-3-mini-4k-instruct-GGUF", "description": "Microsoft Phi 3 Mini 4k (Quantized)", "size": "2.4 GB (Q4_K_M)", "type": "gguf", "link": "microsoft/Phi-3-mini-4k-instruct-gguf", "filename": "Phi-3-mini-4k-instruct-q4.gguf"},
|
|
565
|
+
{"name": "Mistral-7B-Instruct-v0.3-GGUF", "description": "Mistral 7B Instruct v0.3 (Quantized)", "size": "4.6 GB (Q4_K_M)", "type": "gguf", "link": "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF", "filename": "Mistral-7B-Instruct-v0.3.Q4_K_M.gguf"},
|
|
566
|
+
{"name": "Qwen2.5-7B-Instruct-GGUF", "description": "Qwen 2.5 7B Instruct (Quantized)", "size": "4.7 GB (Q5_K_M)", "type": "gguf", "link": "Qwen/Qwen2.5-7B-Instruct-GGUF", "filename": "qwen2.5-7b-instruct-q5_k_m.gguf"}
|
|
567
|
+
]
|
|
568
|
+
|
|
569
|
+
def download_from_zoo(self, index: int, progress_callback: Callable[[dict], None] = None) -> dict:
|
|
570
|
+
zoo = self.get_zoo();
|
|
571
|
+
if index < 0 or index >= len(zoo): return {"status": False, "message": "Index out of bounds"}
|
|
572
|
+
item = zoo[index]
|
|
573
|
+
return self.pull_model(item["link"], item.get("filename"), progress_callback)
|
|
574
|
+
|
|
575
|
+
def pull_model(self, repo_id: str, filename: str, progress_callback: Callable[[dict], None] = None) -> dict:
|
|
576
|
+
try:
|
|
577
|
+
match = re.match(r"^(.*)-(\d{5})-of-(\d{5})\.gguf$", filename)
|
|
578
|
+
files = []
|
|
579
|
+
if match:
|
|
580
|
+
base, total = match.group(1), int(match.group(3))
|
|
581
|
+
ASCIIColors.info(f"Detected multi-file model with {total} parts.")
|
|
582
|
+
for i in range(1, total + 1): files.append(f"{base}-{i:05d}-of-{total:05d}.gguf")
|
|
583
|
+
else:
|
|
584
|
+
files.append(filename)
|
|
585
|
+
|
|
586
|
+
paths = []
|
|
587
|
+
for f in files:
|
|
588
|
+
ASCIIColors.info(f"Downloading {f} from {repo_id}...")
|
|
589
|
+
if progress_callback: progress_callback({"status": "downloading", "message": f"Downloading {f}", "completed": 0, "total": 100})
|
|
590
|
+
p = hf_hub_download(repo_id=repo_id, filename=f, local_dir=self.models_dir, local_dir_use_symlinks=False, resume_download=True)
|
|
591
|
+
paths.append(p)
|
|
592
|
+
ASCIIColors.success(f"Downloaded {f}")
|
|
593
|
+
|
|
594
|
+
msg = f"Successfully downloaded model: {filename}"
|
|
595
|
+
if progress_callback: progress_callback({"status": "success", "message": msg, "completed": 100, "total": 100})
|
|
596
|
+
return {"status": True, "message": msg, "path": paths[0]}
|
|
597
|
+
except Exception as e:
|
|
598
|
+
trace_exception(e)
|
|
599
|
+
return {"status": False, "error": str(e)}
|
|
600
|
+
|
|
601
|
+
def cleanup_orphans_if_needed(self):
|
|
602
|
+
pass
|
|
603
|
+
|
|
604
|
+
def __del__(self):
|
|
605
|
+
pass
|