lollms-client 1.7.10__py3-none-any.whl → 1.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,726 @@
1
+ import subprocess
2
+ import sys
3
+ import os
4
+ import time
5
+ import requests
6
+ import socket
7
+ import re
8
+ import platform
9
+ import zipfile
10
+ import tarfile
11
+ import json
12
+ import yaml
13
+ import atexit
14
+ from pathlib import Path
15
+ from typing import Optional, List, Dict, Any, Union, Callable
16
+
17
+ import pipmaster as pm
18
+ from ascii_colors import ASCIIColors, trace_exception
19
+ from lollms_client.lollms_llm_binding import LollmsLLMBinding
20
+ from lollms_client.lollms_types import MSG_TYPE
21
+ from lollms_client.lollms_discussion import LollmsDiscussion
22
+
23
+ # Ensure dependencies
24
+ pm.ensure_packages(["openai", "huggingface_hub", "filelock", "requests", "tqdm", "psutil", "pyyaml"])
25
+ import openai
26
+ from huggingface_hub import hf_hub_download
27
+ from filelock import FileLock
28
+ from tqdm import tqdm
29
+ import psutil
30
+
31
+ BindingName = "LlamaCppServerBinding"
32
+
33
+ def get_free_port(start_port=9624, max_port=10000):
34
+ """
35
+ Finds a free port on localhost.
36
+ Race-condition safe-ish: We bind to it to check, but release it immediately.
37
+ Real safety comes from the FileLock around this call.
38
+ """
39
+ for port in range(start_port, max_port):
40
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
41
+ try:
42
+ sock.bind(('localhost', port))
43
+ return port
44
+ except OSError:
45
+ continue
46
+ raise RuntimeError("No free ports available.")
47
+
48
+ class LlamaCppServerBinding(LollmsLLMBinding):
49
+ def __init__(self, **kwargs):
50
+ super().__init__(BindingName, **kwargs)
51
+ self.config = kwargs
52
+
53
+ # Configuration
54
+ self.host = kwargs.get("host", "localhost")
55
+ self.model_name = kwargs.get("model_name", "")
56
+ self.n_ctx = kwargs.get("ctx_size", 4096)
57
+ self.n_gpu_layers = kwargs.get("n_gpu_layers", -1)
58
+ self.n_threads = kwargs.get("n_threads", None)
59
+ self.n_parallel = kwargs.get("n_parallel", 1)
60
+ self.batch_size = kwargs.get("batch_size", 512)
61
+
62
+ # Server Management
63
+ self.max_active_models = int(kwargs.get("max_active_models", 1))
64
+ self.idle_timeout = float(kwargs.get("idle_timeout", -1))
65
+
66
+ # Paths
67
+ self.binding_dir = Path(__file__).parent
68
+ self.bin_dir = self.binding_dir / "bin"
69
+ self.models_dir = Path(kwargs.get("models_path", "models/llama_cpp_models")).resolve()
70
+
71
+ # Multimodal Registry
72
+ self.mm_registry_path = self.models_dir / "multimodal_bindings.yaml"
73
+
74
+ # Registry directory for inter-process coordination
75
+ self.servers_dir = self.models_dir / "servers"
76
+ self.servers_dir.mkdir(parents=True, exist_ok=True)
77
+ self.bin_dir.mkdir(exist_ok=True)
78
+
79
+ # Global lock file for all operations on the registry
80
+ self.global_lock_path = self.models_dir / "global_server_manager.lock"
81
+
82
+ # Installation check
83
+ if not self._get_server_executable().exists():
84
+ ASCIIColors.warning("Llama.cpp binary not found. Attempting installation...")
85
+ self.install_llama_cpp()
86
+
87
+ # Register cleanup for this process
88
+ atexit.register(self.cleanup_orphans_if_needed)
89
+
90
+ def _get_server_executable(self) -> Path:
91
+ if platform.system() == "Windows":
92
+ return self.bin_dir / "llama-server.exe"
93
+ else:
94
+ return self.bin_dir / "llama-server"
95
+
96
+ def detect_hardware(self) -> str:
97
+ sys_plat = platform.system()
98
+ if sys_plat == "Darwin":
99
+ return "macos"
100
+ try:
101
+ subprocess.check_call(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
102
+ return "cuda"
103
+ except:
104
+ pass
105
+ return "cpu"
106
+
107
+ def install_llama_cpp(self):
108
+ try:
109
+ ASCIIColors.info("Checking latest llama.cpp release...")
110
+ releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
111
+ response = requests.get(releases_url)
112
+ response.raise_for_status()
113
+ release_data = response.json()
114
+ assets = release_data.get("assets", [])
115
+
116
+ hardware = self.detect_hardware()
117
+ sys_plat = platform.system()
118
+
119
+ target_asset = None
120
+ search_terms = []
121
+
122
+ if sys_plat == "Windows":
123
+ search_terms.append("win")
124
+ search_terms.append("cuda" if hardware == "cuda" else "avx2")
125
+ search_terms.append("x64")
126
+ elif sys_plat == "Linux":
127
+ search_terms.append("ubuntu")
128
+ search_terms.append("x64")
129
+ elif sys_plat == "Darwin":
130
+ search_terms.append("macos")
131
+ search_terms.append("arm64" if platform.machine() == "arm64" else "x64")
132
+
133
+ for asset in assets:
134
+ name = asset["name"].lower()
135
+ if "cudart" in name: continue
136
+ if all(term in name for term in search_terms):
137
+ if "cuda" in name and "cu11" in name and hardware == "cuda": continue
138
+ target_asset = asset
139
+ break
140
+
141
+ # Windows CPU fallback
142
+ if not target_asset and sys_plat == "Windows" and hardware == "cpu":
143
+ for asset in assets:
144
+ if "cudart" in asset["name"].lower(): continue
145
+ if "win" in asset["name"].lower() and "x64" in asset["name"].lower() and "cuda" not in asset["name"].lower():
146
+ target_asset = asset
147
+ break
148
+
149
+ if not target_asset:
150
+ raise RuntimeError(f"No suitable binary found for {sys_plat} / {hardware}")
151
+
152
+ download_url = target_asset["browser_download_url"]
153
+ filename = target_asset["name"]
154
+ dest_file = self.bin_dir / filename
155
+
156
+ ASCIIColors.info(f"Downloading {filename}...")
157
+ with requests.get(download_url, stream=True) as r:
158
+ r.raise_for_status()
159
+ with open(dest_file, 'wb') as f:
160
+ for chunk in r.iter_content(chunk_size=8192):
161
+ f.write(chunk)
162
+
163
+ ASCIIColors.info("Extracting...")
164
+ if filename.endswith(".zip"):
165
+ with zipfile.ZipFile(dest_file, 'r') as z: z.extractall(self.bin_dir)
166
+ elif filename.endswith(".tar.gz"):
167
+ with tarfile.open(dest_file, "r:gz") as t: t.extractall(self.bin_dir)
168
+
169
+ dest_file.unlink()
170
+
171
+ # Normalize binary name
172
+ exe_name = "llama-server.exe" if sys_plat == "Windows" else "llama-server"
173
+ legacy_name = "server.exe" if sys_plat == "Windows" else "server"
174
+ if not (self.bin_dir / exe_name).exists() and (self.bin_dir / legacy_name).exists():
175
+ shutil.move(str(self.bin_dir / legacy_name), str(self.bin_dir / exe_name))
176
+
177
+ if sys_plat != "Windows":
178
+ exe_path = self.bin_dir / exe_name
179
+ if exe_path.exists(): os.chmod(exe_path, 0o755)
180
+
181
+ ASCIIColors.success("Llama.cpp installed successfully.")
182
+ except Exception as e:
183
+ trace_exception(e)
184
+ ASCIIColors.error(f"Failed to install llama.cpp: {e}")
185
+
186
+ # --- Server Management Logic ---
187
+
188
+ def _get_registry_file(self, model_name: str) -> Path:
189
+ # Sanitize filename
190
+ safe_name = "".join(c for c in model_name if c.isalnum() or c in ('-', '_', '.'))
191
+ return self.servers_dir / f"{safe_name}.json"
192
+
193
+ def _get_server_info(self, model_name: str) -> Optional[Dict]:
194
+ """Reads registry file for a model, returns dict or None if invalid."""
195
+ reg_file = self._get_registry_file(model_name)
196
+ if not reg_file.exists():
197
+ return None
198
+
199
+ try:
200
+ with open(reg_file, 'r') as f:
201
+ info = json.load(f)
202
+
203
+ # Verify process is alive
204
+ if psutil.pid_exists(info['pid']):
205
+ # Verify it's actually llama-server (optional but safe)
206
+ try:
207
+ p = psutil.Process(info['pid'])
208
+ if "llama" in p.name().lower() or "server" in p.name().lower():
209
+ return info
210
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
211
+ pass
212
+
213
+ # If we get here, process is dead or invalid
214
+ ASCIIColors.warning(f"Found stale registry file for {model_name} (PID {info['pid']}). Cleaning up.")
215
+ reg_file.unlink()
216
+ return None
217
+ except Exception:
218
+ # Corrupt file
219
+ if reg_file.exists(): reg_file.unlink()
220
+ return None
221
+
222
+ def _kill_server(self, model_name: str, info: Dict):
223
+ """Kills a server process and removes its registry file."""
224
+ ASCIIColors.info(f"Stopping server for {model_name} (PID {info['pid']})...")
225
+ try:
226
+ p = psutil.Process(info['pid'])
227
+ p.terminate()
228
+ p.wait(timeout=5)
229
+ except psutil.NoSuchProcess:
230
+ pass # Already gone
231
+ except psutil.TimeoutExpired:
232
+ p.kill()
233
+ except Exception as e:
234
+ ASCIIColors.error(f"Error killing process: {e}")
235
+
236
+ # Remove registry file
237
+ reg_file = self._get_registry_file(model_name)
238
+ if reg_file.exists():
239
+ reg_file.unlink()
240
+
241
+ def _ensure_capacity_locked(self):
242
+ """
243
+ Called while holding the lock. Ensures we have space for a new model.
244
+ """
245
+ registry_files = list(self.servers_dir.glob("*.json"))
246
+
247
+ # 1. Clean up stale entries first
248
+ valid_servers = []
249
+ for rf in registry_files:
250
+ try:
251
+ with open(rf, 'r') as f:
252
+ data = json.load(f)
253
+ if psutil.pid_exists(data['pid']):
254
+ valid_servers.append((rf, data))
255
+ else:
256
+ rf.unlink() # Clean stale
257
+ except:
258
+ if rf.exists(): rf.unlink()
259
+
260
+ # 2. Check capacity
261
+ if len(valid_servers) >= self.max_active_models:
262
+ # Sort by file modification time (mtime), which acts as our "last used" heartbeat
263
+ # Oldest mtime = Least Recently Used
264
+ valid_servers.sort(key=lambda x: x[0].stat().st_mtime)
265
+
266
+ # Kill the oldest
267
+ oldest_file, oldest_info = valid_servers[0]
268
+ model_to_kill = oldest_info.get("model_name", "unknown")
269
+ ASCIIColors.warning(f"Max active models ({self.max_active_models}) reached. Unloading LRU model: {model_to_kill}")
270
+ self._kill_server(model_to_kill, oldest_info)
271
+
272
+ def _load_mm_registry(self) -> Dict[str, str]:
273
+ if not self.mm_registry_path.exists():
274
+ return {}
275
+ try:
276
+ with open(self.mm_registry_path, 'r') as f:
277
+ registry = yaml.safe_load(f) or {}
278
+
279
+ # Self-healing: remove missing files
280
+ updated = False
281
+ to_remove = []
282
+ for m, p in registry.items():
283
+ if not (self.models_dir / m).exists() or not (self.models_dir / p).exists():
284
+ to_remove.append(m)
285
+ updated = True
286
+
287
+ for m in to_remove:
288
+ del registry[m]
289
+
290
+ if updated:
291
+ self._save_mm_registry(registry)
292
+ return registry
293
+ except Exception as e:
294
+ ASCIIColors.error(f"Failed to load multimodal registry: {e}")
295
+ return {}
296
+
297
+ def _save_mm_registry(self, registry: Dict[str, str]):
298
+ try:
299
+ with open(self.mm_registry_path, 'w') as f:
300
+ yaml.dump(registry, f)
301
+ except Exception as e:
302
+ ASCIIColors.error(f"Failed to save multimodal registry: {e}")
303
+
304
+ def bind_multimodal_model(self, model_name: str, mmproj_name: str) -> dict:
305
+ """Explicitly binds a model to an mmproj file."""
306
+ if not (self.models_dir / model_name).exists():
307
+ return {"status": False, "error": f"Model {model_name} not found."}
308
+ if not (self.models_dir / mmproj_name).exists():
309
+ return {"status": False, "error": f"Projector {mmproj_name} not found."}
310
+
311
+ registry = self._load_mm_registry()
312
+ registry[model_name] = mmproj_name
313
+ self._save_mm_registry(registry)
314
+
315
+ ASCIIColors.success(f"Bound {model_name} with {mmproj_name}")
316
+ return {"status": True, "message": f"Bound {model_name} with {mmproj_name}"}
317
+
318
+ def _find_mmproj(self, model_path: Path) -> Optional[Path]:
319
+ """Finds a corresponding mmproj file for a given model path."""
320
+ # 1. Check registry first
321
+ registry = self._load_mm_registry()
322
+ if model_path.name in registry:
323
+ proj_path = self.models_dir / registry[model_path.name]
324
+ if proj_path.exists():
325
+ return proj_path
326
+
327
+ # 2. Automatic detection patterns
328
+ stem = model_path.stem
329
+ clean_stem = re.sub(r'\.(Q\d_.*|f16|f32)$', '', stem)
330
+ patterns = [
331
+ f"{stem}.mmproj", f"{stem}-mmproj.gguf", f"{stem}.mmproj.gguf",
332
+ f"{clean_stem}.mmproj", f"{clean_stem}-mmproj.gguf",
333
+ f"mmproj-{stem}.gguf", "mmproj.gguf"
334
+ ]
335
+
336
+ for p in patterns:
337
+ pot = model_path.parent / p
338
+ if pot.exists():
339
+ return pot
340
+
341
+ # 3. Last resort: simple scan
342
+ try:
343
+ for f in model_path.parent.iterdir():
344
+ if f.is_file() and "mmproj" in f.name.lower() and f.name != model_path.name:
345
+ if f.suffix in [".gguf", ".mmproj", ".bin"]:
346
+ return f
347
+ except:
348
+ pass
349
+
350
+ return None
351
+
352
+ def _spawn_server_detached(self, model_name: str):
353
+ """Spawns the server process detached so it survives if this python script ends."""
354
+ exe_path = self._get_server_executable()
355
+ model_path = self.models_dir / model_name
356
+
357
+ if not model_path.exists():
358
+ raise FileNotFoundError(f"Model {model_name} not found at {model_path}")
359
+
360
+ port = get_free_port()
361
+
362
+ cmd = [
363
+ str(exe_path),
364
+ "--model", str(model_path),
365
+ "--host", self.host,
366
+ "--port", str(port),
367
+ "--ctx-size", str(self.n_ctx),
368
+ "--n-gpu-layers", str(self.n_gpu_layers),
369
+ "--parallel", str(self.n_parallel),
370
+ "--batch-size", str(self.batch_size),
371
+ "--embedding"
372
+ ]
373
+
374
+ # Automatic detection or Registry-based mmproj
375
+ mmproj_path = self._find_mmproj(model_path)
376
+ if mmproj_path:
377
+ ASCIIColors.info(f"Detected multimodal projector: {mmproj_path}")
378
+ cmd.extend(["--mmproj", str(mmproj_path)])
379
+
380
+ if self.n_threads:
381
+ cmd.extend(["--threads", str(self.n_threads)])
382
+
383
+ ASCIIColors.info(f"Spawning server for {model_name} on port {port}...")
384
+
385
+ # Process creation flags for detachment
386
+ kwargs = {}
387
+ if platform.system() == "Windows":
388
+ kwargs['creationflags'] = subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
389
+ else:
390
+ kwargs['start_new_session'] = True
391
+
392
+ proc = subprocess.Popen(
393
+ cmd,
394
+ stdout=subprocess.DEVNULL,
395
+ stderr=subprocess.DEVNULL,
396
+ **kwargs
397
+ )
398
+
399
+ # Wait for health check
400
+ url = f"http://{self.host}:{port}/v1"
401
+ start_time = time.time()
402
+ while time.time() - start_time < 120:
403
+ try:
404
+ res = requests.get(f"{url}/models", timeout=1)
405
+ if res.status_code == 200:
406
+ return proc.pid, port, url
407
+ except:
408
+ pass
409
+
410
+ if proc.poll() is not None:
411
+ raise RuntimeError(f"Server process exited immediately with code {proc.returncode}")
412
+ time.sleep(0.5)
413
+
414
+ proc.terminate()
415
+ raise TimeoutError(f"Server for {model_name} failed to become responsive.")
416
+
417
+ def load_model(self, model_name: str) -> bool:
418
+ """Thread-safe and Process-safe model loading."""
419
+ if not self.global_lock_path.parent.exists():
420
+ self.global_lock_path.parent.mkdir(parents=True)
421
+
422
+ lock = FileLock(str(self.global_lock_path))
423
+ try:
424
+ with lock.acquire(timeout=60):
425
+ info = self._get_server_info(model_name)
426
+ if info:
427
+ try:
428
+ self._get_registry_file(model_name).touch()
429
+ except:
430
+ pass
431
+ self.model_name = model_name
432
+ return True
433
+
434
+ self._ensure_capacity_locked()
435
+ pid, port, url = self._spawn_server_detached(model_name)
436
+
437
+ reg_file = self._get_registry_file(model_name)
438
+ with open(reg_file, 'w') as f:
439
+ json.dump({
440
+ "model_name": model_name, "pid": pid, "port": port, "url": url, "started_at": time.time()
441
+ }, f)
442
+
443
+ self.model_name = model_name
444
+ return True
445
+ except Exception as e:
446
+ ASCIIColors.error(f"Error loading model {model_name}: {e}")
447
+ trace_exception(e)
448
+ return False
449
+
450
+ def _get_client(self, model_name: str = None) -> openai.OpenAI:
451
+ target_model = model_name or self.model_name
452
+ if not target_model:
453
+ raise ValueError("No model specified.")
454
+ info = self._get_server_info(target_model)
455
+ if not info:
456
+ if self.load_model(target_model):
457
+ info = self._get_server_info(target_model)
458
+ else:
459
+ raise RuntimeError(f"Could not load model {target_model}")
460
+ else:
461
+ try:
462
+ self._get_registry_file(target_model).touch()
463
+ except:
464
+ pass
465
+ if not info:
466
+ raise RuntimeError(f"Model {target_model} failed to load.")
467
+ return openai.OpenAI(base_url=info['url'], api_key="sk-no-key-required")
468
+
469
+ def _execute_with_retry(self, func: Callable, *args, **kwargs):
470
+ retries = 60
471
+ for i in range(retries):
472
+ try:
473
+ return func(*args, **kwargs)
474
+ except openai.InternalServerError as e:
475
+ if e.status_code == 503:
476
+ if i % 10 == 0:
477
+ ASCIIColors.warning(f"Model is loading (503). Waiting... ({i+1}/{retries})")
478
+ time.sleep(2)
479
+ continue
480
+ raise e
481
+ except openai.APIConnectionError:
482
+ if i % 10 == 0:
483
+ ASCIIColors.warning(f"Connection error. Waiting... ({i+1}/{retries})")
484
+ time.sleep(2)
485
+ continue
486
+ return func(*args, **kwargs)
487
+
488
+ def generate_text(self, prompt: str, n_predict: int = None, stream: bool = False, **kwargs) -> Union[str, Dict]:
489
+ try:
490
+ client = self._get_client()
491
+ def do_gen():
492
+ return client.completions.create(
493
+ model=self.model_name, prompt=prompt,
494
+ max_tokens=n_predict if n_predict else 1024,
495
+ temperature=kwargs.get("temperature", 0.7),
496
+ top_p=kwargs.get("top_p", 0.9), stream=stream,
497
+ extra_body={"top_k": kwargs.get("top_k", 40), "repeat_penalty": kwargs.get("repeat_penalty", 1.1), "n_predict": n_predict}
498
+ )
499
+ completion = self._execute_with_retry(do_gen)
500
+ if stream:
501
+ full_text = ""
502
+ for chunk in completion:
503
+ content = chunk.choices[0].text
504
+ full_text += content
505
+ if kwargs.get("streaming_callback"):
506
+ if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
507
+ break
508
+ return full_text
509
+ else:
510
+ return completion.choices[0].text
511
+ except Exception as e:
512
+ trace_exception(e)
513
+ return {"status": False, "error": str(e)}
514
+
515
+ def chat(self, discussion: LollmsDiscussion, **kwargs) -> Union[str, Dict]:
516
+ try:
517
+ client = self._get_client()
518
+ messages = discussion.export("openai_chat")
519
+ def do_chat():
520
+ return client.chat.completions.create(
521
+ model=self.model_name, messages=messages,
522
+ max_tokens=kwargs.get("n_predict", 1024),
523
+ temperature=kwargs.get("temperature", 0.7),
524
+ stream=kwargs.get("stream", False),
525
+ extra_body={"top_k": kwargs.get("top_k", 40), "repeat_penalty": kwargs.get("repeat_penalty", 1.1)}
526
+ )
527
+ response = self._execute_with_retry(do_chat)
528
+ if kwargs.get("stream", False):
529
+ full_text = ""
530
+ for chunk in response:
531
+ content = chunk.choices[0].delta.content or ""
532
+ full_text += content
533
+ if kwargs.get("streaming_callback"):
534
+ if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
535
+ break
536
+ return full_text
537
+ else:
538
+ return response.choices[0].message.content
539
+ except Exception as e:
540
+ trace_exception(e)
541
+ return {"status": False, "error": str(e)}
542
+
543
+ def list_models(self) -> List[Dict[str, Any]]:
544
+ models = []
545
+ if self.models_dir.exists():
546
+ for f in self.models_dir.glob("*.gguf"):
547
+ # Hide files explicitly containing 'mmproj' as they are not standalone models
548
+ if "mmproj" in f.name.lower():
549
+ continue
550
+
551
+ if re.search(r'-\d{5}-of-\d{5}\.gguf$', f.name):
552
+ if "00001-of-" not in f.name: continue
553
+ models.append({"model_name": f.name, "owned_by": "local", "created": time.ctime(f.stat().st_ctime), "size": f.stat().st_size})
554
+ return models
555
+
556
+ def get_model_info(self) -> dict:
557
+ info = {"name": BindingName, "version": "source-wrapper", "active_model": self.model_name}
558
+ reg = self._get_server_info(self.model_name)
559
+ if reg: info["host_address"] = reg['url']
560
+ return info
561
+
562
+ def tokenize(self, text: str) -> list:
563
+ try:
564
+ client = self._get_client()
565
+ url = client.base_url
566
+ def do_tokenize():
567
+ ep = f"{url}tokenize"
568
+ res = requests.post(ep, json={"content": text})
569
+ if res.status_code == 404:
570
+ res = requests.post(str(url).replace("/v1/", "/tokenize"), json={"content": text})
571
+ if res.status_code == 503:
572
+ raise openai.InternalServerError("Loading model", response=res, body=None)
573
+ return res
574
+ res = self._execute_with_retry(do_tokenize)
575
+ if res.status_code == 200: return res.json().get("tokens", [])
576
+ except: pass
577
+ return list(text)
578
+
579
+ def detokenize(self, tokens: list) -> str:
580
+ try:
581
+ client = self._get_client()
582
+ url = client.base_url
583
+ def do_detokenize():
584
+ ep = f"{url}detokenize"
585
+ res = requests.post(ep, json={"tokens": tokens})
586
+ if res.status_code == 404:
587
+ res = requests.post(str(url).replace("/v1/", "/detokenize"), json={"tokens": tokens})
588
+ if res.status_code == 503:
589
+ raise openai.InternalServerError("Loading model", response=res, body=None)
590
+ return res
591
+ res = self._execute_with_retry(do_detokenize)
592
+ if res.status_code == 200: return res.json().get("content", "")
593
+ except: pass
594
+ return "".join(map(str, tokens))
595
+
596
+ def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
597
+
598
+ def embed(self, text: str, **kwargs) -> List[float]:
599
+ client = self._get_client()
600
+ def do_embed():
601
+ return client.embeddings.create(input=text, model=self.model_name)
602
+ res = self._execute_with_retry(do_embed)
603
+ return res.data[0].embedding
604
+
605
+ def get_zoo(self) -> List[Dict[str, Any]]:
606
+ return [
607
+ # Ministral 3: High-performance edge model (3B)
608
+ {
609
+ "name": "Ministral-3-3B-Instruct-2512-GGUF",
610
+ "description": "Mistral AI Ministral 3 3B Instruct (Bartowski Quant) - Efficient Edge Model",
611
+ "size": "2.2 GB (Q4_K_M)",
612
+ "type": "gguf",
613
+ "link": "bartowski/mistralai_Ministral-3-3B-Instruct-2512-GGUF",
614
+ "filename": "mistralai_Ministral-3-3B-Instruct-2512-Q4_K_M.gguf"
615
+ },
616
+ # Devstral 2 Mini: Agentic coding specialist (24B)
617
+ {
618
+ "name": "Devstral-Small-2-24B-Instruct-GGUF",
619
+ "description": "Mistral AI Devstral Small 2 24B Instruct (Bartowski Quant) - Coding Specialist",
620
+ "size": "14.8 GB (Q4_K_M)",
621
+ "type": "gguf",
622
+ "link": "bartowski/mistralai_Devstral-Small-2-24B-Instruct-2512-GGUF",
623
+ "filename": "mistralai_Devstral-Small-2-24B-Instruct-2512-Q4_K_M.gguf"
624
+ },
625
+ # Llama 4 Scout: Meta's efficient MoE (17B)
626
+ {
627
+ "name": "Llama-4-Scout-17B-Instruct-GGUF",
628
+ "description": "Meta Llama 4 Scout 17B Instruct (Bartowski Quant) - 16-Expert MoE",
629
+ "size": "11.2 GB (Q4_K_M)",
630
+ "type": "gguf",
631
+ "link": "bartowski/meta-llama_Llama-4-Scout-17B-16E-Instruct-old-GGUF",
632
+ "filename": "meta-llama_Llama-4-Scout-17B-16E-Instruct-Q4_K_M.gguf"
633
+ },
634
+ # Qwen 3 VL: Vision-Language with "Thinking" (32B)
635
+ {
636
+ "name": "Qwen3-VL-32B-Thinking-GGUF",
637
+ "description": "Qwen 3 VL 32B Thinking (Bartowski Quant) - Vision CoT Reasoning",
638
+ "size": "19.5 GB (Q4_K_M)",
639
+ "type": "gguf",
640
+ "link": "bartowski/Qwen_Qwen3-VL-32B-Thinking-GGUF",
641
+ "filename": "Qwen_Qwen3-VL-32B-Thinking-Q4_K_M.gguf"
642
+ },
643
+ # Qwen 3: Dense reasoning powerhouse (72B)
644
+ {
645
+ "name": "Qwen3-72B-Embiggened-GGUF",
646
+ "description": "Qwen 3 72B Embiggened (Bartowski Quant) - Enhanced Reasoning Dense Model",
647
+ "size": "43.1 GB (Q4_K_M)",
648
+ "type": "gguf",
649
+ "link": "bartowski/cognitivecomputations_Qwen3-72B-Embiggened-GGUF",
650
+ "filename": "Qwen3-72B-Embiggened-Q4_K_M.gguf"
651
+ },
652
+ # Devstral 2: Massive coding architecture (123B)
653
+ {
654
+ "name": "Devstral-2-123B-Instruct-GGUF",
655
+ "description": "Mistral AI Devstral 2 123B Instruct (Bartowski Quant) - Heavy Duty Coding",
656
+ "size": "71.4 GB (Q4_K_M)",
657
+ "type": "gguf",
658
+ "link": "bartowski/mistralai_Devstral-2-123B-Instruct-2512-GGUF",
659
+ "filename": "Devstral-2-123B-Instruct-2512-Q4_K_M.gguf"
660
+ },
661
+ # ChatGPT OSS: Open weights rival (120B)
662
+ {
663
+ "name": "ChatGPT-OSS-120B-GGUF",
664
+ "description": "OpenAI GPT-OSS 120B (Bartowski Quant) - Open Weight Research Model",
665
+ "size": "69.8 GB (Q4_K_M)",
666
+ "type": "gguf",
667
+ "link": "bartowski/openai_gpt-oss-120b-GGUF",
668
+ "filename": "gpt-oss-120b-Q4_K_M.gguf"
669
+ },
670
+ # DeepSeek V3: The MoE Giant (671B Base / 37B Active)
671
+ {
672
+ "name": "DeepSeek-V3-0324-GGUF",
673
+ "description": "DeepSeek V3 0324 (Bartowski Quant) - 671B MoE",
674
+ "size": "365 GB (Q4_K_M)",
675
+ "type": "gguf",
676
+ "link": "bartowski/deepseek-ai_DeepSeek-V3-0324-GGUF",
677
+ "filename": "DeepSeek-V3-0324-Q4_K_M.gguf"
678
+ }
679
+ ]
680
+
681
+
682
+ def download_from_zoo(self, index: int, progress_callback: Callable[[dict], None] = None) -> dict:
683
+ zoo = self.get_zoo();
684
+ if index < 0 or index >= len(zoo): return {"status": False, "message": "Index out of bounds"}
685
+ item = zoo[index]
686
+ return self.pull_model(item["link"], item.get("filename"), progress_callback=progress_callback)
687
+
688
+ def pull_model(self, repo_id: str, filename: str, mmproj_repo_id: str = None, mmproj_filename: str = None, progress_callback: Callable[[dict], None] = None) -> dict:
689
+ try:
690
+ match = re.match(r"^(.*)-(\d{5})-of-(\d{5})\.gguf$", filename)
691
+ files = []
692
+ if match:
693
+ base, total = match.group(1), int(match.group(3))
694
+ ASCIIColors.info(f"Detected multi-file model with {total} parts.")
695
+ for i in range(1, total + 1): files.append(f"{base}-{i:05d}-of-{total:05d}.gguf")
696
+ else:
697
+ files.append(filename)
698
+ paths = []
699
+ for f in files:
700
+ ASCIIColors.info(f"Downloading {f} from {repo_id}...")
701
+ if progress_callback: progress_callback({"status": "downloading", "message": f"Downloading {f}", "completed": 0, "total": 100})
702
+ p = hf_hub_download(repo_id=repo_id, filename=f, local_dir=self.models_dir, local_dir_use_symlinks=False, resume_download=True)
703
+ paths.append(p)
704
+ ASCIIColors.success(f"Downloaded {f}")
705
+
706
+ if mmproj_filename:
707
+ proj_repo = mmproj_repo_id if mmproj_repo_id else repo_id
708
+ ASCIIColors.info(f"Downloading mmproj {mmproj_filename} from {proj_repo}...")
709
+ hf_hub_download(repo_id=proj_repo, filename=mmproj_filename, local_dir=self.models_dir, local_dir_use_symlinks=False, resume_download=True)
710
+ ASCIIColors.success(f"Downloaded mmproj {mmproj_filename}")
711
+ # Automatically bind the model with its projector
712
+ self.bind_multimodal_model(filename, mmproj_filename)
713
+
714
+ msg = f"Successfully downloaded model: {filename}"
715
+ if mmproj_filename: msg += f" and bound with projector: {mmproj_filename}"
716
+ if progress_callback: progress_callback({"status": "success", "message": msg, "completed": 100, "total": 100})
717
+ return {"status": True, "message": msg, "path": paths[0]}
718
+ except Exception as e:
719
+ trace_exception(e)
720
+ return {"status": False, "error": str(e)}
721
+
722
+ def cleanup_orphans_if_needed(self):
723
+ pass
724
+
725
+ def __del__(self):
726
+ pass