lollms-client 1.5.6__py3-none-any.whl → 1.7.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. lollms_client/__init__.py +1 -1
  2. lollms_client/llm_bindings/azure_openai/__init__.py +2 -2
  3. lollms_client/llm_bindings/claude/__init__.py +125 -35
  4. lollms_client/llm_bindings/gemini/__init__.py +261 -159
  5. lollms_client/llm_bindings/grok/__init__.py +52 -15
  6. lollms_client/llm_bindings/groq/__init__.py +2 -2
  7. lollms_client/llm_bindings/hugging_face_inference_api/__init__.py +2 -2
  8. lollms_client/llm_bindings/litellm/__init__.py +1 -1
  9. lollms_client/llm_bindings/llama_cpp_server/__init__.py +605 -0
  10. lollms_client/llm_bindings/llamacpp/__init__.py +18 -11
  11. lollms_client/llm_bindings/lollms/__init__.py +76 -21
  12. lollms_client/llm_bindings/lollms_webui/__init__.py +1 -1
  13. lollms_client/llm_bindings/mistral/__init__.py +2 -2
  14. lollms_client/llm_bindings/novita_ai/__init__.py +142 -6
  15. lollms_client/llm_bindings/ollama/__init__.py +345 -89
  16. lollms_client/llm_bindings/open_router/__init__.py +2 -2
  17. lollms_client/llm_bindings/openai/__init__.py +81 -20
  18. lollms_client/llm_bindings/openllm/__init__.py +362 -506
  19. lollms_client/llm_bindings/openwebui/__init__.py +333 -171
  20. lollms_client/llm_bindings/perplexity/__init__.py +2 -2
  21. lollms_client/llm_bindings/pythonllamacpp/__init__.py +3 -3
  22. lollms_client/llm_bindings/tensor_rt/__init__.py +1 -1
  23. lollms_client/llm_bindings/transformers/__init__.py +428 -632
  24. lollms_client/llm_bindings/vllm/__init__.py +1 -1
  25. lollms_client/lollms_agentic.py +4 -2
  26. lollms_client/lollms_base_binding.py +61 -0
  27. lollms_client/lollms_core.py +512 -1890
  28. lollms_client/lollms_discussion.py +65 -39
  29. lollms_client/lollms_llm_binding.py +126 -261
  30. lollms_client/lollms_mcp_binding.py +49 -77
  31. lollms_client/lollms_stt_binding.py +99 -52
  32. lollms_client/lollms_tti_binding.py +38 -38
  33. lollms_client/lollms_ttm_binding.py +38 -42
  34. lollms_client/lollms_tts_binding.py +43 -18
  35. lollms_client/lollms_ttv_binding.py +38 -42
  36. lollms_client/lollms_types.py +4 -2
  37. lollms_client/stt_bindings/whisper/__init__.py +108 -23
  38. lollms_client/stt_bindings/whispercpp/__init__.py +7 -1
  39. lollms_client/tti_bindings/diffusers/__init__.py +464 -803
  40. lollms_client/tti_bindings/diffusers/server/main.py +1062 -0
  41. lollms_client/tti_bindings/gemini/__init__.py +182 -239
  42. lollms_client/tti_bindings/leonardo_ai/__init__.py +6 -3
  43. lollms_client/tti_bindings/lollms/__init__.py +4 -1
  44. lollms_client/tti_bindings/novita_ai/__init__.py +5 -2
  45. lollms_client/tti_bindings/openai/__init__.py +10 -11
  46. lollms_client/tti_bindings/stability_ai/__init__.py +5 -3
  47. lollms_client/ttm_bindings/audiocraft/__init__.py +7 -12
  48. lollms_client/ttm_bindings/beatoven_ai/__init__.py +7 -3
  49. lollms_client/ttm_bindings/lollms/__init__.py +4 -17
  50. lollms_client/ttm_bindings/replicate/__init__.py +7 -4
  51. lollms_client/ttm_bindings/stability_ai/__init__.py +7 -4
  52. lollms_client/ttm_bindings/topmediai/__init__.py +6 -3
  53. lollms_client/tts_bindings/bark/__init__.py +7 -10
  54. lollms_client/tts_bindings/lollms/__init__.py +6 -1
  55. lollms_client/tts_bindings/piper_tts/__init__.py +8 -11
  56. lollms_client/tts_bindings/xtts/__init__.py +157 -74
  57. lollms_client/tts_bindings/xtts/server/main.py +241 -280
  58. {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/METADATA +113 -5
  59. lollms_client-1.7.13.dist-info/RECORD +90 -0
  60. lollms_client-1.5.6.dist-info/RECORD +0 -87
  61. {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/WHEEL +0 -0
  62. {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/licenses/LICENSE +0 -0
  63. {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,605 @@
1
+ import subprocess
2
+ import sys
3
+ import os
4
+ import time
5
+ import requests
6
+ import socket
7
+ import re
8
+ import platform
9
+ import zipfile
10
+ import tarfile
11
+ import json
12
+ import atexit
13
+ from pathlib import Path
14
+ from typing import Optional, List, Dict, Any, Union, Callable
15
+
16
+ import pipmaster as pm
17
+ from ascii_colors import ASCIIColors, trace_exception
18
+ from lollms_client.lollms_llm_binding import LollmsLLMBinding
19
+ from lollms_client.lollms_types import MSG_TYPE
20
+ from lollms_client.lollms_discussion import LollmsDiscussion
21
+
22
+ # Ensure dependencies
23
+ pm.ensure_packages(["openai", "huggingface_hub", "filelock", "requests", "tqdm", "psutil"])
24
+ import openai
25
+ from huggingface_hub import hf_hub_download
26
+ from filelock import FileLock
27
+ from tqdm import tqdm
28
+ import psutil
29
+
30
+ BindingName = "LlamaCppServerBinding"
31
+
32
+ def get_free_port(start_port=9624, max_port=10000):
33
+ """
34
+ Finds a free port on localhost.
35
+ Race-condition safe-ish: We bind to it to check, but release it immediately.
36
+ Real safety comes from the FileLock around this call.
37
+ """
38
+ for port in range(start_port, max_port):
39
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
40
+ try:
41
+ sock.bind(('localhost', port))
42
+ return port
43
+ except OSError:
44
+ continue
45
+ raise RuntimeError("No free ports available.")
46
+
47
+ class LlamaCppServerBinding(LollmsLLMBinding):
48
+ def __init__(self, **kwargs):
49
+ super().__init__(BindingName, **kwargs)
50
+ self.config = kwargs
51
+
52
+ # Configuration
53
+ self.host = kwargs.get("host", "localhost")
54
+ self.model_name = kwargs.get("model_name", "")
55
+ self.n_ctx = kwargs.get("ctx_size", 4096)
56
+ self.n_gpu_layers = kwargs.get("n_gpu_layers", -1)
57
+ self.n_threads = kwargs.get("n_threads", None)
58
+ self.n_parallel = kwargs.get("n_parallel", 1)
59
+ self.batch_size = kwargs.get("batch_size", 512)
60
+
61
+ # Server Management
62
+ self.max_active_models = int(kwargs.get("max_active_models", 1))
63
+ self.idle_timeout = float(kwargs.get("idle_timeout", -1))
64
+
65
+ # Paths
66
+ self.binding_dir = Path(__file__).parent
67
+ self.bin_dir = self.binding_dir / "bin"
68
+ self.models_dir = Path(kwargs.get("models_path", "models/llama_cpp_models")).resolve()
69
+
70
+ # Registry directory for inter-process coordination
71
+ self.servers_dir = self.models_dir / "servers"
72
+ self.servers_dir.mkdir(parents=True, exist_ok=True)
73
+ self.bin_dir.mkdir(exist_ok=True)
74
+
75
+ # Global lock file for all operations on the registry
76
+ self.global_lock_path = self.models_dir / "global_server_manager.lock"
77
+
78
+ # Installation check
79
+ if not self._get_server_executable().exists():
80
+ ASCIIColors.warning("Llama.cpp binary not found. Attempting installation...")
81
+ self.install_llama_cpp()
82
+
83
+ # Register cleanup for this process
84
+ atexit.register(self.cleanup_orphans_if_needed)
85
+
86
+ def _get_server_executable(self) -> Path:
87
+ if platform.system() == "Windows":
88
+ return self.bin_dir / "llama-server.exe"
89
+ else:
90
+ return self.bin_dir / "llama-server"
91
+
92
+ def detect_hardware(self) -> str:
93
+ sys_plat = platform.system()
94
+ if sys_plat == "Darwin":
95
+ return "macos"
96
+ try:
97
+ subprocess.check_call(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
98
+ return "cuda"
99
+ except:
100
+ pass
101
+ return "cpu"
102
+
103
+ def install_llama_cpp(self):
104
+ try:
105
+ ASCIIColors.info("Checking latest llama.cpp release...")
106
+ releases_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
107
+ response = requests.get(releases_url)
108
+ response.raise_for_status()
109
+ release_data = response.json()
110
+ assets = release_data.get("assets", [])
111
+
112
+ hardware = self.detect_hardware()
113
+ sys_plat = platform.system()
114
+
115
+ target_asset = None
116
+ search_terms = []
117
+
118
+ if sys_plat == "Windows":
119
+ search_terms.append("win")
120
+ search_terms.append("cuda" if hardware == "cuda" else "avx2")
121
+ search_terms.append("x64")
122
+ elif sys_plat == "Linux":
123
+ search_terms.append("ubuntu")
124
+ search_terms.append("x64")
125
+ elif sys_plat == "Darwin":
126
+ search_terms.append("macos")
127
+ search_terms.append("arm64" if platform.machine() == "arm64" else "x64")
128
+
129
+ for asset in assets:
130
+ name = asset["name"].lower()
131
+ if "cudart" in name: continue
132
+ if all(term in name for term in search_terms):
133
+ if "cuda" in name and "cu11" in name and hardware == "cuda": continue
134
+ target_asset = asset
135
+ break
136
+
137
+ # Windows CPU fallback
138
+ if not target_asset and sys_plat == "Windows" and hardware == "cpu":
139
+ for asset in assets:
140
+ if "cudart" in asset["name"].lower(): continue
141
+ if "win" in asset["name"].lower() and "x64" in asset["name"].lower() and "cuda" not in asset["name"].lower():
142
+ target_asset = asset
143
+ break
144
+
145
+ if not target_asset:
146
+ raise RuntimeError(f"No suitable binary found for {sys_plat} / {hardware}")
147
+
148
+ download_url = target_asset["browser_download_url"]
149
+ filename = target_asset["name"]
150
+ dest_file = self.bin_dir / filename
151
+
152
+ ASCIIColors.info(f"Downloading {filename}...")
153
+ with requests.get(download_url, stream=True) as r:
154
+ r.raise_for_status()
155
+ with open(dest_file, 'wb') as f:
156
+ for chunk in r.iter_content(chunk_size=8192):
157
+ f.write(chunk)
158
+
159
+ ASCIIColors.info("Extracting...")
160
+ if filename.endswith(".zip"):
161
+ with zipfile.ZipFile(dest_file, 'r') as z: z.extractall(self.bin_dir)
162
+ elif filename.endswith(".tar.gz"):
163
+ with tarfile.open(dest_file, "r:gz") as t: t.extractall(self.bin_dir)
164
+
165
+ dest_file.unlink()
166
+
167
+ # Normalize binary name
168
+ exe_name = "llama-server.exe" if sys_plat == "Windows" else "llama-server"
169
+ legacy_name = "server.exe" if sys_plat == "Windows" else "server"
170
+ if not (self.bin_dir / exe_name).exists() and (self.bin_dir / legacy_name).exists():
171
+ shutil.move(str(self.bin_dir / legacy_name), str(self.bin_dir / exe_name))
172
+
173
+ if sys_plat != "Windows":
174
+ exe_path = self.bin_dir / exe_name
175
+ if exe_path.exists(): os.chmod(exe_path, 0o755)
176
+
177
+ ASCIIColors.success("Llama.cpp installed successfully.")
178
+ except Exception as e:
179
+ trace_exception(e)
180
+ ASCIIColors.error(f"Failed to install llama.cpp: {e}")
181
+
182
+ # --- Server Management Logic ---
183
+
184
+ def _get_registry_file(self, model_name: str) -> Path:
185
+ # Sanitize filename
186
+ safe_name = "".join(c for c in model_name if c.isalnum() or c in ('-', '_', '.'))
187
+ return self.servers_dir / f"{safe_name}.json"
188
+
189
+ def _get_server_info(self, model_name: str) -> Optional[Dict]:
190
+ """Reads registry file for a model, returns dict or None if invalid."""
191
+ reg_file = self._get_registry_file(model_name)
192
+ if not reg_file.exists():
193
+ return None
194
+
195
+ try:
196
+ with open(reg_file, 'r') as f:
197
+ info = json.load(f)
198
+
199
+ # Verify process is alive
200
+ if psutil.pid_exists(info['pid']):
201
+ # Verify it's actually llama-server (optional but safe)
202
+ try:
203
+ p = psutil.Process(info['pid'])
204
+ if "llama" in p.name().lower() or "server" in p.name().lower():
205
+ return info
206
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
207
+ pass
208
+
209
+ # If we get here, process is dead or invalid
210
+ ASCIIColors.warning(f"Found stale registry file for {model_name} (PID {info['pid']}). Cleaning up.")
211
+ reg_file.unlink()
212
+ return None
213
+ except Exception:
214
+ # Corrupt file
215
+ if reg_file.exists(): reg_file.unlink()
216
+ return None
217
+
218
+ def _kill_server(self, model_name: str, info: Dict):
219
+ """Kills a server process and removes its registry file."""
220
+ ASCIIColors.info(f"Stopping server for {model_name} (PID {info['pid']})...")
221
+ try:
222
+ p = psutil.Process(info['pid'])
223
+ p.terminate()
224
+ p.wait(timeout=5)
225
+ except psutil.NoSuchProcess:
226
+ pass # Already gone
227
+ except psutil.TimeoutExpired:
228
+ p.kill()
229
+ except Exception as e:
230
+ ASCIIColors.error(f"Error killing process: {e}")
231
+
232
+ # Remove registry file
233
+ reg_file = self._get_registry_file(model_name)
234
+ if reg_file.exists():
235
+ reg_file.unlink()
236
+
237
+ def _ensure_capacity_locked(self):
238
+ """
239
+ Called while holding the lock. Ensures we have space for a new model.
240
+ """
241
+ registry_files = list(self.servers_dir.glob("*.json"))
242
+
243
+ # 1. Clean up stale entries first
244
+ valid_servers = []
245
+ for rf in registry_files:
246
+ try:
247
+ with open(rf, 'r') as f:
248
+ data = json.load(f)
249
+ if psutil.pid_exists(data['pid']):
250
+ valid_servers.append((rf, data))
251
+ else:
252
+ rf.unlink() # Clean stale
253
+ except:
254
+ if rf.exists(): rf.unlink()
255
+
256
+ # 2. Check capacity
257
+ if len(valid_servers) >= self.max_active_models:
258
+ # Sort by file modification time (mtime), which acts as our "last used" heartbeat
259
+ # Oldest mtime = Least Recently Used
260
+ valid_servers.sort(key=lambda x: x[0].stat().st_mtime)
261
+
262
+ # Kill the oldest
263
+ oldest_file, oldest_info = valid_servers[0]
264
+ model_to_kill = oldest_info.get("model_name", "unknown")
265
+ ASCIIColors.warning(f"Max active models ({self.max_active_models}) reached. Unloading LRU model: {model_to_kill}")
266
+ self._kill_server(model_to_kill, oldest_info)
267
+
268
+ def _spawn_server_detached(self, model_name: str):
269
+ """Spawns the server process detached so it survives if this python script ends."""
270
+ exe_path = self._get_server_executable()
271
+ model_path = self.models_dir / model_name
272
+
273
+ if not model_path.exists():
274
+ raise FileNotFoundError(f"Model {model_name} not found at {model_path}")
275
+
276
+ port = get_free_port()
277
+
278
+ cmd = [
279
+ str(exe_path),
280
+ "--model", str(model_path),
281
+ "--host", self.host,
282
+ "--port", str(port),
283
+ "--ctx-size", str(self.n_ctx),
284
+ "--n-gpu-layers", str(self.n_gpu_layers),
285
+ "--parallel", str(self.n_parallel),
286
+ "--batch-size", str(self.batch_size),
287
+ "--embedding"
288
+ ]
289
+
290
+ if self.n_threads:
291
+ cmd.extend(["--threads", str(self.n_threads)])
292
+
293
+ ASCIIColors.info(f"Spawning server for {model_name} on port {port}...")
294
+
295
+ # Process creation flags for detachment
296
+ kwargs = {}
297
+ if platform.system() == "Windows":
298
+ kwargs['creationflags'] = subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
299
+ else:
300
+ kwargs['start_new_session'] = True
301
+
302
+ proc = subprocess.Popen(
303
+ cmd,
304
+ stdout=subprocess.DEVNULL,
305
+ stderr=subprocess.DEVNULL,
306
+ **kwargs
307
+ )
308
+
309
+ # Wait for health check (WAIT until STATUS 200 OK)
310
+ url = f"http://{self.host}:{port}/v1"
311
+ start_time = time.time()
312
+ # Increased timeout to 120s for larger models
313
+ while time.time() - start_time < 120:
314
+ try:
315
+ res = requests.get(f"{url}/models", timeout=1)
316
+ # STRICTLY check for 200, as 503 means loading
317
+ if res.status_code == 200:
318
+ return proc.pid, port, url
319
+ except:
320
+ pass
321
+
322
+ if proc.poll() is not None:
323
+ raise RuntimeError(f"Server process exited immediately with code {proc.returncode}")
324
+
325
+ time.sleep(0.5)
326
+
327
+ # Timeout
328
+ proc.terminate()
329
+ raise TimeoutError(f"Server for {model_name} failed to become responsive (timeout).")
330
+
331
+
332
+ def load_model(self, model_name: str) -> bool:
333
+ """
334
+ Thread-safe and Process-safe model loading.
335
+ """
336
+ if not self.global_lock_path.parent.exists():
337
+ self.global_lock_path.parent.mkdir(parents=True)
338
+
339
+ lock = FileLock(str(self.global_lock_path))
340
+
341
+ try:
342
+ with lock.acquire(timeout=60):
343
+ info = self._get_server_info(model_name)
344
+
345
+ if info:
346
+ # Update heartbeat
347
+ try:
348
+ self._get_registry_file(model_name).touch()
349
+ except:
350
+ pass
351
+ self.model_name = model_name
352
+ return True
353
+
354
+ self._ensure_capacity_locked()
355
+ pid, port, url = self._spawn_server_detached(model_name)
356
+
357
+ reg_file = self._get_registry_file(model_name)
358
+ with open(reg_file, 'w') as f:
359
+ json.dump({
360
+ "model_name": model_name,
361
+ "pid": pid,
362
+ "port": port,
363
+ "url": url,
364
+ "started_at": time.time()
365
+ }, f)
366
+
367
+ self.model_name = model_name
368
+ return True
369
+
370
+ except Exception as e:
371
+ ASCIIColors.error(f"Error loading model {model_name}: {e}")
372
+ trace_exception(e)
373
+ return False
374
+
375
+ def _get_client(self, model_name: str = None) -> openai.OpenAI:
376
+ target_model = model_name or self.model_name
377
+ if not target_model:
378
+ raise ValueError("No model specified.")
379
+
380
+ info = self._get_server_info(target_model)
381
+
382
+ if not info:
383
+ if self.load_model(target_model):
384
+ info = self._get_server_info(target_model)
385
+ else:
386
+ raise RuntimeError(f"Could not load model {target_model}")
387
+ else:
388
+ try:
389
+ self._get_registry_file(target_model).touch()
390
+ except:
391
+ pass
392
+
393
+ if not info:
394
+ raise RuntimeError(f"Model {target_model} failed to load.")
395
+
396
+ return openai.OpenAI(base_url=info['url'], api_key="sk-no-key-required")
397
+
398
+ def _execute_with_retry(self, func: Callable, *args, **kwargs):
399
+ """
400
+ Executes an API call with retries for 503 (Model Loading) errors.
401
+ """
402
+ retries = 60 # Wait up to ~2 minutes
403
+ for i in range(retries):
404
+ try:
405
+ return func(*args, **kwargs)
406
+ except openai.InternalServerError as e:
407
+ # Catch 503 Loading model
408
+ if e.status_code == 503:
409
+ if i % 10 == 0: # Reduce log spam
410
+ ASCIIColors.warning(f"Model is loading (503). Waiting... ({i+1}/{retries})")
411
+ time.sleep(2)
412
+ continue
413
+ raise e
414
+ except openai.APIConnectionError:
415
+ # Server might be briefly unreachable during heavy load or restart
416
+ if i % 10 == 0:
417
+ ASCIIColors.warning(f"Connection error. Waiting... ({i+1}/{retries})")
418
+ time.sleep(2)
419
+ continue
420
+ # Final attempt
421
+ return func(*args, **kwargs)
422
+
423
+ def generate_text(self, prompt: str, n_predict: int = None, stream: bool = False, **kwargs) -> Union[str, Dict]:
424
+ try:
425
+ client = self._get_client()
426
+
427
+ def do_gen():
428
+ return client.completions.create(
429
+ model=self.model_name,
430
+ prompt=prompt,
431
+ max_tokens=n_predict if n_predict else 1024,
432
+ temperature=kwargs.get("temperature", 0.7),
433
+ top_p=kwargs.get("top_p", 0.9),
434
+ stream=stream,
435
+ extra_body={
436
+ "top_k": kwargs.get("top_k", 40),
437
+ "repeat_penalty": kwargs.get("repeat_penalty", 1.1),
438
+ "n_predict": n_predict
439
+ }
440
+ )
441
+
442
+ completion = self._execute_with_retry(do_gen)
443
+
444
+ if stream:
445
+ full_text = ""
446
+ for chunk in completion:
447
+ content = chunk.choices[0].text
448
+ full_text += content
449
+ if kwargs.get("streaming_callback"):
450
+ if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
451
+ break
452
+ return full_text
453
+ else:
454
+ return completion.choices[0].text
455
+ except Exception as e:
456
+ trace_exception(e)
457
+ return {"status": False, "error": str(e)}
458
+
459
+ def chat(self, discussion: LollmsDiscussion, **kwargs) -> Union[str, Dict]:
460
+ try:
461
+ client = self._get_client()
462
+ messages = discussion.export("openai_chat")
463
+
464
+ def do_chat():
465
+ return client.chat.completions.create(
466
+ model=self.model_name,
467
+ messages=messages,
468
+ max_tokens=kwargs.get("n_predict", 1024),
469
+ temperature=kwargs.get("temperature", 0.7),
470
+ stream=kwargs.get("stream", False),
471
+ extra_body={
472
+ "top_k": kwargs.get("top_k", 40),
473
+ "repeat_penalty": kwargs.get("repeat_penalty", 1.1)
474
+ }
475
+ )
476
+
477
+ response = self._execute_with_retry(do_chat)
478
+
479
+ if kwargs.get("stream", False):
480
+ full_text = ""
481
+ for chunk in response:
482
+ content = chunk.choices[0].delta.content or ""
483
+ full_text += content
484
+ if kwargs.get("streaming_callback"):
485
+ if not kwargs["streaming_callback"](content, MSG_TYPE.MSG_TYPE_CHUNK):
486
+ break
487
+ return full_text
488
+ else:
489
+ return response.choices[0].message.content
490
+ except Exception as e:
491
+ trace_exception(e)
492
+ return {"status": False, "error": str(e)}
493
+
494
+ def list_models(self) -> List[Dict[str, Any]]:
495
+ models = []
496
+ if self.models_dir.exists():
497
+ for f in self.models_dir.glob("*.gguf"):
498
+ if re.search(r'-\d{5}-of-\d{5}\.gguf$', f.name):
499
+ if "00001-of-" not in f.name: continue
500
+ models.append({"model_name": f.name, "owned_by": "local", "created": time.ctime(f.stat().st_ctime), "size": f.stat().st_size})
501
+ return models
502
+
503
+ def get_model_info(self) -> dict:
504
+ info = {"name": BindingName, "version": "source-wrapper", "active_model": self.model_name}
505
+ reg = self._get_server_info(self.model_name)
506
+ if reg: info["host_address"] = reg['url']
507
+ return info
508
+
509
+ def tokenize(self, text: str) -> list:
510
+ try:
511
+ client = self._get_client()
512
+ url = client.base_url
513
+
514
+ def do_tokenize():
515
+ # Llama-server specific endpoint
516
+ ep = f"{url}tokenize"
517
+ # Strip v1/ if present because tokenize is often at root in older llama-server,
518
+ # but in recent versions it might be under v1 or root. We try robustly.
519
+ res = requests.post(ep, json={"content": text})
520
+ if res.status_code == 404:
521
+ res = requests.post(str(url).replace("/v1/", "/tokenize"), json={"content": text})
522
+
523
+ if res.status_code == 503:
524
+ raise openai.InternalServerError("Loading model", response=res, body=None)
525
+ return res
526
+
527
+ res = self._execute_with_retry(do_tokenize)
528
+ if res.status_code == 200: return res.json().get("tokens", [])
529
+ except: pass
530
+ return list(text)
531
+
532
+ def detokenize(self, tokens: list) -> str:
533
+ try:
534
+ client = self._get_client()
535
+ url = client.base_url
536
+
537
+ def do_detokenize():
538
+ ep = f"{url}detokenize"
539
+ res = requests.post(ep, json={"tokens": tokens})
540
+ if res.status_code == 404:
541
+ res = requests.post(str(url).replace("/v1/", "/detokenize"), json={"tokens": tokens})
542
+
543
+ if res.status_code == 503:
544
+ raise openai.InternalServerError("Loading model", response=res, body=None)
545
+ return res
546
+
547
+ res = self._execute_with_retry(do_detokenize)
548
+ if res.status_code == 200: return res.json().get("content", "")
549
+ except: pass
550
+ return "".join(map(str, tokens))
551
+
552
+ def count_tokens(self, text: str) -> int: return len(self.tokenize(text))
553
+
554
+ def embed(self, text: str, **kwargs) -> List[float]:
555
+ client = self._get_client()
556
+ def do_embed():
557
+ return client.embeddings.create(input=text, model=self.model_name)
558
+ res = self._execute_with_retry(do_embed)
559
+ return res.data[0].embedding
560
+
561
+ def get_zoo(self) -> List[Dict[str, Any]]:
562
+ return [
563
+ {"name": "Llama-3-8B-Instruct-v0.1-GGUF", "description": "Meta Llama 3 8B Instruct (Quantized)", "size": "5.7 GB (Q5_K_M)", "type": "gguf", "link": "MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF", "filename": "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"},
564
+ {"name": "Phi-3-mini-4k-instruct-GGUF", "description": "Microsoft Phi 3 Mini 4k (Quantized)", "size": "2.4 GB (Q4_K_M)", "type": "gguf", "link": "microsoft/Phi-3-mini-4k-instruct-gguf", "filename": "Phi-3-mini-4k-instruct-q4.gguf"},
565
+ {"name": "Mistral-7B-Instruct-v0.3-GGUF", "description": "Mistral 7B Instruct v0.3 (Quantized)", "size": "4.6 GB (Q4_K_M)", "type": "gguf", "link": "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF", "filename": "Mistral-7B-Instruct-v0.3.Q4_K_M.gguf"},
566
+ {"name": "Qwen2.5-7B-Instruct-GGUF", "description": "Qwen 2.5 7B Instruct (Quantized)", "size": "4.7 GB (Q5_K_M)", "type": "gguf", "link": "Qwen/Qwen2.5-7B-Instruct-GGUF", "filename": "qwen2.5-7b-instruct-q5_k_m.gguf"}
567
+ ]
568
+
569
+ def download_from_zoo(self, index: int, progress_callback: Callable[[dict], None] = None) -> dict:
570
+ zoo = self.get_zoo();
571
+ if index < 0 or index >= len(zoo): return {"status": False, "message": "Index out of bounds"}
572
+ item = zoo[index]
573
+ return self.pull_model(item["link"], item.get("filename"), progress_callback)
574
+
575
+ def pull_model(self, repo_id: str, filename: str, progress_callback: Callable[[dict], None] = None) -> dict:
576
+ try:
577
+ match = re.match(r"^(.*)-(\d{5})-of-(\d{5})\.gguf$", filename)
578
+ files = []
579
+ if match:
580
+ base, total = match.group(1), int(match.group(3))
581
+ ASCIIColors.info(f"Detected multi-file model with {total} parts.")
582
+ for i in range(1, total + 1): files.append(f"{base}-{i:05d}-of-{total:05d}.gguf")
583
+ else:
584
+ files.append(filename)
585
+
586
+ paths = []
587
+ for f in files:
588
+ ASCIIColors.info(f"Downloading {f} from {repo_id}...")
589
+ if progress_callback: progress_callback({"status": "downloading", "message": f"Downloading {f}", "completed": 0, "total": 100})
590
+ p = hf_hub_download(repo_id=repo_id, filename=f, local_dir=self.models_dir, local_dir_use_symlinks=False, resume_download=True)
591
+ paths.append(p)
592
+ ASCIIColors.success(f"Downloaded {f}")
593
+
594
+ msg = f"Successfully downloaded model: {filename}"
595
+ if progress_callback: progress_callback({"status": "success", "message": msg, "completed": 100, "total": 100})
596
+ return {"status": True, "message": msg, "path": paths[0]}
597
+ except Exception as e:
598
+ trace_exception(e)
599
+ return {"status": False, "error": str(e)}
600
+
601
+ def cleanup_orphans_if_needed(self):
602
+ pass
603
+
604
+ def __del__(self):
605
+ pass
@@ -66,20 +66,27 @@ pm.ensure_packages(["requests", "pillow", "psutil"]) # pillow for dummy image in
66
66
  if not pm.is_installed("llama-cpp-binaries"):
67
67
  def install_llama_cpp():
68
68
  system = platform.system()
69
- python_version_simple = f"py{sys.version_info.major}" # e.g. py310 for 3.10
70
-
71
- cuda_suffix = "+cu124"
69
+ python_version_simple = f"py{sys.version_info.major}{sys.version_info.minor}" # e.g. py310 for 3.10
72
70
 
71
+ version_tag = "v0.56.0"
72
+ cuda_suffix = "+cu124"
73
73
 
74
74
  if system == "Windows":
75
- url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
76
- fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl" # Generic py3
75
+ # Try version-specific URL first
76
+ url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-{python_version_simple}-none-win_amd64.whl"
77
+ # Fallback to generic py3 if version-specific doesn't exist
78
+ fallback_url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-py3-none-win_amd64.whl"
77
79
  elif system == "Linux":
78
- url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
79
- fallback_url = "https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl" # Generic py3
80
+ # Try version-specific URL first
81
+ url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-{python_version_simple}-none-linux_x86_64.whl"
82
+ # Fallback to generic py3 if version-specific doesn't exist
83
+ fallback_url = f"https://github.com/oobabooga/llama-cpp-binaries/releases/download/{version_tag}/llama_cpp_binaries-{version_tag.lstrip('v')}{cuda_suffix}-py3-none-linux_x86_64.whl"
80
84
  else:
81
- ASCIIColors.warning(f"Unsupported OS for prebuilt llama-cpp-binaries: {system}. Please install manually.")
82
- return
85
+ ASCIIColors.error(f"Unsupported OS for precompiled llama-cpp-binaries: {system}. "
86
+ "You might need to set 'llama_server_binary_path' in the binding config "
87
+ "to point to a manually compiled llama.cpp server binary.")
88
+ return False
89
+
83
90
 
84
91
  ASCIIColors.info(f"Attempting to install llama-cpp-binaries from: {url}")
85
92
  try:
@@ -628,7 +635,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
628
635
 
629
636
  if not model_to_load:
630
637
  self._scan_models()
631
- available_models = self.listModels()
638
+ available_models = self.list_models()
632
639
  if not available_models:
633
640
  ASCIIColors.error("No model specified and no GGUF models found in models path.")
634
641
  return False
@@ -964,7 +971,7 @@ class LlamaCppServerBinding(LollmsLLMBinding):
964
971
 
965
972
  ASCIIColors.info(f"Scanned {len(self._model_path_map)} models from {self.models_path}.")
966
973
 
967
- def listModels(self) -> List[Dict[str, Any]]:
974
+ def list_models(self) -> List[Dict[str, Any]]:
968
975
  self._scan_models()
969
976
  models_found = []
970
977
  for unique_name, model_path in self._model_path_map.items():