lollms-client 0.12.6__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lollms-client might be problematic. Click here for more details.

Files changed (34) hide show
  1. examples/article_summary/article_summary.py +58 -0
  2. examples/deep_analyze/deep_analyse.py +30 -0
  3. examples/deep_analyze/deep_analyze_multiple_files.py +32 -0
  4. examples/function_call/functions_call_with images.py +52 -0
  5. examples/personality_test/chat_test.py +37 -0
  6. examples/personality_test/chat_with_aristotle.py +42 -0
  7. examples/personality_test/tesks_test.py +62 -0
  8. examples/simple_text_gen_test.py +171 -0
  9. examples/simple_text_gen_with_image_test.py +166 -0
  10. examples/test_local_models/local_chat.py +9 -0
  11. examples/text_2_audio.py +77 -0
  12. examples/text_2_image.py +140 -0
  13. examples/text_and_image_2_audio.py +59 -0
  14. examples/text_gen.py +28 -0
  15. lollms_client/__init__.py +2 -1
  16. lollms_client/llm_bindings/lollms/__init__.py +13 -11
  17. lollms_client/llm_bindings/ollama/__init__.py +8 -7
  18. lollms_client/llm_bindings/openai/__init__.py +69 -29
  19. lollms_client/llm_bindings/tensor_rt/__init__.py +603 -0
  20. lollms_client/llm_bindings/transformers/__init__.py +7 -11
  21. lollms_client/llm_bindings/vllm/__init__.py +603 -0
  22. lollms_client/lollms_core.py +0 -3
  23. lollms_client/lollms_llm_binding.py +5 -25
  24. {lollms_client-0.12.6.dist-info → lollms_client-0.13.0.dist-info}/METADATA +12 -12
  25. lollms_client-0.13.0.dist-info/RECORD +52 -0
  26. {lollms_client-0.12.6.dist-info → lollms_client-0.13.0.dist-info}/WHEEL +1 -1
  27. {lollms_client-0.12.6.dist-info → lollms_client-0.13.0.dist-info}/top_level.txt +1 -0
  28. lollms_client/lollms_personality.py +0 -403
  29. lollms_client/lollms_personality_worker.py +0 -1485
  30. lollms_client/lollms_stt.py +0 -35
  31. lollms_client/lollms_tti.py +0 -35
  32. lollms_client/lollms_tts.py +0 -39
  33. lollms_client-0.12.6.dist-info/RECORD +0 -41
  34. {lollms_client-0.12.6.dist-info → lollms_client-0.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,603 @@
1
+ # lollms_client/llm_bindings/vllm/__init__.py
2
+
3
+ import os
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Optional, Callable, List, Union, Dict, Any, Tuple
7
+ import json
8
+ import threading
9
+ import gc
10
+ import importlib
11
+ import platform
12
+
13
+ def detect_os():
14
+ system = platform.system()
15
+ if system == "Windows":
16
+ return "Windows"
17
+ elif system == "Linux":
18
+ return "Linux"
19
+ elif system == "Darwin":
20
+ return "macOS"
21
+ else:
22
+ return "Unknown OS"
23
+
24
+ if detect_os()=="Windows":
25
+ raise Exception("Windows is not supported by vllm, use wsl")
26
+
27
+ # --- Package Management and Conditional Imports ---
28
+ try:
29
+ # Pipmaster is assumed to be installed by the parent lollms_client.
30
+ # We ensure specific packages for this binding.
31
+
32
+ # Check if vllm is already importable to avoid re-running ensure_packages unnecessarily
33
+ # on subsequent imports within the same session if it was successful once.
34
+ _vllm_already_imported = 'vllm' in globals() or importlib.util.find_spec('vllm') is not None
35
+
36
+ if not _vllm_already_imported:
37
+ import pipmaster as pm # Assuming pipmaster is available
38
+ pm.ensure_packages([
39
+ "tensorrt_llm",
40
+ "torch",
41
+ "transformers>=4.37.0",
42
+ "huggingface_hub>=0.20.0",
43
+ "pillow"
44
+ ])
45
+
46
+ from tensorrt_llm import LLM, SamplingParams
47
+ from PIL import Image
48
+ import torch
49
+ from transformers import AutoTokenizer
50
+ from huggingface_hub import hf_hub_download, HfFileSystem, snapshot_download
51
+ import vllm # To get __version__
52
+
53
+ _vllm_deps_installed = True
54
+ _vllm_installation_error = None
55
+ except Exception as e:
56
+ _vllm_deps_installed = False
57
+ _vllm_installation_error = e
58
+ # Define placeholders if imports fail
59
+ LLM, SamplingParams, Image, vllm_multimodal_utils = None, None, None, None
60
+ torch, AutoTokenizer, hf_hub_download, HfFileSystem, snapshot_download, vllm = None, None, None, None, None, None
61
+
62
+
63
+ # --- LOLLMS Client Imports ---
64
+ from lollms_client.lollms_llm_binding import LollmsLLMBinding
65
+ from lollms_client.lollms_types import MSG_TYPE, ELF_COMPLETION_FORMAT # Assuming ELF_COMPLETION_FORMAT is in lollms_types
66
+ from ascii_colors import ASCIIColors, trace_exception
67
+
68
+
69
+ # --- Constants ---
70
+ BindingName = "VLLMBinding"
71
+ DEFAULT_models_folder = Path.home() / ".lollms" / "bindings_models" / "vllm_models"
72
+
73
+
74
+ # --- VLLM Engine Manager ---
75
+ class VLLMEngineManager:
76
+ _instance = None
77
+ _lock = threading.Lock()
78
+
79
+ def __new__(cls, *args, **kwargs):
80
+ if not _vllm_deps_installed:
81
+ raise RuntimeError(f"vLLM or its dependencies not installed. Cannot create VLLMEngineManager. Error: {_vllm_installation_error}")
82
+ with cls._lock:
83
+ if cls._instance is None:
84
+ cls._instance = super().__new__(cls)
85
+ cls._instance._initialized = False
86
+ return cls._instance
87
+
88
+ def __init__(self):
89
+ if hasattr(self, '_initialized') and self._initialized:
90
+ return
91
+ with self._lock:
92
+ if hasattr(self, '_initialized') and self._initialized:
93
+ return
94
+ # Stores: key -> (LLM_engine, tokenizer, ref_count, engine_kwargs_tuple_key)
95
+ self._engines: Dict[Tuple, Tuple[Optional[LLM], Optional[Any], int, Tuple]] = {}
96
+ self._engine_locks: Dict[Tuple, threading.Lock] = {} # Per-engine initialization lock
97
+ self._initialized = True
98
+ ASCIIColors.green("VLLMEngineManager initialized.")
99
+
100
+ def _get_engine_config_key(self, resolved_model_path: Path, engine_params: Dict[str, Any]) -> Tuple:
101
+ critical_params = [
102
+ 'tensor_parallel_size', 'quantization', 'dtype', 'max_model_len',
103
+ 'trust_remote_code', 'enforce_eager', 'gpu_memory_utilization',
104
+ 'swap_space', 'max_num_seqs', 'max_num_batched_tokens', 'tokenizer', 'tokenizer_mode',
105
+ 'image_input_type', 'image_token_id', 'image_feature_size', 'image_input_shape' # Common vision params
106
+ ]
107
+ key_parts = [str(resolved_model_path)]
108
+ for param_name in sorted(critical_params):
109
+ if param_name in engine_params:
110
+ value = engine_params[param_name]
111
+ # Make common mutable types hashable for the key
112
+ if isinstance(value, list): value = tuple(value)
113
+ elif isinstance(value, dict): value = tuple(sorted(value.items()))
114
+ key_parts.append((param_name, value))
115
+ return tuple(key_parts)
116
+
117
+ def get_engine(self,
118
+ resolved_model_path: Path,
119
+ is_gguf: bool,
120
+ engine_params: Dict[str, Any]
121
+ ) -> Tuple[Optional[LLM], Optional[Any]]:
122
+
123
+ engine_key = self._get_engine_config_key(resolved_model_path, engine_params)
124
+
125
+ with self._lock:
126
+ if engine_key not in self._engine_locks:
127
+ self._engine_locks[engine_key] = threading.Lock()
128
+
129
+ with self._engine_locks[engine_key]:
130
+ with self._lock:
131
+ if engine_key in self._engines:
132
+ llm_engine, tokenizer, ref_count, _ = self._engines[engine_key]
133
+ self._engines[engine_key] = (llm_engine, tokenizer, ref_count + 1, engine_key)
134
+ ASCIIColors.info(f"Reusing vLLM engine for {resolved_model_path.name}. Key: {engine_key}. Ref count: {ref_count + 1}")
135
+ return llm_engine, tokenizer
136
+
137
+ ASCIIColors.info(f"Creating new vLLM engine for {resolved_model_path.name} with key: {engine_key}")
138
+ try:
139
+ llm_args = {"model": str(resolved_model_path), **engine_params}
140
+ if is_gguf and "quantization" not in llm_args: # Only set if not overridden by user
141
+ llm_args["quantization"] = "gguf"
142
+
143
+ new_llm_engine = LLM(**llm_args)
144
+ new_tokenizer = None
145
+ try:
146
+ if hasattr(new_llm_engine, 'get_tokenizer'):
147
+ new_tokenizer = new_llm_engine.get_tokenizer()
148
+ else: raise AttributeError("get_tokenizer not on LLM object.")
149
+ except Exception as e_vllm_tok:
150
+ ASCIIColors.warning(f"vLLM engine tokenizer error ({e_vllm_tok}). Loading with AutoTokenizer.")
151
+ tok_path_hint = engine_params.get('tokenizer', str(resolved_model_path.parent if is_gguf else resolved_model_path))
152
+ if not Path(tok_path_hint).exists() and "/" not in tok_path_hint:
153
+ tok_path_hint = str(resolved_model_path.parent if is_gguf else resolved_model_path)
154
+ try:
155
+ new_tokenizer = AutoTokenizer.from_pretrained(
156
+ tok_path_hint, trust_remote_code=engine_params.get("trust_remote_code", False)
157
+ )
158
+ except Exception as e_hf_tok:
159
+ ASCIIColors.error(f"AutoTokenizer failed for {tok_path_hint}: {e_hf_tok}")
160
+
161
+ with self._lock:
162
+ self._engines[engine_key] = (new_llm_engine, new_tokenizer, 1, engine_key)
163
+ ASCIIColors.green(f"New vLLM engine for {resolved_model_path.name} created. Ref count: 1")
164
+ return new_llm_engine, new_tokenizer
165
+
166
+ except Exception as e:
167
+ trace_exception(e)
168
+ ASCIIColors.error(f"Failed to create vLLM engine for {resolved_model_path.name}: {e}")
169
+ return None, None
170
+
171
+ def release_engine(self, resolved_model_path: Path, engine_params: Dict[str, Any]):
172
+ engine_key = self._get_engine_config_key(resolved_model_path, engine_params)
173
+ with self._lock:
174
+ if engine_key in self._engines:
175
+ llm_engine, tokenizer, ref_count, _ = self._engines[engine_key]
176
+ if ref_count <= 1:
177
+ ASCIIColors.info(f"Releasing vLLM engine for {resolved_model_path.name} (key: {engine_key}). Final reference.")
178
+ del self._engines[engine_key]
179
+ if engine_key in self._engine_locks: del self._engine_locks[engine_key]
180
+ del llm_engine
181
+ del tokenizer
182
+ if torch and torch.cuda.is_available(): torch.cuda.empty_cache()
183
+ gc.collect()
184
+ ASCIIColors.green(f"Engine for {resolved_model_path.name} removed.")
185
+ else:
186
+ self._engines[engine_key] = (llm_engine, tokenizer, ref_count - 1, engine_key)
187
+ ASCIIColors.info(f"Decremented ref count for {resolved_model_path.name}. New: {ref_count - 1}")
188
+ else:
189
+ ASCIIColors.warning(f"Release called for non-managed engine key: {engine_key}")
190
+
191
+ if _vllm_deps_installed:
192
+ engine_manager = VLLMEngineManager()
193
+ else:
194
+ engine_manager = None
195
+
196
+
197
+ # --- Helper Functions ---
198
+ def is_hf_model_id(model_name: str) -> bool:
199
+ return "/" in model_name and not Path(model_name).exists() and not model_name.endswith(".gguf")
200
+
201
+ def is_hf_gguf_model_id(model_name: str) -> bool:
202
+ if "/" in model_name and model_name.endswith(".gguf"):
203
+ return len(model_name.split("/")) > 1
204
+ return False
205
+
206
+ def resolve_hf_model_path(model_id_or_gguf_id: str, models_base_path: Path) -> Path:
207
+ if not _vllm_deps_installed: raise RuntimeError("Hugging Face utilities not available.")
208
+
209
+ is_single_gguf = is_hf_gguf_model_id(model_id_or_gguf_id)
210
+
211
+ if is_single_gguf:
212
+ parts = model_id_or_gguf_id.split("/")
213
+ repo_id, gguf_filename = "/".join(parts[:-1]), parts[-1]
214
+ local_repo_name = repo_id.replace("/", "__")
215
+ local_gguf_dir = models_base_path / local_repo_name
216
+ local_gguf_path = local_gguf_dir / gguf_filename
217
+
218
+ if not local_gguf_path.exists():
219
+ ASCIIColors.info(f"Downloading GGUF {model_id_or_gguf_id} to {local_gguf_dir}...")
220
+ local_gguf_dir.mkdir(parents=True, exist_ok=True)
221
+ hf_hub_download(repo_id=repo_id, filename=gguf_filename, local_dir=local_gguf_dir, local_dir_use_symlinks=False, resume_download=True)
222
+ return local_gguf_path
223
+ else:
224
+ local_model_dir_name = model_id_or_gguf_id.replace("/", "__")
225
+ local_model_path = models_base_path / local_model_dir_name
226
+ if not local_model_path.exists() or not any(local_model_path.iterdir()):
227
+ ASCIIColors.info(f"Downloading model repo {model_id_or_gguf_id} to {local_model_path}...")
228
+ snapshot_download(repo_id=model_id_or_gguf_id, local_dir=local_model_path, local_dir_use_symlinks=False, resume_download=True)
229
+ return local_model_path
230
+
231
+
232
+ # --- VLLM Binding Class ---
233
+ class VLLMBinding(LollmsLLMBinding):
234
+ def __init__(self,
235
+ models_folder: Optional[Union[str, Path]] = None,
236
+ model_name: str = "",
237
+ service_key: Optional[str] = None,
238
+ verify_ssl_certificate: bool = True,
239
+ default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
240
+ **kwargs
241
+ ):
242
+ if not _vllm_deps_installed:
243
+ raise ImportError(f"vLLM or its dependencies not installed. Binding unusable. Error: {_vllm_installation_error}")
244
+ if engine_manager is None:
245
+ raise RuntimeError("VLLMEngineManager failed to initialize. Binding unusable.")
246
+
247
+ _models_folder = Path(models_folder) if models_folder is not None else DEFAULT_models_folder
248
+ _models_folder.mkdir(parents=True, exist_ok=True)
249
+
250
+ super().__init__(BindingName)
251
+ self.models_folder= models_folder
252
+ self.model_name=model_name
253
+ self.default_completion_format=default_completion_format
254
+
255
+
256
+ self.models_folder: Path = _models_folder
257
+ self.llm_engine: Optional[LLM] = None
258
+ self.tokenizer = None
259
+ self.current_model_name_or_id: Optional[str] = None
260
+ self.current_resolved_model_path: Optional[Path] = None
261
+ self.current_engine_params: Optional[Dict[str, Any]] = None
262
+ self.vllm_engine_kwargs_config = kwargs.copy()
263
+
264
+ if model_name:
265
+ try:
266
+ self.load_model(model_name)
267
+ except Exception as e:
268
+ ASCIIColors.error(f"Auto-load model '{model_name}' failed: {e}")
269
+ trace_exception(e)
270
+
271
+ def _get_vllm_engine_params_for_load(self) -> Dict[str, Any]:
272
+ params = self.vllm_engine_kwargs_config.copy()
273
+ if torch and torch.cuda.is_available():
274
+ params.setdefault('tensor_parallel_size', torch.cuda.device_count())
275
+ params.setdefault('gpu_memory_utilization', 0.90)
276
+ params.setdefault('dtype', 'auto')
277
+ else:
278
+ params.setdefault('tensor_parallel_size', 1)
279
+ params.setdefault('gpu_memory_utilization', 0)
280
+ params.setdefault('enforce_eager', True)
281
+ if not (torch and torch.cuda.is_available()): ASCIIColors.warning("No CUDA GPU by PyTorch, vLLM on CPU or may fail.")
282
+ params.setdefault('trust_remote_code', False) # Important default
283
+ return params
284
+
285
+ def load_model(self, model_name_or_id: str) -> bool:
286
+ ASCIIColors.info(f"Binding {id(self)} loading model: {model_name_or_id}")
287
+ self.close() # Release any existing model held by this instance
288
+
289
+ resolved_model_path: Path
290
+ is_gguf_model = False
291
+ effective_engine_params = self._get_vllm_engine_params_for_load()
292
+
293
+ potential_local_path = Path(model_name_or_id)
294
+ if potential_local_path.is_absolute():
295
+ if not potential_local_path.exists():
296
+ ASCIIColors.error(f"Absolute path not found: {potential_local_path}")
297
+ return False
298
+ resolved_model_path = potential_local_path
299
+ else:
300
+ path_in_models_dir = self.models_folder / model_name_or_id
301
+ if path_in_models_dir.exists():
302
+ resolved_model_path = path_in_models_dir
303
+ elif is_hf_model_id(model_name_or_id) or is_hf_gguf_model_id(model_name_or_id):
304
+ try:
305
+ resolved_model_path = resolve_hf_model_path(model_name_or_id, self.models_folder)
306
+ except Exception as e:
307
+ ASCIIColors.error(f"HF model resolve/download failed for {model_name_or_id}: {e}"); return False
308
+ else:
309
+ ASCIIColors.error(f"Model '{model_name_or_id}' not found locally or as HF ID."); return False
310
+
311
+ if resolved_model_path.is_file() and resolved_model_path.suffix.lower() == ".gguf":
312
+ is_gguf_model = True
313
+ elif not resolved_model_path.is_dir():
314
+ ASCIIColors.error(f"Resolved path {resolved_model_path} not valid model."); return False
315
+
316
+ self.llm_engine, self.tokenizer = engine_manager.get_engine(resolved_model_path, is_gguf_model, effective_engine_params)
317
+
318
+ if self.llm_engine:
319
+ self.current_model_name_or_id = model_name_or_id
320
+ self.current_resolved_model_path = resolved_model_path
321
+ self.current_engine_params = effective_engine_params
322
+ self.model_name = model_name_or_id # Update superclass
323
+ ASCIIColors.green(f"Binding {id(self)} obtained engine for: {model_name_or_id}")
324
+ if not self.tokenizer: ASCIIColors.warning("Tokenizer unavailable for current model.")
325
+ return True
326
+ else:
327
+ ASCIIColors.error(f"Binding {id(self)} failed to get engine for: {model_name_or_id}")
328
+ self.close() # Clear any partial state
329
+ return False
330
+
331
+ def generate_text(self,
332
+ prompt: str,
333
+ images: Optional[List[str]] = None,
334
+ n_predict: Optional[int] = 1024,
335
+ stream: bool = False, # vLLM's generate is blocking, stream is pseudo
336
+ temperature: float = 0.7,
337
+ top_k: int = 50,
338
+ top_p: float = 0.95,
339
+ repeat_penalty: float = 1.1,
340
+ repeat_last_n: int = 64, # Note: vLLM applies penalty to full context
341
+ seed: Optional[int] = None,
342
+ n_threads: int = 8, # Note: vLLM manages its own threading/parallelism
343
+ streaming_callback: Optional[Callable[[str, int], bool]] = None
344
+ ) -> Union[str, Dict[str, any]]:
345
+ if not self.llm_engine: return {"status": False, "error": "Engine not loaded."}
346
+
347
+ sampling_dict = {
348
+ "temperature": float(temperature) if float(temperature) > 0.001 else 0.001, # Temp > 0
349
+ "top_p": float(top_p), "top_k": int(top_k) if top_k > 0 else -1,
350
+ "max_tokens": int(n_predict) if n_predict is not None else 1024,
351
+ "repetition_penalty": float(repeat_penalty),
352
+ }
353
+ if sampling_dict["temperature"] <= 0.001 and sampling_dict["top_k"] !=1 : # Greedy like
354
+ sampling_dict["top_k"] = 1
355
+ sampling_dict["temperature"] = 1.0 # Valid combination for greedy
356
+
357
+ if seed is not None: sampling_dict["seed"] = int(seed)
358
+
359
+ sampling_params = SamplingParams(**sampling_dict)
360
+ gen_kwargs = {}
361
+
362
+ if images:
363
+ if not self.tokenizer: return {"status": False, "error": "Tokenizer needed for multimodal."}
364
+ # Vision model image processing is complex and model-specific.
365
+ # This is a simplified placeholder for LLaVA-like models.
366
+ # Requires vLLM >= 0.4.0 and appropriate model/engine_params.
367
+ try:
368
+ pil_images = [Image.open(img_path).convert('RGB') for img_path in images]
369
+
370
+ # The prompt might need an image token, e.g. <image>. This should be part of `self.current_engine_params`
371
+ image_token_str = self.current_engine_params.get("image_token_str", "<image>")
372
+ if image_token_str not in prompt and images:
373
+ prompt = f"{image_token_str}\n{prompt}"
374
+
375
+ # This is a simplified view. `process_multimodal_inputs` in vLLM is more robust.
376
+ # The structure of multi_modal_data can vary.
377
+ if len(pil_images) == 1: mm_data_content = pil_images[0]
378
+ else: mm_data_content = pil_images
379
+
380
+ # For vLLM, prompts can be text or token IDs.
381
+ # If providing multi_modal_data, usually prompt_token_ids are also needed.
382
+ # This can get complex as it depends on how the model expects images to be interleaved.
383
+ # For a simple case where image comes first:
384
+ encoded_prompt_ids = self.tokenizer.encode(prompt)
385
+ gen_kwargs["prompt_token_ids"] = [encoded_prompt_ids] # List of lists
386
+ gen_kwargs["multi_modal_data"] = [{"image": mm_data_content}] # List of dicts
387
+ gen_kwargs["prompts"] = None # Don't use prompts if prompt_token_ids is used
388
+ ASCIIColors.info("Prepared basic multimodal inputs.")
389
+ except Exception as e_mm:
390
+ return {"status": False, "error": f"Multimodal prep error: {e_mm}"}
391
+ else:
392
+ gen_kwargs["prompts"] = [prompt]
393
+
394
+ try:
395
+ outputs = self.llm_engine.generate(**gen_kwargs, sampling_params=sampling_params)
396
+ full_response_text = outputs[0].outputs[0].text
397
+ if stream and streaming_callback:
398
+ if not streaming_callback(full_response_text, MSG_TYPE.MSG_TYPE_CHUNK):
399
+ ASCIIColors.info("Streaming callback stopped (pseudo-stream).")
400
+ return full_response_text
401
+ except Exception as e:
402
+ trace_exception(e); return {"status": False, "error": f"vLLM generation error: {e}"}
403
+
404
+ def tokenize(self, text: str) -> List[int]:
405
+ if not self.tokenizer: ASCIIColors.warning("Tokenizer unavailable."); return [ord(c) for c in text]
406
+ try:
407
+ encoded = self.tokenizer.encode(text)
408
+ return encoded.ids if hasattr(encoded, 'ids') else encoded
409
+ except Exception as e: trace_exception(e); return []
410
+
411
+ def detokenize(self, tokens: List[int]) -> str:
412
+ if not self.tokenizer: ASCIIColors.warning("Tokenizer unavailable."); return "".join(map(chr, tokens)) # Crude fallback
413
+ try: return self.tokenizer.decode(tokens, skip_special_tokens=True)
414
+ except Exception as e: trace_exception(e); return ""
415
+
416
+ def count_tokens(self, text: str) -> int:
417
+ if not self.tokenizer: return len(text)
418
+ return len(self.tokenize(text))
419
+
420
+ def embed(self, text: str, **kwargs) -> list:
421
+ raise NotImplementedError("VLLMBinding does not provide generic text embedding.")
422
+
423
+ def get_model_info(self) -> dict:
424
+ info = {
425
+ "binding_name": self.binding_name,
426
+ "vllm_version": vllm.__version__ if vllm else "N/A",
427
+ "models_folder": str(self.models_folder),
428
+ "loaded_model_name_or_id": self.current_model_name_or_id,
429
+ "resolved_model_path": str(self.current_resolved_model_path) if self.current_resolved_model_path else None,
430
+ "engine_parameters_used": self.current_engine_params,
431
+ "supports_structured_output": False, # Can be True with outlines, not basic
432
+ "supports_vision": "multi_modal_data" in LLM.generate.__annotations__ if LLM else False
433
+ }
434
+ if self.llm_engine and hasattr(self.llm_engine, 'llm_engine') and hasattr(self.llm_engine.llm_engine, 'model_config'):
435
+ cfg = self.llm_engine.llm_engine.model_config
436
+ hf_cfg = getattr(cfg, 'hf_config', None)
437
+ info["loaded_model_config_details"] = {
438
+ "model_type": getattr(hf_cfg, 'model_type', getattr(cfg, 'model_type', "N/A")),
439
+ "vocab_size": getattr(hf_cfg, 'vocab_size', getattr(cfg, 'vocab_size', "N/A")),
440
+ "max_model_len": getattr(cfg, 'max_model_len', "N/A"),
441
+ "quantization": getattr(self.llm_engine.llm_engine, 'quantization_method', "N/A"),
442
+ "dtype": str(getattr(cfg, 'dtype', "N/A")),
443
+ }
444
+ return info
445
+
446
+ def listModels(self) -> List[Dict[str, Any]]:
447
+ local_models = []
448
+ if not self.models_folder.exists(): return []
449
+ for item_path in self.models_folder.rglob('*'):
450
+ try:
451
+ model_info = {"model_name": None, "path": str(item_path), "type": None, "size_gb": None}
452
+ if item_path.is_dir() and ((item_path / "config.json").exists() or list(item_path.glob("*.safetensors"))):
453
+ is_sub_dir = any(Path(m["path"]) == item_path.parent for m in local_models if m["type"] == "HuggingFace Directory")
454
+ if is_sub_dir: continue
455
+ model_info.update({
456
+ "model_name": item_path.name, "type": "HuggingFace Directory",
457
+ "size_gb": round(sum(f.stat().st_size for f in item_path.glob('**/*') if f.is_file()) / (1024**3), 2)
458
+ })
459
+ local_models.append(model_info)
460
+ elif item_path.is_file() and item_path.suffix.lower() == ".gguf":
461
+ model_info.update({
462
+ "model_name": str(item_path.relative_to(self.models_folder)), "type": "GGUF File",
463
+ "size_gb": round(item_path.stat().st_size / (1024**3), 2)
464
+ })
465
+ local_models.append(model_info)
466
+ except Exception as e: ASCIIColors.warning(f"Error processing {item_path}: {e}")
467
+ return local_models
468
+
469
+ def __del__(self):
470
+ self.close()
471
+
472
+ def close(self):
473
+ if self.llm_engine and self.current_resolved_model_path and self.current_engine_params:
474
+ ASCIIColors.info(f"Binding {id(self)} close(). Releasing engine for: {self.current_resolved_model_path.name}")
475
+ engine_manager.release_engine(self.current_resolved_model_path, self.current_engine_params)
476
+ self.llm_engine = None
477
+ self.tokenizer = None
478
+ self.current_model_name_or_id = None
479
+ self.current_resolved_model_path = None
480
+ self.current_engine_params = None
481
+ self.model_name = ""
482
+
483
+
484
+ # --- Exports for LOLLMS ---
485
+ __all__ = ["VLLMBinding", "BindingName"]
486
+
487
+
488
+ # --- Main Test Block (Example Usage) ---
489
+ if __name__ == '__main__':
490
+ if not _vllm_deps_installed:
491
+ print(f"{ASCIIColors.RED}VLLM dependencies not met. Skipping tests. Error: {_vllm_installation_error}{ASCIIColors.RESET}")
492
+ exit()
493
+
494
+ ASCIIColors.yellow("--- VLLMBinding Test ---")
495
+ test_models_dir = DEFAULT_models_folder / "test_run_vllm_binding"
496
+ test_models_dir.mkdir(parents=True, exist_ok=True)
497
+ ASCIIColors.info(f"Using test models directory: {test_models_dir}")
498
+
499
+ # Choose small models for testing to save time/resources
500
+ # test_hf_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
501
+ test_hf_id = "microsoft/phi-2" # Needs trust_remote_code=True
502
+ # test_gguf_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/tinyllama-1.1b-chat-v1.0.Q2_K.gguf"
503
+ test_gguf_id = "bartowski/Phi-2-GGUF/phi-2.Q2_K.gguf" # May need tokenizer="microsoft/phi-2"
504
+
505
+ common_engine_args = {"trust_remote_code": True, "gpu_memory_utilization": 0.5} # Lower util for testing
506
+
507
+ def test_binding_instance(name: str, model_id_to_load: str, specific_engine_args: Dict = {}):
508
+ ASCIIColors.cyan(f"\n--- Testing Instance: {name} with Model: {model_id_to_load} ---")
509
+ final_engine_args = {**common_engine_args, **specific_engine_args}
510
+ binding = None
511
+ try:
512
+ binding = VLLMBinding(models_folder=test_models_dir, **final_engine_args)
513
+ if binding.load_model(model_id_to_load):
514
+ ASCIIColors.green(f"Model {binding.current_model_name_or_id} loaded by {name}.")
515
+ info = binding.get_model_info()
516
+ ASCIIColors.magenta(f"Model Info for {name}: {json.dumps(info['loaded_model_config_details'] if 'loaded_model_config_details' in info else 'N/A', indent=2, default=str)}")
517
+
518
+ test_prompt = "What is the main purpose of a CPU in a computer?"
519
+ if "phi-2" in model_id_to_load.lower(): # Phi-2 uses a specific prompt format
520
+ test_prompt = f"Instruct: {test_prompt}\nOutput:"
521
+
522
+ ASCIIColors.info(f"Prompt for {name}: {test_prompt}")
523
+ response = binding.generate_text(test_prompt, n_predict=50, temperature=0.1)
524
+ if isinstance(response, str): ASCIIColors.green(f"Response from {name}: {response}")
525
+ else: ASCIIColors.error(f"Generation failed for {name}: {response}")
526
+
527
+ tokens = binding.tokenize("Test tokenization.")
528
+ ASCIIColors.info(f"Token count for {name} ('Test tokenization.'): {len(tokens)}")
529
+
530
+ else:
531
+ ASCIIColors.error(f"Failed to load model {model_id_to_load} for {name}.")
532
+ except Exception as e:
533
+ ASCIIColors.error(f"Error during test for {name} with {model_id_to_load}: {e}")
534
+ trace_exception(e)
535
+ finally:
536
+ if binding:
537
+ binding.close()
538
+ ASCIIColors.info(f"Closed binding for {name}.")
539
+ # After closing a binding, the engine_manager ref count should decrease.
540
+ # If it was the last reference, the engine should be removed from manager.
541
+ # This can be verified by checking engine_manager._engines (for debugging)
542
+ # print(f"DEBUG: Engines in manager after closing {name}: {engine_manager._engines.keys()}")
543
+
544
+ # Test different models
545
+ test_binding_instance("HF_Phi2_Instance1", test_hf_id)
546
+ test_binding_instance("GGUF_Phi2_Instance", test_gguf_id, specific_engine_args={"tokenizer": "microsoft/phi-2"})
547
+
548
+ # Test sharing: Two instances requesting the same model config
549
+ ASCIIColors.cyan("\n--- Testing Model Sharing (Two instances, same HF model) ---")
550
+ args_for_shared = {**common_engine_args, "max_model_len": 2048} # Add a param to make key specific
551
+ binding_A = VLLMBinding(models_folder=test_models_dir, **args_for_shared)
552
+ binding_B = VLLMBinding(models_folder=test_models_dir, **args_for_shared)
553
+
554
+ loaded_A = binding_A.load_model(test_hf_id)
555
+ if loaded_A: ASCIIColors.green(f"Binding A loaded {test_hf_id}. Manager should have 1 ref.")
556
+ # print(f"DEBUG: Engines after A loads: {engine_manager._engines.keys()}") # For debug
557
+
558
+ loaded_B = binding_B.load_model(test_hf_id) # Should reuse the engine loaded by A
559
+ if loaded_B: ASCIIColors.green(f"Binding B loaded {test_hf_id}. Manager should have 2 refs for this engine.")
560
+ # print(f"DEBUG: Engines after B loads: {engine_manager._engines.keys()}") # For debug
561
+
562
+ if loaded_A:
563
+ resp_A = binding_A.generate_text(f"Instruct: Hello from A!\nOutput:", n_predict=10)
564
+ ASCIIColors.info(f"Response from A (shared model): {resp_A}")
565
+ if loaded_B:
566
+ resp_B = binding_B.generate_text(f"Instruct: Hello from B!\nOutput:", n_predict=10)
567
+ ASCIIColors.info(f"Response from B (shared model): {resp_B}")
568
+
569
+ binding_A.close()
570
+ ASCIIColors.info("Binding A closed. Manager should have 1 ref left for this engine.")
571
+ # print(f"DEBUG: Engines after A closes: {engine_manager._engines.keys()}") # For debug
572
+ binding_B.close()
573
+ ASCIIColors.info("Binding B closed. Manager should have 0 refs, engine should be removed.")
574
+ # print(f"DEBUG: Engines after B closes: {engine_manager._engines.keys()}") # For debug
575
+
576
+ # Vision Test (Conceptual - requires a real vision model and setup)
577
+ ASCIIColors.cyan("\n--- Conceptual Vision Test ---")
578
+ # test_vision_model_id = "llava-hf/llava-1.5-7b-hf" # Example LLaVA model
579
+ # vision_args = {**common_engine_args, "image_input_type": "pixel_values", "image_token_id": 32000, "image_feature_size":576}
580
+ # try:
581
+ # # Create a dummy image
582
+ # dummy_img_path = "dummy_vision_test.png"
583
+ # img = Image.new('RGB', (224, 224), color = 'blue')
584
+ # img.save(dummy_img_path)
585
+ # binding_vision = VLLMBinding(models_folder=test_models_dir, **vision_args)
586
+ # if binding_vision.load_model(test_vision_model_id):
587
+ # # Prompt for LLaVA often includes <image>
588
+ # vision_prompt = "USER: <image>\nWhat is in this image?\nASSISTANT:"
589
+ # response = binding_vision.generate_text(vision_prompt, images=[dummy_img_path], n_predict=30)
590
+ # ASCIIColors.green(f"Vision response: {response}")
591
+ # else:
592
+ # ASCIIColors.warning(f"Could not load vision model {test_vision_model_id}")
593
+ # if Path(dummy_img_path).exists(): Path(dummy_img_path).unlink()
594
+ # except Exception as e_vis:
595
+ # ASCIIColors.warning(f"Vision test block skipped or failed: {e_vis}. This often requires specific model and VRAM.")
596
+
597
+
598
+ ASCIIColors.yellow("\n--- VLLMBinding Test Finished ---")
599
+ # Optional: Clean up test directory
600
+ # import shutil
601
+ # if input(f"Clean up {test_models_dir}? (y/N): ").lower() == 'y':
602
+ # shutil.rmtree(test_models_dir)
603
+ # ASCIIColors.info(f"Cleaned up {test_models_dir}")
@@ -32,17 +32,15 @@ class TransformersBinding(LollmsLLMBinding):
32
32
  """Transformers-specific binding implementation"""
33
33
 
34
34
  def __init__(self,
35
- host_address: str = None,
35
+ models_folder: str = "./models",
36
36
  model_name: str = "",
37
- service_key: str = None,
38
- verify_ssl_certificate: bool = True,
39
37
  default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
40
38
  prompt_template: Optional[str] = None):
41
39
  """
42
40
  Initialize the Transformers binding.
43
41
 
44
42
  Args:
45
- host_address (str): Host address for the service. Defaults to None.
43
+ models_folder (str): The folder where we can find local models
46
44
  model_name (str): Name of the model to use. Defaults to empty string.
47
45
  service_key (str): Authentication key for the service. Defaults to None.
48
46
  verify_ssl_certificate (bool): Whether to verify SSL certificates. Defaults to True.
@@ -50,14 +48,12 @@ class TransformersBinding(LollmsLLMBinding):
50
48
  prompt_template (Optional[str]): Custom prompt template. If None, inferred from model.
51
49
  """
52
50
  super().__init__(
53
- binding_name = "transformers",
54
- host_address=host_address,
55
- model_name=model_name,
56
- service_key=service_key,
57
- verify_ssl_certificate=verify_ssl_certificate,
58
- default_completion_format=default_completion_format
51
+ binding_name = "transformers"
59
52
  )
60
-
53
+ self.models_folder= models_folder
54
+ self.model_name=model_name
55
+ self.default_completion_format=default_completion_format
56
+
61
57
  # Configure 4-bit quantization
62
58
  quantization_config = BitsAndBytesConfig(
63
59
  load_in_4bit=True,