lollms-client 0.12.6__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lollms-client might be problematic. Click here for more details.
- examples/article_summary/article_summary.py +58 -0
- examples/deep_analyze/deep_analyse.py +30 -0
- examples/deep_analyze/deep_analyze_multiple_files.py +32 -0
- examples/function_call/functions_call_with images.py +52 -0
- examples/personality_test/chat_test.py +37 -0
- examples/personality_test/chat_with_aristotle.py +42 -0
- examples/personality_test/tesks_test.py +62 -0
- examples/simple_text_gen_test.py +173 -0
- examples/simple_text_gen_with_image_test.py +166 -0
- examples/test_local_models/local_chat.py +9 -0
- examples/text_2_audio.py +77 -0
- examples/text_2_image.py +140 -0
- examples/text_and_image_2_audio.py +59 -0
- examples/text_gen.py +28 -0
- lollms_client/__init__.py +3 -2
- lollms_client/llm_bindings/lollms/__init__.py +13 -11
- lollms_client/llm_bindings/ollama/__init__.py +44 -60
- lollms_client/llm_bindings/openai/__init__.py +69 -29
- lollms_client/llm_bindings/tensor_rt/__init__.py +603 -0
- lollms_client/llm_bindings/transformers/__init__.py +7 -11
- lollms_client/llm_bindings/vllm/__init__.py +603 -0
- lollms_client/lollms_core.py +14 -4
- lollms_client/lollms_llm_binding.py +5 -25
- {lollms_client-0.12.6.dist-info → lollms_client-0.13.1.dist-info}/METADATA +19 -12
- lollms_client-0.13.1.dist-info/RECORD +52 -0
- {lollms_client-0.12.6.dist-info → lollms_client-0.13.1.dist-info}/WHEEL +1 -1
- {lollms_client-0.12.6.dist-info → lollms_client-0.13.1.dist-info}/top_level.txt +1 -0
- lollms_client/lollms_personality.py +0 -403
- lollms_client/lollms_personality_worker.py +0 -1485
- lollms_client/lollms_stt.py +0 -35
- lollms_client/lollms_tti.py +0 -35
- lollms_client/lollms_tts.py +0 -39
- lollms_client-0.12.6.dist-info/RECORD +0 -41
- {lollms_client-0.12.6.dist-info → lollms_client-0.13.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
# lollms_client/llm_bindings/vllm/__init__.py
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, Callable, List, Union, Dict, Any, Tuple
|
|
7
|
+
import json
|
|
8
|
+
import threading
|
|
9
|
+
import gc
|
|
10
|
+
import importlib
|
|
11
|
+
import platform
|
|
12
|
+
|
|
13
|
+
def detect_os():
|
|
14
|
+
system = platform.system()
|
|
15
|
+
if system == "Windows":
|
|
16
|
+
return "Windows"
|
|
17
|
+
elif system == "Linux":
|
|
18
|
+
return "Linux"
|
|
19
|
+
elif system == "Darwin":
|
|
20
|
+
return "macOS"
|
|
21
|
+
else:
|
|
22
|
+
return "Unknown OS"
|
|
23
|
+
|
|
24
|
+
if detect_os()=="Windows":
|
|
25
|
+
raise Exception("Windows is not supported by vllm, use wsl")
|
|
26
|
+
|
|
27
|
+
# --- Package Management and Conditional Imports ---
|
|
28
|
+
try:
|
|
29
|
+
# Pipmaster is assumed to be installed by the parent lollms_client.
|
|
30
|
+
# We ensure specific packages for this binding.
|
|
31
|
+
|
|
32
|
+
# Check if vllm is already importable to avoid re-running ensure_packages unnecessarily
|
|
33
|
+
# on subsequent imports within the same session if it was successful once.
|
|
34
|
+
_vllm_already_imported = 'vllm' in globals() or importlib.util.find_spec('vllm') is not None
|
|
35
|
+
|
|
36
|
+
if not _vllm_already_imported:
|
|
37
|
+
import pipmaster as pm # Assuming pipmaster is available
|
|
38
|
+
pm.ensure_packages([
|
|
39
|
+
"vllm",
|
|
40
|
+
"torch",
|
|
41
|
+
"transformers>=4.37.0",
|
|
42
|
+
"huggingface_hub>=0.20.0",
|
|
43
|
+
"pillow"
|
|
44
|
+
])
|
|
45
|
+
|
|
46
|
+
from vllm import LLM, SamplingParams
|
|
47
|
+
from PIL import Image
|
|
48
|
+
import torch
|
|
49
|
+
from transformers import AutoTokenizer
|
|
50
|
+
from huggingface_hub import hf_hub_download, HfFileSystem, snapshot_download
|
|
51
|
+
import vllm # To get __version__
|
|
52
|
+
|
|
53
|
+
_vllm_deps_installed = True
|
|
54
|
+
_vllm_installation_error = None
|
|
55
|
+
except Exception as e:
|
|
56
|
+
_vllm_deps_installed = False
|
|
57
|
+
_vllm_installation_error = e
|
|
58
|
+
# Define placeholders if imports fail
|
|
59
|
+
LLM, SamplingParams, Image, vllm_multimodal_utils = None, None, None, None
|
|
60
|
+
torch, AutoTokenizer, hf_hub_download, HfFileSystem, snapshot_download, vllm = None, None, None, None, None, None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# --- LOLLMS Client Imports ---
|
|
64
|
+
from lollms_client.lollms_llm_binding import LollmsLLMBinding
|
|
65
|
+
from lollms_client.lollms_types import MSG_TYPE, ELF_COMPLETION_FORMAT # Assuming ELF_COMPLETION_FORMAT is in lollms_types
|
|
66
|
+
from ascii_colors import ASCIIColors, trace_exception
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# --- Constants ---
|
|
70
|
+
BindingName = "VLLMBinding"
|
|
71
|
+
DEFAULT_models_folder = Path.home() / ".lollms" / "bindings_models" / "vllm_models"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# --- VLLM Engine Manager ---
|
|
75
|
+
class VLLMEngineManager:
|
|
76
|
+
_instance = None
|
|
77
|
+
_lock = threading.Lock()
|
|
78
|
+
|
|
79
|
+
def __new__(cls, *args, **kwargs):
|
|
80
|
+
if not _vllm_deps_installed:
|
|
81
|
+
raise RuntimeError(f"vLLM or its dependencies not installed. Cannot create VLLMEngineManager. Error: {_vllm_installation_error}")
|
|
82
|
+
with cls._lock:
|
|
83
|
+
if cls._instance is None:
|
|
84
|
+
cls._instance = super().__new__(cls)
|
|
85
|
+
cls._instance._initialized = False
|
|
86
|
+
return cls._instance
|
|
87
|
+
|
|
88
|
+
def __init__(self):
|
|
89
|
+
if hasattr(self, '_initialized') and self._initialized:
|
|
90
|
+
return
|
|
91
|
+
with self._lock:
|
|
92
|
+
if hasattr(self, '_initialized') and self._initialized:
|
|
93
|
+
return
|
|
94
|
+
# Stores: key -> (LLM_engine, tokenizer, ref_count, engine_kwargs_tuple_key)
|
|
95
|
+
self._engines: Dict[Tuple, Tuple[Optional[LLM], Optional[Any], int, Tuple]] = {}
|
|
96
|
+
self._engine_locks: Dict[Tuple, threading.Lock] = {} # Per-engine initialization lock
|
|
97
|
+
self._initialized = True
|
|
98
|
+
ASCIIColors.green("VLLMEngineManager initialized.")
|
|
99
|
+
|
|
100
|
+
def _get_engine_config_key(self, resolved_model_path: Path, engine_params: Dict[str, Any]) -> Tuple:
|
|
101
|
+
critical_params = [
|
|
102
|
+
'tensor_parallel_size', 'quantization', 'dtype', 'max_model_len',
|
|
103
|
+
'trust_remote_code', 'enforce_eager', 'gpu_memory_utilization',
|
|
104
|
+
'swap_space', 'max_num_seqs', 'max_num_batched_tokens', 'tokenizer', 'tokenizer_mode',
|
|
105
|
+
'image_input_type', 'image_token_id', 'image_feature_size', 'image_input_shape' # Common vision params
|
|
106
|
+
]
|
|
107
|
+
key_parts = [str(resolved_model_path)]
|
|
108
|
+
for param_name in sorted(critical_params):
|
|
109
|
+
if param_name in engine_params:
|
|
110
|
+
value = engine_params[param_name]
|
|
111
|
+
# Make common mutable types hashable for the key
|
|
112
|
+
if isinstance(value, list): value = tuple(value)
|
|
113
|
+
elif isinstance(value, dict): value = tuple(sorted(value.items()))
|
|
114
|
+
key_parts.append((param_name, value))
|
|
115
|
+
return tuple(key_parts)
|
|
116
|
+
|
|
117
|
+
def get_engine(self,
|
|
118
|
+
resolved_model_path: Path,
|
|
119
|
+
is_gguf: bool,
|
|
120
|
+
engine_params: Dict[str, Any]
|
|
121
|
+
) -> Tuple[Optional[LLM], Optional[Any]]:
|
|
122
|
+
|
|
123
|
+
engine_key = self._get_engine_config_key(resolved_model_path, engine_params)
|
|
124
|
+
|
|
125
|
+
with self._lock:
|
|
126
|
+
if engine_key not in self._engine_locks:
|
|
127
|
+
self._engine_locks[engine_key] = threading.Lock()
|
|
128
|
+
|
|
129
|
+
with self._engine_locks[engine_key]:
|
|
130
|
+
with self._lock:
|
|
131
|
+
if engine_key in self._engines:
|
|
132
|
+
llm_engine, tokenizer, ref_count, _ = self._engines[engine_key]
|
|
133
|
+
self._engines[engine_key] = (llm_engine, tokenizer, ref_count + 1, engine_key)
|
|
134
|
+
ASCIIColors.info(f"Reusing vLLM engine for {resolved_model_path.name}. Key: {engine_key}. Ref count: {ref_count + 1}")
|
|
135
|
+
return llm_engine, tokenizer
|
|
136
|
+
|
|
137
|
+
ASCIIColors.info(f"Creating new vLLM engine for {resolved_model_path.name} with key: {engine_key}")
|
|
138
|
+
try:
|
|
139
|
+
llm_args = {"model": str(resolved_model_path), **engine_params}
|
|
140
|
+
if is_gguf and "quantization" not in llm_args: # Only set if not overridden by user
|
|
141
|
+
llm_args["quantization"] = "gguf"
|
|
142
|
+
|
|
143
|
+
new_llm_engine = LLM(**llm_args)
|
|
144
|
+
new_tokenizer = None
|
|
145
|
+
try:
|
|
146
|
+
if hasattr(new_llm_engine, 'get_tokenizer'):
|
|
147
|
+
new_tokenizer = new_llm_engine.get_tokenizer()
|
|
148
|
+
else: raise AttributeError("get_tokenizer not on LLM object.")
|
|
149
|
+
except Exception as e_vllm_tok:
|
|
150
|
+
ASCIIColors.warning(f"vLLM engine tokenizer error ({e_vllm_tok}). Loading with AutoTokenizer.")
|
|
151
|
+
tok_path_hint = engine_params.get('tokenizer', str(resolved_model_path.parent if is_gguf else resolved_model_path))
|
|
152
|
+
if not Path(tok_path_hint).exists() and "/" not in tok_path_hint:
|
|
153
|
+
tok_path_hint = str(resolved_model_path.parent if is_gguf else resolved_model_path)
|
|
154
|
+
try:
|
|
155
|
+
new_tokenizer = AutoTokenizer.from_pretrained(
|
|
156
|
+
tok_path_hint, trust_remote_code=engine_params.get("trust_remote_code", False)
|
|
157
|
+
)
|
|
158
|
+
except Exception as e_hf_tok:
|
|
159
|
+
ASCIIColors.error(f"AutoTokenizer failed for {tok_path_hint}: {e_hf_tok}")
|
|
160
|
+
|
|
161
|
+
with self._lock:
|
|
162
|
+
self._engines[engine_key] = (new_llm_engine, new_tokenizer, 1, engine_key)
|
|
163
|
+
ASCIIColors.green(f"New vLLM engine for {resolved_model_path.name} created. Ref count: 1")
|
|
164
|
+
return new_llm_engine, new_tokenizer
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
trace_exception(e)
|
|
168
|
+
ASCIIColors.error(f"Failed to create vLLM engine for {resolved_model_path.name}: {e}")
|
|
169
|
+
return None, None
|
|
170
|
+
|
|
171
|
+
def release_engine(self, resolved_model_path: Path, engine_params: Dict[str, Any]):
|
|
172
|
+
engine_key = self._get_engine_config_key(resolved_model_path, engine_params)
|
|
173
|
+
with self._lock:
|
|
174
|
+
if engine_key in self._engines:
|
|
175
|
+
llm_engine, tokenizer, ref_count, _ = self._engines[engine_key]
|
|
176
|
+
if ref_count <= 1:
|
|
177
|
+
ASCIIColors.info(f"Releasing vLLM engine for {resolved_model_path.name} (key: {engine_key}). Final reference.")
|
|
178
|
+
del self._engines[engine_key]
|
|
179
|
+
if engine_key in self._engine_locks: del self._engine_locks[engine_key]
|
|
180
|
+
del llm_engine
|
|
181
|
+
del tokenizer
|
|
182
|
+
if torch and torch.cuda.is_available(): torch.cuda.empty_cache()
|
|
183
|
+
gc.collect()
|
|
184
|
+
ASCIIColors.green(f"Engine for {resolved_model_path.name} removed.")
|
|
185
|
+
else:
|
|
186
|
+
self._engines[engine_key] = (llm_engine, tokenizer, ref_count - 1, engine_key)
|
|
187
|
+
ASCIIColors.info(f"Decremented ref count for {resolved_model_path.name}. New: {ref_count - 1}")
|
|
188
|
+
else:
|
|
189
|
+
ASCIIColors.warning(f"Release called for non-managed engine key: {engine_key}")
|
|
190
|
+
|
|
191
|
+
if _vllm_deps_installed:
|
|
192
|
+
engine_manager = VLLMEngineManager()
|
|
193
|
+
else:
|
|
194
|
+
engine_manager = None
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# --- Helper Functions ---
|
|
198
|
+
def is_hf_model_id(model_name: str) -> bool:
|
|
199
|
+
return "/" in model_name and not Path(model_name).exists() and not model_name.endswith(".gguf")
|
|
200
|
+
|
|
201
|
+
def is_hf_gguf_model_id(model_name: str) -> bool:
|
|
202
|
+
if "/" in model_name and model_name.endswith(".gguf"):
|
|
203
|
+
return len(model_name.split("/")) > 1
|
|
204
|
+
return False
|
|
205
|
+
|
|
206
|
+
def resolve_hf_model_path(model_id_or_gguf_id: str, models_base_path: Path) -> Path:
|
|
207
|
+
if not _vllm_deps_installed: raise RuntimeError("Hugging Face utilities not available.")
|
|
208
|
+
|
|
209
|
+
is_single_gguf = is_hf_gguf_model_id(model_id_or_gguf_id)
|
|
210
|
+
|
|
211
|
+
if is_single_gguf:
|
|
212
|
+
parts = model_id_or_gguf_id.split("/")
|
|
213
|
+
repo_id, gguf_filename = "/".join(parts[:-1]), parts[-1]
|
|
214
|
+
local_repo_name = repo_id.replace("/", "__")
|
|
215
|
+
local_gguf_dir = models_base_path / local_repo_name
|
|
216
|
+
local_gguf_path = local_gguf_dir / gguf_filename
|
|
217
|
+
|
|
218
|
+
if not local_gguf_path.exists():
|
|
219
|
+
ASCIIColors.info(f"Downloading GGUF {model_id_or_gguf_id} to {local_gguf_dir}...")
|
|
220
|
+
local_gguf_dir.mkdir(parents=True, exist_ok=True)
|
|
221
|
+
hf_hub_download(repo_id=repo_id, filename=gguf_filename, local_dir=local_gguf_dir, local_dir_use_symlinks=False, resume_download=True)
|
|
222
|
+
return local_gguf_path
|
|
223
|
+
else:
|
|
224
|
+
local_model_dir_name = model_id_or_gguf_id.replace("/", "__")
|
|
225
|
+
local_model_path = models_base_path / local_model_dir_name
|
|
226
|
+
if not local_model_path.exists() or not any(local_model_path.iterdir()):
|
|
227
|
+
ASCIIColors.info(f"Downloading model repo {model_id_or_gguf_id} to {local_model_path}...")
|
|
228
|
+
snapshot_download(repo_id=model_id_or_gguf_id, local_dir=local_model_path, local_dir_use_symlinks=False, resume_download=True)
|
|
229
|
+
return local_model_path
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# --- VLLM Binding Class ---
|
|
233
|
+
class VLLMBinding(LollmsLLMBinding):
|
|
234
|
+
def __init__(self,
|
|
235
|
+
models_folder: Optional[Union[str, Path]] = None,
|
|
236
|
+
model_name: str = "",
|
|
237
|
+
service_key: Optional[str] = None,
|
|
238
|
+
verify_ssl_certificate: bool = True,
|
|
239
|
+
default_completion_format: ELF_COMPLETION_FORMAT = ELF_COMPLETION_FORMAT.Chat,
|
|
240
|
+
**kwargs
|
|
241
|
+
):
|
|
242
|
+
if not _vllm_deps_installed:
|
|
243
|
+
raise ImportError(f"vLLM or its dependencies not installed. Binding unusable. Error: {_vllm_installation_error}")
|
|
244
|
+
if engine_manager is None:
|
|
245
|
+
raise RuntimeError("VLLMEngineManager failed to initialize. Binding unusable.")
|
|
246
|
+
|
|
247
|
+
_models_folder = Path(models_folder) if models_folder is not None else DEFAULT_models_folder
|
|
248
|
+
_models_folder.mkdir(parents=True, exist_ok=True)
|
|
249
|
+
|
|
250
|
+
super().__init__(BindingName)
|
|
251
|
+
self.models_folder= models_folder
|
|
252
|
+
self.model_name=model_name
|
|
253
|
+
self.default_completion_format=default_completion_format
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
self.models_folder: Path = _models_folder
|
|
257
|
+
self.llm_engine: Optional[LLM] = None
|
|
258
|
+
self.tokenizer = None
|
|
259
|
+
self.current_model_name_or_id: Optional[str] = None
|
|
260
|
+
self.current_resolved_model_path: Optional[Path] = None
|
|
261
|
+
self.current_engine_params: Optional[Dict[str, Any]] = None
|
|
262
|
+
self.vllm_engine_kwargs_config = kwargs.copy()
|
|
263
|
+
|
|
264
|
+
if model_name:
|
|
265
|
+
try:
|
|
266
|
+
self.load_model(model_name)
|
|
267
|
+
except Exception as e:
|
|
268
|
+
ASCIIColors.error(f"Auto-load model '{model_name}' failed: {e}")
|
|
269
|
+
trace_exception(e)
|
|
270
|
+
|
|
271
|
+
def _get_vllm_engine_params_for_load(self) -> Dict[str, Any]:
|
|
272
|
+
params = self.vllm_engine_kwargs_config.copy()
|
|
273
|
+
if torch and torch.cuda.is_available():
|
|
274
|
+
params.setdefault('tensor_parallel_size', torch.cuda.device_count())
|
|
275
|
+
params.setdefault('gpu_memory_utilization', 0.90)
|
|
276
|
+
params.setdefault('dtype', 'auto')
|
|
277
|
+
else:
|
|
278
|
+
params.setdefault('tensor_parallel_size', 1)
|
|
279
|
+
params.setdefault('gpu_memory_utilization', 0)
|
|
280
|
+
params.setdefault('enforce_eager', True)
|
|
281
|
+
if not (torch and torch.cuda.is_available()): ASCIIColors.warning("No CUDA GPU by PyTorch, vLLM on CPU or may fail.")
|
|
282
|
+
params.setdefault('trust_remote_code', False) # Important default
|
|
283
|
+
return params
|
|
284
|
+
|
|
285
|
+
def load_model(self, model_name_or_id: str) -> bool:
|
|
286
|
+
ASCIIColors.info(f"Binding {id(self)} loading model: {model_name_or_id}")
|
|
287
|
+
self.close() # Release any existing model held by this instance
|
|
288
|
+
|
|
289
|
+
resolved_model_path: Path
|
|
290
|
+
is_gguf_model = False
|
|
291
|
+
effective_engine_params = self._get_vllm_engine_params_for_load()
|
|
292
|
+
|
|
293
|
+
potential_local_path = Path(model_name_or_id)
|
|
294
|
+
if potential_local_path.is_absolute():
|
|
295
|
+
if not potential_local_path.exists():
|
|
296
|
+
ASCIIColors.error(f"Absolute path not found: {potential_local_path}")
|
|
297
|
+
return False
|
|
298
|
+
resolved_model_path = potential_local_path
|
|
299
|
+
else:
|
|
300
|
+
path_in_models_dir = self.models_folder / model_name_or_id
|
|
301
|
+
if path_in_models_dir.exists():
|
|
302
|
+
resolved_model_path = path_in_models_dir
|
|
303
|
+
elif is_hf_model_id(model_name_or_id) or is_hf_gguf_model_id(model_name_or_id):
|
|
304
|
+
try:
|
|
305
|
+
resolved_model_path = resolve_hf_model_path(model_name_or_id, self.models_folder)
|
|
306
|
+
except Exception as e:
|
|
307
|
+
ASCIIColors.error(f"HF model resolve/download failed for {model_name_or_id}: {e}"); return False
|
|
308
|
+
else:
|
|
309
|
+
ASCIIColors.error(f"Model '{model_name_or_id}' not found locally or as HF ID."); return False
|
|
310
|
+
|
|
311
|
+
if resolved_model_path.is_file() and resolved_model_path.suffix.lower() == ".gguf":
|
|
312
|
+
is_gguf_model = True
|
|
313
|
+
elif not resolved_model_path.is_dir():
|
|
314
|
+
ASCIIColors.error(f"Resolved path {resolved_model_path} not valid model."); return False
|
|
315
|
+
|
|
316
|
+
self.llm_engine, self.tokenizer = engine_manager.get_engine(resolved_model_path, is_gguf_model, effective_engine_params)
|
|
317
|
+
|
|
318
|
+
if self.llm_engine:
|
|
319
|
+
self.current_model_name_or_id = model_name_or_id
|
|
320
|
+
self.current_resolved_model_path = resolved_model_path
|
|
321
|
+
self.current_engine_params = effective_engine_params
|
|
322
|
+
self.model_name = model_name_or_id # Update superclass
|
|
323
|
+
ASCIIColors.green(f"Binding {id(self)} obtained engine for: {model_name_or_id}")
|
|
324
|
+
if not self.tokenizer: ASCIIColors.warning("Tokenizer unavailable for current model.")
|
|
325
|
+
return True
|
|
326
|
+
else:
|
|
327
|
+
ASCIIColors.error(f"Binding {id(self)} failed to get engine for: {model_name_or_id}")
|
|
328
|
+
self.close() # Clear any partial state
|
|
329
|
+
return False
|
|
330
|
+
|
|
331
|
+
def generate_text(self,
|
|
332
|
+
prompt: str,
|
|
333
|
+
images: Optional[List[str]] = None,
|
|
334
|
+
n_predict: Optional[int] = 1024,
|
|
335
|
+
stream: bool = False, # vLLM's generate is blocking, stream is pseudo
|
|
336
|
+
temperature: float = 0.7,
|
|
337
|
+
top_k: int = 50,
|
|
338
|
+
top_p: float = 0.95,
|
|
339
|
+
repeat_penalty: float = 1.1,
|
|
340
|
+
repeat_last_n: int = 64, # Note: vLLM applies penalty to full context
|
|
341
|
+
seed: Optional[int] = None,
|
|
342
|
+
n_threads: int = 8, # Note: vLLM manages its own threading/parallelism
|
|
343
|
+
streaming_callback: Optional[Callable[[str, int], bool]] = None
|
|
344
|
+
) -> Union[str, Dict[str, any]]:
|
|
345
|
+
if not self.llm_engine: return {"status": False, "error": "Engine not loaded."}
|
|
346
|
+
|
|
347
|
+
sampling_dict = {
|
|
348
|
+
"temperature": float(temperature) if float(temperature) > 0.001 else 0.001, # Temp > 0
|
|
349
|
+
"top_p": float(top_p), "top_k": int(top_k) if top_k > 0 else -1,
|
|
350
|
+
"max_tokens": int(n_predict) if n_predict is not None else 1024,
|
|
351
|
+
"repetition_penalty": float(repeat_penalty),
|
|
352
|
+
}
|
|
353
|
+
if sampling_dict["temperature"] <= 0.001 and sampling_dict["top_k"] !=1 : # Greedy like
|
|
354
|
+
sampling_dict["top_k"] = 1
|
|
355
|
+
sampling_dict["temperature"] = 1.0 # Valid combination for greedy
|
|
356
|
+
|
|
357
|
+
if seed is not None: sampling_dict["seed"] = int(seed)
|
|
358
|
+
|
|
359
|
+
sampling_params = SamplingParams(**sampling_dict)
|
|
360
|
+
gen_kwargs = {}
|
|
361
|
+
|
|
362
|
+
if images:
|
|
363
|
+
if not self.tokenizer: return {"status": False, "error": "Tokenizer needed for multimodal."}
|
|
364
|
+
# Vision model image processing is complex and model-specific.
|
|
365
|
+
# This is a simplified placeholder for LLaVA-like models.
|
|
366
|
+
# Requires vLLM >= 0.4.0 and appropriate model/engine_params.
|
|
367
|
+
try:
|
|
368
|
+
pil_images = [Image.open(img_path).convert('RGB') for img_path in images]
|
|
369
|
+
|
|
370
|
+
# The prompt might need an image token, e.g. <image>. This should be part of `self.current_engine_params`
|
|
371
|
+
image_token_str = self.current_engine_params.get("image_token_str", "<image>")
|
|
372
|
+
if image_token_str not in prompt and images:
|
|
373
|
+
prompt = f"{image_token_str}\n{prompt}"
|
|
374
|
+
|
|
375
|
+
# This is a simplified view. `process_multimodal_inputs` in vLLM is more robust.
|
|
376
|
+
# The structure of multi_modal_data can vary.
|
|
377
|
+
if len(pil_images) == 1: mm_data_content = pil_images[0]
|
|
378
|
+
else: mm_data_content = pil_images
|
|
379
|
+
|
|
380
|
+
# For vLLM, prompts can be text or token IDs.
|
|
381
|
+
# If providing multi_modal_data, usually prompt_token_ids are also needed.
|
|
382
|
+
# This can get complex as it depends on how the model expects images to be interleaved.
|
|
383
|
+
# For a simple case where image comes first:
|
|
384
|
+
encoded_prompt_ids = self.tokenizer.encode(prompt)
|
|
385
|
+
gen_kwargs["prompt_token_ids"] = [encoded_prompt_ids] # List of lists
|
|
386
|
+
gen_kwargs["multi_modal_data"] = [{"image": mm_data_content}] # List of dicts
|
|
387
|
+
gen_kwargs["prompts"] = None # Don't use prompts if prompt_token_ids is used
|
|
388
|
+
ASCIIColors.info("Prepared basic multimodal inputs.")
|
|
389
|
+
except Exception as e_mm:
|
|
390
|
+
return {"status": False, "error": f"Multimodal prep error: {e_mm}"}
|
|
391
|
+
else:
|
|
392
|
+
gen_kwargs["prompts"] = [prompt]
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
outputs = self.llm_engine.generate(**gen_kwargs, sampling_params=sampling_params)
|
|
396
|
+
full_response_text = outputs[0].outputs[0].text
|
|
397
|
+
if stream and streaming_callback:
|
|
398
|
+
if not streaming_callback(full_response_text, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
399
|
+
ASCIIColors.info("Streaming callback stopped (pseudo-stream).")
|
|
400
|
+
return full_response_text
|
|
401
|
+
except Exception as e:
|
|
402
|
+
trace_exception(e); return {"status": False, "error": f"vLLM generation error: {e}"}
|
|
403
|
+
|
|
404
|
+
def tokenize(self, text: str) -> List[int]:
|
|
405
|
+
if not self.tokenizer: ASCIIColors.warning("Tokenizer unavailable."); return [ord(c) for c in text]
|
|
406
|
+
try:
|
|
407
|
+
encoded = self.tokenizer.encode(text)
|
|
408
|
+
return encoded.ids if hasattr(encoded, 'ids') else encoded
|
|
409
|
+
except Exception as e: trace_exception(e); return []
|
|
410
|
+
|
|
411
|
+
def detokenize(self, tokens: List[int]) -> str:
|
|
412
|
+
if not self.tokenizer: ASCIIColors.warning("Tokenizer unavailable."); return "".join(map(chr, tokens)) # Crude fallback
|
|
413
|
+
try: return self.tokenizer.decode(tokens, skip_special_tokens=True)
|
|
414
|
+
except Exception as e: trace_exception(e); return ""
|
|
415
|
+
|
|
416
|
+
def count_tokens(self, text: str) -> int:
|
|
417
|
+
if not self.tokenizer: return len(text)
|
|
418
|
+
return len(self.tokenize(text))
|
|
419
|
+
|
|
420
|
+
def embed(self, text: str, **kwargs) -> list:
|
|
421
|
+
raise NotImplementedError("VLLMBinding does not provide generic text embedding.")
|
|
422
|
+
|
|
423
|
+
def get_model_info(self) -> dict:
|
|
424
|
+
info = {
|
|
425
|
+
"binding_name": self.binding_name,
|
|
426
|
+
"vllm_version": vllm.__version__ if vllm else "N/A",
|
|
427
|
+
"models_folder": str(self.models_folder),
|
|
428
|
+
"loaded_model_name_or_id": self.current_model_name_or_id,
|
|
429
|
+
"resolved_model_path": str(self.current_resolved_model_path) if self.current_resolved_model_path else None,
|
|
430
|
+
"engine_parameters_used": self.current_engine_params,
|
|
431
|
+
"supports_structured_output": False, # Can be True with outlines, not basic
|
|
432
|
+
"supports_vision": "multi_modal_data" in LLM.generate.__annotations__ if LLM else False
|
|
433
|
+
}
|
|
434
|
+
if self.llm_engine and hasattr(self.llm_engine, 'llm_engine') and hasattr(self.llm_engine.llm_engine, 'model_config'):
|
|
435
|
+
cfg = self.llm_engine.llm_engine.model_config
|
|
436
|
+
hf_cfg = getattr(cfg, 'hf_config', None)
|
|
437
|
+
info["loaded_model_config_details"] = {
|
|
438
|
+
"model_type": getattr(hf_cfg, 'model_type', getattr(cfg, 'model_type', "N/A")),
|
|
439
|
+
"vocab_size": getattr(hf_cfg, 'vocab_size', getattr(cfg, 'vocab_size', "N/A")),
|
|
440
|
+
"max_model_len": getattr(cfg, 'max_model_len', "N/A"),
|
|
441
|
+
"quantization": getattr(self.llm_engine.llm_engine, 'quantization_method', "N/A"),
|
|
442
|
+
"dtype": str(getattr(cfg, 'dtype', "N/A")),
|
|
443
|
+
}
|
|
444
|
+
return info
|
|
445
|
+
|
|
446
|
+
def listModels(self) -> List[Dict[str, Any]]:
|
|
447
|
+
local_models = []
|
|
448
|
+
if not self.models_folder.exists(): return []
|
|
449
|
+
for item_path in self.models_folder.rglob('*'):
|
|
450
|
+
try:
|
|
451
|
+
model_info = {"model_name": None, "path": str(item_path), "type": None, "size_gb": None}
|
|
452
|
+
if item_path.is_dir() and ((item_path / "config.json").exists() or list(item_path.glob("*.safetensors"))):
|
|
453
|
+
is_sub_dir = any(Path(m["path"]) == item_path.parent for m in local_models if m["type"] == "HuggingFace Directory")
|
|
454
|
+
if is_sub_dir: continue
|
|
455
|
+
model_info.update({
|
|
456
|
+
"model_name": item_path.name, "type": "HuggingFace Directory",
|
|
457
|
+
"size_gb": round(sum(f.stat().st_size for f in item_path.glob('**/*') if f.is_file()) / (1024**3), 2)
|
|
458
|
+
})
|
|
459
|
+
local_models.append(model_info)
|
|
460
|
+
elif item_path.is_file() and item_path.suffix.lower() == ".gguf":
|
|
461
|
+
model_info.update({
|
|
462
|
+
"model_name": str(item_path.relative_to(self.models_folder)), "type": "GGUF File",
|
|
463
|
+
"size_gb": round(item_path.stat().st_size / (1024**3), 2)
|
|
464
|
+
})
|
|
465
|
+
local_models.append(model_info)
|
|
466
|
+
except Exception as e: ASCIIColors.warning(f"Error processing {item_path}: {e}")
|
|
467
|
+
return local_models
|
|
468
|
+
|
|
469
|
+
def __del__(self):
|
|
470
|
+
self.close()
|
|
471
|
+
|
|
472
|
+
def close(self):
|
|
473
|
+
if self.llm_engine and self.current_resolved_model_path and self.current_engine_params:
|
|
474
|
+
ASCIIColors.info(f"Binding {id(self)} close(). Releasing engine for: {self.current_resolved_model_path.name}")
|
|
475
|
+
engine_manager.release_engine(self.current_resolved_model_path, self.current_engine_params)
|
|
476
|
+
self.llm_engine = None
|
|
477
|
+
self.tokenizer = None
|
|
478
|
+
self.current_model_name_or_id = None
|
|
479
|
+
self.current_resolved_model_path = None
|
|
480
|
+
self.current_engine_params = None
|
|
481
|
+
self.model_name = ""
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
# --- Exports for LOLLMS ---
|
|
485
|
+
__all__ = ["VLLMBinding", "BindingName"]
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
# --- Main Test Block (Example Usage) ---
|
|
489
|
+
if __name__ == '__main__':
|
|
490
|
+
if not _vllm_deps_installed:
|
|
491
|
+
print(f"{ASCIIColors.RED}VLLM dependencies not met. Skipping tests. Error: {_vllm_installation_error}{ASCIIColors.RESET}")
|
|
492
|
+
exit()
|
|
493
|
+
|
|
494
|
+
ASCIIColors.yellow("--- VLLMBinding Test ---")
|
|
495
|
+
test_models_dir = DEFAULT_models_folder / "test_run_vllm_binding"
|
|
496
|
+
test_models_dir.mkdir(parents=True, exist_ok=True)
|
|
497
|
+
ASCIIColors.info(f"Using test models directory: {test_models_dir}")
|
|
498
|
+
|
|
499
|
+
# Choose small models for testing to save time/resources
|
|
500
|
+
# test_hf_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
|
501
|
+
test_hf_id = "microsoft/phi-2" # Needs trust_remote_code=True
|
|
502
|
+
# test_gguf_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/tinyllama-1.1b-chat-v1.0.Q2_K.gguf"
|
|
503
|
+
test_gguf_id = "bartowski/Phi-2-GGUF/phi-2.Q2_K.gguf" # May need tokenizer="microsoft/phi-2"
|
|
504
|
+
|
|
505
|
+
common_engine_args = {"trust_remote_code": True, "gpu_memory_utilization": 0.5} # Lower util for testing
|
|
506
|
+
|
|
507
|
+
def test_binding_instance(name: str, model_id_to_load: str, specific_engine_args: Dict = {}):
|
|
508
|
+
ASCIIColors.cyan(f"\n--- Testing Instance: {name} with Model: {model_id_to_load} ---")
|
|
509
|
+
final_engine_args = {**common_engine_args, **specific_engine_args}
|
|
510
|
+
binding = None
|
|
511
|
+
try:
|
|
512
|
+
binding = VLLMBinding(models_folder=test_models_dir, **final_engine_args)
|
|
513
|
+
if binding.load_model(model_id_to_load):
|
|
514
|
+
ASCIIColors.green(f"Model {binding.current_model_name_or_id} loaded by {name}.")
|
|
515
|
+
info = binding.get_model_info()
|
|
516
|
+
ASCIIColors.magenta(f"Model Info for {name}: {json.dumps(info['loaded_model_config_details'] if 'loaded_model_config_details' in info else 'N/A', indent=2, default=str)}")
|
|
517
|
+
|
|
518
|
+
test_prompt = "What is the main purpose of a CPU in a computer?"
|
|
519
|
+
if "phi-2" in model_id_to_load.lower(): # Phi-2 uses a specific prompt format
|
|
520
|
+
test_prompt = f"Instruct: {test_prompt}\nOutput:"
|
|
521
|
+
|
|
522
|
+
ASCIIColors.info(f"Prompt for {name}: {test_prompt}")
|
|
523
|
+
response = binding.generate_text(test_prompt, n_predict=50, temperature=0.1)
|
|
524
|
+
if isinstance(response, str): ASCIIColors.green(f"Response from {name}: {response}")
|
|
525
|
+
else: ASCIIColors.error(f"Generation failed for {name}: {response}")
|
|
526
|
+
|
|
527
|
+
tokens = binding.tokenize("Test tokenization.")
|
|
528
|
+
ASCIIColors.info(f"Token count for {name} ('Test tokenization.'): {len(tokens)}")
|
|
529
|
+
|
|
530
|
+
else:
|
|
531
|
+
ASCIIColors.error(f"Failed to load model {model_id_to_load} for {name}.")
|
|
532
|
+
except Exception as e:
|
|
533
|
+
ASCIIColors.error(f"Error during test for {name} with {model_id_to_load}: {e}")
|
|
534
|
+
trace_exception(e)
|
|
535
|
+
finally:
|
|
536
|
+
if binding:
|
|
537
|
+
binding.close()
|
|
538
|
+
ASCIIColors.info(f"Closed binding for {name}.")
|
|
539
|
+
# After closing a binding, the engine_manager ref count should decrease.
|
|
540
|
+
# If it was the last reference, the engine should be removed from manager.
|
|
541
|
+
# This can be verified by checking engine_manager._engines (for debugging)
|
|
542
|
+
# print(f"DEBUG: Engines in manager after closing {name}: {engine_manager._engines.keys()}")
|
|
543
|
+
|
|
544
|
+
# Test different models
|
|
545
|
+
test_binding_instance("HF_Phi2_Instance1", test_hf_id)
|
|
546
|
+
test_binding_instance("GGUF_Phi2_Instance", test_gguf_id, specific_engine_args={"tokenizer": "microsoft/phi-2"})
|
|
547
|
+
|
|
548
|
+
# Test sharing: Two instances requesting the same model config
|
|
549
|
+
ASCIIColors.cyan("\n--- Testing Model Sharing (Two instances, same HF model) ---")
|
|
550
|
+
args_for_shared = {**common_engine_args, "max_model_len": 2048} # Add a param to make key specific
|
|
551
|
+
binding_A = VLLMBinding(models_folder=test_models_dir, **args_for_shared)
|
|
552
|
+
binding_B = VLLMBinding(models_folder=test_models_dir, **args_for_shared)
|
|
553
|
+
|
|
554
|
+
loaded_A = binding_A.load_model(test_hf_id)
|
|
555
|
+
if loaded_A: ASCIIColors.green(f"Binding A loaded {test_hf_id}. Manager should have 1 ref.")
|
|
556
|
+
# print(f"DEBUG: Engines after A loads: {engine_manager._engines.keys()}") # For debug
|
|
557
|
+
|
|
558
|
+
loaded_B = binding_B.load_model(test_hf_id) # Should reuse the engine loaded by A
|
|
559
|
+
if loaded_B: ASCIIColors.green(f"Binding B loaded {test_hf_id}. Manager should have 2 refs for this engine.")
|
|
560
|
+
# print(f"DEBUG: Engines after B loads: {engine_manager._engines.keys()}") # For debug
|
|
561
|
+
|
|
562
|
+
if loaded_A:
|
|
563
|
+
resp_A = binding_A.generate_text(f"Instruct: Hello from A!\nOutput:", n_predict=10)
|
|
564
|
+
ASCIIColors.info(f"Response from A (shared model): {resp_A}")
|
|
565
|
+
if loaded_B:
|
|
566
|
+
resp_B = binding_B.generate_text(f"Instruct: Hello from B!\nOutput:", n_predict=10)
|
|
567
|
+
ASCIIColors.info(f"Response from B (shared model): {resp_B}")
|
|
568
|
+
|
|
569
|
+
binding_A.close()
|
|
570
|
+
ASCIIColors.info("Binding A closed. Manager should have 1 ref left for this engine.")
|
|
571
|
+
# print(f"DEBUG: Engines after A closes: {engine_manager._engines.keys()}") # For debug
|
|
572
|
+
binding_B.close()
|
|
573
|
+
ASCIIColors.info("Binding B closed. Manager should have 0 refs, engine should be removed.")
|
|
574
|
+
# print(f"DEBUG: Engines after B closes: {engine_manager._engines.keys()}") # For debug
|
|
575
|
+
|
|
576
|
+
# Vision Test (Conceptual - requires a real vision model and setup)
|
|
577
|
+
ASCIIColors.cyan("\n--- Conceptual Vision Test ---")
|
|
578
|
+
# test_vision_model_id = "llava-hf/llava-1.5-7b-hf" # Example LLaVA model
|
|
579
|
+
# vision_args = {**common_engine_args, "image_input_type": "pixel_values", "image_token_id": 32000, "image_feature_size":576}
|
|
580
|
+
# try:
|
|
581
|
+
# # Create a dummy image
|
|
582
|
+
# dummy_img_path = "dummy_vision_test.png"
|
|
583
|
+
# img = Image.new('RGB', (224, 224), color = 'blue')
|
|
584
|
+
# img.save(dummy_img_path)
|
|
585
|
+
# binding_vision = VLLMBinding(models_folder=test_models_dir, **vision_args)
|
|
586
|
+
# if binding_vision.load_model(test_vision_model_id):
|
|
587
|
+
# # Prompt for LLaVA often includes <image>
|
|
588
|
+
# vision_prompt = "USER: <image>\nWhat is in this image?\nASSISTANT:"
|
|
589
|
+
# response = binding_vision.generate_text(vision_prompt, images=[dummy_img_path], n_predict=30)
|
|
590
|
+
# ASCIIColors.green(f"Vision response: {response}")
|
|
591
|
+
# else:
|
|
592
|
+
# ASCIIColors.warning(f"Could not load vision model {test_vision_model_id}")
|
|
593
|
+
# if Path(dummy_img_path).exists(): Path(dummy_img_path).unlink()
|
|
594
|
+
# except Exception as e_vis:
|
|
595
|
+
# ASCIIColors.warning(f"Vision test block skipped or failed: {e_vis}. This often requires specific model and VRAM.")
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
ASCIIColors.yellow("\n--- VLLMBinding Test Finished ---")
|
|
599
|
+
# Optional: Clean up test directory
|
|
600
|
+
# import shutil
|
|
601
|
+
# if input(f"Clean up {test_models_dir}? (y/N): ").lower() == 'y':
|
|
602
|
+
# shutil.rmtree(test_models_dir)
|
|
603
|
+
# ASCIIColors.info(f"Cleaned up {test_models_dir}")
|
lollms_client/lollms_core.py
CHANGED
|
@@ -32,7 +32,6 @@ class LollmsClient():
|
|
|
32
32
|
model_name: str = "",
|
|
33
33
|
llm_bindings_dir: Path = Path(__file__).parent / "llm_bindings",
|
|
34
34
|
llm_binding_config: Optional[Dict[str, any]] = None, # Renamed for clarity
|
|
35
|
-
personality: Optional[int] = None, # Specific to LLM lollms binding
|
|
36
35
|
|
|
37
36
|
# Optional Modality Binding Names
|
|
38
37
|
tts_binding_name: Optional[str] = None,
|
|
@@ -73,7 +72,6 @@ class LollmsClient():
|
|
|
73
72
|
model_name (str): Default model name for the LLM binding.
|
|
74
73
|
llm_bindings_dir (Path): Directory for LLM binding implementations.
|
|
75
74
|
llm_binding_config (Optional[Dict]): Additional config for the LLM binding.
|
|
76
|
-
personality (Optional[int]): Personality ID (used by LLM 'lollms' binding).
|
|
77
75
|
tts_binding_name (Optional[str]): Name of the TTS binding to use (e.g., "lollms").
|
|
78
76
|
tti_binding_name (Optional[str]): Name of the TTI binding to use (e.g., "lollms").
|
|
79
77
|
stt_binding_name (Optional[str]): Name of the STT binding to use (e.g., "lollms").
|
|
@@ -115,7 +113,6 @@ class LollmsClient():
|
|
|
115
113
|
model_name=model_name,
|
|
116
114
|
service_key=service_key,
|
|
117
115
|
verify_ssl_certificate=verify_ssl_certificate,
|
|
118
|
-
personality=personality,
|
|
119
116
|
# Pass LLM specific config if needed
|
|
120
117
|
**(llm_binding_config or {})
|
|
121
118
|
)
|
|
@@ -281,7 +278,20 @@ class LollmsClient():
|
|
|
281
278
|
if self.binding:
|
|
282
279
|
return self.binding.detokenize(tokens)
|
|
283
280
|
raise RuntimeError("LLM binding not initialized.")
|
|
281
|
+
def count_tokens(self, text: str) -> int:
|
|
282
|
+
"""
|
|
283
|
+
Counts how many tokens are there in the text using the active LLM binding.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
text (str): The text to tokenize.
|
|
284
287
|
|
|
288
|
+
Returns:
|
|
289
|
+
int: Number of tokens.
|
|
290
|
+
"""
|
|
291
|
+
if self.binding:
|
|
292
|
+
return self.binding.count_tokens(text)
|
|
293
|
+
raise RuntimeError("LLM binding not initialized.")
|
|
294
|
+
|
|
285
295
|
def get_model_details(self) -> dict:
|
|
286
296
|
"""
|
|
287
297
|
Get model information from the active LLM binding.
|
|
@@ -1614,4 +1624,4 @@ def chunk_text(text, tokenizer, detokenizer, chunk_size, overlap, use_separators
|
|
|
1614
1624
|
if current_pos >= len(text):
|
|
1615
1625
|
break
|
|
1616
1626
|
|
|
1617
|
-
return chunks
|
|
1627
|
+
return chunks
|