lollms-client 1.5.6__py3-none-any.whl → 1.7.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. lollms_client/__init__.py +1 -1
  2. lollms_client/llm_bindings/azure_openai/__init__.py +2 -2
  3. lollms_client/llm_bindings/claude/__init__.py +125 -35
  4. lollms_client/llm_bindings/gemini/__init__.py +261 -159
  5. lollms_client/llm_bindings/grok/__init__.py +52 -15
  6. lollms_client/llm_bindings/groq/__init__.py +2 -2
  7. lollms_client/llm_bindings/hugging_face_inference_api/__init__.py +2 -2
  8. lollms_client/llm_bindings/litellm/__init__.py +1 -1
  9. lollms_client/llm_bindings/llama_cpp_server/__init__.py +605 -0
  10. lollms_client/llm_bindings/llamacpp/__init__.py +18 -11
  11. lollms_client/llm_bindings/lollms/__init__.py +76 -21
  12. lollms_client/llm_bindings/lollms_webui/__init__.py +1 -1
  13. lollms_client/llm_bindings/mistral/__init__.py +2 -2
  14. lollms_client/llm_bindings/novita_ai/__init__.py +142 -6
  15. lollms_client/llm_bindings/ollama/__init__.py +345 -89
  16. lollms_client/llm_bindings/open_router/__init__.py +2 -2
  17. lollms_client/llm_bindings/openai/__init__.py +81 -20
  18. lollms_client/llm_bindings/openllm/__init__.py +362 -506
  19. lollms_client/llm_bindings/openwebui/__init__.py +333 -171
  20. lollms_client/llm_bindings/perplexity/__init__.py +2 -2
  21. lollms_client/llm_bindings/pythonllamacpp/__init__.py +3 -3
  22. lollms_client/llm_bindings/tensor_rt/__init__.py +1 -1
  23. lollms_client/llm_bindings/transformers/__init__.py +428 -632
  24. lollms_client/llm_bindings/vllm/__init__.py +1 -1
  25. lollms_client/lollms_agentic.py +4 -2
  26. lollms_client/lollms_base_binding.py +61 -0
  27. lollms_client/lollms_core.py +512 -1890
  28. lollms_client/lollms_discussion.py +65 -39
  29. lollms_client/lollms_llm_binding.py +126 -261
  30. lollms_client/lollms_mcp_binding.py +49 -77
  31. lollms_client/lollms_stt_binding.py +99 -52
  32. lollms_client/lollms_tti_binding.py +38 -38
  33. lollms_client/lollms_ttm_binding.py +38 -42
  34. lollms_client/lollms_tts_binding.py +43 -18
  35. lollms_client/lollms_ttv_binding.py +38 -42
  36. lollms_client/lollms_types.py +4 -2
  37. lollms_client/stt_bindings/whisper/__init__.py +108 -23
  38. lollms_client/stt_bindings/whispercpp/__init__.py +7 -1
  39. lollms_client/tti_bindings/diffusers/__init__.py +464 -803
  40. lollms_client/tti_bindings/diffusers/server/main.py +1062 -0
  41. lollms_client/tti_bindings/gemini/__init__.py +182 -239
  42. lollms_client/tti_bindings/leonardo_ai/__init__.py +6 -3
  43. lollms_client/tti_bindings/lollms/__init__.py +4 -1
  44. lollms_client/tti_bindings/novita_ai/__init__.py +5 -2
  45. lollms_client/tti_bindings/openai/__init__.py +10 -11
  46. lollms_client/tti_bindings/stability_ai/__init__.py +5 -3
  47. lollms_client/ttm_bindings/audiocraft/__init__.py +7 -12
  48. lollms_client/ttm_bindings/beatoven_ai/__init__.py +7 -3
  49. lollms_client/ttm_bindings/lollms/__init__.py +4 -17
  50. lollms_client/ttm_bindings/replicate/__init__.py +7 -4
  51. lollms_client/ttm_bindings/stability_ai/__init__.py +7 -4
  52. lollms_client/ttm_bindings/topmediai/__init__.py +6 -3
  53. lollms_client/tts_bindings/bark/__init__.py +7 -10
  54. lollms_client/tts_bindings/lollms/__init__.py +6 -1
  55. lollms_client/tts_bindings/piper_tts/__init__.py +8 -11
  56. lollms_client/tts_bindings/xtts/__init__.py +157 -74
  57. lollms_client/tts_bindings/xtts/server/main.py +241 -280
  58. {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/METADATA +113 -5
  59. lollms_client-1.7.13.dist-info/RECORD +90 -0
  60. lollms_client-1.5.6.dist-info/RECORD +0 -87
  61. {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/WHEEL +0 -0
  62. {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/licenses/LICENSE +0 -0
  63. {lollms_client-1.5.6.dist-info → lollms_client-1.7.13.dist-info}/top_level.txt +0 -0
@@ -1,707 +1,503 @@
1
- # bindings/huggingface_hub/binding.py
1
+ # bindings/huggingface/__init__.py
2
2
  import json
3
3
  import os
4
- import pprint
5
- import re
6
- import socket # Not used directly for server, but good to keep for consistency if needed elsewhere
7
- import subprocess # Not used for server
8
- import sys
9
4
  import threading
10
5
  import time
6
+ import shutil
11
7
  from pathlib import Path
12
- from typing import Optional, Callable, List, Union, Dict, Any, Set
13
- import base64 # For potential image data handling, though PIL.Image is primary
14
- import requests # Not used for server, but for consistency
8
+ from typing import Optional, Callable, List, Union, Dict, Any
15
9
 
10
+ import psutil
16
11
  from lollms_client.lollms_llm_binding import LollmsLLMBinding
17
- from lollms_client.lollms_types import MSG_TYPE, ELF_COMPLETION_FORMAT
12
+ from lollms_client.lollms_types import MSG_TYPE
13
+ from lollms_client.lollms_discussion import LollmsDiscussion
18
14
 
19
15
  from ascii_colors import ASCIIColors, trace_exception
20
16
  import pipmaster as pm
21
17
 
22
- # --- Pipmaster: Ensure dependencies ---
18
+ # --- Dependencies ---
23
19
  pm.ensure_packages([
24
- "torch",
25
- "transformers",
26
- "accelerate", # For device_map="auto" and advanced model loading
27
- "bitsandbytes", # For 4-bit/8-bit quantization (works best on CUDA)
28
- "sentence_transformers", # For robust embedding generation
29
- "pillow" # For image handling (vision models)
20
+ "torch", "transformers", "accelerate", "bitsandbytes",
21
+ "sentence_transformers", "pillow", "scipy", "huggingface_hub",
22
+ "psutil", "peft", "trl", "datasets"
30
23
  ])
31
24
 
32
25
  try:
33
26
  import torch
34
27
  from transformers import (
35
28
  AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer,
36
- BitsAndBytesConfig, AutoConfig, GenerationConfig,
37
- AutoProcessor, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, # Common LLaVA models
38
- StoppingCriteria, StoppingCriteriaList
29
+ BitsAndBytesConfig, AutoConfig, AutoProcessor,
30
+ LlavaForConditionalGeneration, LlavaNextForConditionalGeneration,
31
+ TrainingArguments
39
32
  )
33
+ from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
34
+ from trl import SFTTrainer
35
+ from datasets import load_dataset
40
36
  from sentence_transformers import SentenceTransformer
37
+ from huggingface_hub import snapshot_download
41
38
  from PIL import Image
42
39
  except ImportError as e:
43
40
  ASCIIColors.error(f"Failed to import core libraries: {e}")
44
- ASCIIColors.error("Please ensure torch, transformers, accelerate, bitsandbytes, sentence_transformers, and pillow are installed.")
45
- trace_exception(e)
46
- # Set them to None so the binding can report failure cleanly if __init__ is still called.
47
41
  torch = None
48
42
  transformers = None
49
- sentence_transformers = None
50
- Image = None
51
43
 
52
-
53
- # --- Custom Stopping Criteria for Hugging Face generate ---
54
- class StopOnWords(StoppingCriteria):
55
- def __init__(self, tokenizer, stop_words: List[str]):
56
- super().__init__()
44
+ # --- Container ---
45
+ class ModelContainer:
46
+ def __init__(self, model_id, model, tokenizer, processor=None, device="cpu", quant=None):
47
+ self.model_id = model_id
48
+ self.model = model
57
49
  self.tokenizer = tokenizer
58
- self.stop_sequences_token_ids = []
59
- for word in stop_words:
60
- # Encode stop words without adding special tokens to get their raw token IDs
61
- token_ids = tokenizer.encode(word, add_special_tokens=False)
62
- if token_ids:
63
- self.stop_sequences_token_ids.append(torch.tensor(token_ids))
64
-
65
- def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
66
- for stop_seq_ids in self.stop_sequences_token_ids:
67
- if input_ids.shape[1] >= stop_seq_ids.shape[0]:
68
- # Check if the end of input_ids matches the stop sequence
69
- if torch.equal(input_ids[0, -stop_seq_ids.shape[0]:], stop_seq_ids.to(input_ids.device)):
70
- return True
71
- return False
50
+ self.processor = processor
51
+ self.device = device
52
+ self.quantization = quant
53
+ self.last_used = time.time()
54
+ self.supports_vision = processor is not None
72
55
 
56
+ def update_usage(self):
57
+ self.last_used = time.time()
73
58
 
74
- BindingName = "HuggingFaceHubBinding"
59
+ BindingName = "HuggingFace"
75
60
 
76
- class HuggingFaceHubBinding(LollmsLLMBinding):
61
+ class HuggingFace(LollmsLLMBinding):
77
62
  DEFAULT_CONFIG_ARGS = {
78
- "device": "auto", # "auto", "cuda", "mps", "cpu"
79
- "quantize": False, # False, "8bit", "4bit" (8bit/4bit require CUDA and bitsandbytes)
80
- "torch_dtype": "auto", # "auto", "float16", "bfloat16", "float32"
81
- "max_new_tokens": 2048, # Default for generation
63
+ "device": "auto",
64
+ "quantize": False,
65
+ "torch_dtype": "auto",
66
+ "max_new_tokens": 4096,
82
67
  "temperature": 0.7,
83
- "top_k": 50,
84
- "top_p": 0.95,
85
- "repetition_penalty": 1.1,
86
- "trust_remote_code": False, # Set to True for models like Phi, some LLaVA, etc.
87
- "use_flash_attention_2": False, # If supported by hardware/model & transformers version
88
- "embedding_model_name": "sentence-transformers/all-MiniLM-L6-v2", # Default for embed()
89
- "generation_timeout": 300, # Timeout for non-streaming generation
90
- "stop_words": [], # List of strings to stop generation
68
+ "trust_remote_code": False,
69
+ "use_flash_attention_2": False,
70
+ "embedding_model_name": "sentence-transformers/all-MiniLM-L6-v2",
71
+ "max_active_models": 1,
72
+ "local_models_path": "" # If empty, dynamic default is used
91
73
  }
92
74
 
93
- def __init__(self,
94
- **kwargs # Overrides for config_args
95
- ):
96
- """
97
- Initializes the Hugging Face Hub binding.
98
- Args:
99
- model_name (str): Hugging Face Hub model ID or local folder name.
100
- models_path (str or Path): Path to the directory containing local models.
101
- config (Optional[Dict[str, Any]]): Optional configuration dictionary to override defaults.
102
- default_completion_format (ELF_COMPLETION_FORMAT): Default format for text generation.
103
- """
75
+ def __init__(self, **kwargs):
104
76
  super().__init__(BindingName, **kwargs)
77
+
78
+ if torch is None or transformers is None:
79
+ raise ImportError("Core libraries not available.")
105
80
 
106
- model_name_or_id = kwargs.get("model_name")
107
- models_path = kwargs.get("models_path")
108
- config = kwargs.get("config")
109
- default_completion_format = kwargs.get("default_completion_format", ELF_COMPLETION_FORMAT.Chat)
110
-
111
- if torch is None or transformers is None: # Check if core imports failed
112
- raise ImportError("Core libraries (torch, transformers) not available. Binding cannot function.")
113
-
114
- self.models_path = Path(models_path)
115
- self.config = {**self.DEFAULT_CONFIG_ARGS, **(config or {}), **kwargs}
116
- self.default_completion_format = default_completion_format
81
+ self.config = {**self.DEFAULT_CONFIG_ARGS, **kwargs.get("config", {}), **kwargs}
82
+
83
+ # --- 1. Setup Local Models Path ---
84
+ # Priority: Config Override -> Lollms Personal Path -> Default relative path
85
+ if self.config["local_models_path"]:
86
+ self.local_models_path = Path(self.config["local_models_path"])
87
+ elif kwargs.get("lollms_paths"):
88
+ self.local_models_path = Path(kwargs["lollms_paths"].personal_models_path) / "huggingface"
89
+ else:
90
+ self.local_models_path = Path("models/huggingface")
117
91
 
118
- self.model_identifier: Optional[str] = None
119
- self.model_name: Optional[str] = None # User-friendly name (folder name or hub id)
120
- self.model: Optional[Union[AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration]] = None
121
- self.tokenizer: Optional[AutoTokenizer] = None
122
- self.processor: Optional[AutoProcessor] = None # For vision models
123
- self.embedding_model: Optional[SentenceTransformer] = None
124
- self.device: Optional[str] = None
125
- self.torch_dtype: Optional[torch.dtype] = None
126
- self.supports_vision: bool = False
127
-
128
- # Attempt to load the model during initialization
129
- if not self.load_model(model_name_or_id):
130
- # load_model will print errors. Here we can raise if critical.
131
- ASCIIColors.error(f"Initial model load failed for {model_name_or_id}. Binding may not be functional.")
132
- # Depending on Lollms behavior, this might be acceptable if user can select another model later.
133
-
134
- def _resolve_model_path_or_id(self, model_name_or_id: str) -> str:
135
- # 1. Check if it's an absolute path to a model directory
136
- abs_path = Path(model_name_or_id)
137
- if abs_path.is_absolute() and abs_path.is_dir() and (abs_path / "config.json").exists():
138
- ASCIIColors.info(f"Using absolute model path: {abs_path}")
139
- return str(abs_path)
140
-
141
- # 2. Check if it's a name relative to self.models_path
142
- local_model_path = self.models_path / model_name_or_id
143
- if local_model_path.is_dir() and (local_model_path / "config.json").exists():
144
- ASCIIColors.info(f"Found local model in models_path: {local_model_path}")
145
- return str(local_model_path)
92
+ self.local_models_path.mkdir(parents=True, exist_ok=True)
93
+ ASCIIColors.info(f"HuggingFace Local Storage: {self.local_models_path}")
94
+
95
+ # State
96
+ self.loaded_models: Dict[str, ModelContainer] = {}
97
+ self.active_model_id: Optional[str] = None
98
+ self.inference_lock = threading.Lock()
99
+ self.is_training = False
146
100
 
147
- # 3. Assume it's a Hugging Face Hub ID
148
- ASCIIColors.info(f"Assuming '{model_name_or_id}' is a Hugging Face Hub ID.")
149
- return model_name_or_id
101
+ # Load Embeddings
102
+ self.embedding_model = None
103
+ self.load_embedding_model()
104
+
105
+ # Initial Load
106
+ model_name = kwargs.get("model_name")
107
+ if model_name:
108
+ self.load_model(model_name)
109
+
110
+ def load_embedding_model(self):
111
+ name = self.config.get("embedding_model_name")
112
+ if name:
113
+ try:
114
+ ASCIIColors.info(f"Loading embedding model: {name}")
115
+ device = "cuda" if torch.cuda.is_available() else "cpu"
116
+ self.embedding_model = SentenceTransformer(name, device=device)
117
+ except Exception as e:
118
+ ASCIIColors.warning(f"Failed to load embedding model: {e}")
119
+
120
+ def _manage_memory(self):
121
+ max_models = int(self.config.get("max_active_models", 1))
122
+ while len(self.loaded_models) >= max_models:
123
+ lru_id = min(self.loaded_models, key=lambda k: self.loaded_models[k].last_used)
124
+ # Avoid unloading the active one if possible, unless it's the only one and we need a swap
125
+ if lru_id == self.active_model_id and len(self.loaded_models) == 1:
126
+ pass
127
+ ASCIIColors.info(f"Unloading {lru_id} to free memory.")
128
+ self.unload_model_by_id(lru_id)
150
129
 
151
130
  def load_model(self, model_name_or_id: str) -> bool:
152
- if self.model is not None:
153
- self.unload_model()
154
-
155
- self.model_identifier = self._resolve_model_path_or_id(model_name_or_id)
156
- self.model_name = Path(self.model_identifier).name # User-friendly name
157
-
158
- # --- Device Selection ---
159
- device_pref = self.config.get("device", "auto")
160
- if device_pref == "auto":
161
- if torch.cuda.is_available(): self.device = "cuda"
162
- elif torch.backends.mps.is_available(): self.device = "mps" # For Apple Silicon
163
- else: self.device = "cpu"
164
- else:
165
- self.device = device_pref
166
- ASCIIColors.info(f"Using device: {self.device}")
167
-
168
- # --- Dtype Selection ---
169
- dtype_pref = self.config.get("torch_dtype", "auto")
170
- if dtype_pref == "auto":
171
- if self.device == "cuda": self.torch_dtype = torch.float16 # bfloat16 is better for Ampere+
172
- else: self.torch_dtype = torch.float32 # MPS and CPU generally use float32
173
- elif dtype_pref == "float16": self.torch_dtype = torch.float16
174
- elif dtype_pref == "bfloat16": self.torch_dtype = torch.bfloat16
175
- else: self.torch_dtype = torch.float32
176
- ASCIIColors.info(f"Using DType: {self.torch_dtype}")
177
-
178
- # --- Quantization ---
179
- quantize_mode = self.config.get("quantize", False)
180
- load_in_8bit = False
181
- load_in_4bit = False
182
- bnb_config = None
131
+ """
132
+ Loads a model. Priorities:
133
+ 1. Local folder (self.local_models_path / model_name_or_id)
134
+ 2. Hugging Face Hub (download/cache automatically)
135
+ """
136
+ # --- Resolve Path ---
137
+ # Clean naming for folder lookup
138
+ folder_name = model_name_or_id.replace("/", "_") # Sanitize potential subdirs if user types "meta-llama/Llama-2"
139
+
140
+ # Check standard path mapping
141
+ possible_paths = [
142
+ self.local_models_path / model_name_or_id, # Exact match (subfolders)
143
+ self.local_models_path / folder_name, # Flattened match
144
+ Path(model_name_or_id) # Absolute path provided by user
145
+ ]
146
+
147
+ model_path_to_use = model_name_or_id # Default to ID for HF Hub
148
+
149
+ for p in possible_paths:
150
+ if p.exists() and p.is_dir() and (p / "config.json").exists():
151
+ ASCIIColors.info(f"Found local model at: {p}")
152
+ model_path_to_use = str(p)
153
+ break
154
+
155
+ # Check if already loaded
156
+ if model_name_or_id in self.loaded_models:
157
+ self.active_model_id = model_name_or_id
158
+ self.loaded_models[model_name_or_id].update_usage()
159
+ return True
183
160
 
184
- if self.device == "cuda": # bitsandbytes primarily for CUDA
185
- if quantize_mode == "8bit":
186
- load_in_8bit = True
187
- ASCIIColors.info("Quantizing model to 8-bit.")
188
- elif quantize_mode == "4bit":
189
- load_in_4bit = True
190
- bnb_config = BitsAndBytesConfig(
191
- load_in_4bit=True,
192
- bnb_4bit_quant_type="nf4",
193
- bnb_4bit_use_double_quant=True,
194
- bnb_4bit_compute_dtype=self.torch_dtype # e.g., torch.bfloat16 for computation
195
- )
196
- ASCIIColors.info("Quantizing model to 4-bit.")
197
- elif quantize_mode and self.device != "cuda":
198
- ASCIIColors.warning(f"Quantization ('{quantize_mode}') is selected but device is '{self.device}'. bitsandbytes works best on CUDA. Proceeding without quantization.")
199
- quantize_mode = False
161
+ self._manage_memory()
162
+ if self.is_training:
163
+ ASCIIColors.error("Training in progress. Cannot load new model.")
164
+ return False
200
165
 
166
+ ASCIIColors.info(f"Loading {model_name_or_id} (Path/ID: {model_path_to_use})...")
167
+
168
+ # --- Config & Device ---
169
+ device = "cuda" if torch.cuda.is_available() and self.config["device"]=="auto" else "cpu"
170
+ if self.config["device"] != "auto": device = self.config["device"]
171
+
172
+ dtype_map = {"auto": torch.float16 if device=="cuda" else torch.float32,
173
+ "float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
174
+ dtype = dtype_map.get(self.config["torch_dtype"], torch.float32)
175
+
176
+ quant_mode = self.config.get("quantize", False)
177
+ load_in_4bit = str(quant_mode) == "4bit"
178
+ load_in_8bit = str(quant_mode) == "8bit"
179
+
180
+ bnb_config = None
181
+ if device == "cuda" and load_in_4bit:
182
+ bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
183
+ bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=dtype)
201
184
 
202
- # --- Model Loading Arguments ---
203
- model_load_args = {
185
+ model_args = {
204
186
  "trust_remote_code": self.config.get("trust_remote_code", False),
205
- # torch_dtype is handled by BitsAndBytesConfig if quantizing, otherwise set directly
206
- "torch_dtype": self.torch_dtype if not (load_in_8bit or load_in_4bit) else None,
187
+ "torch_dtype": dtype if not (load_in_4bit or load_in_8bit) else None,
188
+ "device_map": "auto" if device == "cuda" else None
207
189
  }
208
- if self.config.get("use_flash_attention_2", False) and self.device == "cuda":
209
- if hasattr(transformers, " আসছেAttention"): # Check for Flash Attention support in transformers version
210
- model_load_args["attn_implementation"] = "flash_attention_2"
211
- ASCIIColors.info("Attempting to use Flash Attention 2.")
212
- else:
213
- ASCIIColors.warning("Flash Attention 2 requested but not found in this transformers version. Using default.")
214
190
 
191
+ if self.config.get("use_flash_attention_2") and device == "cuda":
192
+ try:
193
+ import flash_attn
194
+ model_args["attn_implementation"] = "flash_attention_2"
195
+ except ImportError:
196
+ ASCIIColors.warning("Flash Attention 2 enabled but not installed.")
197
+
198
+ if load_in_4bit: model_args["quantization_config"] = bnb_config
199
+ if load_in_8bit: model_args["load_in_8bit"] = True
215
200
 
216
- if load_in_8bit: model_load_args["load_in_8bit"] = True
217
- if load_in_4bit: model_load_args["quantization_config"] = bnb_config
218
-
219
- # device_map="auto" for multi-GPU or when quantizing on CUDA
220
- if self.device == "cuda" and (load_in_8bit or load_in_4bit or torch.cuda.device_count() > 1):
221
- model_load_args["device_map"] = "auto"
222
- ASCIIColors.info("Using device_map='auto'.")
223
-
224
201
  try:
225
- ASCIIColors.info(f"Loading tokenizer for '{self.model_identifier}'...")
226
- self.tokenizer = AutoTokenizer.from_pretrained(
227
- self.model_identifier,
228
- trust_remote_code=model_load_args["trust_remote_code"]
229
- )
230
- if self.tokenizer.pad_token is None:
231
- self.tokenizer.pad_token = self.tokenizer.eos_token
232
- ASCIIColors.info("Tokenizer `pad_token` was None, set to `eos_token`.")
233
-
234
- # --- Determine if it's a LLaVA-like vision model ---
235
- model_config_hf = AutoConfig.from_pretrained(self.model_identifier, trust_remote_code=model_load_args["trust_remote_code"])
236
- self.supports_vision = "llava" in model_config_hf.model_type.lower() or \
237
- any("Llava" in arch for arch in getattr(model_config_hf, "architectures", [])) or \
238
- "vision_tower" in model_config_hf.to_dict() # Common LLaVA config key
239
-
240
- if self.supports_vision:
241
- ASCIIColors.info(f"Detected LLaVA-like vision model: '{self.model_identifier}'.")
242
- self.processor = AutoProcessor.from_pretrained(
243
- self.model_identifier,
244
- trust_remote_code=model_load_args["trust_remote_code"]
245
- )
246
- # Choose appropriate LLaVA model class
247
- if "llava-next" in self.model_identifier.lower() or any("LlavaNext" in arch for arch in getattr(model_config_hf, "architectures", [])):
248
- ModelClass = LlavaNextForConditionalGeneration
249
- elif "llava" in self.model_identifier.lower() or any("LlavaForConditionalGeneration" in arch for arch in getattr(model_config_hf, "architectures", [])):
250
- ModelClass = LlavaForConditionalGeneration
251
- else: # Fallback if specific Llava class not matched by name
252
- ASCIIColors.warning("Could not determine specific LLaVA class, using AutoModelForCausalLM. Vision capabilities might be limited.")
253
- ModelClass = AutoModelForCausalLM # This might not fully work for all LLaVAs
254
-
255
- self.model = ModelClass.from_pretrained(self.model_identifier, **model_load_args)
256
- else:
257
- ASCIIColors.info(f"Loading text model '{self.model_identifier}'...")
258
- self.model = AutoModelForCausalLM.from_pretrained(self.model_identifier, **model_load_args)
202
+ # Tokenizer
203
+ tokenizer = AutoTokenizer.from_pretrained(model_path_to_use, trust_remote_code=model_args["trust_remote_code"])
204
+ if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
259
205
 
260
- # If not using device_map, move model to the selected device
261
- if "device_map" not in model_load_args and self.device != "cpu":
262
- self.model.to(self.device)
206
+ # Architecture Detection
207
+ config = AutoConfig.from_pretrained(model_path_to_use, trust_remote_code=model_args["trust_remote_code"])
208
+ processor = None
263
209
 
264
- self.model.eval() # Set to evaluation mode
265
-
266
- # --- Load Embedding Model ---
267
- emb_model_name = self.config.get("embedding_model_name")
268
- if emb_model_name:
269
- try:
270
- ASCIIColors.info(f"Loading embedding model: {emb_model_name} on device: {self.device}")
271
- self.embedding_model = SentenceTransformer(emb_model_name, device=self.device)
272
- except Exception as e_emb:
273
- ASCIIColors.warning(f"Failed to load embedding model '{emb_model_name}': {e_emb}. Embeddings will not be available.")
274
- self.embedding_model = None
210
+ # LLaVA Check
211
+ if "llava" in config.model_type.lower() or "Llava" in str(getattr(config, "architectures", [])):
212
+ processor = AutoProcessor.from_pretrained(model_path_to_use, trust_remote_code=model_args["trust_remote_code"])
213
+ ModelClass = LlavaNextForConditionalGeneration if "next" in config.model_type.lower() else LlavaForConditionalGeneration
214
+ model = ModelClass.from_pretrained(model_path_to_use, **model_args)
275
215
  else:
276
- ASCIIColors.info("No embedding_model_name configured. Skipping embedding model load.")
277
- self.embedding_model = None
216
+ model = AutoModelForCausalLM.from_pretrained(model_path_to_use, **model_args)
278
217
 
279
- ASCIIColors.green(f"Model '{self.model_identifier}' loaded successfully.")
218
+ # Fallback for device placement
219
+ if not model_args.get("device_map") and device != "cpu" and not (load_in_4bit or load_in_8bit):
220
+ model.to(device)
221
+
222
+ model.eval()
223
+
224
+ container = ModelContainer(model_name_or_id, model, tokenizer, processor, device, quant_mode)
225
+ self.loaded_models[model_name_or_id] = container
226
+ self.active_model_id = model_name_or_id
227
+ ASCIIColors.success(f"Loaded {model_name_or_id}")
280
228
  return True
281
-
229
+
282
230
  except Exception as e:
283
- ASCIIColors.error(f"Failed to load model '{self.model_identifier}': {e}")
231
+ ASCIIColors.error(f"Load failed: {e}")
284
232
  trace_exception(e)
285
- self.unload_model() # Ensure partial loads are cleaned up
286
233
  return False
287
234
 
288
- def unload_model(self):
289
- if self.model is not None:
290
- del self.model
291
- self.model = None
292
- if self.tokenizer is not None:
293
- del self.tokenizer
294
- self.tokenizer = None
295
- if self.processor is not None:
296
- del self.processor
297
- self.processor = None
298
- if self.embedding_model is not None:
299
- del self.embedding_model
300
- self.embedding_model = None
235
+ def unload_model_by_id(self, model_id: str):
236
+ if model_id in self.loaded_models:
237
+ del self.loaded_models[model_id]
238
+ if torch.cuda.is_available(): torch.cuda.empty_cache()
239
+ import gc; gc.collect()
240
+
241
+ def get_container(self):
242
+ return self.loaded_models.get(self.active_model_id)
243
+
244
+ # --- Generation ---
245
+ def generate_text(self, prompt, images=None, system_prompt="", stream=False, streaming_callback=None, split=False, n_predict=None, **kwargs):
246
+ if self.is_training: return {"status": False, "error": "Training in progress."}
301
247
 
302
- if self.device == "cuda":
303
- torch.cuda.empty_cache()
248
+ container = self.get_container()
249
+ if not container: return {"status": False, "error": "No model loaded."}
304
250
 
305
- self.model_identifier = None
306
- self.model_name = None
307
- self.supports_vision = False
308
- ASCIIColors.info("Hugging Face model unloaded.")
309
-
310
- def generate_text(self,
311
- prompt: str,
312
- images: Optional[List[str]] = None,
313
- system_prompt: str = "",
314
- n_predict: Optional[int] = None,
315
- stream: bool = False,
316
- temperature: float = None,
317
- top_k: int = None,
318
- top_p: float = None,
319
- repeat_penalty: float = None,
320
- seed: Optional[int] = None,
321
- stop_words: Optional[List[str]] = None, # Added custom stop_words
322
- streaming_callback: Optional[Callable[[str, int], bool]] = None,
323
- split:Optional[bool]=False, # put to true if the prompt is a discussion
324
- user_keyword:Optional[str]="!@>user:",
325
- ai_keyword:Optional[str]="!@>assistant:",
326
- use_chat_format_override: Optional[bool] = None,
327
- **generation_kwargs
328
- ) -> Union[str, Dict[str, Any]]:
329
-
330
- if self.model is None or self.tokenizer is None:
331
- return {"status": False, "error": "Model not loaded."}
332
-
333
- if seed is not None:
334
- torch.manual_seed(seed)
335
- if self.device == "cuda": torch.cuda.manual_seed_all(seed)
336
-
337
- _use_chat_format = use_chat_format_override if use_chat_format_override is not None \
338
- else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
339
-
340
- # --- Prepare Inputs ---
341
- inputs_dict = {}
342
- processed_images = []
343
- if self.supports_vision and self.processor and images:
251
+ container.update_usage()
252
+
253
+ with self.inference_lock:
254
+ inputs = {}
255
+ # Vision
256
+ if container.supports_vision and images:
257
+ pil_images = [Image.open(p).convert("RGB") for p in images]
258
+ inputs = container.processor(text=prompt, images=pil_images, return_tensors="pt").to(container.model.device)
259
+ # Text / Chat
260
+ else:
261
+ if hasattr(container.tokenizer, 'apply_chat_template') and not split:
262
+ messages = []
263
+ if system_prompt: messages.append({"role": "system", "content": system_prompt})
264
+ messages.append({"role": "user", "content": prompt})
265
+ try:
266
+ text = container.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
267
+ inputs = container.tokenizer(text, return_tensors="pt").to(container.model.device)
268
+ except:
269
+ inputs = container.tokenizer(prompt, return_tensors="pt").to(container.model.device)
270
+ else:
271
+ inputs = container.tokenizer(prompt, return_tensors="pt").to(container.model.device)
272
+
273
+ gen_kwargs = {
274
+ "max_new_tokens": n_predict or self.config.get("max_new_tokens"),
275
+ "temperature": kwargs.get("temperature", self.config.get("temperature")),
276
+ "do_sample": kwargs.get("temperature", 0.7) > 0,
277
+ "pad_token_id": container.tokenizer.eos_token_id
278
+ }
279
+
344
280
  try:
345
- for img_path in images:
346
- processed_images.append(Image.open(img_path).convert("RGB"))
347
- # LLaVA processor typically takes text and images, returns combined inputs
348
- inputs_dict = self.processor(text=prompt, images=processed_images, return_tensors="pt").to(self.model.device)
349
- ASCIIColors.debug("Processed inputs with LLaVA processor.")
350
- except Exception as e_img:
351
- ASCIIColors.error(f"Error processing images for LLaVA: {e_img}")
352
- return {"status": False, "error": f"Image processing error: {e_img}"}
353
-
354
- elif _use_chat_format and hasattr(self.tokenizer, 'apply_chat_template'):
355
- messages = []
356
- if system_prompt: messages.append({"role": "system", "content": system_prompt})
281
+ if stream and streaming_callback:
282
+ streamer = TextIteratorStreamer(container.tokenizer, skip_prompt=True, skip_special_tokens=True)
283
+ gen_kwargs["streamer"] = streamer
284
+ t = threading.Thread(target=container.model.generate, kwargs={**inputs, **gen_kwargs})
285
+ t.start()
286
+
287
+ full = ""
288
+ for chunk in streamer:
289
+ full += chunk
290
+ if not streaming_callback(chunk, MSG_TYPE.MSG_TYPE_CHUNK): break
291
+ t.join()
292
+ return full
293
+ else:
294
+ outputs = container.model.generate(**inputs, **gen_kwargs)
295
+ text = container.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
296
+ return text
297
+ except Exception as e:
298
+ trace_exception(e)
299
+ return {"status": False, "error": str(e)}
300
+
301
+ # --- Commands ---
302
+
303
+ def list_models(self) -> List[Dict[str, str]]:
304
+ """Scans the designated local_models_path."""
305
+ models = []
306
+ if self.local_models_path.exists():
307
+ for item in self.local_models_path.iterdir():
308
+ if item.is_dir():
309
+ # Simple heuristic to check if it's a valid HF model folder
310
+ if (item / "config.json").exists() or (item / "adapter_config.json").exists():
311
+ try:
312
+ size_gb = sum(f.stat().st_size for f in item.rglob('*') if f.is_file()) / (1024**3)
313
+ except: size_gb = 0
314
+
315
+ models.append({
316
+ "model_name": item.name,
317
+ "path": str(item),
318
+ "size": f"{size_gb:.2f} GB",
319
+ "source": "Local Storage"
320
+ })
321
+ return models
322
+
323
+ def pull_model(self, model_name: str) -> dict:
324
+ """Downloads model files directly to self.local_models_path."""
325
+ try:
326
+ ASCIIColors.info(f"Downloading {model_name} to {self.local_models_path}...")
357
327
 
358
- # Newer chat templates can handle images directly in content if tokenizer supports it
359
- # Example: [{"type": "text", "text": "..."}, {"type": "image_url", "image_url": {"url": "path/to/image.jpg"}}]
360
- # For now, this example keeps LLaVA processor separate.
361
- messages.append({"role": "user", "content": prompt})
328
+ # We preserve the folder structure simply using the last part of the repo name
329
+ # e.g. 'meta-llama/Llama-2-7b' -> 'Llama-2-7b' folder in local path.
330
+ # OR use the full 'meta-llama_Llama-2-7b' to avoid name collisions.
331
+ folder_name = model_name.replace("/", "_")
332
+ target_dir = self.local_models_path / folder_name
362
333
 
363
- try:
364
- input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
365
- inputs_dict = self.tokenizer(input_text, return_tensors="pt").to(self.model.device)
366
- ASCIIColors.debug("Applied chat template.")
367
- except Exception as e_tmpl: # Some tokenizers might fail if template is complex or not well-defined
368
- ASCIIColors.warning(f"Failed to apply chat template ({e_tmpl}). Falling back to raw prompt.")
369
- _use_chat_format = False # Fallback
370
-
371
- if not _use_chat_format or not inputs_dict: # Raw prompt or fallback
372
- full_prompt_text = ""
373
- if system_prompt: full_prompt_text += system_prompt + "\n\n"
374
- full_prompt_text += prompt
375
- inputs_dict = self.tokenizer(full_prompt_text, return_tensors="pt").to(self.model.device)
376
- ASCIIColors.debug("Using raw prompt format.")
377
-
378
- input_ids = inputs_dict.get("input_ids")
379
- if input_ids is None: return {"status": False, "error": "Failed to tokenize prompt."}
380
-
381
- current_input_length = input_ids.shape[1]
382
-
383
- # --- Generation Parameters ---
384
- gen_conf = GenerationConfig.from_model_config(self.model.config) # Start with model's default
385
-
386
- gen_conf.max_new_tokens = n_predict if n_predict is not None else self.config.get("max_new_tokens")
387
- gen_conf.temperature = temperature if temperature is not None else self.config.get("temperature")
388
- gen_conf.top_k = top_k if top_k is not None else self.config.get("top_k")
389
- gen_conf.top_p = top_p if top_p is not None else self.config.get("top_p")
390
- gen_conf.repetition_penalty = repeat_penalty if repeat_penalty is not None else self.config.get("repetition_penalty")
391
- gen_conf.pad_token_id = self.tokenizer.eos_token_id # Crucial for stopping
392
- gen_conf.eos_token_id = self.tokenizer.eos_token_id
393
-
394
- # Apply any other valid GenerationConfig parameters from generation_kwargs
395
- for key, value in generation_kwargs.items():
396
- if hasattr(gen_conf, key): setattr(gen_conf, key, value)
397
-
398
- # --- Stopping Criteria ---
399
- stopping_criteria_list = StoppingCriteriaList()
400
- effective_stop_words = stop_words if stop_words is not None else self.config.get("stop_words", [])
401
- if effective_stop_words:
402
- stopping_criteria_list.append(StopOnWords(self.tokenizer, effective_stop_words))
403
-
404
- # --- Generation ---
405
- try:
406
- if stream and streaming_callback:
407
- streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
408
- generation_thread_kwargs = {
409
- **inputs_dict, # input_ids, attention_mask, pixel_values (if vision)
410
- "generation_config": gen_conf,
411
- "streamer": streamer,
412
- "stopping_criteria": stopping_criteria_list if effective_stop_words else None
413
- }
414
-
415
- thread = threading.Thread(target=self.model.generate, kwargs=generation_thread_kwargs)
416
- thread.start()
417
-
418
- full_response_text = ""
419
- for new_text_chunk in streamer:
420
- if streaming_callback(new_text_chunk, MSG_TYPE.MSG_TYPE_CHUNK):
421
- full_response_text += new_text_chunk
422
- else: # Callback requested stop
423
- ASCIIColors.info("Streaming callback requested stop.")
424
- # Note: stopping the model.generate thread externally is complex.
425
- # The thread will complete its current generation.
426
- break
427
- thread.join(timeout=self.config.get("generation_timeout", 300))
428
- if thread.is_alive():
429
- ASCIIColors.warning("Generation thread did not finish in time after streaming.")
430
- return full_response_text
431
- else: # Non-streaming
432
- outputs = self.model.generate(
433
- **inputs_dict,
434
- generation_config=gen_conf,
435
- stopping_criteria=stopping_criteria_list if effective_stop_words else None
436
- )
437
- # outputs contains the full sequence (prompt + new tokens)
438
- generated_tokens = outputs[0][current_input_length:]
439
- generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
440
- return generated_text.strip()
441
-
334
+ # local_dir ensures actual files are downloaded, not just cache pointers
335
+ path = snapshot_download(repo_id=model_name, local_dir=target_dir, local_dir_use_symlinks=False)
336
+
337
+ msg = f"Model downloaded successfully to {path}"
338
+ ASCIIColors.success(msg)
339
+ return {"status": True, "message": msg, "path": str(path)}
442
340
  except Exception as e:
443
- ASCIIColors.error(f"Error during text generation: {e}")
444
- trace_exception(e)
445
- return {"status": False, "error": str(e)}
446
-
447
- def tokenize(self, text: str) -> List[int]:
448
- if self.tokenizer is None: raise RuntimeError("Tokenizer not loaded.")
449
- return self.tokenizer.encode(text)
341
+ return {"status": False, "message": str(e)}
450
342
 
451
- def detokenize(self, tokens: List[int]) -> str:
452
- if self.tokenizer is None: raise RuntimeError("Tokenizer not loaded.")
453
- return self.tokenizer.decode(tokens)
343
+ def train(self, base_model_name: str, dataset_path: str, new_model_name: str, num_epochs=1, batch_size=1, learning_rate=2e-4) -> dict:
344
+ if self.is_training: return {"status": False, "message": "Busy."}
345
+
346
+ # Output to local path
347
+ output_dir = self.local_models_path / new_model_name
348
+ if output_dir.exists(): return {"status": False, "message": "Model exists."}
349
+
350
+ # Resolve base model path (is it local or remote?)
351
+ # Reuse logic from load_model's resolution if strictly needed, or let HF handle it.
352
+ # But for QLoRA, we usually want the base model weights.
353
+ # We pass 'base_model_name' directly; if it matches a local folder in `load_model`,
354
+ # the user should probably pass that full path or we resolve it here.
355
+ # Let's resolve it against local path:
356
+ possible_local = self.local_models_path / base_model_name
357
+ if possible_local.exists():
358
+ base_model_path = str(possible_local)
359
+ else:
360
+ base_model_path = base_model_name
454
361
 
455
- def count_tokens(self, text: str) -> int:
456
- if self.tokenizer is None: raise RuntimeError("Tokenizer not loaded.")
457
- return len(self.tokenizer.encode(text))
362
+ t = threading.Thread(target=self._run_training_job, args=(base_model_path, dataset_path, str(output_dir), num_epochs, batch_size, learning_rate))
363
+ t.start()
364
+ return {"status": True, "message": f"Training started. Output: {output_dir}"}
458
365
 
459
- def embed(self, text: Union[str, List[str]], **kwargs) -> Union[List[float], List[List[float]]]:
460
- if self.embedding_model is None:
461
- raise RuntimeError("Embedding model not loaded. Configure 'embedding_model_name'.")
366
+ def _run_training_job(self, base_model, dataset_path, output_dir, epochs, batch_size, lr):
367
+ self.is_training = True
368
+ self.inference_lock.acquire()
462
369
  try:
463
- # SentenceTransformer's encode can take a string or list of strings
464
- embeddings_np = self.embedding_model.encode(text, **kwargs)
465
- if isinstance(text, str): # Single text input
466
- return embeddings_np.tolist()
467
- else: # List of texts input
468
- return [emb.tolist() for emb in embeddings_np]
370
+ ASCIIColors.info(f"Training Base: {base_model}")
371
+
372
+ # Dataset
373
+ ext = "json" if dataset_path.endswith("json") else "text"
374
+ dataset = load_dataset(ext, data_files=dataset_path, split="train")
375
+
376
+ # QLoRA Setup
377
+ bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
378
+
379
+ model = AutoModelForCausalLM.from_pretrained(
380
+ base_model, quantization_config=bnb_config, device_map="auto",
381
+ trust_remote_code=self.config.get("trust_remote_code", False)
382
+ )
383
+ model.config.use_cache = False
384
+ model = prepare_model_for_kbit_training(model)
385
+
386
+ tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
387
+ if not tokenizer.pad_token: tokenizer.pad_token = tokenizer.eos_token
388
+ tokenizer.padding_side = "right"
389
+
390
+ peft_config = LoraConfig(r=64, lora_alpha=16, target_modules=["q_proj", "v_proj"], task_type="CAUSAL_LM", bias="none", lora_dropout=0.1)
391
+ model = get_peft_model(model, peft_config)
392
+
393
+ # Formatting
394
+ def format_prompts(examples):
395
+ texts = []
396
+ for i in range(len(examples.get("instruction", []))):
397
+ ins = examples["instruction"][i]
398
+ inp = examples.get("input", [""])[i]
399
+ out = examples.get("output", [""])[i]
400
+ if inp: text = f"### Instruction:\n{ins}\n\n### Input:\n{inp}\n\n### Response:\n{out}<|endoftext|>"
401
+ else: text = f"### Instruction:\n{ins}\n\n### Response:\n{out}<|endoftext|>"
402
+ texts.append(text)
403
+ return texts if texts else examples.get("text", [])
404
+
405
+ trainer = SFTTrainer(
406
+ model=model, train_dataset=dataset, peft_config=peft_config,
407
+ formatting_func=format_prompts, tokenizer=tokenizer,
408
+ args=TrainingArguments(
409
+ output_dir=output_dir, num_train_epochs=epochs,
410
+ per_device_train_batch_size=batch_size, gradient_accumulation_steps=4,
411
+ learning_rate=lr, fp16=True, logging_steps=10, save_strategy="epoch", optim="paged_adamw_32bit"
412
+ )
413
+ )
414
+ trainer.train()
415
+ trainer.save_model(output_dir)
416
+ ASCIIColors.success("Training Finished.")
469
417
  except Exception as e:
470
- ASCIIColors.error(f"Embedding generation failed: {e}")
418
+ ASCIIColors.error(f"Training error: {e}")
471
419
  trace_exception(e)
472
- raise
473
-
474
- def get_model_info(self) -> dict:
475
- info = {
476
- "binding_name": self.binding_name,
477
- "model_name": self.model_name,
478
- "model_identifier": self.model_identifier,
479
- "loaded": self.model is not None,
480
- "config": self.config, # Binding's own config
481
- "device": self.device,
482
- "torch_dtype": str(self.torch_dtype),
483
- "supports_vision": self.supports_vision,
484
- "embedding_model_name": self.config.get("embedding_model_name") if self.embedding_model else None,
485
- }
486
- if self.model and hasattr(self.model, 'config'):
487
- model_hf_config = self.model.config.to_dict()
488
- info["model_hf_config"] = {k: str(v)[:200] for k,v in model_hf_config.items()} # Truncate long values
489
- info["max_model_len"] = getattr(self.model.config, "max_position_embeddings", "N/A")
420
+ finally:
421
+ self.inference_lock.release()
422
+ self.is_training = False
423
+
424
+ def merge_lora(self, base_model_name, lora_model_name, new_model_name):
425
+ # Resolve Base
426
+ possible_base = self.local_models_path / base_model_name
427
+ base_path = str(possible_base) if possible_base.exists() else base_model_name
490
428
 
491
- info["supports_structured_output"] = False # HF models don't inherently support grammar like llama.cpp server
492
- # (unless using external libraries like outlines)
493
- return info
494
-
495
- def listModels(self) -> List[Dict[str, str]]:
496
- models_found = []
497
- unique_model_names = set()
498
-
499
- if self.models_path.exists() and self.models_path.is_dir():
500
- for item in self.models_path.iterdir():
501
- if item.is_dir(): # HF models are directories
502
- # Basic check for a config file to qualify as a model dir
503
- if (item / "config.json").exists():
504
- model_name = item.name
505
- if model_name not in unique_model_names:
506
- try:
507
- # Calculating size can be slow for large model repos
508
- # total_size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
509
- # size_gb_str = f"{total_size / (1024**3):.2f} GB"
510
- size_gb_str = "N/A (size calculation disabled for speed)"
511
- except Exception:
512
- size_gb_str = "N/A"
513
-
514
- models_found.append({
515
- "model_name": model_name, # This is the folder name
516
- "path_hint": str(item.relative_to(self.models_path.parent) if item.is_relative_to(self.models_path.parent) else item),
517
- "size_gb": size_gb_str
518
- })
519
- unique_model_names.add(model_name)
429
+ # Resolve LoRA (Usually local if trained here)
430
+ possible_lora = self.local_models_path / lora_model_name
431
+ lora_path = str(possible_lora) if possible_lora.exists() else lora_model_name
432
+
433
+ save_path = self.local_models_path / new_model_name
520
434
 
521
- ASCIIColors.info("Tip: You can also use any Hugging Face Hub model ID directly (e.g., 'mistralai/Mistral-7B-Instruct-v0.1').")
522
- return models_found
523
-
524
- def __del__(self):
525
- self.unload_model()
526
-
527
-
528
- if __name__ == '__main__':
529
- global full_streamed_text
530
- ASCIIColors.yellow("Testing HuggingFaceHubBinding...")
531
-
532
- # --- Configuration ---
533
- # For testing, you might need to download a model first or use a small Hub ID.
534
- # Option 1: Use a small model from Hugging Face Hub
535
- # test_model_name = "gpt2" # Small, good for quick tests
536
- test_model_name = "microsoft/phi-2" # Small, good quality, requires trust_remote_code=True
537
- # test_model_name = "HuggingFaceH4/zephyr-7b-beta" # Larger, powerful
538
-
539
- # Option 2: Path to a local model folder (if you have one)
540
- # Replace 'path/to/your/models' with the PARENT directory of your HF model folders.
541
- # And 'your-local-model-folder' with the actual folder name.
542
- # Example:
543
- # test_models_parent_path = Path.home() / "lollms_models" # Example path
544
- # test_model_name = "phi-2" # if "phi-2" folder is inside test_models_parent_path
545
-
546
- # For local testing, models_path should be where your HF model *folders* are.
547
- # If using a Hub ID like "gpt2", models_path is less critical unless you expect
548
- # the binding to *only* look there (which it doesn't, it prioritizes Hub IDs).
549
- # Let's use a dummy path for models_path for Hub ID testing.
550
-
551
- # Adjust current_directory for local model testing if needed
552
- # For this test, we'll assume a Hub ID. `models_path` is where `listModels` would scan.
553
- test_models_parent_path = Path("./test_hf_models_dir") # Create a dummy for listModels scan
554
- test_models_parent_path.mkdir(exist_ok=True)
555
-
556
- binding_config = {
557
- "device": "auto", # "cuda", "mps", "cpu"
558
- "quantize": False, # False, "4bit", "8bit" (requires CUDA & bitsandbytes for 4/8 bit)
559
- "torch_dtype": "auto", # "float16" or "bfloat16" on CUDA for speed
560
- "max_new_tokens": 100, # Limit generation length for tests
561
- "trust_remote_code": True, # Needed for models like Phi-2
562
- "stop_words": ["\nHuman:", "\nUSER:"], # Example stop words
563
- # "embedding_model_name": "sentence-transformers/paraphrase-MiniLM-L3-v2" # Smaller embedding model
564
- }
435
+ try:
436
+ ASCIIColors.info(f"Merging {base_path} + {lora_path} -> {save_path}")
437
+ base = AutoModelForCausalLM.from_pretrained(base_path, return_dict=True, torch_dtype=torch.float16, device_map="auto", trust_remote_code=self.config.get("trust_remote_code"))
438
+ tokenizer = AutoTokenizer.from_pretrained(base_path)
439
+
440
+ merged = PeftModel.from_pretrained(base, lora_path).merge_and_unload()
441
+ merged.save_pretrained(save_path)
442
+ tokenizer.save_pretrained(save_path)
443
+ return {"status": True, "message": "Merged."}
444
+ except Exception as e:
445
+ return {"status": False, "message": str(e)}
565
446
 
566
- active_binding = None
567
- try:
568
- ASCIIColors.cyan("\n--- Initializing HuggingFaceHubBinding ---")
569
- active_binding = HuggingFaceHubBinding(
570
- model_name_or_id=test_model_name,
571
- models_path=test_models_parent_path,
572
- config=binding_config
573
- )
574
- if not active_binding.model:
575
- raise RuntimeError(f"Model '{test_model_name}' failed to load.")
576
-
577
- ASCIIColors.green(f"Binding initialized. Model '{active_binding.model_name}' loaded on {active_binding.device}.")
578
- ASCIIColors.info(f"Model Info: {json.dumps(active_binding.get_model_info(), indent=2, default=str)}")
579
-
580
- # --- List Models (scans configured models_path) ---
581
- ASCIIColors.cyan("\n--- Listing Models (from models_path) ---")
582
- # To make this test useful, you could manually place a model folder in `test_hf_models_dir`
583
- # e.g., download "gpt2" and put it in `test_hf_models_dir/gpt2`
584
- # For now, it will likely be empty unless you do that.
585
- listed_models = active_binding.listModels()
586
- if listed_models:
587
- ASCIIColors.green(f"Found {len(listed_models)} potential model folders. First 5:")
588
- for m in listed_models[:5]: print(m)
589
- else: ASCIIColors.warning(f"No model folders found in '{test_models_parent_path}'. This is normal if it's empty.")
590
-
591
- # --- Tokenize/Detokenize ---
592
- ASCIIColors.cyan("\n--- Tokenize/Detokenize ---")
593
- sample_text = "Hello, Hugging Face world!"
594
- tokens = active_binding.tokenize(sample_text)
595
- ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
596
- token_count = active_binding.count_tokens(sample_text)
597
- ASCIIColors.green(f"Token count: {token_count}")
598
- if tokens:
599
- detokenized_text = active_binding.detokenize(tokens)
600
- ASCIIColors.green(f"Detokenized text: {detokenized_text}")
601
- else: ASCIIColors.warning("Tokenization returned empty list.")
602
-
603
- # --- Text Generation (Non-Streaming, Chat Format if supported) ---
604
- ASCIIColors.cyan("\n--- Text Generation (Non-Streaming) ---")
605
- prompt_text = "What is the capital of France?"
606
- # For Phi-2, system prompt might need specific formatting if not using apply_chat_template strictly
607
- # For models like Zephyr, system_prompt is part of chat template
608
- system_prompt_text = "You are a helpful AI assistant."
609
- generated_text = active_binding.generate_text(
610
- prompt_text, system_prompt=system_prompt_text, stream=False,
611
- n_predict=30 # Override default max_new_tokens for this call
612
- )
613
- if isinstance(generated_text, str): ASCIIColors.green(f"Generated text: {generated_text}")
614
- else: ASCIIColors.error(f"Generation failed: {generated_text}")
615
-
616
- # --- Text Generation (Streaming) ---
617
- ASCIIColors.cyan("\n--- Text Generation (Streaming) ---")
618
- full_streamed_text = ""
619
- def stream_callback(chunk: str, msg_type: int):
620
- global full_streamed_text
621
- ASCIIColors.green(f"{chunk}", end="", flush=True)
622
- full_streamed_text += chunk
623
- return True # Continue streaming
447
+ def ps(self) -> Dict[str, List[Dict]]:
448
+ """
449
+ Returns the process status of loaded models, including memory usage.
450
+ """
451
+ models_status = []
624
452
 
625
- result = active_binding.generate_text(
626
- "Tell me a short story about a brave robot.",
627
- stream=True,
628
- streaming_callback=stream_callback,
629
- n_predict=70
630
- )
631
- print("\n--- End of Stream ---")
632
- if isinstance(result, str): ASCIIColors.green(f"Full streamed text collected: {result}")
633
- else: ASCIIColors.error(f"Streaming generation failed: {result}")
634
-
635
- # --- Embeddings ---
636
- if active_binding.embedding_model:
637
- ASCIIColors.cyan("\n--- Embeddings ---")
638
- embedding_text = "This is a test sentence for Hugging Face embeddings."
639
- try:
640
- embedding_vector = active_binding.embed(embedding_text)
641
- ASCIIColors.green(f"Embedding for '{embedding_text}' (first 3 dims): {embedding_vector[:3]}...")
642
- ASCIIColors.info(f"Embedding vector dimension: {len(embedding_vector)}")
643
-
644
- # Test batch embedding
645
- batch_texts = ["First sentence.", "Second sentence, quite different."]
646
- batch_embeddings = active_binding.embed(batch_texts)
647
- ASCIIColors.green(f"Batch embeddings generated for {len(batch_texts)} texts.")
648
- ASCIIColors.info(f"First batch embedding (first 3 dims): {batch_embeddings[0][:3]}...")
649
-
650
- except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
651
- else: ASCIIColors.yellow("\n--- Embeddings Skipped (no embedding model loaded) ---")
652
-
653
- # --- LLaVA Vision Test (Conceptual - requires a LLaVA model and an image) ---
654
- # To test LLaVA properly:
655
- # 1. Set `test_model_name` to a LLaVA model, e.g., "llava-hf/llava-1.5-7b-hf" (very large!)
656
- # or a smaller one like "unum-cloud/uform-gen2-qwen-500m" (check its specific prompting style).
657
- # 2. Ensure `trust_remote_code=True` might be needed.
658
- # 3. Provide a real image path.
659
- if active_binding.supports_vision:
660
- ASCIIColors.cyan("\n--- LLaVA Vision Test ---")
661
- dummy_image_path = Path("test_dummy_image.png")
453
+ # Get global GPU info once
454
+ gpu_total_mem = 0
455
+ if torch.cuda.is_available():
662
456
  try:
663
- # Create a dummy image for testing
664
- img = Image.new('RGB', (200, 100), color = ('skyblue'))
665
- from PIL import ImageDraw
666
- d = ImageDraw.Draw(img)
667
- d.text((10,10), "Hello LLaVA from HF!", fill=('black'))
668
- img.save(dummy_image_path)
669
- ASCIIColors.info(f"Created dummy image: {dummy_image_path}")
670
-
671
- llava_prompt = "Describe this image." # LLaVA models often use "<image>\nUSER: <prompt>\nASSISTANT:"
672
- # or just the prompt if processor handles template.
673
- # For AutoProcessor, often just the text part of the prompt.
674
- llava_response = active_binding.generate_text(
675
- prompt=llava_prompt,
676
- images=[str(dummy_image_path)],
677
- n_predict=50,
678
- stream=False
679
- )
680
- if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
681
- else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
457
+ gpu_total_mem = torch.cuda.get_device_properties(0).total_memory
458
+ except:
459
+ gpu_total_mem = 0
682
460
 
683
- except ImportError: ASCIIColors.warning("Pillow's ImageDraw not found for dummy image text.")
684
- except Exception as e_llava: ASCIIColors.error(f"LLaVA test error: {e_llava}"); trace_exception(e_llava)
685
- finally:
686
- if dummy_image_path.exists(): dummy_image_path.unlink()
687
- else:
688
- ASCIIColors.yellow("\n--- LLaVA Vision Test Skipped (model does not support vision or not configured for it) ---")
689
-
690
- except ImportError as e_imp:
691
- ASCIIColors.error(f"Import error: {e_imp}. Check installations.")
692
- except RuntimeError as e_rt:
693
- ASCIIColors.error(f"Runtime error: {e_rt}")
694
- except Exception as e_main:
695
- ASCIIColors.error(f"An unexpected error occurred: {e_main}")
696
- trace_exception(e_main)
697
- finally:
698
- if active_binding:
699
- ASCIIColors.cyan("\n--- Unloading Model ---")
700
- active_binding.unload_model()
701
- ASCIIColors.green("Model unloaded.")
702
- if test_models_parent_path.exists() and not any(test_models_parent_path.iterdir()): # cleanup dummy dir if empty
703
- try: os.rmdir(test_models_parent_path)
704
- except: pass
705
-
706
-
707
- ASCIIColors.yellow("\nHuggingFaceHubBinding test finished.")
461
+ system_mem = psutil.virtual_memory()
462
+
463
+ for mid, container in self.loaded_models.items():
464
+ # 1. Calculate Model Size (Bytes)
465
+ try:
466
+ # Hugging Face models track their own footprint
467
+ size_bytes = container.model.get_memory_footprint()
468
+ except Exception:
469
+ size_bytes = 0
470
+
471
+ # 2. Split into VRAM/RAM based on device
472
+ size_vram = 0
473
+ size_ram = 0
474
+
475
+ if container.device == "cuda":
476
+ size_vram = size_bytes
477
+ else:
478
+ size_ram = size_bytes
479
+
480
+ # 3. Calculate Percentages
481
+ gpu_usage_percent = 0
482
+ if gpu_total_mem > 0:
483
+ gpu_usage_percent = (size_vram / gpu_total_mem) * 100
484
+
485
+ # For CPU, we compare against total system RAM
486
+ cpu_usage_percent = 0
487
+ if system_mem.total > 0:
488
+ cpu_usage_percent = (size_ram / system_mem.total) * 100
489
+
490
+ models_status.append({
491
+ "model_name": mid, # UI Standard: 'model_name'
492
+ "active": mid == self.active_model_id,
493
+ "size": size_bytes, # Total size in bytes
494
+ "size_vram": size_vram, # GPU memory usage in bytes
495
+ "size_ram": size_ram, # RAM usage in bytes
496
+ "device": container.device,
497
+ "gpu_usage_percent": round(gpu_usage_percent, 2),
498
+ "cpu_usage_percent": round(cpu_usage_percent, 2),
499
+ "loader": "HuggingFace"
500
+ })
501
+
502
+ # Return a dictionary matching the YAML output definition
503
+ return {"models": models_status}