lollms-client 1.5.6__py3-none-any.whl → 1.7.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lollms_client/__init__.py +1 -1
- lollms_client/llm_bindings/azure_openai/__init__.py +2 -2
- lollms_client/llm_bindings/claude/__init__.py +125 -34
- lollms_client/llm_bindings/gemini/__init__.py +261 -159
- lollms_client/llm_bindings/grok/__init__.py +52 -14
- lollms_client/llm_bindings/groq/__init__.py +2 -2
- lollms_client/llm_bindings/hugging_face_inference_api/__init__.py +2 -2
- lollms_client/llm_bindings/litellm/__init__.py +1 -1
- lollms_client/llm_bindings/llamacpp/__init__.py +18 -11
- lollms_client/llm_bindings/lollms/__init__.py +76 -21
- lollms_client/llm_bindings/lollms_webui/__init__.py +1 -1
- lollms_client/llm_bindings/mistral/__init__.py +2 -2
- lollms_client/llm_bindings/novita_ai/__init__.py +142 -6
- lollms_client/llm_bindings/ollama/__init__.py +307 -89
- lollms_client/llm_bindings/open_router/__init__.py +2 -2
- lollms_client/llm_bindings/openai/__init__.py +81 -20
- lollms_client/llm_bindings/openllm/__init__.py +362 -506
- lollms_client/llm_bindings/openwebui/__init__.py +333 -171
- lollms_client/llm_bindings/perplexity/__init__.py +2 -2
- lollms_client/llm_bindings/pythonllamacpp/__init__.py +3 -3
- lollms_client/llm_bindings/tensor_rt/__init__.py +1 -1
- lollms_client/llm_bindings/transformers/__init__.py +428 -632
- lollms_client/llm_bindings/vllm/__init__.py +1 -1
- lollms_client/lollms_agentic.py +4 -2
- lollms_client/lollms_base_binding.py +61 -0
- lollms_client/lollms_core.py +512 -1890
- lollms_client/lollms_discussion.py +25 -11
- lollms_client/lollms_llm_binding.py +112 -261
- lollms_client/lollms_mcp_binding.py +34 -75
- lollms_client/lollms_stt_binding.py +85 -52
- lollms_client/lollms_tti_binding.py +23 -37
- lollms_client/lollms_ttm_binding.py +24 -42
- lollms_client/lollms_tts_binding.py +28 -17
- lollms_client/lollms_ttv_binding.py +24 -42
- lollms_client/lollms_types.py +4 -2
- lollms_client/stt_bindings/whisper/__init__.py +108 -23
- lollms_client/stt_bindings/whispercpp/__init__.py +7 -1
- lollms_client/tti_bindings/diffusers/__init__.py +418 -810
- lollms_client/tti_bindings/diffusers/server/main.py +1051 -0
- lollms_client/tti_bindings/gemini/__init__.py +182 -239
- lollms_client/tti_bindings/leonardo_ai/__init__.py +6 -3
- lollms_client/tti_bindings/lollms/__init__.py +4 -1
- lollms_client/tti_bindings/novita_ai/__init__.py +5 -2
- lollms_client/tti_bindings/openai/__init__.py +10 -11
- lollms_client/tti_bindings/stability_ai/__init__.py +5 -3
- lollms_client/ttm_bindings/audiocraft/__init__.py +7 -12
- lollms_client/ttm_bindings/beatoven_ai/__init__.py +7 -3
- lollms_client/ttm_bindings/lollms/__init__.py +4 -17
- lollms_client/ttm_bindings/replicate/__init__.py +7 -4
- lollms_client/ttm_bindings/stability_ai/__init__.py +7 -4
- lollms_client/ttm_bindings/topmediai/__init__.py +6 -3
- lollms_client/tts_bindings/bark/__init__.py +7 -10
- lollms_client/tts_bindings/lollms/__init__.py +6 -1
- lollms_client/tts_bindings/piper_tts/__init__.py +8 -11
- lollms_client/tts_bindings/xtts/__init__.py +157 -74
- lollms_client/tts_bindings/xtts/server/main.py +241 -280
- {lollms_client-1.5.6.dist-info → lollms_client-1.7.10.dist-info}/METADATA +113 -5
- lollms_client-1.7.10.dist-info/RECORD +89 -0
- lollms_client-1.5.6.dist-info/RECORD +0 -87
- {lollms_client-1.5.6.dist-info → lollms_client-1.7.10.dist-info}/WHEEL +0 -0
- {lollms_client-1.5.6.dist-info → lollms_client-1.7.10.dist-info}/licenses/LICENSE +0 -0
- {lollms_client-1.5.6.dist-info → lollms_client-1.7.10.dist-info}/top_level.txt +0 -0
|
@@ -1,707 +1,503 @@
|
|
|
1
|
-
# bindings/
|
|
1
|
+
# bindings/huggingface/__init__.py
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
|
-
import pprint
|
|
5
|
-
import re
|
|
6
|
-
import socket # Not used directly for server, but good to keep for consistency if needed elsewhere
|
|
7
|
-
import subprocess # Not used for server
|
|
8
|
-
import sys
|
|
9
4
|
import threading
|
|
10
5
|
import time
|
|
6
|
+
import shutil
|
|
11
7
|
from pathlib import Path
|
|
12
|
-
from typing import Optional, Callable, List, Union, Dict, Any
|
|
13
|
-
import base64 # For potential image data handling, though PIL.Image is primary
|
|
14
|
-
import requests # Not used for server, but for consistency
|
|
8
|
+
from typing import Optional, Callable, List, Union, Dict, Any
|
|
15
9
|
|
|
10
|
+
import psutil
|
|
16
11
|
from lollms_client.lollms_llm_binding import LollmsLLMBinding
|
|
17
|
-
from lollms_client.lollms_types import MSG_TYPE
|
|
12
|
+
from lollms_client.lollms_types import MSG_TYPE
|
|
13
|
+
from lollms_client.lollms_discussion import LollmsDiscussion
|
|
18
14
|
|
|
19
15
|
from ascii_colors import ASCIIColors, trace_exception
|
|
20
16
|
import pipmaster as pm
|
|
21
17
|
|
|
22
|
-
# ---
|
|
18
|
+
# --- Dependencies ---
|
|
23
19
|
pm.ensure_packages([
|
|
24
|
-
"torch",
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
"bitsandbytes", # For 4-bit/8-bit quantization (works best on CUDA)
|
|
28
|
-
"sentence_transformers", # For robust embedding generation
|
|
29
|
-
"pillow" # For image handling (vision models)
|
|
20
|
+
"torch", "transformers", "accelerate", "bitsandbytes",
|
|
21
|
+
"sentence_transformers", "pillow", "scipy", "huggingface_hub",
|
|
22
|
+
"psutil", "peft", "trl", "datasets"
|
|
30
23
|
])
|
|
31
24
|
|
|
32
25
|
try:
|
|
33
26
|
import torch
|
|
34
27
|
from transformers import (
|
|
35
28
|
AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer,
|
|
36
|
-
BitsAndBytesConfig, AutoConfig,
|
|
37
|
-
|
|
38
|
-
|
|
29
|
+
BitsAndBytesConfig, AutoConfig, AutoProcessor,
|
|
30
|
+
LlavaForConditionalGeneration, LlavaNextForConditionalGeneration,
|
|
31
|
+
TrainingArguments
|
|
39
32
|
)
|
|
33
|
+
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
|
34
|
+
from trl import SFTTrainer
|
|
35
|
+
from datasets import load_dataset
|
|
40
36
|
from sentence_transformers import SentenceTransformer
|
|
37
|
+
from huggingface_hub import snapshot_download
|
|
41
38
|
from PIL import Image
|
|
42
39
|
except ImportError as e:
|
|
43
40
|
ASCIIColors.error(f"Failed to import core libraries: {e}")
|
|
44
|
-
ASCIIColors.error("Please ensure torch, transformers, accelerate, bitsandbytes, sentence_transformers, and pillow are installed.")
|
|
45
|
-
trace_exception(e)
|
|
46
|
-
# Set them to None so the binding can report failure cleanly if __init__ is still called.
|
|
47
41
|
torch = None
|
|
48
42
|
transformers = None
|
|
49
|
-
sentence_transformers = None
|
|
50
|
-
Image = None
|
|
51
43
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
44
|
+
# --- Container ---
|
|
45
|
+
class ModelContainer:
|
|
46
|
+
def __init__(self, model_id, model, tokenizer, processor=None, device="cpu", quant=None):
|
|
47
|
+
self.model_id = model_id
|
|
48
|
+
self.model = model
|
|
57
49
|
self.tokenizer = tokenizer
|
|
58
|
-
self.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
self.stop_sequences_token_ids.append(torch.tensor(token_ids))
|
|
64
|
-
|
|
65
|
-
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
|
66
|
-
for stop_seq_ids in self.stop_sequences_token_ids:
|
|
67
|
-
if input_ids.shape[1] >= stop_seq_ids.shape[0]:
|
|
68
|
-
# Check if the end of input_ids matches the stop sequence
|
|
69
|
-
if torch.equal(input_ids[0, -stop_seq_ids.shape[0]:], stop_seq_ids.to(input_ids.device)):
|
|
70
|
-
return True
|
|
71
|
-
return False
|
|
50
|
+
self.processor = processor
|
|
51
|
+
self.device = device
|
|
52
|
+
self.quantization = quant
|
|
53
|
+
self.last_used = time.time()
|
|
54
|
+
self.supports_vision = processor is not None
|
|
72
55
|
|
|
56
|
+
def update_usage(self):
|
|
57
|
+
self.last_used = time.time()
|
|
73
58
|
|
|
74
|
-
BindingName = "
|
|
59
|
+
BindingName = "HuggingFace"
|
|
75
60
|
|
|
76
|
-
class
|
|
61
|
+
class HuggingFace(LollmsLLMBinding):
|
|
77
62
|
DEFAULT_CONFIG_ARGS = {
|
|
78
|
-
"device": "auto",
|
|
79
|
-
"quantize": False,
|
|
80
|
-
"torch_dtype": "auto",
|
|
81
|
-
"max_new_tokens":
|
|
63
|
+
"device": "auto",
|
|
64
|
+
"quantize": False,
|
|
65
|
+
"torch_dtype": "auto",
|
|
66
|
+
"max_new_tokens": 4096,
|
|
82
67
|
"temperature": 0.7,
|
|
83
|
-
"
|
|
84
|
-
"
|
|
85
|
-
"
|
|
86
|
-
"
|
|
87
|
-
"
|
|
88
|
-
"embedding_model_name": "sentence-transformers/all-MiniLM-L6-v2", # Default for embed()
|
|
89
|
-
"generation_timeout": 300, # Timeout for non-streaming generation
|
|
90
|
-
"stop_words": [], # List of strings to stop generation
|
|
68
|
+
"trust_remote_code": False,
|
|
69
|
+
"use_flash_attention_2": False,
|
|
70
|
+
"embedding_model_name": "sentence-transformers/all-MiniLM-L6-v2",
|
|
71
|
+
"max_active_models": 1,
|
|
72
|
+
"local_models_path": "" # If empty, dynamic default is used
|
|
91
73
|
}
|
|
92
74
|
|
|
93
|
-
def __init__(self,
|
|
94
|
-
**kwargs # Overrides for config_args
|
|
95
|
-
):
|
|
96
|
-
"""
|
|
97
|
-
Initializes the Hugging Face Hub binding.
|
|
98
|
-
Args:
|
|
99
|
-
model_name (str): Hugging Face Hub model ID or local folder name.
|
|
100
|
-
models_path (str or Path): Path to the directory containing local models.
|
|
101
|
-
config (Optional[Dict[str, Any]]): Optional configuration dictionary to override defaults.
|
|
102
|
-
default_completion_format (ELF_COMPLETION_FORMAT): Default format for text generation.
|
|
103
|
-
"""
|
|
75
|
+
def __init__(self, **kwargs):
|
|
104
76
|
super().__init__(BindingName, **kwargs)
|
|
77
|
+
|
|
78
|
+
if torch is None or transformers is None:
|
|
79
|
+
raise ImportError("Core libraries not available.")
|
|
105
80
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
self.default_completion_format = default_completion_format
|
|
81
|
+
self.config = {**self.DEFAULT_CONFIG_ARGS, **kwargs.get("config", {}), **kwargs}
|
|
82
|
+
|
|
83
|
+
# --- 1. Setup Local Models Path ---
|
|
84
|
+
# Priority: Config Override -> Lollms Personal Path -> Default relative path
|
|
85
|
+
if self.config["local_models_path"]:
|
|
86
|
+
self.local_models_path = Path(self.config["local_models_path"])
|
|
87
|
+
elif kwargs.get("lollms_paths"):
|
|
88
|
+
self.local_models_path = Path(kwargs["lollms_paths"].personal_models_path) / "huggingface"
|
|
89
|
+
else:
|
|
90
|
+
self.local_models_path = Path("models/huggingface")
|
|
117
91
|
|
|
118
|
-
self.
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
self.
|
|
123
|
-
self.
|
|
124
|
-
self.
|
|
125
|
-
self.
|
|
126
|
-
self.supports_vision: bool = False
|
|
127
|
-
|
|
128
|
-
# Attempt to load the model during initialization
|
|
129
|
-
if not self.load_model(model_name_or_id):
|
|
130
|
-
# load_model will print errors. Here we can raise if critical.
|
|
131
|
-
ASCIIColors.error(f"Initial model load failed for {model_name_or_id}. Binding may not be functional.")
|
|
132
|
-
# Depending on Lollms behavior, this might be acceptable if user can select another model later.
|
|
133
|
-
|
|
134
|
-
def _resolve_model_path_or_id(self, model_name_or_id: str) -> str:
|
|
135
|
-
# 1. Check if it's an absolute path to a model directory
|
|
136
|
-
abs_path = Path(model_name_or_id)
|
|
137
|
-
if abs_path.is_absolute() and abs_path.is_dir() and (abs_path / "config.json").exists():
|
|
138
|
-
ASCIIColors.info(f"Using absolute model path: {abs_path}")
|
|
139
|
-
return str(abs_path)
|
|
140
|
-
|
|
141
|
-
# 2. Check if it's a name relative to self.models_path
|
|
142
|
-
local_model_path = self.models_path / model_name_or_id
|
|
143
|
-
if local_model_path.is_dir() and (local_model_path / "config.json").exists():
|
|
144
|
-
ASCIIColors.info(f"Found local model in models_path: {local_model_path}")
|
|
145
|
-
return str(local_model_path)
|
|
92
|
+
self.local_models_path.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
ASCIIColors.info(f"HuggingFace Local Storage: {self.local_models_path}")
|
|
94
|
+
|
|
95
|
+
# State
|
|
96
|
+
self.loaded_models: Dict[str, ModelContainer] = {}
|
|
97
|
+
self.active_model_id: Optional[str] = None
|
|
98
|
+
self.inference_lock = threading.Lock()
|
|
99
|
+
self.is_training = False
|
|
146
100
|
|
|
147
|
-
#
|
|
148
|
-
|
|
149
|
-
|
|
101
|
+
# Load Embeddings
|
|
102
|
+
self.embedding_model = None
|
|
103
|
+
self.load_embedding_model()
|
|
104
|
+
|
|
105
|
+
# Initial Load
|
|
106
|
+
model_name = kwargs.get("model_name")
|
|
107
|
+
if model_name:
|
|
108
|
+
self.load_model(model_name)
|
|
109
|
+
|
|
110
|
+
def load_embedding_model(self):
|
|
111
|
+
name = self.config.get("embedding_model_name")
|
|
112
|
+
if name:
|
|
113
|
+
try:
|
|
114
|
+
ASCIIColors.info(f"Loading embedding model: {name}")
|
|
115
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
116
|
+
self.embedding_model = SentenceTransformer(name, device=device)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
ASCIIColors.warning(f"Failed to load embedding model: {e}")
|
|
119
|
+
|
|
120
|
+
def _manage_memory(self):
|
|
121
|
+
max_models = int(self.config.get("max_active_models", 1))
|
|
122
|
+
while len(self.loaded_models) >= max_models:
|
|
123
|
+
lru_id = min(self.loaded_models, key=lambda k: self.loaded_models[k].last_used)
|
|
124
|
+
# Avoid unloading the active one if possible, unless it's the only one and we need a swap
|
|
125
|
+
if lru_id == self.active_model_id and len(self.loaded_models) == 1:
|
|
126
|
+
pass
|
|
127
|
+
ASCIIColors.info(f"Unloading {lru_id} to free memory.")
|
|
128
|
+
self.unload_model_by_id(lru_id)
|
|
150
129
|
|
|
151
130
|
def load_model(self, model_name_or_id: str) -> bool:
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
#
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
#
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
if
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
load_in_4bit = False
|
|
182
|
-
bnb_config = None
|
|
131
|
+
"""
|
|
132
|
+
Loads a model. Priorities:
|
|
133
|
+
1. Local folder (self.local_models_path / model_name_or_id)
|
|
134
|
+
2. Hugging Face Hub (download/cache automatically)
|
|
135
|
+
"""
|
|
136
|
+
# --- Resolve Path ---
|
|
137
|
+
# Clean naming for folder lookup
|
|
138
|
+
folder_name = model_name_or_id.replace("/", "_") # Sanitize potential subdirs if user types "meta-llama/Llama-2"
|
|
139
|
+
|
|
140
|
+
# Check standard path mapping
|
|
141
|
+
possible_paths = [
|
|
142
|
+
self.local_models_path / model_name_or_id, # Exact match (subfolders)
|
|
143
|
+
self.local_models_path / folder_name, # Flattened match
|
|
144
|
+
Path(model_name_or_id) # Absolute path provided by user
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
model_path_to_use = model_name_or_id # Default to ID for HF Hub
|
|
148
|
+
|
|
149
|
+
for p in possible_paths:
|
|
150
|
+
if p.exists() and p.is_dir() and (p / "config.json").exists():
|
|
151
|
+
ASCIIColors.info(f"Found local model at: {p}")
|
|
152
|
+
model_path_to_use = str(p)
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
# Check if already loaded
|
|
156
|
+
if model_name_or_id in self.loaded_models:
|
|
157
|
+
self.active_model_id = model_name_or_id
|
|
158
|
+
self.loaded_models[model_name_or_id].update_usage()
|
|
159
|
+
return True
|
|
183
160
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
elif quantize_mode == "4bit":
|
|
189
|
-
load_in_4bit = True
|
|
190
|
-
bnb_config = BitsAndBytesConfig(
|
|
191
|
-
load_in_4bit=True,
|
|
192
|
-
bnb_4bit_quant_type="nf4",
|
|
193
|
-
bnb_4bit_use_double_quant=True,
|
|
194
|
-
bnb_4bit_compute_dtype=self.torch_dtype # e.g., torch.bfloat16 for computation
|
|
195
|
-
)
|
|
196
|
-
ASCIIColors.info("Quantizing model to 4-bit.")
|
|
197
|
-
elif quantize_mode and self.device != "cuda":
|
|
198
|
-
ASCIIColors.warning(f"Quantization ('{quantize_mode}') is selected but device is '{self.device}'. bitsandbytes works best on CUDA. Proceeding without quantization.")
|
|
199
|
-
quantize_mode = False
|
|
161
|
+
self._manage_memory()
|
|
162
|
+
if self.is_training:
|
|
163
|
+
ASCIIColors.error("Training in progress. Cannot load new model.")
|
|
164
|
+
return False
|
|
200
165
|
|
|
166
|
+
ASCIIColors.info(f"Loading {model_name_or_id} (Path/ID: {model_path_to_use})...")
|
|
167
|
+
|
|
168
|
+
# --- Config & Device ---
|
|
169
|
+
device = "cuda" if torch.cuda.is_available() and self.config["device"]=="auto" else "cpu"
|
|
170
|
+
if self.config["device"] != "auto": device = self.config["device"]
|
|
171
|
+
|
|
172
|
+
dtype_map = {"auto": torch.float16 if device=="cuda" else torch.float32,
|
|
173
|
+
"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
|
|
174
|
+
dtype = dtype_map.get(self.config["torch_dtype"], torch.float32)
|
|
175
|
+
|
|
176
|
+
quant_mode = self.config.get("quantize", False)
|
|
177
|
+
load_in_4bit = str(quant_mode) == "4bit"
|
|
178
|
+
load_in_8bit = str(quant_mode) == "8bit"
|
|
179
|
+
|
|
180
|
+
bnb_config = None
|
|
181
|
+
if device == "cuda" and load_in_4bit:
|
|
182
|
+
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
|
|
183
|
+
bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=dtype)
|
|
201
184
|
|
|
202
|
-
|
|
203
|
-
model_load_args = {
|
|
185
|
+
model_args = {
|
|
204
186
|
"trust_remote_code": self.config.get("trust_remote_code", False),
|
|
205
|
-
|
|
206
|
-
"
|
|
187
|
+
"torch_dtype": dtype if not (load_in_4bit or load_in_8bit) else None,
|
|
188
|
+
"device_map": "auto" if device == "cuda" else None
|
|
207
189
|
}
|
|
208
|
-
if self.config.get("use_flash_attention_2", False) and self.device == "cuda":
|
|
209
|
-
if hasattr(transformers, " আসছেAttention"): # Check for Flash Attention support in transformers version
|
|
210
|
-
model_load_args["attn_implementation"] = "flash_attention_2"
|
|
211
|
-
ASCIIColors.info("Attempting to use Flash Attention 2.")
|
|
212
|
-
else:
|
|
213
|
-
ASCIIColors.warning("Flash Attention 2 requested but not found in this transformers version. Using default.")
|
|
214
190
|
|
|
191
|
+
if self.config.get("use_flash_attention_2") and device == "cuda":
|
|
192
|
+
try:
|
|
193
|
+
import flash_attn
|
|
194
|
+
model_args["attn_implementation"] = "flash_attention_2"
|
|
195
|
+
except ImportError:
|
|
196
|
+
ASCIIColors.warning("Flash Attention 2 enabled but not installed.")
|
|
197
|
+
|
|
198
|
+
if load_in_4bit: model_args["quantization_config"] = bnb_config
|
|
199
|
+
if load_in_8bit: model_args["load_in_8bit"] = True
|
|
215
200
|
|
|
216
|
-
if load_in_8bit: model_load_args["load_in_8bit"] = True
|
|
217
|
-
if load_in_4bit: model_load_args["quantization_config"] = bnb_config
|
|
218
|
-
|
|
219
|
-
# device_map="auto" for multi-GPU or when quantizing on CUDA
|
|
220
|
-
if self.device == "cuda" and (load_in_8bit or load_in_4bit or torch.cuda.device_count() > 1):
|
|
221
|
-
model_load_args["device_map"] = "auto"
|
|
222
|
-
ASCIIColors.info("Using device_map='auto'.")
|
|
223
|
-
|
|
224
201
|
try:
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
trust_remote_code=model_load_args["trust_remote_code"]
|
|
229
|
-
)
|
|
230
|
-
if self.tokenizer.pad_token is None:
|
|
231
|
-
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
232
|
-
ASCIIColors.info("Tokenizer `pad_token` was None, set to `eos_token`.")
|
|
233
|
-
|
|
234
|
-
# --- Determine if it's a LLaVA-like vision model ---
|
|
235
|
-
model_config_hf = AutoConfig.from_pretrained(self.model_identifier, trust_remote_code=model_load_args["trust_remote_code"])
|
|
236
|
-
self.supports_vision = "llava" in model_config_hf.model_type.lower() or \
|
|
237
|
-
any("Llava" in arch for arch in getattr(model_config_hf, "architectures", [])) or \
|
|
238
|
-
"vision_tower" in model_config_hf.to_dict() # Common LLaVA config key
|
|
239
|
-
|
|
240
|
-
if self.supports_vision:
|
|
241
|
-
ASCIIColors.info(f"Detected LLaVA-like vision model: '{self.model_identifier}'.")
|
|
242
|
-
self.processor = AutoProcessor.from_pretrained(
|
|
243
|
-
self.model_identifier,
|
|
244
|
-
trust_remote_code=model_load_args["trust_remote_code"]
|
|
245
|
-
)
|
|
246
|
-
# Choose appropriate LLaVA model class
|
|
247
|
-
if "llava-next" in self.model_identifier.lower() or any("LlavaNext" in arch for arch in getattr(model_config_hf, "architectures", [])):
|
|
248
|
-
ModelClass = LlavaNextForConditionalGeneration
|
|
249
|
-
elif "llava" in self.model_identifier.lower() or any("LlavaForConditionalGeneration" in arch for arch in getattr(model_config_hf, "architectures", [])):
|
|
250
|
-
ModelClass = LlavaForConditionalGeneration
|
|
251
|
-
else: # Fallback if specific Llava class not matched by name
|
|
252
|
-
ASCIIColors.warning("Could not determine specific LLaVA class, using AutoModelForCausalLM. Vision capabilities might be limited.")
|
|
253
|
-
ModelClass = AutoModelForCausalLM # This might not fully work for all LLaVAs
|
|
254
|
-
|
|
255
|
-
self.model = ModelClass.from_pretrained(self.model_identifier, **model_load_args)
|
|
256
|
-
else:
|
|
257
|
-
ASCIIColors.info(f"Loading text model '{self.model_identifier}'...")
|
|
258
|
-
self.model = AutoModelForCausalLM.from_pretrained(self.model_identifier, **model_load_args)
|
|
202
|
+
# Tokenizer
|
|
203
|
+
tokenizer = AutoTokenizer.from_pretrained(model_path_to_use, trust_remote_code=model_args["trust_remote_code"])
|
|
204
|
+
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
|
|
259
205
|
|
|
260
|
-
#
|
|
261
|
-
|
|
262
|
-
|
|
206
|
+
# Architecture Detection
|
|
207
|
+
config = AutoConfig.from_pretrained(model_path_to_use, trust_remote_code=model_args["trust_remote_code"])
|
|
208
|
+
processor = None
|
|
263
209
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
try:
|
|
270
|
-
ASCIIColors.info(f"Loading embedding model: {emb_model_name} on device: {self.device}")
|
|
271
|
-
self.embedding_model = SentenceTransformer(emb_model_name, device=self.device)
|
|
272
|
-
except Exception as e_emb:
|
|
273
|
-
ASCIIColors.warning(f"Failed to load embedding model '{emb_model_name}': {e_emb}. Embeddings will not be available.")
|
|
274
|
-
self.embedding_model = None
|
|
210
|
+
# LLaVA Check
|
|
211
|
+
if "llava" in config.model_type.lower() or "Llava" in str(getattr(config, "architectures", [])):
|
|
212
|
+
processor = AutoProcessor.from_pretrained(model_path_to_use, trust_remote_code=model_args["trust_remote_code"])
|
|
213
|
+
ModelClass = LlavaNextForConditionalGeneration if "next" in config.model_type.lower() else LlavaForConditionalGeneration
|
|
214
|
+
model = ModelClass.from_pretrained(model_path_to_use, **model_args)
|
|
275
215
|
else:
|
|
276
|
-
|
|
277
|
-
self.embedding_model = None
|
|
216
|
+
model = AutoModelForCausalLM.from_pretrained(model_path_to_use, **model_args)
|
|
278
217
|
|
|
279
|
-
|
|
218
|
+
# Fallback for device placement
|
|
219
|
+
if not model_args.get("device_map") and device != "cpu" and not (load_in_4bit or load_in_8bit):
|
|
220
|
+
model.to(device)
|
|
221
|
+
|
|
222
|
+
model.eval()
|
|
223
|
+
|
|
224
|
+
container = ModelContainer(model_name_or_id, model, tokenizer, processor, device, quant_mode)
|
|
225
|
+
self.loaded_models[model_name_or_id] = container
|
|
226
|
+
self.active_model_id = model_name_or_id
|
|
227
|
+
ASCIIColors.success(f"Loaded {model_name_or_id}")
|
|
280
228
|
return True
|
|
281
|
-
|
|
229
|
+
|
|
282
230
|
except Exception as e:
|
|
283
|
-
ASCIIColors.error(f"
|
|
231
|
+
ASCIIColors.error(f"Load failed: {e}")
|
|
284
232
|
trace_exception(e)
|
|
285
|
-
self.unload_model() # Ensure partial loads are cleaned up
|
|
286
233
|
return False
|
|
287
234
|
|
|
288
|
-
def
|
|
289
|
-
if self.
|
|
290
|
-
del self.
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
self.embedding_model = None
|
|
235
|
+
def unload_model_by_id(self, model_id: str):
|
|
236
|
+
if model_id in self.loaded_models:
|
|
237
|
+
del self.loaded_models[model_id]
|
|
238
|
+
if torch.cuda.is_available(): torch.cuda.empty_cache()
|
|
239
|
+
import gc; gc.collect()
|
|
240
|
+
|
|
241
|
+
def get_container(self):
|
|
242
|
+
return self.loaded_models.get(self.active_model_id)
|
|
243
|
+
|
|
244
|
+
# --- Generation ---
|
|
245
|
+
def generate_text(self, prompt, images=None, system_prompt="", stream=False, streaming_callback=None, split=False, n_predict=None, **kwargs):
|
|
246
|
+
if self.is_training: return {"status": False, "error": "Training in progress."}
|
|
301
247
|
|
|
302
|
-
|
|
303
|
-
|
|
248
|
+
container = self.get_container()
|
|
249
|
+
if not container: return {"status": False, "error": "No model loaded."}
|
|
304
250
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
self.
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
torch.manual_seed(seed)
|
|
335
|
-
if self.device == "cuda": torch.cuda.manual_seed_all(seed)
|
|
336
|
-
|
|
337
|
-
_use_chat_format = use_chat_format_override if use_chat_format_override is not None \
|
|
338
|
-
else (self.default_completion_format == ELF_COMPLETION_FORMAT.Chat)
|
|
339
|
-
|
|
340
|
-
# --- Prepare Inputs ---
|
|
341
|
-
inputs_dict = {}
|
|
342
|
-
processed_images = []
|
|
343
|
-
if self.supports_vision and self.processor and images:
|
|
251
|
+
container.update_usage()
|
|
252
|
+
|
|
253
|
+
with self.inference_lock:
|
|
254
|
+
inputs = {}
|
|
255
|
+
# Vision
|
|
256
|
+
if container.supports_vision and images:
|
|
257
|
+
pil_images = [Image.open(p).convert("RGB") for p in images]
|
|
258
|
+
inputs = container.processor(text=prompt, images=pil_images, return_tensors="pt").to(container.model.device)
|
|
259
|
+
# Text / Chat
|
|
260
|
+
else:
|
|
261
|
+
if hasattr(container.tokenizer, 'apply_chat_template') and not split:
|
|
262
|
+
messages = []
|
|
263
|
+
if system_prompt: messages.append({"role": "system", "content": system_prompt})
|
|
264
|
+
messages.append({"role": "user", "content": prompt})
|
|
265
|
+
try:
|
|
266
|
+
text = container.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
267
|
+
inputs = container.tokenizer(text, return_tensors="pt").to(container.model.device)
|
|
268
|
+
except:
|
|
269
|
+
inputs = container.tokenizer(prompt, return_tensors="pt").to(container.model.device)
|
|
270
|
+
else:
|
|
271
|
+
inputs = container.tokenizer(prompt, return_tensors="pt").to(container.model.device)
|
|
272
|
+
|
|
273
|
+
gen_kwargs = {
|
|
274
|
+
"max_new_tokens": n_predict or self.config.get("max_new_tokens"),
|
|
275
|
+
"temperature": kwargs.get("temperature", self.config.get("temperature")),
|
|
276
|
+
"do_sample": kwargs.get("temperature", 0.7) > 0,
|
|
277
|
+
"pad_token_id": container.tokenizer.eos_token_id
|
|
278
|
+
}
|
|
279
|
+
|
|
344
280
|
try:
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
281
|
+
if stream and streaming_callback:
|
|
282
|
+
streamer = TextIteratorStreamer(container.tokenizer, skip_prompt=True, skip_special_tokens=True)
|
|
283
|
+
gen_kwargs["streamer"] = streamer
|
|
284
|
+
t = threading.Thread(target=container.model.generate, kwargs={**inputs, **gen_kwargs})
|
|
285
|
+
t.start()
|
|
286
|
+
|
|
287
|
+
full = ""
|
|
288
|
+
for chunk in streamer:
|
|
289
|
+
full += chunk
|
|
290
|
+
if not streaming_callback(chunk, MSG_TYPE.MSG_TYPE_CHUNK): break
|
|
291
|
+
t.join()
|
|
292
|
+
return full
|
|
293
|
+
else:
|
|
294
|
+
outputs = container.model.generate(**inputs, **gen_kwargs)
|
|
295
|
+
text = container.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
|
296
|
+
return text
|
|
297
|
+
except Exception as e:
|
|
298
|
+
trace_exception(e)
|
|
299
|
+
return {"status": False, "error": str(e)}
|
|
300
|
+
|
|
301
|
+
# --- Commands ---
|
|
302
|
+
|
|
303
|
+
def list_models(self) -> List[Dict[str, str]]:
|
|
304
|
+
"""Scans the designated local_models_path."""
|
|
305
|
+
models = []
|
|
306
|
+
if self.local_models_path.exists():
|
|
307
|
+
for item in self.local_models_path.iterdir():
|
|
308
|
+
if item.is_dir():
|
|
309
|
+
# Simple heuristic to check if it's a valid HF model folder
|
|
310
|
+
if (item / "config.json").exists() or (item / "adapter_config.json").exists():
|
|
311
|
+
try:
|
|
312
|
+
size_gb = sum(f.stat().st_size for f in item.rglob('*') if f.is_file()) / (1024**3)
|
|
313
|
+
except: size_gb = 0
|
|
314
|
+
|
|
315
|
+
models.append({
|
|
316
|
+
"model_name": item.name,
|
|
317
|
+
"path": str(item),
|
|
318
|
+
"size": f"{size_gb:.2f} GB",
|
|
319
|
+
"source": "Local Storage"
|
|
320
|
+
})
|
|
321
|
+
return models
|
|
322
|
+
|
|
323
|
+
def pull_model(self, model_name: str) -> dict:
|
|
324
|
+
"""Downloads model files directly to self.local_models_path."""
|
|
325
|
+
try:
|
|
326
|
+
ASCIIColors.info(f"Downloading {model_name} to {self.local_models_path}...")
|
|
357
327
|
|
|
358
|
-
#
|
|
359
|
-
#
|
|
360
|
-
#
|
|
361
|
-
|
|
328
|
+
# We preserve the folder structure simply using the last part of the repo name
|
|
329
|
+
# e.g. 'meta-llama/Llama-2-7b' -> 'Llama-2-7b' folder in local path.
|
|
330
|
+
# OR use the full 'meta-llama_Llama-2-7b' to avoid name collisions.
|
|
331
|
+
folder_name = model_name.replace("/", "_")
|
|
332
|
+
target_dir = self.local_models_path / folder_name
|
|
362
333
|
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
_use_chat_format = False # Fallback
|
|
370
|
-
|
|
371
|
-
if not _use_chat_format or not inputs_dict: # Raw prompt or fallback
|
|
372
|
-
full_prompt_text = ""
|
|
373
|
-
if system_prompt: full_prompt_text += system_prompt + "\n\n"
|
|
374
|
-
full_prompt_text += prompt
|
|
375
|
-
inputs_dict = self.tokenizer(full_prompt_text, return_tensors="pt").to(self.model.device)
|
|
376
|
-
ASCIIColors.debug("Using raw prompt format.")
|
|
377
|
-
|
|
378
|
-
input_ids = inputs_dict.get("input_ids")
|
|
379
|
-
if input_ids is None: return {"status": False, "error": "Failed to tokenize prompt."}
|
|
380
|
-
|
|
381
|
-
current_input_length = input_ids.shape[1]
|
|
382
|
-
|
|
383
|
-
# --- Generation Parameters ---
|
|
384
|
-
gen_conf = GenerationConfig.from_model_config(self.model.config) # Start with model's default
|
|
385
|
-
|
|
386
|
-
gen_conf.max_new_tokens = n_predict if n_predict is not None else self.config.get("max_new_tokens")
|
|
387
|
-
gen_conf.temperature = temperature if temperature is not None else self.config.get("temperature")
|
|
388
|
-
gen_conf.top_k = top_k if top_k is not None else self.config.get("top_k")
|
|
389
|
-
gen_conf.top_p = top_p if top_p is not None else self.config.get("top_p")
|
|
390
|
-
gen_conf.repetition_penalty = repeat_penalty if repeat_penalty is not None else self.config.get("repetition_penalty")
|
|
391
|
-
gen_conf.pad_token_id = self.tokenizer.eos_token_id # Crucial for stopping
|
|
392
|
-
gen_conf.eos_token_id = self.tokenizer.eos_token_id
|
|
393
|
-
|
|
394
|
-
# Apply any other valid GenerationConfig parameters from generation_kwargs
|
|
395
|
-
for key, value in generation_kwargs.items():
|
|
396
|
-
if hasattr(gen_conf, key): setattr(gen_conf, key, value)
|
|
397
|
-
|
|
398
|
-
# --- Stopping Criteria ---
|
|
399
|
-
stopping_criteria_list = StoppingCriteriaList()
|
|
400
|
-
effective_stop_words = stop_words if stop_words is not None else self.config.get("stop_words", [])
|
|
401
|
-
if effective_stop_words:
|
|
402
|
-
stopping_criteria_list.append(StopOnWords(self.tokenizer, effective_stop_words))
|
|
403
|
-
|
|
404
|
-
# --- Generation ---
|
|
405
|
-
try:
|
|
406
|
-
if stream and streaming_callback:
|
|
407
|
-
streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
|
|
408
|
-
generation_thread_kwargs = {
|
|
409
|
-
**inputs_dict, # input_ids, attention_mask, pixel_values (if vision)
|
|
410
|
-
"generation_config": gen_conf,
|
|
411
|
-
"streamer": streamer,
|
|
412
|
-
"stopping_criteria": stopping_criteria_list if effective_stop_words else None
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
thread = threading.Thread(target=self.model.generate, kwargs=generation_thread_kwargs)
|
|
416
|
-
thread.start()
|
|
417
|
-
|
|
418
|
-
full_response_text = ""
|
|
419
|
-
for new_text_chunk in streamer:
|
|
420
|
-
if streaming_callback(new_text_chunk, MSG_TYPE.MSG_TYPE_CHUNK):
|
|
421
|
-
full_response_text += new_text_chunk
|
|
422
|
-
else: # Callback requested stop
|
|
423
|
-
ASCIIColors.info("Streaming callback requested stop.")
|
|
424
|
-
# Note: stopping the model.generate thread externally is complex.
|
|
425
|
-
# The thread will complete its current generation.
|
|
426
|
-
break
|
|
427
|
-
thread.join(timeout=self.config.get("generation_timeout", 300))
|
|
428
|
-
if thread.is_alive():
|
|
429
|
-
ASCIIColors.warning("Generation thread did not finish in time after streaming.")
|
|
430
|
-
return full_response_text
|
|
431
|
-
else: # Non-streaming
|
|
432
|
-
outputs = self.model.generate(
|
|
433
|
-
**inputs_dict,
|
|
434
|
-
generation_config=gen_conf,
|
|
435
|
-
stopping_criteria=stopping_criteria_list if effective_stop_words else None
|
|
436
|
-
)
|
|
437
|
-
# outputs contains the full sequence (prompt + new tokens)
|
|
438
|
-
generated_tokens = outputs[0][current_input_length:]
|
|
439
|
-
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
|
440
|
-
return generated_text.strip()
|
|
441
|
-
|
|
334
|
+
# local_dir ensures actual files are downloaded, not just cache pointers
|
|
335
|
+
path = snapshot_download(repo_id=model_name, local_dir=target_dir, local_dir_use_symlinks=False)
|
|
336
|
+
|
|
337
|
+
msg = f"Model downloaded successfully to {path}"
|
|
338
|
+
ASCIIColors.success(msg)
|
|
339
|
+
return {"status": True, "message": msg, "path": str(path)}
|
|
442
340
|
except Exception as e:
|
|
443
|
-
|
|
444
|
-
trace_exception(e)
|
|
445
|
-
return {"status": False, "error": str(e)}
|
|
446
|
-
|
|
447
|
-
def tokenize(self, text: str) -> List[int]:
|
|
448
|
-
if self.tokenizer is None: raise RuntimeError("Tokenizer not loaded.")
|
|
449
|
-
return self.tokenizer.encode(text)
|
|
341
|
+
return {"status": False, "message": str(e)}
|
|
450
342
|
|
|
451
|
-
def
|
|
452
|
-
if self.
|
|
453
|
-
|
|
343
|
+
def train(self, base_model_name: str, dataset_path: str, new_model_name: str, num_epochs=1, batch_size=1, learning_rate=2e-4) -> dict:
|
|
344
|
+
if self.is_training: return {"status": False, "message": "Busy."}
|
|
345
|
+
|
|
346
|
+
# Output to local path
|
|
347
|
+
output_dir = self.local_models_path / new_model_name
|
|
348
|
+
if output_dir.exists(): return {"status": False, "message": "Model exists."}
|
|
349
|
+
|
|
350
|
+
# Resolve base model path (is it local or remote?)
|
|
351
|
+
# Reuse logic from load_model's resolution if strictly needed, or let HF handle it.
|
|
352
|
+
# But for QLoRA, we usually want the base model weights.
|
|
353
|
+
# We pass 'base_model_name' directly; if it matches a local folder in `load_model`,
|
|
354
|
+
# the user should probably pass that full path or we resolve it here.
|
|
355
|
+
# Let's resolve it against local path:
|
|
356
|
+
possible_local = self.local_models_path / base_model_name
|
|
357
|
+
if possible_local.exists():
|
|
358
|
+
base_model_path = str(possible_local)
|
|
359
|
+
else:
|
|
360
|
+
base_model_path = base_model_name
|
|
454
361
|
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
return
|
|
362
|
+
t = threading.Thread(target=self._run_training_job, args=(base_model_path, dataset_path, str(output_dir), num_epochs, batch_size, learning_rate))
|
|
363
|
+
t.start()
|
|
364
|
+
return {"status": True, "message": f"Training started. Output: {output_dir}"}
|
|
458
365
|
|
|
459
|
-
def
|
|
460
|
-
|
|
461
|
-
|
|
366
|
+
def _run_training_job(self, base_model, dataset_path, output_dir, epochs, batch_size, lr):
|
|
367
|
+
self.is_training = True
|
|
368
|
+
self.inference_lock.acquire()
|
|
462
369
|
try:
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
370
|
+
ASCIIColors.info(f"Training Base: {base_model}")
|
|
371
|
+
|
|
372
|
+
# Dataset
|
|
373
|
+
ext = "json" if dataset_path.endswith("json") else "text"
|
|
374
|
+
dataset = load_dataset(ext, data_files=dataset_path, split="train")
|
|
375
|
+
|
|
376
|
+
# QLoRA Setup
|
|
377
|
+
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
|
|
378
|
+
|
|
379
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
380
|
+
base_model, quantization_config=bnb_config, device_map="auto",
|
|
381
|
+
trust_remote_code=self.config.get("trust_remote_code", False)
|
|
382
|
+
)
|
|
383
|
+
model.config.use_cache = False
|
|
384
|
+
model = prepare_model_for_kbit_training(model)
|
|
385
|
+
|
|
386
|
+
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
|
|
387
|
+
if not tokenizer.pad_token: tokenizer.pad_token = tokenizer.eos_token
|
|
388
|
+
tokenizer.padding_side = "right"
|
|
389
|
+
|
|
390
|
+
peft_config = LoraConfig(r=64, lora_alpha=16, target_modules=["q_proj", "v_proj"], task_type="CAUSAL_LM", bias="none", lora_dropout=0.1)
|
|
391
|
+
model = get_peft_model(model, peft_config)
|
|
392
|
+
|
|
393
|
+
# Formatting
|
|
394
|
+
def format_prompts(examples):
|
|
395
|
+
texts = []
|
|
396
|
+
for i in range(len(examples.get("instruction", []))):
|
|
397
|
+
ins = examples["instruction"][i]
|
|
398
|
+
inp = examples.get("input", [""])[i]
|
|
399
|
+
out = examples.get("output", [""])[i]
|
|
400
|
+
if inp: text = f"### Instruction:\n{ins}\n\n### Input:\n{inp}\n\n### Response:\n{out}<|endoftext|>"
|
|
401
|
+
else: text = f"### Instruction:\n{ins}\n\n### Response:\n{out}<|endoftext|>"
|
|
402
|
+
texts.append(text)
|
|
403
|
+
return texts if texts else examples.get("text", [])
|
|
404
|
+
|
|
405
|
+
trainer = SFTTrainer(
|
|
406
|
+
model=model, train_dataset=dataset, peft_config=peft_config,
|
|
407
|
+
formatting_func=format_prompts, tokenizer=tokenizer,
|
|
408
|
+
args=TrainingArguments(
|
|
409
|
+
output_dir=output_dir, num_train_epochs=epochs,
|
|
410
|
+
per_device_train_batch_size=batch_size, gradient_accumulation_steps=4,
|
|
411
|
+
learning_rate=lr, fp16=True, logging_steps=10, save_strategy="epoch", optim="paged_adamw_32bit"
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
trainer.train()
|
|
415
|
+
trainer.save_model(output_dir)
|
|
416
|
+
ASCIIColors.success("Training Finished.")
|
|
469
417
|
except Exception as e:
|
|
470
|
-
ASCIIColors.error(f"
|
|
418
|
+
ASCIIColors.error(f"Training error: {e}")
|
|
471
419
|
trace_exception(e)
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
"config": self.config, # Binding's own config
|
|
481
|
-
"device": self.device,
|
|
482
|
-
"torch_dtype": str(self.torch_dtype),
|
|
483
|
-
"supports_vision": self.supports_vision,
|
|
484
|
-
"embedding_model_name": self.config.get("embedding_model_name") if self.embedding_model else None,
|
|
485
|
-
}
|
|
486
|
-
if self.model and hasattr(self.model, 'config'):
|
|
487
|
-
model_hf_config = self.model.config.to_dict()
|
|
488
|
-
info["model_hf_config"] = {k: str(v)[:200] for k,v in model_hf_config.items()} # Truncate long values
|
|
489
|
-
info["max_model_len"] = getattr(self.model.config, "max_position_embeddings", "N/A")
|
|
420
|
+
finally:
|
|
421
|
+
self.inference_lock.release()
|
|
422
|
+
self.is_training = False
|
|
423
|
+
|
|
424
|
+
def merge_lora(self, base_model_name, lora_model_name, new_model_name):
|
|
425
|
+
# Resolve Base
|
|
426
|
+
possible_base = self.local_models_path / base_model_name
|
|
427
|
+
base_path = str(possible_base) if possible_base.exists() else base_model_name
|
|
490
428
|
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
models_found = []
|
|
497
|
-
unique_model_names = set()
|
|
498
|
-
|
|
499
|
-
if self.models_path.exists() and self.models_path.is_dir():
|
|
500
|
-
for item in self.models_path.iterdir():
|
|
501
|
-
if item.is_dir(): # HF models are directories
|
|
502
|
-
# Basic check for a config file to qualify as a model dir
|
|
503
|
-
if (item / "config.json").exists():
|
|
504
|
-
model_name = item.name
|
|
505
|
-
if model_name not in unique_model_names:
|
|
506
|
-
try:
|
|
507
|
-
# Calculating size can be slow for large model repos
|
|
508
|
-
# total_size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
|
|
509
|
-
# size_gb_str = f"{total_size / (1024**3):.2f} GB"
|
|
510
|
-
size_gb_str = "N/A (size calculation disabled for speed)"
|
|
511
|
-
except Exception:
|
|
512
|
-
size_gb_str = "N/A"
|
|
513
|
-
|
|
514
|
-
models_found.append({
|
|
515
|
-
"model_name": model_name, # This is the folder name
|
|
516
|
-
"path_hint": str(item.relative_to(self.models_path.parent) if item.is_relative_to(self.models_path.parent) else item),
|
|
517
|
-
"size_gb": size_gb_str
|
|
518
|
-
})
|
|
519
|
-
unique_model_names.add(model_name)
|
|
429
|
+
# Resolve LoRA (Usually local if trained here)
|
|
430
|
+
possible_lora = self.local_models_path / lora_model_name
|
|
431
|
+
lora_path = str(possible_lora) if possible_lora.exists() else lora_model_name
|
|
432
|
+
|
|
433
|
+
save_path = self.local_models_path / new_model_name
|
|
520
434
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
# --- Configuration ---
|
|
533
|
-
# For testing, you might need to download a model first or use a small Hub ID.
|
|
534
|
-
# Option 1: Use a small model from Hugging Face Hub
|
|
535
|
-
# test_model_name = "gpt2" # Small, good for quick tests
|
|
536
|
-
test_model_name = "microsoft/phi-2" # Small, good quality, requires trust_remote_code=True
|
|
537
|
-
# test_model_name = "HuggingFaceH4/zephyr-7b-beta" # Larger, powerful
|
|
538
|
-
|
|
539
|
-
# Option 2: Path to a local model folder (if you have one)
|
|
540
|
-
# Replace 'path/to/your/models' with the PARENT directory of your HF model folders.
|
|
541
|
-
# And 'your-local-model-folder' with the actual folder name.
|
|
542
|
-
# Example:
|
|
543
|
-
# test_models_parent_path = Path.home() / "lollms_models" # Example path
|
|
544
|
-
# test_model_name = "phi-2" # if "phi-2" folder is inside test_models_parent_path
|
|
545
|
-
|
|
546
|
-
# For local testing, models_path should be where your HF model *folders* are.
|
|
547
|
-
# If using a Hub ID like "gpt2", models_path is less critical unless you expect
|
|
548
|
-
# the binding to *only* look there (which it doesn't, it prioritizes Hub IDs).
|
|
549
|
-
# Let's use a dummy path for models_path for Hub ID testing.
|
|
550
|
-
|
|
551
|
-
# Adjust current_directory for local model testing if needed
|
|
552
|
-
# For this test, we'll assume a Hub ID. `models_path` is where `listModels` would scan.
|
|
553
|
-
test_models_parent_path = Path("./test_hf_models_dir") # Create a dummy for listModels scan
|
|
554
|
-
test_models_parent_path.mkdir(exist_ok=True)
|
|
555
|
-
|
|
556
|
-
binding_config = {
|
|
557
|
-
"device": "auto", # "cuda", "mps", "cpu"
|
|
558
|
-
"quantize": False, # False, "4bit", "8bit" (requires CUDA & bitsandbytes for 4/8 bit)
|
|
559
|
-
"torch_dtype": "auto", # "float16" or "bfloat16" on CUDA for speed
|
|
560
|
-
"max_new_tokens": 100, # Limit generation length for tests
|
|
561
|
-
"trust_remote_code": True, # Needed for models like Phi-2
|
|
562
|
-
"stop_words": ["\nHuman:", "\nUSER:"], # Example stop words
|
|
563
|
-
# "embedding_model_name": "sentence-transformers/paraphrase-MiniLM-L3-v2" # Smaller embedding model
|
|
564
|
-
}
|
|
435
|
+
try:
|
|
436
|
+
ASCIIColors.info(f"Merging {base_path} + {lora_path} -> {save_path}")
|
|
437
|
+
base = AutoModelForCausalLM.from_pretrained(base_path, return_dict=True, torch_dtype=torch.float16, device_map="auto", trust_remote_code=self.config.get("trust_remote_code"))
|
|
438
|
+
tokenizer = AutoTokenizer.from_pretrained(base_path)
|
|
439
|
+
|
|
440
|
+
merged = PeftModel.from_pretrained(base, lora_path).merge_and_unload()
|
|
441
|
+
merged.save_pretrained(save_path)
|
|
442
|
+
tokenizer.save_pretrained(save_path)
|
|
443
|
+
return {"status": True, "message": "Merged."}
|
|
444
|
+
except Exception as e:
|
|
445
|
+
return {"status": False, "message": str(e)}
|
|
565
446
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
models_path=test_models_parent_path,
|
|
572
|
-
config=binding_config
|
|
573
|
-
)
|
|
574
|
-
if not active_binding.model:
|
|
575
|
-
raise RuntimeError(f"Model '{test_model_name}' failed to load.")
|
|
576
|
-
|
|
577
|
-
ASCIIColors.green(f"Binding initialized. Model '{active_binding.model_name}' loaded on {active_binding.device}.")
|
|
578
|
-
ASCIIColors.info(f"Model Info: {json.dumps(active_binding.get_model_info(), indent=2, default=str)}")
|
|
579
|
-
|
|
580
|
-
# --- List Models (scans configured models_path) ---
|
|
581
|
-
ASCIIColors.cyan("\n--- Listing Models (from models_path) ---")
|
|
582
|
-
# To make this test useful, you could manually place a model folder in `test_hf_models_dir`
|
|
583
|
-
# e.g., download "gpt2" and put it in `test_hf_models_dir/gpt2`
|
|
584
|
-
# For now, it will likely be empty unless you do that.
|
|
585
|
-
listed_models = active_binding.listModels()
|
|
586
|
-
if listed_models:
|
|
587
|
-
ASCIIColors.green(f"Found {len(listed_models)} potential model folders. First 5:")
|
|
588
|
-
for m in listed_models[:5]: print(m)
|
|
589
|
-
else: ASCIIColors.warning(f"No model folders found in '{test_models_parent_path}'. This is normal if it's empty.")
|
|
590
|
-
|
|
591
|
-
# --- Tokenize/Detokenize ---
|
|
592
|
-
ASCIIColors.cyan("\n--- Tokenize/Detokenize ---")
|
|
593
|
-
sample_text = "Hello, Hugging Face world!"
|
|
594
|
-
tokens = active_binding.tokenize(sample_text)
|
|
595
|
-
ASCIIColors.green(f"Tokens for '{sample_text}': {tokens[:10]}...")
|
|
596
|
-
token_count = active_binding.count_tokens(sample_text)
|
|
597
|
-
ASCIIColors.green(f"Token count: {token_count}")
|
|
598
|
-
if tokens:
|
|
599
|
-
detokenized_text = active_binding.detokenize(tokens)
|
|
600
|
-
ASCIIColors.green(f"Detokenized text: {detokenized_text}")
|
|
601
|
-
else: ASCIIColors.warning("Tokenization returned empty list.")
|
|
602
|
-
|
|
603
|
-
# --- Text Generation (Non-Streaming, Chat Format if supported) ---
|
|
604
|
-
ASCIIColors.cyan("\n--- Text Generation (Non-Streaming) ---")
|
|
605
|
-
prompt_text = "What is the capital of France?"
|
|
606
|
-
# For Phi-2, system prompt might need specific formatting if not using apply_chat_template strictly
|
|
607
|
-
# For models like Zephyr, system_prompt is part of chat template
|
|
608
|
-
system_prompt_text = "You are a helpful AI assistant."
|
|
609
|
-
generated_text = active_binding.generate_text(
|
|
610
|
-
prompt_text, system_prompt=system_prompt_text, stream=False,
|
|
611
|
-
n_predict=30 # Override default max_new_tokens for this call
|
|
612
|
-
)
|
|
613
|
-
if isinstance(generated_text, str): ASCIIColors.green(f"Generated text: {generated_text}")
|
|
614
|
-
else: ASCIIColors.error(f"Generation failed: {generated_text}")
|
|
615
|
-
|
|
616
|
-
# --- Text Generation (Streaming) ---
|
|
617
|
-
ASCIIColors.cyan("\n--- Text Generation (Streaming) ---")
|
|
618
|
-
full_streamed_text = ""
|
|
619
|
-
def stream_callback(chunk: str, msg_type: int):
|
|
620
|
-
global full_streamed_text
|
|
621
|
-
ASCIIColors.green(f"{chunk}", end="", flush=True)
|
|
622
|
-
full_streamed_text += chunk
|
|
623
|
-
return True # Continue streaming
|
|
447
|
+
def ps(self) -> Dict[str, List[Dict]]:
|
|
448
|
+
"""
|
|
449
|
+
Returns the process status of loaded models, including memory usage.
|
|
450
|
+
"""
|
|
451
|
+
models_status = []
|
|
624
452
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
streaming_callback=stream_callback,
|
|
629
|
-
n_predict=70
|
|
630
|
-
)
|
|
631
|
-
print("\n--- End of Stream ---")
|
|
632
|
-
if isinstance(result, str): ASCIIColors.green(f"Full streamed text collected: {result}")
|
|
633
|
-
else: ASCIIColors.error(f"Streaming generation failed: {result}")
|
|
634
|
-
|
|
635
|
-
# --- Embeddings ---
|
|
636
|
-
if active_binding.embedding_model:
|
|
637
|
-
ASCIIColors.cyan("\n--- Embeddings ---")
|
|
638
|
-
embedding_text = "This is a test sentence for Hugging Face embeddings."
|
|
639
|
-
try:
|
|
640
|
-
embedding_vector = active_binding.embed(embedding_text)
|
|
641
|
-
ASCIIColors.green(f"Embedding for '{embedding_text}' (first 3 dims): {embedding_vector[:3]}...")
|
|
642
|
-
ASCIIColors.info(f"Embedding vector dimension: {len(embedding_vector)}")
|
|
643
|
-
|
|
644
|
-
# Test batch embedding
|
|
645
|
-
batch_texts = ["First sentence.", "Second sentence, quite different."]
|
|
646
|
-
batch_embeddings = active_binding.embed(batch_texts)
|
|
647
|
-
ASCIIColors.green(f"Batch embeddings generated for {len(batch_texts)} texts.")
|
|
648
|
-
ASCIIColors.info(f"First batch embedding (first 3 dims): {batch_embeddings[0][:3]}...")
|
|
649
|
-
|
|
650
|
-
except Exception as e_emb: ASCIIColors.warning(f"Could not get embedding: {e_emb}")
|
|
651
|
-
else: ASCIIColors.yellow("\n--- Embeddings Skipped (no embedding model loaded) ---")
|
|
652
|
-
|
|
653
|
-
# --- LLaVA Vision Test (Conceptual - requires a LLaVA model and an image) ---
|
|
654
|
-
# To test LLaVA properly:
|
|
655
|
-
# 1. Set `test_model_name` to a LLaVA model, e.g., "llava-hf/llava-1.5-7b-hf" (very large!)
|
|
656
|
-
# or a smaller one like "unum-cloud/uform-gen2-qwen-500m" (check its specific prompting style).
|
|
657
|
-
# 2. Ensure `trust_remote_code=True` might be needed.
|
|
658
|
-
# 3. Provide a real image path.
|
|
659
|
-
if active_binding.supports_vision:
|
|
660
|
-
ASCIIColors.cyan("\n--- LLaVA Vision Test ---")
|
|
661
|
-
dummy_image_path = Path("test_dummy_image.png")
|
|
453
|
+
# Get global GPU info once
|
|
454
|
+
gpu_total_mem = 0
|
|
455
|
+
if torch.cuda.is_available():
|
|
662
456
|
try:
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
d = ImageDraw.Draw(img)
|
|
667
|
-
d.text((10,10), "Hello LLaVA from HF!", fill=('black'))
|
|
668
|
-
img.save(dummy_image_path)
|
|
669
|
-
ASCIIColors.info(f"Created dummy image: {dummy_image_path}")
|
|
670
|
-
|
|
671
|
-
llava_prompt = "Describe this image." # LLaVA models often use "<image>\nUSER: <prompt>\nASSISTANT:"
|
|
672
|
-
# or just the prompt if processor handles template.
|
|
673
|
-
# For AutoProcessor, often just the text part of the prompt.
|
|
674
|
-
llava_response = active_binding.generate_text(
|
|
675
|
-
prompt=llava_prompt,
|
|
676
|
-
images=[str(dummy_image_path)],
|
|
677
|
-
n_predict=50,
|
|
678
|
-
stream=False
|
|
679
|
-
)
|
|
680
|
-
if isinstance(llava_response, str): ASCIIColors.green(f"LLaVA response: {llava_response}")
|
|
681
|
-
else: ASCIIColors.error(f"LLaVA generation failed: {llava_response}")
|
|
457
|
+
gpu_total_mem = torch.cuda.get_device_properties(0).total_memory
|
|
458
|
+
except:
|
|
459
|
+
gpu_total_mem = 0
|
|
682
460
|
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
461
|
+
system_mem = psutil.virtual_memory()
|
|
462
|
+
|
|
463
|
+
for mid, container in self.loaded_models.items():
|
|
464
|
+
# 1. Calculate Model Size (Bytes)
|
|
465
|
+
try:
|
|
466
|
+
# Hugging Face models track their own footprint
|
|
467
|
+
size_bytes = container.model.get_memory_footprint()
|
|
468
|
+
except Exception:
|
|
469
|
+
size_bytes = 0
|
|
470
|
+
|
|
471
|
+
# 2. Split into VRAM/RAM based on device
|
|
472
|
+
size_vram = 0
|
|
473
|
+
size_ram = 0
|
|
474
|
+
|
|
475
|
+
if container.device == "cuda":
|
|
476
|
+
size_vram = size_bytes
|
|
477
|
+
else:
|
|
478
|
+
size_ram = size_bytes
|
|
479
|
+
|
|
480
|
+
# 3. Calculate Percentages
|
|
481
|
+
gpu_usage_percent = 0
|
|
482
|
+
if gpu_total_mem > 0:
|
|
483
|
+
gpu_usage_percent = (size_vram / gpu_total_mem) * 100
|
|
484
|
+
|
|
485
|
+
# For CPU, we compare against total system RAM
|
|
486
|
+
cpu_usage_percent = 0
|
|
487
|
+
if system_mem.total > 0:
|
|
488
|
+
cpu_usage_percent = (size_ram / system_mem.total) * 100
|
|
489
|
+
|
|
490
|
+
models_status.append({
|
|
491
|
+
"model_name": mid, # UI Standard: 'model_name'
|
|
492
|
+
"active": mid == self.active_model_id,
|
|
493
|
+
"size": size_bytes, # Total size in bytes
|
|
494
|
+
"size_vram": size_vram, # GPU memory usage in bytes
|
|
495
|
+
"size_ram": size_ram, # RAM usage in bytes
|
|
496
|
+
"device": container.device,
|
|
497
|
+
"gpu_usage_percent": round(gpu_usage_percent, 2),
|
|
498
|
+
"cpu_usage_percent": round(cpu_usage_percent, 2),
|
|
499
|
+
"loader": "HuggingFace"
|
|
500
|
+
})
|
|
501
|
+
|
|
502
|
+
# Return a dictionary matching the YAML output definition
|
|
503
|
+
return {"models": models_status}
|