xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/async_restful_client.py +8 -13
- xinference/client/restful/restful_client.py +6 -2
- xinference/core/chat_interface.py +6 -4
- xinference/core/media_interface.py +5 -0
- xinference/core/model.py +1 -5
- xinference/core/supervisor.py +117 -68
- xinference/core/worker.py +49 -37
- xinference/deploy/test/test_cmdline.py +2 -6
- xinference/model/audio/__init__.py +26 -23
- xinference/model/audio/chattts.py +3 -2
- xinference/model/audio/core.py +49 -98
- xinference/model/audio/cosyvoice.py +3 -2
- xinference/model/audio/custom.py +28 -73
- xinference/model/audio/f5tts.py +3 -2
- xinference/model/audio/f5tts_mlx.py +3 -2
- xinference/model/audio/fish_speech.py +3 -2
- xinference/model/audio/funasr.py +17 -4
- xinference/model/audio/kokoro.py +3 -2
- xinference/model/audio/megatts.py +3 -2
- xinference/model/audio/melotts.py +3 -2
- xinference/model/audio/model_spec.json +572 -171
- xinference/model/audio/utils.py +0 -6
- xinference/model/audio/whisper.py +3 -2
- xinference/model/audio/whisper_mlx.py +3 -2
- xinference/model/cache_manager.py +141 -0
- xinference/model/core.py +6 -49
- xinference/model/custom.py +174 -0
- xinference/model/embedding/__init__.py +67 -56
- xinference/model/embedding/cache_manager.py +35 -0
- xinference/model/embedding/core.py +104 -84
- xinference/model/embedding/custom.py +55 -78
- xinference/model/embedding/embed_family.py +80 -31
- xinference/model/embedding/flag/core.py +21 -5
- xinference/model/embedding/llama_cpp/__init__.py +0 -0
- xinference/model/embedding/llama_cpp/core.py +234 -0
- xinference/model/embedding/model_spec.json +968 -103
- xinference/model/embedding/sentence_transformers/core.py +30 -20
- xinference/model/embedding/vllm/core.py +11 -5
- xinference/model/flexible/__init__.py +8 -2
- xinference/model/flexible/core.py +26 -119
- xinference/model/flexible/custom.py +69 -0
- xinference/model/flexible/launchers/image_process_launcher.py +1 -0
- xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
- xinference/model/flexible/launchers/transformers_launcher.py +15 -3
- xinference/model/flexible/launchers/yolo_launcher.py +5 -1
- xinference/model/image/__init__.py +20 -20
- xinference/model/image/cache_manager.py +62 -0
- xinference/model/image/core.py +70 -182
- xinference/model/image/custom.py +28 -72
- xinference/model/image/model_spec.json +402 -119
- xinference/model/image/ocr/got_ocr2.py +3 -2
- xinference/model/image/stable_diffusion/core.py +22 -7
- xinference/model/image/stable_diffusion/mlx.py +6 -6
- xinference/model/image/utils.py +2 -2
- xinference/model/llm/__init__.py +71 -94
- xinference/model/llm/cache_manager.py +292 -0
- xinference/model/llm/core.py +37 -111
- xinference/model/llm/custom.py +88 -0
- xinference/model/llm/llama_cpp/core.py +5 -7
- xinference/model/llm/llm_family.json +16260 -8151
- xinference/model/llm/llm_family.py +138 -839
- xinference/model/llm/lmdeploy/core.py +5 -7
- xinference/model/llm/memory.py +3 -4
- xinference/model/llm/mlx/core.py +6 -8
- xinference/model/llm/reasoning_parser.py +3 -1
- xinference/model/llm/sglang/core.py +32 -14
- xinference/model/llm/transformers/chatglm.py +3 -7
- xinference/model/llm/transformers/core.py +49 -27
- xinference/model/llm/transformers/deepseek_v2.py +2 -2
- xinference/model/llm/transformers/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
- xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
- xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
- xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
- xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
- xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
- xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
- xinference/model/llm/transformers/opt.py +3 -7
- xinference/model/llm/utils.py +34 -49
- xinference/model/llm/vllm/core.py +77 -27
- xinference/model/llm/vllm/xavier/engine.py +5 -3
- xinference/model/llm/vllm/xavier/scheduler.py +10 -6
- xinference/model/llm/vllm/xavier/transfer.py +1 -1
- xinference/model/rerank/__init__.py +26 -25
- xinference/model/rerank/core.py +47 -87
- xinference/model/rerank/custom.py +25 -71
- xinference/model/rerank/model_spec.json +158 -33
- xinference/model/rerank/utils.py +2 -2
- xinference/model/utils.py +115 -54
- xinference/model/video/__init__.py +13 -17
- xinference/model/video/core.py +44 -102
- xinference/model/video/diffusers.py +4 -3
- xinference/model/video/model_spec.json +90 -21
- xinference/types.py +5 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
- xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
- xinference/web/ui/src/locales/en.json +0 -1
- xinference/web/ui/src/locales/ja.json +0 -1
- xinference/web/ui/src/locales/ko.json +0 -1
- xinference/web/ui/src/locales/zh.json +0 -1
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
- xinference/model/audio/model_spec_modelscope.json +0 -231
- xinference/model/embedding/model_spec_modelscope.json +0 -293
- xinference/model/embedding/utils.py +0 -18
- xinference/model/image/model_spec_modelscope.json +0 -375
- xinference/model/llm/llama_cpp/memory.py +0 -457
- xinference/model/llm/llm_family_csghub.json +0 -56
- xinference/model/llm/llm_family_modelscope.json +0 -8700
- xinference/model/llm/llm_family_openmind_hub.json +0 -1019
- xinference/model/rerank/model_spec_modelscope.json +0 -85
- xinference/model/video/model_spec_modelscope.json +0 -184
- xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
- xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
- /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
xinference/model/llm/core.py
CHANGED
|
@@ -22,35 +22,32 @@ from abc import abstractmethod
|
|
|
22
22
|
from collections import defaultdict
|
|
23
23
|
from contextvars import ContextVar
|
|
24
24
|
from functools import lru_cache
|
|
25
|
-
from typing import TYPE_CHECKING, Dict, List, Literal, Optional,
|
|
25
|
+
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
|
|
26
26
|
|
|
27
27
|
from ...core.utils import parse_replica_model_uid
|
|
28
28
|
from ...types import PeftModelConfig
|
|
29
|
-
from ..core import ModelDescription
|
|
30
29
|
from .reasoning_parser import ReasoningParser
|
|
31
30
|
|
|
32
31
|
if TYPE_CHECKING:
|
|
33
|
-
from .llm_family import
|
|
32
|
+
from .llm_family import LLMFamilyV2, LLMSpecV1
|
|
34
33
|
|
|
35
34
|
logger = logging.getLogger(__name__)
|
|
36
35
|
|
|
37
36
|
|
|
38
|
-
|
|
37
|
+
LLM_VERSION_INFOS: Dict[str, List[Dict]] = defaultdict(list)
|
|
39
38
|
|
|
40
39
|
|
|
41
|
-
def
|
|
40
|
+
def get_llm_version_infos():
|
|
42
41
|
import copy
|
|
43
42
|
|
|
44
|
-
return copy.deepcopy(
|
|
43
|
+
return copy.deepcopy(LLM_VERSION_INFOS)
|
|
45
44
|
|
|
46
45
|
|
|
47
46
|
class LLM(abc.ABC):
|
|
48
47
|
def __init__(
|
|
49
48
|
self,
|
|
50
49
|
replica_model_uid: str,
|
|
51
|
-
model_family: "
|
|
52
|
-
model_spec: "LLMSpecV1",
|
|
53
|
-
quantization: str,
|
|
50
|
+
model_family: "LLMFamilyV2",
|
|
54
51
|
model_path: str,
|
|
55
52
|
*args,
|
|
56
53
|
**kwargs,
|
|
@@ -58,8 +55,8 @@ class LLM(abc.ABC):
|
|
|
58
55
|
self.model_uid, self.rep_id = parse_replica_model_uid(replica_model_uid)
|
|
59
56
|
self.raw_model_uid = replica_model_uid
|
|
60
57
|
self.model_family = model_family
|
|
61
|
-
self.model_spec =
|
|
62
|
-
self.quantization = quantization
|
|
58
|
+
self.model_spec = model_family.model_specs[0]
|
|
59
|
+
self.quantization = model_family.model_specs[0].quantization
|
|
63
60
|
self.model_path = model_path
|
|
64
61
|
self.reasoning_parser = None
|
|
65
62
|
if args:
|
|
@@ -128,7 +125,7 @@ class LLM(abc.ABC):
|
|
|
128
125
|
|
|
129
126
|
@classmethod
|
|
130
127
|
def match(
|
|
131
|
-
cls, llm_family: "
|
|
128
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
132
129
|
) -> bool:
|
|
133
130
|
if not cls.check_lib():
|
|
134
131
|
return False
|
|
@@ -137,7 +134,7 @@ class LLM(abc.ABC):
|
|
|
137
134
|
@classmethod
|
|
138
135
|
@abstractmethod
|
|
139
136
|
def match_json(
|
|
140
|
-
cls, llm_family: "
|
|
137
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
141
138
|
) -> bool:
|
|
142
139
|
raise NotImplementedError
|
|
143
140
|
|
|
@@ -169,89 +166,26 @@ class LLM(abc.ABC):
|
|
|
169
166
|
chat_context_var: ContextVar[dict] = ContextVar("chat_context_var", default={})
|
|
170
167
|
|
|
171
168
|
|
|
172
|
-
|
|
173
|
-
def __init__(
|
|
174
|
-
self,
|
|
175
|
-
address: Optional[str],
|
|
176
|
-
devices: Optional[List[str]],
|
|
177
|
-
llm_family: "LLMFamilyV1",
|
|
178
|
-
llm_spec: "LLMSpecV1",
|
|
179
|
-
quantization: Optional[str],
|
|
180
|
-
multimodal_projector: Optional[str] = None,
|
|
181
|
-
model_path: Optional[str] = None,
|
|
182
|
-
):
|
|
183
|
-
super().__init__(address, devices, model_path=model_path)
|
|
184
|
-
self._llm_family = llm_family
|
|
185
|
-
self._llm_spec = llm_spec
|
|
186
|
-
self._quantization = quantization
|
|
187
|
-
self._multimodal_projector = multimodal_projector
|
|
188
|
-
|
|
189
|
-
@property
|
|
190
|
-
def spec(self):
|
|
191
|
-
return self._llm_family
|
|
192
|
-
|
|
193
|
-
def to_dict(self):
|
|
194
|
-
return {
|
|
195
|
-
"model_type": "LLM",
|
|
196
|
-
"address": self.address,
|
|
197
|
-
"accelerators": self.devices,
|
|
198
|
-
"model_name": self._llm_family.model_name,
|
|
199
|
-
"model_lang": self._llm_family.model_lang,
|
|
200
|
-
"model_ability": self._llm_family.model_ability,
|
|
201
|
-
"model_description": self._llm_family.model_description,
|
|
202
|
-
"model_format": self._llm_spec.model_format,
|
|
203
|
-
"model_size_in_billions": self._llm_spec.model_size_in_billions,
|
|
204
|
-
"model_family": self._llm_family.model_family
|
|
205
|
-
or self._llm_family.model_name,
|
|
206
|
-
"quantization": self._quantization,
|
|
207
|
-
"multimodal_projector": self._multimodal_projector,
|
|
208
|
-
"model_hub": self._llm_spec.model_hub,
|
|
209
|
-
"revision": self._llm_spec.model_revision,
|
|
210
|
-
"context_length": self._llm_family.context_length,
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
def to_version_info(self):
|
|
214
|
-
from .utils import get_file_location, get_model_version
|
|
215
|
-
|
|
216
|
-
model_file_location, cache_status = get_file_location(
|
|
217
|
-
self._llm_family, self._llm_spec, self._quantization
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
return {
|
|
221
|
-
"model_version": get_model_version(
|
|
222
|
-
self._llm_family, self._llm_spec, self._quantization
|
|
223
|
-
),
|
|
224
|
-
"model_file_location": model_file_location,
|
|
225
|
-
"cache_status": cache_status,
|
|
226
|
-
"quantization": self._quantization,
|
|
227
|
-
"multimodal_projector": self._multimodal_projector,
|
|
228
|
-
"model_format": self._llm_spec.model_format,
|
|
229
|
-
"model_size_in_billions": self._llm_spec.model_size_in_billions,
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
|
|
169
|
+
def generate_llm_version_info(llm_family: "LLMFamilyV2") -> Dict[str, List[Dict]]:
|
|
234
170
|
res = defaultdict(list)
|
|
235
|
-
|
|
171
|
+
# Use model_specs from huggingface, as HuggingFace is the most comprehensive.
|
|
172
|
+
hf_specs = [
|
|
173
|
+
spec for spec in llm_family.model_specs if spec.model_hub == "huggingface"
|
|
174
|
+
]
|
|
175
|
+
for spec in hf_specs:
|
|
176
|
+
_llm_family = llm_family.copy()
|
|
177
|
+
_llm_family.model_specs = [spec]
|
|
236
178
|
multimodal_projectors = getattr(spec, "multimodal_projectors", None)
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
).to_version_info()
|
|
244
|
-
)
|
|
245
|
-
else:
|
|
246
|
-
res[llm_family.model_name].append(
|
|
247
|
-
LLMDescription(None, None, llm_family, spec, q).to_version_info()
|
|
248
|
-
)
|
|
179
|
+
if multimodal_projectors:
|
|
180
|
+
for mmproj in multimodal_projectors:
|
|
181
|
+
_llm_family.multimodal_projector = mmproj
|
|
182
|
+
res[_llm_family.model_name].append(_llm_family.to_version_info())
|
|
183
|
+
else:
|
|
184
|
+
res[_llm_family.model_name].append(_llm_family.to_version_info())
|
|
249
185
|
return res
|
|
250
186
|
|
|
251
187
|
|
|
252
188
|
def create_llm_model_instance(
|
|
253
|
-
subpool_addr: str,
|
|
254
|
-
devices: List[str],
|
|
255
189
|
model_uid: str,
|
|
256
190
|
model_name: str,
|
|
257
191
|
model_engine: Optional[str],
|
|
@@ -264,35 +198,35 @@ def create_llm_model_instance(
|
|
|
264
198
|
] = None,
|
|
265
199
|
model_path: Optional[str] = None,
|
|
266
200
|
**kwargs,
|
|
267
|
-
) ->
|
|
268
|
-
from .
|
|
201
|
+
) -> LLM:
|
|
202
|
+
from .cache_manager import LLMCacheManager
|
|
203
|
+
from .llm_family import check_engine_by_spec_parameters, match_llm
|
|
269
204
|
|
|
270
205
|
if model_engine is None:
|
|
271
206
|
raise ValueError("model_engine is required for LLM model")
|
|
272
|
-
|
|
207
|
+
llm_family = match_llm(
|
|
273
208
|
model_name, model_format, model_size_in_billions, quantization, download_hub
|
|
274
209
|
)
|
|
275
210
|
|
|
276
|
-
if not
|
|
211
|
+
if not llm_family:
|
|
277
212
|
raise ValueError(
|
|
278
213
|
f"Model not found, name: {model_name}, format: {model_format},"
|
|
279
214
|
f" size: {model_size_in_billions}, quantization: {quantization}"
|
|
280
215
|
)
|
|
281
|
-
llm_family, llm_spec, quantization = match_result
|
|
282
|
-
assert quantization is not None
|
|
283
216
|
|
|
284
217
|
llm_cls = check_engine_by_spec_parameters(
|
|
285
218
|
model_engine,
|
|
286
219
|
llm_family.model_name,
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
quantization,
|
|
220
|
+
llm_family.model_specs[0].model_format,
|
|
221
|
+
llm_family.model_specs[0].model_size_in_billions,
|
|
222
|
+
llm_family.model_specs[0].quantization,
|
|
290
223
|
)
|
|
291
224
|
logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
|
|
292
225
|
|
|
293
226
|
multimodal_projector = kwargs.get("multimodal_projector")
|
|
294
227
|
if not model_path:
|
|
295
|
-
|
|
228
|
+
cache_manager = LLMCacheManager(llm_family, multimodal_projector)
|
|
229
|
+
model_path = cache_manager.cache()
|
|
296
230
|
|
|
297
231
|
peft_model = peft_model_config.peft_model if peft_model_config else None
|
|
298
232
|
if peft_model is not None:
|
|
@@ -300,8 +234,6 @@ def create_llm_model_instance(
|
|
|
300
234
|
model = llm_cls(
|
|
301
235
|
model_uid,
|
|
302
236
|
llm_family,
|
|
303
|
-
llm_spec,
|
|
304
|
-
quantization,
|
|
305
237
|
model_path,
|
|
306
238
|
kwargs,
|
|
307
239
|
peft_model,
|
|
@@ -311,13 +243,7 @@ def create_llm_model_instance(
|
|
|
311
243
|
f"Model not supported with lora, name: {model_name}, format: {model_format}, engine: {model_engine}. "
|
|
312
244
|
f"Load this without lora."
|
|
313
245
|
)
|
|
314
|
-
model = llm_cls(
|
|
315
|
-
model_uid, llm_family, llm_spec, quantization, model_path, kwargs
|
|
316
|
-
)
|
|
246
|
+
model = llm_cls(model_uid, llm_family, model_path, kwargs)
|
|
317
247
|
else:
|
|
318
|
-
model = llm_cls(
|
|
319
|
-
|
|
320
|
-
)
|
|
321
|
-
return model, LLMDescription(
|
|
322
|
-
subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
|
|
323
|
-
)
|
|
248
|
+
model = llm_cls(model_uid, llm_family, model_path, kwargs)
|
|
249
|
+
return model
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from typing import TYPE_CHECKING, List
|
|
17
|
+
|
|
18
|
+
from ..custom import ModelRegistry
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from .llm_family import LLMFamilyV2
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
UD_LLM_FAMILIES: List["LLMFamilyV2"] = []
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LLMModelRegistry(ModelRegistry):
|
|
31
|
+
model_type = "llm"
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
from .llm_family import BUILTIN_LLM_FAMILIES
|
|
35
|
+
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.models = UD_LLM_FAMILIES
|
|
38
|
+
self.builtin_models = [x.model_name for x in BUILTIN_LLM_FAMILIES]
|
|
39
|
+
|
|
40
|
+
def add_ud_model(self, model_spec):
|
|
41
|
+
from . import generate_engine_config_by_model_family
|
|
42
|
+
|
|
43
|
+
self.models.append(model_spec)
|
|
44
|
+
generate_engine_config_by_model_family(model_spec)
|
|
45
|
+
|
|
46
|
+
def check_model_uri(self, llm_family: "LLMFamilyV2"):
|
|
47
|
+
from ..utils import is_valid_model_uri
|
|
48
|
+
|
|
49
|
+
for spec in llm_family.model_specs:
|
|
50
|
+
model_uri = spec.model_uri
|
|
51
|
+
if model_uri and not is_valid_model_uri(model_uri):
|
|
52
|
+
raise ValueError(f"Invalid model URI {model_uri}.")
|
|
53
|
+
|
|
54
|
+
def remove_ud_model(self, llm_family: "LLMFamilyV2"):
|
|
55
|
+
from .llm_family import LLM_ENGINES
|
|
56
|
+
|
|
57
|
+
UD_LLM_FAMILIES.remove(llm_family)
|
|
58
|
+
del LLM_ENGINES[llm_family.model_name]
|
|
59
|
+
|
|
60
|
+
def remove_ud_model_files(self, llm_family: "LLMFamilyV2"):
|
|
61
|
+
from .cache_manager import LLMCacheManager
|
|
62
|
+
|
|
63
|
+
_llm_family = llm_family.copy()
|
|
64
|
+
for spec in llm_family.model_specs:
|
|
65
|
+
_llm_family.model_specs = [spec]
|
|
66
|
+
cache_manager = LLMCacheManager(_llm_family)
|
|
67
|
+
cache_manager.unregister_custom_model(self.model_type)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_user_defined_llm_families():
|
|
71
|
+
from ..custom import RegistryManager
|
|
72
|
+
|
|
73
|
+
registry = RegistryManager.get_registry("llm")
|
|
74
|
+
return registry.get_custom_models()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def register_llm(llm_family: "LLMFamilyV2", persist: bool):
|
|
78
|
+
from ..custom import RegistryManager
|
|
79
|
+
|
|
80
|
+
registry = RegistryManager.get_registry("llm")
|
|
81
|
+
registry.register(llm_family, persist)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def unregister_llm(model_name: str, raise_error: bool = True):
|
|
85
|
+
from ..custom import RegistryManager
|
|
86
|
+
|
|
87
|
+
registry = RegistryManager.get_registry("llm")
|
|
88
|
+
registry.unregister(model_name, raise_error)
|
|
@@ -23,9 +23,8 @@ import orjson
|
|
|
23
23
|
|
|
24
24
|
from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
|
|
25
25
|
from ..core import LLM
|
|
26
|
-
from ..llm_family import
|
|
26
|
+
from ..llm_family import LLMFamilyV2, LLMSpecV1
|
|
27
27
|
from ..utils import ChatModelMixin
|
|
28
|
-
from .memory import estimate_gpu_layers
|
|
29
28
|
|
|
30
29
|
logger = logging.getLogger(__name__)
|
|
31
30
|
|
|
@@ -43,13 +42,11 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
43
42
|
def __init__(
|
|
44
43
|
self,
|
|
45
44
|
model_uid: str,
|
|
46
|
-
model_family: "
|
|
47
|
-
model_spec: "LLMSpecV1",
|
|
48
|
-
quantization: str,
|
|
45
|
+
model_family: "LLMFamilyV2",
|
|
49
46
|
model_path: str,
|
|
50
47
|
llamacpp_model_config: Optional[dict] = None,
|
|
51
48
|
):
|
|
52
|
-
super().__init__(model_uid, model_family,
|
|
49
|
+
super().__init__(model_uid, model_family, model_path)
|
|
53
50
|
self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
|
|
54
51
|
self._llm = None
|
|
55
52
|
self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
|
@@ -84,7 +81,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
84
81
|
|
|
85
82
|
@classmethod
|
|
86
83
|
def match_json(
|
|
87
|
-
cls, llm_family:
|
|
84
|
+
cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
|
|
88
85
|
) -> bool:
|
|
89
86
|
if llm_spec.model_format not in ["ggufv2"]:
|
|
90
87
|
return False
|
|
@@ -100,6 +97,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
100
97
|
from xllamacpp import (
|
|
101
98
|
CommonParams,
|
|
102
99
|
Server,
|
|
100
|
+
estimate_gpu_layers,
|
|
103
101
|
get_device_info,
|
|
104
102
|
ggml_backend_dev_type,
|
|
105
103
|
)
|