xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +79 -2
- xinference/client/restful/restful_client.py +65 -3
- xinference/conftest.py +0 -7
- xinference/core/media_interface.py +132 -8
- xinference/core/model.py +44 -6
- xinference/core/scheduler.py +1 -10
- xinference/core/supervisor.py +8 -17
- xinference/core/worker.py +5 -27
- xinference/deploy/cmdline.py +6 -2
- xinference/model/audio/chattts.py +24 -39
- xinference/model/audio/cosyvoice.py +18 -30
- xinference/model/audio/funasr.py +42 -0
- xinference/model/audio/model_spec.json +71 -1
- xinference/model/audio/model_spec_modelscope.json +76 -2
- xinference/model/audio/utils.py +75 -0
- xinference/model/core.py +1 -0
- xinference/model/embedding/__init__.py +74 -18
- xinference/model/embedding/core.py +98 -589
- xinference/model/embedding/embed_family.py +133 -0
- xinference/{thirdparty/omnilmm/train → model/embedding/flag}/__init__.py +1 -1
- xinference/model/embedding/flag/core.py +282 -0
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/embedding/sentence_transformers/__init__.py +13 -0
- xinference/model/embedding/sentence_transformers/core.py +399 -0
- xinference/model/embedding/vllm/core.py +95 -0
- xinference/model/image/model_spec.json +30 -3
- xinference/model/image/model_spec_modelscope.json +41 -2
- xinference/model/image/stable_diffusion/core.py +144 -53
- xinference/model/llm/__init__.py +6 -54
- xinference/model/llm/core.py +19 -5
- xinference/model/llm/llama_cpp/core.py +59 -3
- xinference/model/llm/llama_cpp/memory.py +457 -0
- xinference/model/llm/llm_family.json +247 -402
- xinference/model/llm/llm_family.py +88 -16
- xinference/model/llm/llm_family_modelscope.json +260 -421
- xinference/model/llm/llm_family_openmind_hub.json +0 -34
- xinference/model/llm/sglang/core.py +8 -0
- xinference/model/llm/transformers/__init__.py +27 -6
- xinference/model/llm/transformers/chatglm.py +4 -2
- xinference/model/llm/transformers/core.py +49 -28
- xinference/model/llm/transformers/deepseek_v2.py +6 -49
- xinference/model/llm/transformers/gemma3.py +119 -164
- xinference/model/llm/transformers/multimodal/__init__.py +13 -0
- xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
- xinference/model/llm/transformers/multimodal/core.py +205 -0
- xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
- xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
- xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
- xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
- xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
- xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
- xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
- xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
- xinference/model/llm/transformers/opt.py +4 -2
- xinference/model/llm/transformers/utils.py +6 -37
- xinference/model/llm/utils.py +11 -0
- xinference/model/llm/vllm/core.py +7 -0
- xinference/model/rerank/core.py +91 -3
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +24 -0
- xinference/model/rerank/utils.py +20 -2
- xinference/model/utils.py +38 -1
- xinference/model/video/diffusers.py +65 -3
- xinference/model/video/model_spec.json +31 -4
- xinference/model/video/model_spec_modelscope.json +32 -4
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.013f296b.css +2 -0
- xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
- xinference/web/ui/src/locales/en.json +21 -8
- xinference/web/ui/src/locales/ja.json +224 -0
- xinference/web/ui/src/locales/ko.json +224 -0
- xinference/web/ui/src/locales/zh.json +21 -8
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/METADATA +14 -11
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/RECORD +93 -100
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/WHEEL +1 -1
- xinference/model/llm/transformers/cogvlm2.py +0 -442
- xinference/model/llm/transformers/cogvlm2_video.py +0 -333
- xinference/model/llm/transformers/deepseek_vl.py +0 -280
- xinference/model/llm/transformers/glm_edge_v.py +0 -213
- xinference/model/llm/transformers/intern_vl.py +0 -526
- xinference/model/llm/transformers/internlm2.py +0 -94
- xinference/model/llm/transformers/minicpmv25.py +0 -193
- xinference/model/llm/transformers/omnilmm.py +0 -132
- xinference/model/llm/transformers/qwen2_audio.py +0 -179
- xinference/model/llm/transformers/qwen_vl.py +0 -360
- xinference/thirdparty/omnilmm/LICENSE +0 -201
- xinference/thirdparty/omnilmm/chat.py +0 -218
- xinference/thirdparty/omnilmm/constants.py +0 -4
- xinference/thirdparty/omnilmm/conversation.py +0 -332
- xinference/thirdparty/omnilmm/model/__init__.py +0 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
- xinference/thirdparty/omnilmm/model/resampler.py +0 -166
- xinference/thirdparty/omnilmm/model/utils.py +0 -578
- xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
- xinference/thirdparty/omnilmm/utils.py +0 -134
- xinference/web/ui/build/static/css/main.337afe76.css +0 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
- xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
- xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
- /xinference/{thirdparty/omnilmm → model/embedding/vllm}/__init__.py +0 -0
- /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -22,6 +22,7 @@ import logging
|
|
|
22
22
|
import os
|
|
23
23
|
import re
|
|
24
24
|
import sys
|
|
25
|
+
import warnings
|
|
25
26
|
from glob import glob
|
|
26
27
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
|
27
28
|
|
|
@@ -197,8 +198,6 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
197
198
|
return getattr(module, class_name)
|
|
198
199
|
|
|
199
200
|
def load(self):
|
|
200
|
-
from transformers import BitsAndBytesConfig, T5EncoderModel
|
|
201
|
-
|
|
202
201
|
if "text2image" in self._abilities or "image2image" in self._abilities:
|
|
203
202
|
from diffusers import AutoPipelineForText2Image as AutoPipelineModel
|
|
204
203
|
elif "inpainting" in self._abilities:
|
|
@@ -227,58 +226,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
227
226
|
self._get_controlnet_model(*cn) for cn in controlnet
|
|
228
227
|
]
|
|
229
228
|
|
|
229
|
+
# quantizations
|
|
230
|
+
# text_encoder
|
|
230
231
|
quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
import bitsandbytes # noqa: F401
|
|
234
|
-
except ImportError:
|
|
235
|
-
error_message = "Failed to import module 'bitsandbytes'"
|
|
236
|
-
installation_guide = [
|
|
237
|
-
"Please make sure 'bitsandbytes' is installed. ",
|
|
238
|
-
"You can install it by `pip install bitsandbytes`\n",
|
|
239
|
-
]
|
|
240
|
-
|
|
241
|
-
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
242
|
-
|
|
243
|
-
for text_encoder_name in quantize_text_encoder.split(","):
|
|
244
|
-
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
|
245
|
-
quantization_kwargs = {}
|
|
246
|
-
if torch_dtype:
|
|
247
|
-
quantization_kwargs["torch_dtype"] = torch_dtype
|
|
248
|
-
text_encoder = T5EncoderModel.from_pretrained(
|
|
249
|
-
self._model_path,
|
|
250
|
-
subfolder=text_encoder_name,
|
|
251
|
-
quantization_config=quantization_config,
|
|
252
|
-
**quantization_kwargs,
|
|
253
|
-
)
|
|
254
|
-
self._kwargs[text_encoder_name] = text_encoder
|
|
255
|
-
self._kwargs["device_map"] = "balanced"
|
|
256
|
-
|
|
232
|
+
self._quantize_text_encoder(quantize_text_encoder)
|
|
233
|
+
# transformer
|
|
257
234
|
if self._gguf_model_path:
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
self._kwargs["transformer"] = self._get_layer_cls(
|
|
262
|
-
"transformer"
|
|
263
|
-
).from_single_file(
|
|
264
|
-
self._gguf_model_path,
|
|
265
|
-
quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
|
|
266
|
-
torch_dtype=torch_dtype,
|
|
267
|
-
config=os.path.join(self._model_path, "transformer"),
|
|
268
|
-
)
|
|
269
|
-
elif self._kwargs.get("transformer_nf4"):
|
|
270
|
-
nf4_config = BitsAndBytesConfig(
|
|
271
|
-
load_in_4bit=True,
|
|
272
|
-
bnb_4bit_quant_type="nf4",
|
|
273
|
-
bnb_4bit_compute_dtype=torch_dtype,
|
|
274
|
-
)
|
|
275
|
-
model_nf4 = self._get_layer_cls("transformer").from_pretrained(
|
|
276
|
-
self._model_path,
|
|
277
|
-
subfolder="transformer",
|
|
278
|
-
quantization_config=nf4_config,
|
|
279
|
-
torch_dtype=torch_dtype,
|
|
280
|
-
)
|
|
281
|
-
self._kwargs["transformer"] = model_nf4
|
|
235
|
+
self._quantize_transformer_gguf()
|
|
236
|
+
else:
|
|
237
|
+
self._quantize_transformer()
|
|
282
238
|
|
|
283
239
|
logger.debug(
|
|
284
240
|
"Loading model from %s, kwargs: %s", self._model_path, self._kwargs
|
|
@@ -308,6 +264,133 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
308
264
|
cache_branch_id=self._kwargs.get("deepcache_cache_branch_id", 0),
|
|
309
265
|
)
|
|
310
266
|
|
|
267
|
+
def _get_quantize_config(self, method: str, quantization: str, module: str):
|
|
268
|
+
if method == "bnb":
|
|
269
|
+
try:
|
|
270
|
+
import bitsandbytes # noqa: F401
|
|
271
|
+
except ImportError:
|
|
272
|
+
error_message = "Failed to import module 'bitsandbytes'"
|
|
273
|
+
installation_guide = [
|
|
274
|
+
"Please make sure 'bitsandbytes' is installed. ",
|
|
275
|
+
"You can install it by `pip install bitsandbytes`\n",
|
|
276
|
+
]
|
|
277
|
+
|
|
278
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
279
|
+
|
|
280
|
+
if module.startswith("diffusers."):
|
|
281
|
+
from diffusers import BitsAndBytesConfig
|
|
282
|
+
else:
|
|
283
|
+
assert module.startswith("transformers.")
|
|
284
|
+
from transformers import BitsAndBytesConfig
|
|
285
|
+
|
|
286
|
+
if quantization == "4-bit":
|
|
287
|
+
return BitsAndBytesConfig(load_in_4bit=True)
|
|
288
|
+
elif quantization == "8-bit":
|
|
289
|
+
return BitsAndBytesConfig(load_in_8bit=True)
|
|
290
|
+
elif quantization == "nf4":
|
|
291
|
+
return BitsAndBytesConfig(
|
|
292
|
+
load_in_4bit=True,
|
|
293
|
+
bnb_4bit_quant_type="nf4",
|
|
294
|
+
bnb_4bit_compute_dtype=self._torch_dtype,
|
|
295
|
+
)
|
|
296
|
+
elif method == "torchao":
|
|
297
|
+
try:
|
|
298
|
+
import torchao # noqa: F401
|
|
299
|
+
except ImportError:
|
|
300
|
+
error_message = "Failed to import module 'torchao'"
|
|
301
|
+
installation_guide = [
|
|
302
|
+
"Please make sure 'torchao' is installed. ",
|
|
303
|
+
"You can install it by `pip install torchao`\n",
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
307
|
+
|
|
308
|
+
if module.startswith("diffusers."):
|
|
309
|
+
from diffusers import TorchAoConfig
|
|
310
|
+
else:
|
|
311
|
+
assert module.startswith("transformers.")
|
|
312
|
+
from transformers import TorchAoConfig
|
|
313
|
+
|
|
314
|
+
return TorchAoConfig(quantization)
|
|
315
|
+
else:
|
|
316
|
+
raise ValueError(f"Unknown quantization method for image model: {method}")
|
|
317
|
+
|
|
318
|
+
def _quantize_text_encoder(self, quantize_text_encoder: Optional[str]):
|
|
319
|
+
if self._gguf_model_path:
|
|
320
|
+
# skip quantization when gguf applied to transformer
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
if not quantize_text_encoder:
|
|
324
|
+
return
|
|
325
|
+
|
|
326
|
+
quantization_method = self._kwargs.pop("text_encoder_quantize_method", "bnb")
|
|
327
|
+
quantization = self._kwargs.pop("text_encoder_quantization", "8-bit")
|
|
328
|
+
|
|
329
|
+
torch_dtype = self._torch_dtype
|
|
330
|
+
for text_encoder_name in quantize_text_encoder.split(","):
|
|
331
|
+
quantization_kwargs: Dict[str, Any] = {}
|
|
332
|
+
if torch_dtype:
|
|
333
|
+
quantization_kwargs["torch_dtype"] = torch_dtype
|
|
334
|
+
text_encoder_cls = self._get_layer_cls(text_encoder_name)
|
|
335
|
+
quantization_config = self._get_quantize_config(
|
|
336
|
+
quantization_method, quantization, text_encoder_cls.__module__
|
|
337
|
+
)
|
|
338
|
+
text_encoder = text_encoder_cls.from_pretrained(
|
|
339
|
+
self._model_path,
|
|
340
|
+
subfolder=text_encoder_name,
|
|
341
|
+
quantization_config=quantization_config,
|
|
342
|
+
**quantization_kwargs,
|
|
343
|
+
)
|
|
344
|
+
self._kwargs[text_encoder_name] = text_encoder
|
|
345
|
+
else:
|
|
346
|
+
if not self._kwargs.get("device_map"):
|
|
347
|
+
self._kwargs["device_map"] = "balanced"
|
|
348
|
+
|
|
349
|
+
def _quantize_transformer(self):
|
|
350
|
+
quantization = None
|
|
351
|
+
nf4 = self._kwargs.pop("transformer_nf4", None)
|
|
352
|
+
if nf4:
|
|
353
|
+
warnings.warn(
|
|
354
|
+
"`transformer_nf4` is deprecated, please use `transformer_quantization=nf4`",
|
|
355
|
+
category=DeprecationWarning,
|
|
356
|
+
stacklevel=2,
|
|
357
|
+
)
|
|
358
|
+
quantization = "nf4"
|
|
359
|
+
method = self._kwargs.pop("transformer_quantize_method", "bnb")
|
|
360
|
+
if not quantization:
|
|
361
|
+
quantization = self._kwargs.pop("transformer_quantization", None)
|
|
362
|
+
|
|
363
|
+
if not quantization:
|
|
364
|
+
# skip if no quantization specified
|
|
365
|
+
return
|
|
366
|
+
|
|
367
|
+
torch_dtype = self._torch_dtype
|
|
368
|
+
transformer_cls = self._get_layer_cls("transformer")
|
|
369
|
+
quantization_config = self._get_quantize_config(
|
|
370
|
+
method, quantization, transformer_cls.__module__
|
|
371
|
+
)
|
|
372
|
+
transformer_model = transformer_cls.from_pretrained(
|
|
373
|
+
self._model_path,
|
|
374
|
+
subfolder="transformer",
|
|
375
|
+
quantization_config=quantization_config,
|
|
376
|
+
torch_dtype=torch_dtype,
|
|
377
|
+
)
|
|
378
|
+
self._kwargs["transformer"] = transformer_model
|
|
379
|
+
|
|
380
|
+
def _quantize_transformer_gguf(self):
|
|
381
|
+
from diffusers import GGUFQuantizationConfig
|
|
382
|
+
|
|
383
|
+
# GGUF transformer
|
|
384
|
+
torch_dtype = self._torch_dtype
|
|
385
|
+
self._kwargs["transformer"] = self._get_layer_cls(
|
|
386
|
+
"transformer"
|
|
387
|
+
).from_single_file(
|
|
388
|
+
self._gguf_model_path,
|
|
389
|
+
quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
|
|
390
|
+
torch_dtype=torch_dtype,
|
|
391
|
+
config=os.path.join(self._model_path, "transformer"),
|
|
392
|
+
)
|
|
393
|
+
|
|
311
394
|
def _load_to_device(self, model):
|
|
312
395
|
if self._kwargs.get("cpu_offload", False):
|
|
313
396
|
logger.debug("CPU offloading model")
|
|
@@ -321,7 +404,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
321
404
|
if self._kwargs.get("attention_slicing", False):
|
|
322
405
|
model.enable_attention_slicing()
|
|
323
406
|
if self._kwargs.get("vae_tiling", False):
|
|
324
|
-
|
|
407
|
+
try:
|
|
408
|
+
model.enable_vae_tiling()
|
|
409
|
+
except AttributeError:
|
|
410
|
+
model.vae.enable_tiling()
|
|
411
|
+
if self._kwargs.get("vae_slicing", False):
|
|
412
|
+
try:
|
|
413
|
+
model.enable_vae_slicing()
|
|
414
|
+
except AttributeError:
|
|
415
|
+
model.vae.enable_slicing()
|
|
325
416
|
|
|
326
417
|
def get_max_num_images_for_batching(self):
|
|
327
418
|
return self._kwargs.get("max_num_images", 16)
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -73,7 +73,7 @@ def generate_engine_config_by_model_family(model_family):
|
|
|
73
73
|
model_size_in_billions = spec.model_size_in_billions
|
|
74
74
|
quantizations = spec.quantizations
|
|
75
75
|
for quantization in quantizations:
|
|
76
|
-
# traverse all supported engines to match the name, format, size in billions and
|
|
76
|
+
# traverse all supported engines to match the name, format, size in billions and quantization of model
|
|
77
77
|
for engine in SUPPORTED_ENGINES:
|
|
78
78
|
if not check_format_with_engine(
|
|
79
79
|
model_format, engine
|
|
@@ -107,6 +107,10 @@ def generate_engine_config_by_model_family(model_family):
|
|
|
107
107
|
"llm_class": cls,
|
|
108
108
|
}
|
|
109
109
|
)
|
|
110
|
+
if hasattr(spec, "multimodal_projectors"):
|
|
111
|
+
engine_params[-1][
|
|
112
|
+
"multimodal_projectors"
|
|
113
|
+
] = spec.multimodal_projectors
|
|
110
114
|
engines[engine] = engine_params
|
|
111
115
|
break
|
|
112
116
|
LLM_ENGINES[model_name] = engines
|
|
@@ -163,36 +167,9 @@ def _install():
|
|
|
163
167
|
from .lmdeploy.core import LMDeployChatModel, LMDeployModel
|
|
164
168
|
from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
|
|
165
169
|
from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
|
|
166
|
-
from .transformers.chatglm import ChatglmPytorchChatModel
|
|
167
|
-
from .transformers.cogagent import CogAgentChatModel
|
|
168
|
-
from .transformers.cogvlm2 import CogVLM2Model
|
|
169
|
-
from .transformers.cogvlm2_video import CogVLM2VideoModel
|
|
170
170
|
from .transformers.core import PytorchChatModel, PytorchModel
|
|
171
|
-
from .transformers.deepseek_v2 import (
|
|
172
|
-
DeepSeekV2PytorchChatModel,
|
|
173
|
-
DeepSeekV2PytorchModel,
|
|
174
|
-
)
|
|
175
|
-
from .transformers.deepseek_vl import DeepSeekVLChatModel
|
|
176
|
-
from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
|
|
177
|
-
from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
|
|
178
|
-
from .transformers.glm4v import Glm4VModel
|
|
179
|
-
from .transformers.glm_edge_v import GlmEdgeVModel
|
|
180
|
-
from .transformers.minicpmv25 import MiniCPMV25Model
|
|
181
|
-
from .transformers.minicpmv26 import MiniCPMV26Model
|
|
182
|
-
from .transformers.opt import OptPytorchModel
|
|
183
|
-
from .transformers.ovis2 import Ovis2ChatModel
|
|
184
|
-
from .transformers.qwen2_audio import Qwen2AudioChatModel
|
|
185
|
-
from .transformers.qwen_vl import QwenVLChatModel
|
|
186
171
|
from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
|
|
187
172
|
|
|
188
|
-
try:
|
|
189
|
-
from .transformers.omnilmm import OmniLMMModel
|
|
190
|
-
except ImportError as e:
|
|
191
|
-
# For quite old transformers version,
|
|
192
|
-
# import will generate error
|
|
193
|
-
OmniLMMModel = None
|
|
194
|
-
warnings.warn(f"Cannot import OmniLLMModel due to reason: {e}")
|
|
195
|
-
|
|
196
173
|
# register llm classes.
|
|
197
174
|
LLAMA_CLASSES.extend(
|
|
198
175
|
[
|
|
@@ -203,32 +180,7 @@ def _install():
|
|
|
203
180
|
VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
|
|
204
181
|
MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
|
|
205
182
|
LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
|
|
206
|
-
TRANSFORMERS_CLASSES.extend(
|
|
207
|
-
[
|
|
208
|
-
ChatglmPytorchChatModel,
|
|
209
|
-
PytorchChatModel,
|
|
210
|
-
QwenVLChatModel,
|
|
211
|
-
Qwen2AudioChatModel,
|
|
212
|
-
DeepSeekVLChatModel,
|
|
213
|
-
DeepSeekVL2ChatModel,
|
|
214
|
-
PytorchModel,
|
|
215
|
-
CogVLM2Model,
|
|
216
|
-
CogVLM2VideoModel,
|
|
217
|
-
MiniCPMV25Model,
|
|
218
|
-
MiniCPMV26Model,
|
|
219
|
-
Glm4VModel,
|
|
220
|
-
DeepSeekV2PytorchModel,
|
|
221
|
-
DeepSeekV2PytorchChatModel,
|
|
222
|
-
OptPytorchModel,
|
|
223
|
-
GlmEdgeVModel,
|
|
224
|
-
CogAgentChatModel,
|
|
225
|
-
Gemma3TextChatModel,
|
|
226
|
-
Gemma3ChatModel,
|
|
227
|
-
Ovis2ChatModel,
|
|
228
|
-
]
|
|
229
|
-
)
|
|
230
|
-
if OmniLMMModel: # type: ignore
|
|
231
|
-
TRANSFORMERS_CLASSES.append(OmniLMMModel)
|
|
183
|
+
TRANSFORMERS_CLASSES.extend([PytorchChatModel, PytorchModel])
|
|
232
184
|
|
|
233
185
|
# support 4 engines for now
|
|
234
186
|
SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES
|
xinference/model/llm/core.py
CHANGED
|
@@ -160,12 +160,14 @@ class LLMDescription(ModelDescription):
|
|
|
160
160
|
llm_family: "LLMFamilyV1",
|
|
161
161
|
llm_spec: "LLMSpecV1",
|
|
162
162
|
quantization: Optional[str],
|
|
163
|
+
multimodal_projector: Optional[str] = None,
|
|
163
164
|
model_path: Optional[str] = None,
|
|
164
165
|
):
|
|
165
166
|
super().__init__(address, devices, model_path=model_path)
|
|
166
167
|
self._llm_family = llm_family
|
|
167
168
|
self._llm_spec = llm_spec
|
|
168
169
|
self._quantization = quantization
|
|
170
|
+
self._multimodal_projector = multimodal_projector
|
|
169
171
|
|
|
170
172
|
@property
|
|
171
173
|
def spec(self):
|
|
@@ -185,6 +187,7 @@ class LLMDescription(ModelDescription):
|
|
|
185
187
|
"model_family": self._llm_family.model_family
|
|
186
188
|
or self._llm_family.model_name,
|
|
187
189
|
"quantization": self._quantization,
|
|
190
|
+
"multimodal_projector": self._multimodal_projector,
|
|
188
191
|
"model_hub": self._llm_spec.model_hub,
|
|
189
192
|
"revision": self._llm_spec.model_revision,
|
|
190
193
|
"context_length": self._llm_family.context_length,
|
|
@@ -204,6 +207,7 @@ class LLMDescription(ModelDescription):
|
|
|
204
207
|
"model_file_location": model_file_location,
|
|
205
208
|
"cache_status": cache_status,
|
|
206
209
|
"quantization": self._quantization,
|
|
210
|
+
"multimodal_projector": self._multimodal_projector,
|
|
207
211
|
"model_format": self._llm_spec.model_format,
|
|
208
212
|
"model_size_in_billions": self._llm_spec.model_size_in_billions,
|
|
209
213
|
}
|
|
@@ -212,10 +216,19 @@ class LLMDescription(ModelDescription):
|
|
|
212
216
|
def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
|
|
213
217
|
res = defaultdict(list)
|
|
214
218
|
for spec in llm_family.model_specs:
|
|
219
|
+
multimodal_projectors = getattr(spec, "multimodal_projectors", None)
|
|
215
220
|
for q in spec.quantizations:
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
221
|
+
if multimodal_projectors:
|
|
222
|
+
for mmproj in multimodal_projectors:
|
|
223
|
+
res[llm_family.model_name].append(
|
|
224
|
+
LLMDescription(
|
|
225
|
+
None, None, llm_family, spec, q, mmproj
|
|
226
|
+
).to_version_info()
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
res[llm_family.model_name].append(
|
|
230
|
+
LLMDescription(None, None, llm_family, spec, q).to_version_info()
|
|
231
|
+
)
|
|
219
232
|
return res
|
|
220
233
|
|
|
221
234
|
|
|
@@ -260,8 +273,9 @@ def create_llm_model_instance(
|
|
|
260
273
|
)
|
|
261
274
|
logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
|
|
262
275
|
|
|
276
|
+
multimodal_projector = kwargs.get("multimodal_projector")
|
|
263
277
|
if not model_path:
|
|
264
|
-
model_path = cache(llm_family, llm_spec, quantization)
|
|
278
|
+
model_path = cache(llm_family, llm_spec, quantization, multimodal_projector)
|
|
265
279
|
|
|
266
280
|
peft_model = peft_model_config.peft_model if peft_model_config else None
|
|
267
281
|
if peft_model is not None:
|
|
@@ -288,5 +302,5 @@ def create_llm_model_instance(
|
|
|
288
302
|
model_uid, llm_family, llm_spec, quantization, model_path, kwargs
|
|
289
303
|
)
|
|
290
304
|
return model, LLMDescription(
|
|
291
|
-
subpool_addr, devices, llm_family, llm_spec, quantization
|
|
305
|
+
subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
|
|
292
306
|
)
|
|
@@ -15,6 +15,7 @@ import concurrent.futures
|
|
|
15
15
|
import importlib.util
|
|
16
16
|
import logging
|
|
17
17
|
import os
|
|
18
|
+
import pprint
|
|
18
19
|
import queue
|
|
19
20
|
from typing import Iterator, List, Optional, Union
|
|
20
21
|
|
|
@@ -24,6 +25,7 @@ from ....types import ChatCompletion, ChatCompletionChunk, Completion, Completio
|
|
|
24
25
|
from ..core import LLM
|
|
25
26
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
26
27
|
from ..utils import ChatModelMixin
|
|
28
|
+
from .memory import estimate_gpu_layers
|
|
27
29
|
|
|
28
30
|
logger = logging.getLogger(__name__)
|
|
29
31
|
|
|
@@ -95,7 +97,12 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
95
97
|
|
|
96
98
|
def load(self):
|
|
97
99
|
try:
|
|
98
|
-
from xllamacpp import
|
|
100
|
+
from xllamacpp import (
|
|
101
|
+
CommonParams,
|
|
102
|
+
Server,
|
|
103
|
+
get_device_info,
|
|
104
|
+
ggml_backend_dev_type,
|
|
105
|
+
)
|
|
99
106
|
except ImportError:
|
|
100
107
|
error_message = "Failed to import module 'xllamacpp'"
|
|
101
108
|
installation_guide = ["Please make sure 'xllamacpp' is installed. "]
|
|
@@ -135,6 +142,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
135
142
|
if os.path.exists(legacy_model_file_path):
|
|
136
143
|
model_path = legacy_model_file_path
|
|
137
144
|
|
|
145
|
+
multimodal_projector = self._llamacpp_model_config.get(
|
|
146
|
+
"multimodal_projector", ""
|
|
147
|
+
)
|
|
148
|
+
mmproj = (
|
|
149
|
+
os.path.join(self.model_path, multimodal_projector)
|
|
150
|
+
if multimodal_projector
|
|
151
|
+
else ""
|
|
152
|
+
)
|
|
153
|
+
|
|
138
154
|
try:
|
|
139
155
|
params = CommonParams()
|
|
140
156
|
# Compatible with xllamacpp changes
|
|
@@ -142,6 +158,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
142
158
|
params.model = model_path
|
|
143
159
|
except Exception:
|
|
144
160
|
params.model.path = model_path
|
|
161
|
+
params.mmproj.path = mmproj
|
|
145
162
|
if self.model_family.chat_template:
|
|
146
163
|
params.chat_template = self.model_family.chat_template
|
|
147
164
|
# This is the default value, could be overwritten by _llamacpp_model_config
|
|
@@ -165,6 +182,41 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
165
182
|
# Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
|
|
166
183
|
# 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
|
167
184
|
params.n_gpu_layers = 0x7FFFFFFF
|
|
185
|
+
try:
|
|
186
|
+
device_info = get_device_info()
|
|
187
|
+
gpus = [
|
|
188
|
+
info
|
|
189
|
+
for info in device_info
|
|
190
|
+
if info["type"]
|
|
191
|
+
== ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_GPU
|
|
192
|
+
]
|
|
193
|
+
if gpus:
|
|
194
|
+
logger.info(
|
|
195
|
+
"Try to estimate num gpu layers, n_ctx: %s, n_batch: %s, n_parallel: %s, gpus:\n%s",
|
|
196
|
+
params.n_ctx,
|
|
197
|
+
params.n_batch,
|
|
198
|
+
params.n_parallel,
|
|
199
|
+
pprint.pformat(gpus),
|
|
200
|
+
)
|
|
201
|
+
estimate = estimate_gpu_layers(
|
|
202
|
+
gpus=gpus,
|
|
203
|
+
model_path=model_path,
|
|
204
|
+
projectors=[mmproj] if mmproj else [],
|
|
205
|
+
context_length=params.n_ctx,
|
|
206
|
+
batch_size=params.n_batch,
|
|
207
|
+
num_parallel=params.n_parallel,
|
|
208
|
+
kv_cache_type="",
|
|
209
|
+
)
|
|
210
|
+
logger.info("Estimate num gpu layers: %s", estimate)
|
|
211
|
+
if estimate.tensor_split:
|
|
212
|
+
params.tensor_split = estimate.tensor_split
|
|
213
|
+
else:
|
|
214
|
+
params.n_gpu_layers = estimate.layers
|
|
215
|
+
except Exception as e:
|
|
216
|
+
logger.exception(
|
|
217
|
+
"Estimate num gpu layers for llama.cpp backend failed: %s", e
|
|
218
|
+
)
|
|
219
|
+
|
|
168
220
|
self._llm = Server(params)
|
|
169
221
|
self._executor = concurrent.futures.ThreadPoolExecutor(
|
|
170
222
|
max_workers=max(10, n_threads)
|
|
@@ -207,11 +259,13 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
207
259
|
q.put(res)
|
|
208
260
|
except Exception as e:
|
|
209
261
|
logger.exception("handle_completions callback failed: %s", e)
|
|
262
|
+
q.put(_Error(str(e)))
|
|
210
263
|
|
|
211
264
|
try:
|
|
212
265
|
self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
|
|
213
266
|
except Exception as ex:
|
|
214
267
|
logger.exception("handle_completions failed: %s", ex)
|
|
268
|
+
q.put(_Error(str(ex)))
|
|
215
269
|
q.put(_Done)
|
|
216
270
|
|
|
217
271
|
assert self._executor
|
|
@@ -271,6 +325,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
271
325
|
q.put(res)
|
|
272
326
|
except Exception as e:
|
|
273
327
|
logger.exception("handle_chat_completions callback failed: %s", e)
|
|
328
|
+
q.put(_Error(str(e)))
|
|
274
329
|
|
|
275
330
|
try:
|
|
276
331
|
self._llm.handle_chat_completions(
|
|
@@ -278,6 +333,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
278
333
|
)
|
|
279
334
|
except Exception as ex:
|
|
280
335
|
logger.exception("handle_chat_completions failed: %s", ex)
|
|
336
|
+
q.put(_Error(str(ex)))
|
|
281
337
|
q.put(_Done)
|
|
282
338
|
|
|
283
339
|
assert self._executor
|
|
@@ -288,7 +344,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
288
344
|
def _to_iterator():
|
|
289
345
|
while (r := q.get()) is not _Done:
|
|
290
346
|
if type(r) is _Error:
|
|
291
|
-
raise Exception("Got error in chat stream:
|
|
347
|
+
raise Exception(f"Got error in chat stream: {r.msg}")
|
|
292
348
|
# Get valid keys (O(1) lookup)
|
|
293
349
|
chunk_keys = ChatCompletionChunk.__annotations__
|
|
294
350
|
# The chunk may contain additional keys (e.g., system_fingerprint),
|
|
@@ -302,5 +358,5 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
302
358
|
else:
|
|
303
359
|
r = q.get()
|
|
304
360
|
if type(r) is _Error:
|
|
305
|
-
raise Exception("Got error in chat:
|
|
361
|
+
raise Exception(f"Got error in chat: {r.msg}")
|
|
306
362
|
return self._to_chat_completion(r, self.reasoning_parser)
|