xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +79 -2
  3. xinference/client/restful/restful_client.py +65 -3
  4. xinference/conftest.py +0 -7
  5. xinference/core/media_interface.py +132 -8
  6. xinference/core/model.py +44 -6
  7. xinference/core/scheduler.py +1 -10
  8. xinference/core/supervisor.py +8 -17
  9. xinference/core/worker.py +5 -27
  10. xinference/deploy/cmdline.py +6 -2
  11. xinference/model/audio/chattts.py +24 -39
  12. xinference/model/audio/cosyvoice.py +18 -30
  13. xinference/model/audio/funasr.py +42 -0
  14. xinference/model/audio/model_spec.json +71 -1
  15. xinference/model/audio/model_spec_modelscope.json +76 -2
  16. xinference/model/audio/utils.py +75 -0
  17. xinference/model/core.py +1 -0
  18. xinference/model/embedding/__init__.py +74 -18
  19. xinference/model/embedding/core.py +98 -589
  20. xinference/model/embedding/embed_family.py +133 -0
  21. xinference/{thirdparty/omnilmm/train → model/embedding/flag}/__init__.py +1 -1
  22. xinference/model/embedding/flag/core.py +282 -0
  23. xinference/model/embedding/model_spec.json +24 -0
  24. xinference/model/embedding/model_spec_modelscope.json +24 -0
  25. xinference/model/embedding/sentence_transformers/__init__.py +13 -0
  26. xinference/model/embedding/sentence_transformers/core.py +399 -0
  27. xinference/model/embedding/vllm/core.py +95 -0
  28. xinference/model/image/model_spec.json +30 -3
  29. xinference/model/image/model_spec_modelscope.json +41 -2
  30. xinference/model/image/stable_diffusion/core.py +144 -53
  31. xinference/model/llm/__init__.py +6 -54
  32. xinference/model/llm/core.py +19 -5
  33. xinference/model/llm/llama_cpp/core.py +59 -3
  34. xinference/model/llm/llama_cpp/memory.py +457 -0
  35. xinference/model/llm/llm_family.json +247 -402
  36. xinference/model/llm/llm_family.py +88 -16
  37. xinference/model/llm/llm_family_modelscope.json +260 -421
  38. xinference/model/llm/llm_family_openmind_hub.json +0 -34
  39. xinference/model/llm/sglang/core.py +8 -0
  40. xinference/model/llm/transformers/__init__.py +27 -6
  41. xinference/model/llm/transformers/chatglm.py +4 -2
  42. xinference/model/llm/transformers/core.py +49 -28
  43. xinference/model/llm/transformers/deepseek_v2.py +6 -49
  44. xinference/model/llm/transformers/gemma3.py +119 -164
  45. xinference/model/llm/transformers/multimodal/__init__.py +13 -0
  46. xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
  47. xinference/model/llm/transformers/multimodal/core.py +205 -0
  48. xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
  49. xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
  50. xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
  51. xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
  52. xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
  53. xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
  54. xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
  55. xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
  56. xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
  57. xinference/model/llm/transformers/opt.py +4 -2
  58. xinference/model/llm/transformers/utils.py +6 -37
  59. xinference/model/llm/utils.py +11 -0
  60. xinference/model/llm/vllm/core.py +7 -0
  61. xinference/model/rerank/core.py +91 -3
  62. xinference/model/rerank/model_spec.json +24 -0
  63. xinference/model/rerank/model_spec_modelscope.json +24 -0
  64. xinference/model/rerank/utils.py +20 -2
  65. xinference/model/utils.py +38 -1
  66. xinference/model/video/diffusers.py +65 -3
  67. xinference/model/video/model_spec.json +31 -4
  68. xinference/model/video/model_spec_modelscope.json +32 -4
  69. xinference/web/ui/build/asset-manifest.json +6 -6
  70. xinference/web/ui/build/index.html +1 -1
  71. xinference/web/ui/build/static/css/main.013f296b.css +2 -0
  72. xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
  73. xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
  74. xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
  79. xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
  80. xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
  81. xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
  82. xinference/web/ui/src/locales/en.json +21 -8
  83. xinference/web/ui/src/locales/ja.json +224 -0
  84. xinference/web/ui/src/locales/ko.json +224 -0
  85. xinference/web/ui/src/locales/zh.json +21 -8
  86. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/METADATA +14 -11
  87. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/RECORD +93 -100
  88. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/WHEEL +1 -1
  89. xinference/model/llm/transformers/cogvlm2.py +0 -442
  90. xinference/model/llm/transformers/cogvlm2_video.py +0 -333
  91. xinference/model/llm/transformers/deepseek_vl.py +0 -280
  92. xinference/model/llm/transformers/glm_edge_v.py +0 -213
  93. xinference/model/llm/transformers/intern_vl.py +0 -526
  94. xinference/model/llm/transformers/internlm2.py +0 -94
  95. xinference/model/llm/transformers/minicpmv25.py +0 -193
  96. xinference/model/llm/transformers/omnilmm.py +0 -132
  97. xinference/model/llm/transformers/qwen2_audio.py +0 -179
  98. xinference/model/llm/transformers/qwen_vl.py +0 -360
  99. xinference/thirdparty/omnilmm/LICENSE +0 -201
  100. xinference/thirdparty/omnilmm/chat.py +0 -218
  101. xinference/thirdparty/omnilmm/constants.py +0 -4
  102. xinference/thirdparty/omnilmm/conversation.py +0 -332
  103. xinference/thirdparty/omnilmm/model/__init__.py +0 -1
  104. xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
  105. xinference/thirdparty/omnilmm/model/resampler.py +0 -166
  106. xinference/thirdparty/omnilmm/model/utils.py +0 -578
  107. xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
  108. xinference/thirdparty/omnilmm/utils.py +0 -134
  109. xinference/web/ui/build/static/css/main.337afe76.css +0 -2
  110. xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
  111. xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
  112. xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
  113. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
  114. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
  115. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
  116. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
  117. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
  118. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
  119. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
  120. /xinference/{thirdparty/omnilmm → model/embedding/vllm}/__init__.py +0 -0
  121. /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
  122. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/entry_points.txt +0 -0
  123. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/licenses/LICENSE +0 -0
  124. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/top_level.txt +0 -0
@@ -22,6 +22,7 @@ import logging
22
22
  import os
23
23
  import re
24
24
  import sys
25
+ import warnings
25
26
  from glob import glob
26
27
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
27
28
 
@@ -197,8 +198,6 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
197
198
  return getattr(module, class_name)
198
199
 
199
200
  def load(self):
200
- from transformers import BitsAndBytesConfig, T5EncoderModel
201
-
202
201
  if "text2image" in self._abilities or "image2image" in self._abilities:
203
202
  from diffusers import AutoPipelineForText2Image as AutoPipelineModel
204
203
  elif "inpainting" in self._abilities:
@@ -227,58 +226,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
227
226
  self._get_controlnet_model(*cn) for cn in controlnet
228
227
  ]
229
228
 
229
+ # quantizations
230
+ # text_encoder
230
231
  quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
231
- if quantize_text_encoder and not self._gguf_model_path:
232
- try:
233
- import bitsandbytes # noqa: F401
234
- except ImportError:
235
- error_message = "Failed to import module 'bitsandbytes'"
236
- installation_guide = [
237
- "Please make sure 'bitsandbytes' is installed. ",
238
- "You can install it by `pip install bitsandbytes`\n",
239
- ]
240
-
241
- raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
242
-
243
- for text_encoder_name in quantize_text_encoder.split(","):
244
- quantization_config = BitsAndBytesConfig(load_in_8bit=True)
245
- quantization_kwargs = {}
246
- if torch_dtype:
247
- quantization_kwargs["torch_dtype"] = torch_dtype
248
- text_encoder = T5EncoderModel.from_pretrained(
249
- self._model_path,
250
- subfolder=text_encoder_name,
251
- quantization_config=quantization_config,
252
- **quantization_kwargs,
253
- )
254
- self._kwargs[text_encoder_name] = text_encoder
255
- self._kwargs["device_map"] = "balanced"
256
-
232
+ self._quantize_text_encoder(quantize_text_encoder)
233
+ # transformer
257
234
  if self._gguf_model_path:
258
- from diffusers import GGUFQuantizationConfig
259
-
260
- # GGUF transformer
261
- self._kwargs["transformer"] = self._get_layer_cls(
262
- "transformer"
263
- ).from_single_file(
264
- self._gguf_model_path,
265
- quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
266
- torch_dtype=torch_dtype,
267
- config=os.path.join(self._model_path, "transformer"),
268
- )
269
- elif self._kwargs.get("transformer_nf4"):
270
- nf4_config = BitsAndBytesConfig(
271
- load_in_4bit=True,
272
- bnb_4bit_quant_type="nf4",
273
- bnb_4bit_compute_dtype=torch_dtype,
274
- )
275
- model_nf4 = self._get_layer_cls("transformer").from_pretrained(
276
- self._model_path,
277
- subfolder="transformer",
278
- quantization_config=nf4_config,
279
- torch_dtype=torch_dtype,
280
- )
281
- self._kwargs["transformer"] = model_nf4
235
+ self._quantize_transformer_gguf()
236
+ else:
237
+ self._quantize_transformer()
282
238
 
283
239
  logger.debug(
284
240
  "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
@@ -308,6 +264,133 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
308
264
  cache_branch_id=self._kwargs.get("deepcache_cache_branch_id", 0),
309
265
  )
310
266
 
267
+ def _get_quantize_config(self, method: str, quantization: str, module: str):
268
+ if method == "bnb":
269
+ try:
270
+ import bitsandbytes # noqa: F401
271
+ except ImportError:
272
+ error_message = "Failed to import module 'bitsandbytes'"
273
+ installation_guide = [
274
+ "Please make sure 'bitsandbytes' is installed. ",
275
+ "You can install it by `pip install bitsandbytes`\n",
276
+ ]
277
+
278
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
279
+
280
+ if module.startswith("diffusers."):
281
+ from diffusers import BitsAndBytesConfig
282
+ else:
283
+ assert module.startswith("transformers.")
284
+ from transformers import BitsAndBytesConfig
285
+
286
+ if quantization == "4-bit":
287
+ return BitsAndBytesConfig(load_in_4bit=True)
288
+ elif quantization == "8-bit":
289
+ return BitsAndBytesConfig(load_in_8bit=True)
290
+ elif quantization == "nf4":
291
+ return BitsAndBytesConfig(
292
+ load_in_4bit=True,
293
+ bnb_4bit_quant_type="nf4",
294
+ bnb_4bit_compute_dtype=self._torch_dtype,
295
+ )
296
+ elif method == "torchao":
297
+ try:
298
+ import torchao # noqa: F401
299
+ except ImportError:
300
+ error_message = "Failed to import module 'torchao'"
301
+ installation_guide = [
302
+ "Please make sure 'torchao' is installed. ",
303
+ "You can install it by `pip install torchao`\n",
304
+ ]
305
+
306
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
307
+
308
+ if module.startswith("diffusers."):
309
+ from diffusers import TorchAoConfig
310
+ else:
311
+ assert module.startswith("transformers.")
312
+ from transformers import TorchAoConfig
313
+
314
+ return TorchAoConfig(quantization)
315
+ else:
316
+ raise ValueError(f"Unknown quantization method for image model: {method}")
317
+
318
+ def _quantize_text_encoder(self, quantize_text_encoder: Optional[str]):
319
+ if self._gguf_model_path:
320
+ # skip quantization when gguf applied to transformer
321
+ return
322
+
323
+ if not quantize_text_encoder:
324
+ return
325
+
326
+ quantization_method = self._kwargs.pop("text_encoder_quantize_method", "bnb")
327
+ quantization = self._kwargs.pop("text_encoder_quantization", "8-bit")
328
+
329
+ torch_dtype = self._torch_dtype
330
+ for text_encoder_name in quantize_text_encoder.split(","):
331
+ quantization_kwargs: Dict[str, Any] = {}
332
+ if torch_dtype:
333
+ quantization_kwargs["torch_dtype"] = torch_dtype
334
+ text_encoder_cls = self._get_layer_cls(text_encoder_name)
335
+ quantization_config = self._get_quantize_config(
336
+ quantization_method, quantization, text_encoder_cls.__module__
337
+ )
338
+ text_encoder = text_encoder_cls.from_pretrained(
339
+ self._model_path,
340
+ subfolder=text_encoder_name,
341
+ quantization_config=quantization_config,
342
+ **quantization_kwargs,
343
+ )
344
+ self._kwargs[text_encoder_name] = text_encoder
345
+ else:
346
+ if not self._kwargs.get("device_map"):
347
+ self._kwargs["device_map"] = "balanced"
348
+
349
+ def _quantize_transformer(self):
350
+ quantization = None
351
+ nf4 = self._kwargs.pop("transformer_nf4", None)
352
+ if nf4:
353
+ warnings.warn(
354
+ "`transformer_nf4` is deprecated, please use `transformer_quantization=nf4`",
355
+ category=DeprecationWarning,
356
+ stacklevel=2,
357
+ )
358
+ quantization = "nf4"
359
+ method = self._kwargs.pop("transformer_quantize_method", "bnb")
360
+ if not quantization:
361
+ quantization = self._kwargs.pop("transformer_quantization", None)
362
+
363
+ if not quantization:
364
+ # skip if no quantization specified
365
+ return
366
+
367
+ torch_dtype = self._torch_dtype
368
+ transformer_cls = self._get_layer_cls("transformer")
369
+ quantization_config = self._get_quantize_config(
370
+ method, quantization, transformer_cls.__module__
371
+ )
372
+ transformer_model = transformer_cls.from_pretrained(
373
+ self._model_path,
374
+ subfolder="transformer",
375
+ quantization_config=quantization_config,
376
+ torch_dtype=torch_dtype,
377
+ )
378
+ self._kwargs["transformer"] = transformer_model
379
+
380
+ def _quantize_transformer_gguf(self):
381
+ from diffusers import GGUFQuantizationConfig
382
+
383
+ # GGUF transformer
384
+ torch_dtype = self._torch_dtype
385
+ self._kwargs["transformer"] = self._get_layer_cls(
386
+ "transformer"
387
+ ).from_single_file(
388
+ self._gguf_model_path,
389
+ quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
390
+ torch_dtype=torch_dtype,
391
+ config=os.path.join(self._model_path, "transformer"),
392
+ )
393
+
311
394
  def _load_to_device(self, model):
312
395
  if self._kwargs.get("cpu_offload", False):
313
396
  logger.debug("CPU offloading model")
@@ -321,7 +404,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
321
404
  if self._kwargs.get("attention_slicing", False):
322
405
  model.enable_attention_slicing()
323
406
  if self._kwargs.get("vae_tiling", False):
324
- model.enable_vae_tiling()
407
+ try:
408
+ model.enable_vae_tiling()
409
+ except AttributeError:
410
+ model.vae.enable_tiling()
411
+ if self._kwargs.get("vae_slicing", False):
412
+ try:
413
+ model.enable_vae_slicing()
414
+ except AttributeError:
415
+ model.vae.enable_slicing()
325
416
 
326
417
  def get_max_num_images_for_batching(self):
327
418
  return self._kwargs.get("max_num_images", 16)
@@ -73,7 +73,7 @@ def generate_engine_config_by_model_family(model_family):
73
73
  model_size_in_billions = spec.model_size_in_billions
74
74
  quantizations = spec.quantizations
75
75
  for quantization in quantizations:
76
- # traverse all supported engines to match the name, format, size in billions and quatization of model
76
+ # traverse all supported engines to match the name, format, size in billions and quantization of model
77
77
  for engine in SUPPORTED_ENGINES:
78
78
  if not check_format_with_engine(
79
79
  model_format, engine
@@ -107,6 +107,10 @@ def generate_engine_config_by_model_family(model_family):
107
107
  "llm_class": cls,
108
108
  }
109
109
  )
110
+ if hasattr(spec, "multimodal_projectors"):
111
+ engine_params[-1][
112
+ "multimodal_projectors"
113
+ ] = spec.multimodal_projectors
110
114
  engines[engine] = engine_params
111
115
  break
112
116
  LLM_ENGINES[model_name] = engines
@@ -163,36 +167,9 @@ def _install():
163
167
  from .lmdeploy.core import LMDeployChatModel, LMDeployModel
164
168
  from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
165
169
  from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
166
- from .transformers.chatglm import ChatglmPytorchChatModel
167
- from .transformers.cogagent import CogAgentChatModel
168
- from .transformers.cogvlm2 import CogVLM2Model
169
- from .transformers.cogvlm2_video import CogVLM2VideoModel
170
170
  from .transformers.core import PytorchChatModel, PytorchModel
171
- from .transformers.deepseek_v2 import (
172
- DeepSeekV2PytorchChatModel,
173
- DeepSeekV2PytorchModel,
174
- )
175
- from .transformers.deepseek_vl import DeepSeekVLChatModel
176
- from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
177
- from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
178
- from .transformers.glm4v import Glm4VModel
179
- from .transformers.glm_edge_v import GlmEdgeVModel
180
- from .transformers.minicpmv25 import MiniCPMV25Model
181
- from .transformers.minicpmv26 import MiniCPMV26Model
182
- from .transformers.opt import OptPytorchModel
183
- from .transformers.ovis2 import Ovis2ChatModel
184
- from .transformers.qwen2_audio import Qwen2AudioChatModel
185
- from .transformers.qwen_vl import QwenVLChatModel
186
171
  from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
187
172
 
188
- try:
189
- from .transformers.omnilmm import OmniLMMModel
190
- except ImportError as e:
191
- # For quite old transformers version,
192
- # import will generate error
193
- OmniLMMModel = None
194
- warnings.warn(f"Cannot import OmniLLMModel due to reason: {e}")
195
-
196
173
  # register llm classes.
197
174
  LLAMA_CLASSES.extend(
198
175
  [
@@ -203,32 +180,7 @@ def _install():
203
180
  VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
204
181
  MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
205
182
  LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
206
- TRANSFORMERS_CLASSES.extend(
207
- [
208
- ChatglmPytorchChatModel,
209
- PytorchChatModel,
210
- QwenVLChatModel,
211
- Qwen2AudioChatModel,
212
- DeepSeekVLChatModel,
213
- DeepSeekVL2ChatModel,
214
- PytorchModel,
215
- CogVLM2Model,
216
- CogVLM2VideoModel,
217
- MiniCPMV25Model,
218
- MiniCPMV26Model,
219
- Glm4VModel,
220
- DeepSeekV2PytorchModel,
221
- DeepSeekV2PytorchChatModel,
222
- OptPytorchModel,
223
- GlmEdgeVModel,
224
- CogAgentChatModel,
225
- Gemma3TextChatModel,
226
- Gemma3ChatModel,
227
- Ovis2ChatModel,
228
- ]
229
- )
230
- if OmniLMMModel: # type: ignore
231
- TRANSFORMERS_CLASSES.append(OmniLMMModel)
183
+ TRANSFORMERS_CLASSES.extend([PytorchChatModel, PytorchModel])
232
184
 
233
185
  # support 4 engines for now
234
186
  SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES
@@ -160,12 +160,14 @@ class LLMDescription(ModelDescription):
160
160
  llm_family: "LLMFamilyV1",
161
161
  llm_spec: "LLMSpecV1",
162
162
  quantization: Optional[str],
163
+ multimodal_projector: Optional[str] = None,
163
164
  model_path: Optional[str] = None,
164
165
  ):
165
166
  super().__init__(address, devices, model_path=model_path)
166
167
  self._llm_family = llm_family
167
168
  self._llm_spec = llm_spec
168
169
  self._quantization = quantization
170
+ self._multimodal_projector = multimodal_projector
169
171
 
170
172
  @property
171
173
  def spec(self):
@@ -185,6 +187,7 @@ class LLMDescription(ModelDescription):
185
187
  "model_family": self._llm_family.model_family
186
188
  or self._llm_family.model_name,
187
189
  "quantization": self._quantization,
190
+ "multimodal_projector": self._multimodal_projector,
188
191
  "model_hub": self._llm_spec.model_hub,
189
192
  "revision": self._llm_spec.model_revision,
190
193
  "context_length": self._llm_family.context_length,
@@ -204,6 +207,7 @@ class LLMDescription(ModelDescription):
204
207
  "model_file_location": model_file_location,
205
208
  "cache_status": cache_status,
206
209
  "quantization": self._quantization,
210
+ "multimodal_projector": self._multimodal_projector,
207
211
  "model_format": self._llm_spec.model_format,
208
212
  "model_size_in_billions": self._llm_spec.model_size_in_billions,
209
213
  }
@@ -212,10 +216,19 @@ class LLMDescription(ModelDescription):
212
216
  def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
213
217
  res = defaultdict(list)
214
218
  for spec in llm_family.model_specs:
219
+ multimodal_projectors = getattr(spec, "multimodal_projectors", None)
215
220
  for q in spec.quantizations:
216
- res[llm_family.model_name].append(
217
- LLMDescription(None, None, llm_family, spec, q).to_version_info()
218
- )
221
+ if multimodal_projectors:
222
+ for mmproj in multimodal_projectors:
223
+ res[llm_family.model_name].append(
224
+ LLMDescription(
225
+ None, None, llm_family, spec, q, mmproj
226
+ ).to_version_info()
227
+ )
228
+ else:
229
+ res[llm_family.model_name].append(
230
+ LLMDescription(None, None, llm_family, spec, q).to_version_info()
231
+ )
219
232
  return res
220
233
 
221
234
 
@@ -260,8 +273,9 @@ def create_llm_model_instance(
260
273
  )
261
274
  logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
262
275
 
276
+ multimodal_projector = kwargs.get("multimodal_projector")
263
277
  if not model_path:
264
- model_path = cache(llm_family, llm_spec, quantization)
278
+ model_path = cache(llm_family, llm_spec, quantization, multimodal_projector)
265
279
 
266
280
  peft_model = peft_model_config.peft_model if peft_model_config else None
267
281
  if peft_model is not None:
@@ -288,5 +302,5 @@ def create_llm_model_instance(
288
302
  model_uid, llm_family, llm_spec, quantization, model_path, kwargs
289
303
  )
290
304
  return model, LLMDescription(
291
- subpool_addr, devices, llm_family, llm_spec, quantization
305
+ subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
292
306
  )
@@ -15,6 +15,7 @@ import concurrent.futures
15
15
  import importlib.util
16
16
  import logging
17
17
  import os
18
+ import pprint
18
19
  import queue
19
20
  from typing import Iterator, List, Optional, Union
20
21
 
@@ -24,6 +25,7 @@ from ....types import ChatCompletion, ChatCompletionChunk, Completion, Completio
24
25
  from ..core import LLM
25
26
  from ..llm_family import LLMFamilyV1, LLMSpecV1
26
27
  from ..utils import ChatModelMixin
28
+ from .memory import estimate_gpu_layers
27
29
 
28
30
  logger = logging.getLogger(__name__)
29
31
 
@@ -95,7 +97,12 @@ class XllamaCppModel(LLM, ChatModelMixin):
95
97
 
96
98
  def load(self):
97
99
  try:
98
- from xllamacpp import CommonParams, Server
100
+ from xllamacpp import (
101
+ CommonParams,
102
+ Server,
103
+ get_device_info,
104
+ ggml_backend_dev_type,
105
+ )
99
106
  except ImportError:
100
107
  error_message = "Failed to import module 'xllamacpp'"
101
108
  installation_guide = ["Please make sure 'xllamacpp' is installed. "]
@@ -135,6 +142,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
135
142
  if os.path.exists(legacy_model_file_path):
136
143
  model_path = legacy_model_file_path
137
144
 
145
+ multimodal_projector = self._llamacpp_model_config.get(
146
+ "multimodal_projector", ""
147
+ )
148
+ mmproj = (
149
+ os.path.join(self.model_path, multimodal_projector)
150
+ if multimodal_projector
151
+ else ""
152
+ )
153
+
138
154
  try:
139
155
  params = CommonParams()
140
156
  # Compatible with xllamacpp changes
@@ -142,6 +158,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
142
158
  params.model = model_path
143
159
  except Exception:
144
160
  params.model.path = model_path
161
+ params.mmproj.path = mmproj
145
162
  if self.model_family.chat_template:
146
163
  params.chat_template = self.model_family.chat_template
147
164
  # This is the default value, could be overwritten by _llamacpp_model_config
@@ -165,6 +182,41 @@ class XllamaCppModel(LLM, ChatModelMixin):
165
182
  # Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
166
183
  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
167
184
  params.n_gpu_layers = 0x7FFFFFFF
185
+ try:
186
+ device_info = get_device_info()
187
+ gpus = [
188
+ info
189
+ for info in device_info
190
+ if info["type"]
191
+ == ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_GPU
192
+ ]
193
+ if gpus:
194
+ logger.info(
195
+ "Try to estimate num gpu layers, n_ctx: %s, n_batch: %s, n_parallel: %s, gpus:\n%s",
196
+ params.n_ctx,
197
+ params.n_batch,
198
+ params.n_parallel,
199
+ pprint.pformat(gpus),
200
+ )
201
+ estimate = estimate_gpu_layers(
202
+ gpus=gpus,
203
+ model_path=model_path,
204
+ projectors=[mmproj] if mmproj else [],
205
+ context_length=params.n_ctx,
206
+ batch_size=params.n_batch,
207
+ num_parallel=params.n_parallel,
208
+ kv_cache_type="",
209
+ )
210
+ logger.info("Estimate num gpu layers: %s", estimate)
211
+ if estimate.tensor_split:
212
+ params.tensor_split = estimate.tensor_split
213
+ else:
214
+ params.n_gpu_layers = estimate.layers
215
+ except Exception as e:
216
+ logger.exception(
217
+ "Estimate num gpu layers for llama.cpp backend failed: %s", e
218
+ )
219
+
168
220
  self._llm = Server(params)
169
221
  self._executor = concurrent.futures.ThreadPoolExecutor(
170
222
  max_workers=max(10, n_threads)
@@ -207,11 +259,13 @@ class XllamaCppModel(LLM, ChatModelMixin):
207
259
  q.put(res)
208
260
  except Exception as e:
209
261
  logger.exception("handle_completions callback failed: %s", e)
262
+ q.put(_Error(str(e)))
210
263
 
211
264
  try:
212
265
  self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
213
266
  except Exception as ex:
214
267
  logger.exception("handle_completions failed: %s", ex)
268
+ q.put(_Error(str(ex)))
215
269
  q.put(_Done)
216
270
 
217
271
  assert self._executor
@@ -271,6 +325,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
271
325
  q.put(res)
272
326
  except Exception as e:
273
327
  logger.exception("handle_chat_completions callback failed: %s", e)
328
+ q.put(_Error(str(e)))
274
329
 
275
330
  try:
276
331
  self._llm.handle_chat_completions(
@@ -278,6 +333,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
278
333
  )
279
334
  except Exception as ex:
280
335
  logger.exception("handle_chat_completions failed: %s", ex)
336
+ q.put(_Error(str(ex)))
281
337
  q.put(_Done)
282
338
 
283
339
  assert self._executor
@@ -288,7 +344,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
288
344
  def _to_iterator():
289
345
  while (r := q.get()) is not _Done:
290
346
  if type(r) is _Error:
291
- raise Exception("Got error in chat stream: %s", r.msg)
347
+ raise Exception(f"Got error in chat stream: {r.msg}")
292
348
  # Get valid keys (O(1) lookup)
293
349
  chunk_keys = ChatCompletionChunk.__annotations__
294
350
  # The chunk may contain additional keys (e.g., system_fingerprint),
@@ -302,5 +358,5 @@ class XllamaCppModel(LLM, ChatModelMixin):
302
358
  else:
303
359
  r = q.get()
304
360
  if type(r) is _Error:
305
- raise Exception("Got error in chat: %s", r.msg)
361
+ raise Exception(f"Got error in chat: {r.msg}")
306
362
  return self._to_chat_completion(r, self.reasoning_parser)