xinference 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/restful_client.py +1 -1
  3. xinference/conftest.py +0 -7
  4. xinference/core/media_interface.py +9 -8
  5. xinference/core/model.py +13 -6
  6. xinference/core/scheduler.py +1 -10
  7. xinference/core/worker.py +0 -10
  8. xinference/model/audio/model_spec.json +53 -1
  9. xinference/model/audio/model_spec_modelscope.json +57 -1
  10. xinference/model/embedding/core.py +19 -11
  11. xinference/model/image/model_spec.json +10 -1
  12. xinference/model/image/model_spec_modelscope.json +20 -0
  13. xinference/model/llm/__init__.py +6 -54
  14. xinference/model/llm/core.py +19 -5
  15. xinference/model/llm/llama_cpp/core.py +59 -3
  16. xinference/model/llm/llama_cpp/memory.py +455 -0
  17. xinference/model/llm/llm_family.json +185 -397
  18. xinference/model/llm/llm_family.py +88 -16
  19. xinference/model/llm/llm_family_modelscope.json +199 -421
  20. xinference/model/llm/llm_family_openmind_hub.json +0 -34
  21. xinference/model/llm/sglang/core.py +4 -0
  22. xinference/model/llm/transformers/__init__.py +27 -6
  23. xinference/model/llm/transformers/chatglm.py +4 -2
  24. xinference/model/llm/transformers/core.py +49 -28
  25. xinference/model/llm/transformers/deepseek_v2.py +6 -49
  26. xinference/model/llm/transformers/gemma3.py +119 -164
  27. xinference/{thirdparty/omnilmm/train → model/llm/transformers/multimodal}/__init__.py +1 -1
  28. xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
  29. xinference/model/llm/transformers/multimodal/core.py +205 -0
  30. xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
  31. xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
  32. xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
  33. xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
  34. xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
  35. xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
  36. xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
  37. xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
  38. xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
  39. xinference/model/llm/transformers/opt.py +4 -2
  40. xinference/model/llm/transformers/utils.py +6 -37
  41. xinference/model/llm/vllm/core.py +4 -0
  42. xinference/model/rerank/core.py +7 -1
  43. xinference/model/rerank/utils.py +17 -0
  44. xinference/web/ui/build/asset-manifest.json +3 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/js/main.ddf9eaee.js +3 -0
  47. xinference/web/ui/build/static/js/main.ddf9eaee.js.map +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +1 -0
  52. xinference/web/ui/src/locales/en.json +3 -1
  53. xinference/web/ui/src/locales/zh.json +3 -1
  54. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/METADATA +16 -14
  55. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/RECORD +60 -76
  56. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/WHEEL +1 -1
  57. xinference/model/llm/transformers/cogvlm2.py +0 -442
  58. xinference/model/llm/transformers/cogvlm2_video.py +0 -333
  59. xinference/model/llm/transformers/deepseek_vl.py +0 -280
  60. xinference/model/llm/transformers/glm_edge_v.py +0 -213
  61. xinference/model/llm/transformers/intern_vl.py +0 -526
  62. xinference/model/llm/transformers/internlm2.py +0 -94
  63. xinference/model/llm/transformers/minicpmv25.py +0 -193
  64. xinference/model/llm/transformers/omnilmm.py +0 -132
  65. xinference/model/llm/transformers/qwen2_audio.py +0 -179
  66. xinference/model/llm/transformers/qwen_vl.py +0 -360
  67. xinference/thirdparty/omnilmm/LICENSE +0 -201
  68. xinference/thirdparty/omnilmm/__init__.py +0 -0
  69. xinference/thirdparty/omnilmm/chat.py +0 -218
  70. xinference/thirdparty/omnilmm/constants.py +0 -4
  71. xinference/thirdparty/omnilmm/conversation.py +0 -332
  72. xinference/thirdparty/omnilmm/model/__init__.py +0 -1
  73. xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
  74. xinference/thirdparty/omnilmm/model/resampler.py +0 -166
  75. xinference/thirdparty/omnilmm/model/utils.py +0 -578
  76. xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
  77. xinference/thirdparty/omnilmm/utils.py +0 -134
  78. xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
  79. xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
  84. /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.ddf9eaee.js.LICENSE.txt} +0 -0
  85. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/entry_points.txt +0 -0
  86. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/licenses/LICENSE +0 -0
  87. {xinference-1.6.0.dist-info → xinference-1.6.1.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-05-16T20:05:54+0800",
11
+ "date": "2025-05-30T19:36:43+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "81a24f4646ace8f41c85a810237491d9c0ad5282",
15
- "version": "1.6.0"
14
+ "full-revisionid": "72cc5e39040bdc49981b240c2b59af998554a75f",
15
+ "version": "1.6.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -1017,7 +1017,7 @@ class Client:
1017
1017
  model_path: Optional[str]
1018
1018
  Model path, if gguf format, should be the file path, otherwise, should be directory of the model.
1019
1019
  **kwargs:
1020
- Any other parameters been specified.
1020
+ Any other parameters been specified. e.g. multimodal_projector for multimodal inference with the llama.cpp backend.
1021
1021
 
1022
1022
  Returns
1023
1023
  -------
xinference/conftest.py CHANGED
@@ -304,10 +304,3 @@ def setup_with_auth():
304
304
  os.remove(auth_file)
305
305
  except:
306
306
  pass
307
-
308
-
309
- @pytest.fixture
310
- def set_use_xllamacpp():
311
- os.environ["USE_XLLAMACPP"] = "1"
312
- yield
313
- del os.environ["USE_XLLAMACPP"]
@@ -19,7 +19,7 @@ import os
19
19
  import threading
20
20
  import time
21
21
  import uuid
22
- from typing import Dict, List, Optional, Tuple, Union
22
+ from typing import Any, Dict, List, Optional, Tuple, Union
23
23
 
24
24
  import gradio as gr
25
25
  import PIL.Image
@@ -463,7 +463,7 @@ class MediaInterface:
463
463
 
464
464
  def image2video_interface(self) -> "gr.Blocks":
465
465
  def image_generate_video(
466
- image: "PIL.Image",
466
+ image: "PIL.Image.Image",
467
467
  prompt: str,
468
468
  negative_prompt: str,
469
469
  num_frames: int,
@@ -653,13 +653,14 @@ class MediaInterface:
653
653
  with open(prompt_speech_file, "rb") as f:
654
654
  prompt_speech_bytes = f.read()
655
655
 
656
+ kw: Dict[str, Any] = {}
657
+ if prompt_speech_bytes:
658
+ kw["prompt_speech"] = prompt_speech_bytes
659
+ if prompt_text:
660
+ kw["prompt_text"] = prompt_text
661
+
656
662
  response = model.speech(
657
- input=input_text,
658
- voice=voice,
659
- speed=speed,
660
- response_format="mp3",
661
- prompt_speech=prompt_speech_bytes,
662
- prompt_text=prompt_text,
663
+ input=input_text, voice=voice, speed=speed, response_format="mp3", **kw
663
664
  )
664
665
 
665
666
  # Write to a temp .mp3 file and return its path
xinference/core/model.py CHANGED
@@ -71,12 +71,8 @@ except ImportError:
71
71
  OutOfMemoryError = _OutOfMemoryError
72
72
 
73
73
 
74
- XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
75
- "qwen-vl-chat",
76
- "cogvlm2",
77
- "glm-4v",
78
- "MiniCPM-V-2.6",
79
- ]
74
+ # !!!!! DO NOT add model_name to this list, using `register_batching_multimodal_models` below instead.
75
+ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = []
80
76
 
81
77
  XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
82
78
  XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
@@ -84,6 +80,16 @@ XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
84
80
  )
85
81
 
86
82
 
83
+ def register_batching_multimodal_models(*model_names: str):
84
+ def decorator(cls):
85
+ for name in model_names:
86
+ if name not in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS:
87
+ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS.append(name)
88
+ return cls
89
+
90
+ return decorator
91
+
92
+
87
93
  def request_limit(fn):
88
94
  """
89
95
  Used by ModelActor.
@@ -977,6 +983,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
977
983
  response_format,
978
984
  temperature,
979
985
  timestamp_granularities,
986
+ **kwargs,
980
987
  )
981
988
  raise AttributeError(
982
989
  f"Model {self._model.model_spec} is not for creating transcriptions."
@@ -272,15 +272,6 @@ class InferenceRequest:
272
272
  )
273
273
 
274
274
 
275
- def _get_valid_batch_kv_cache(cache, skipped_indexes: Set[int]):
276
- batch_size = cache.key_cache[0].shape[0]
277
- batch_slices = [num for num in range(batch_size) if num not in skipped_indexes]
278
- for idx in range(len(cache)):
279
- cache.key_cache[idx] = cache.key_cache[idx][batch_slices, ::].contiguous()
280
- cache.value_cache[idx] = cache.value_cache[idx][batch_slices, ::].contiguous()
281
- return cache
282
-
283
-
284
275
  class SchedulerActor(xo.StatelessActor):
285
276
  @classmethod
286
277
  def gen_uid(cls, model_uid: str, replica_id: str):
@@ -409,7 +400,7 @@ class SchedulerActor(xo.StatelessActor):
409
400
  # Some requests have been completed. Batch size needs to be reduced for kv cache.
410
401
  if stopped_batch_indexes and len(self._running_queue) > 0:
411
402
  kv_cache = self._running_queue[0].kv_cache
412
- reduced_kv_cache = _get_valid_batch_kv_cache(
403
+ reduced_kv_cache = self._model.build_reduced_kv_cache(
413
404
  kv_cache, stopped_batch_indexes
414
405
  )
415
406
  for r in self._running_queue:
xinference/core/worker.py CHANGED
@@ -533,16 +533,6 @@ class WorkerActor(xo.StatelessActor):
533
533
  existing_model_uids.append(rep_uid)
534
534
  if idx in self._gpu_to_embedding_model_uids:
535
535
  existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
536
- # If user has run the vLLM model on the GPU that was forced to be specified,
537
- # it is not possible to force this GPU to be allocated again
538
- if idx in self._user_specified_gpu_to_model_uids:
539
- for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
540
- is_vllm_model = await self.is_model_vllm_backend(rep_uid)
541
- if is_vllm_model:
542
- raise RuntimeError(
543
- f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
544
- f"therefore cannot allocate GPU memory for a new model."
545
- )
546
536
 
547
537
  if existing_model_uids:
548
538
  logger.warning(
@@ -218,13 +218,65 @@
218
218
  "batch_size_s": 300
219
219
  }
220
220
  },
221
+ {
222
+ "model_name": "paraformer-zh-hotword",
223
+ "model_family": "funasr",
224
+ "model_id": "JunHowie/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
225
+ "model_revision": "26d622993683d7b0c517ee5ec9c1c8bdde76e324",
226
+ "model_ability": ["audio2text"],
227
+ "multilingual": false,
228
+ "default_model_config": {
229
+ "vad_model": "fsmn-vad",
230
+ "punc_model": "ct-punc"
231
+ },
232
+ "default_transcription_config": {
233
+ "batch_size_s": 300
234
+ }
235
+ },
236
+ {
237
+ "model_name": "paraformer-zh-long",
238
+ "model_family": "funasr",
239
+ "model_id": "JunHowie/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
240
+ "model_revision": "b6d8cb81645e34056cd3dda41e5624a740587de3",
241
+ "model_ability": ["audio2text"],
242
+ "multilingual": false,
243
+ "default_model_config": {
244
+ "vad_model": "fsmn-vad",
245
+ "punc_model": "ct-punc"
246
+ },
247
+ "default_transcription_config": {
248
+ "batch_size_s": 300
249
+ }
250
+ },
251
+ {
252
+ "model_name": "paraformer-zh-spk",
253
+ "model_family": "funasr",
254
+ "model_id": "JunHowie/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
255
+ "model_revision": "36abd64af4392fe02bf76453bc86c081cf1ca6da",
256
+ "model_ability": ["audio2text"],
257
+ "multilingual": false,
258
+ "default_model_config": {
259
+ "vad_model": "fsmn-vad",
260
+ "punc_model": "ct-punc"
261
+ },
262
+ "default_transcription_config": {
263
+ "batch_size_s": 300
264
+ }
265
+ },
221
266
  {
222
267
  "model_name": "ChatTTS",
223
268
  "model_family": "ChatTTS",
224
269
  "model_id": "2Noise/ChatTTS",
225
270
  "model_revision": "1a3c04a8b0651689bd9242fbb55b1f4b5a9aef84",
226
271
  "model_ability": ["text2audio"],
227
- "multilingual": true
272
+ "multilingual": true,
273
+ "virtualenv": {
274
+ "packages": [
275
+ "ChatTTS>=0.2.1",
276
+ "#system_torch#",
277
+ "#system_numpy#"
278
+ ]
279
+ }
228
280
  },
229
281
  {
230
282
  "model_name": "CosyVoice-300M",
@@ -51,6 +51,55 @@
51
51
  "model_name": "paraformer-zh",
52
52
  "model_family": "funasr",
53
53
  "model_hub": "modelscope",
54
+ "model_id": "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
55
+ "model_revision": "master",
56
+ "model_ability": ["audio2text"],
57
+ "multilingual": false,
58
+ "default_model_config": {
59
+ "vad_model": "fsmn-vad",
60
+ "punc_model": "ct-punc"
61
+ },
62
+ "default_transcription_config": {
63
+ "batch_size_s": 300
64
+ }
65
+ },
66
+ {
67
+ "model_name": "paraformer-zh-hotword",
68
+ "model_family": "funasr",
69
+ "model_hub": "modelscope",
70
+ "model_id": "iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
71
+ "model_revision": "master",
72
+ "model_ability": ["audio2text"],
73
+ "multilingual": false,
74
+ "default_model_config": {
75
+ "vad_model": "fsmn-vad",
76
+ "punc_model": "ct-punc"
77
+ },
78
+ "default_transcription_config": {
79
+ "hotword": "",
80
+ "batch_size_s": 300
81
+ }
82
+ },
83
+ {
84
+ "model_name": "paraformer-zh-long",
85
+ "model_family": "funasr",
86
+ "model_hub": "modelscope",
87
+ "model_id": "iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
88
+ "model_revision": "master",
89
+ "model_ability": ["audio2text"],
90
+ "multilingual": false,
91
+ "default_model_config": {
92
+ "vad_model": "fsmn-vad",
93
+ "punc_model": "ct-punc"
94
+ },
95
+ "default_transcription_config": {
96
+ "batch_size_s": 300
97
+ }
98
+ },
99
+ {
100
+ "model_name": "paraformer-zh-spk",
101
+ "model_family": "funasr",
102
+ "model_hub": "modelscope",
54
103
  "model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
55
104
  "model_revision": "master",
56
105
  "model_ability": ["audio2text"],
@@ -70,7 +119,14 @@
70
119
  "model_id": "AI-ModelScope/ChatTTS",
71
120
  "model_revision": "master",
72
121
  "model_ability": ["text2audio"],
73
- "multilingual": true
122
+ "multilingual": true,
123
+ "virtualenv": {
124
+ "packages": [
125
+ "ChatTTS>=0.2.1",
126
+ "#system_torch#",
127
+ "#system_numpy#"
128
+ ]
129
+ }
74
130
  },
75
131
  {
76
132
  "model_name": "CosyVoice-300M",
@@ -651,19 +651,27 @@ class EmbeddingModel:
651
651
  img = Image.open(image_data)
652
652
  return img
653
653
 
654
- objs: list[dict[str, str]] = []
655
- for item in sentences:
656
- if isinstance(item, dict):
657
- if item.get("text") is not None:
658
- objs.append(item["text"])
659
- elif item.get("image") is not None:
660
- if re.match(r"^data:image/.+;base64,", item["image"]):
661
- image = base64_to_image(item["image"])
662
- objs.append(image)
654
+ objs: list[str] = []
655
+ if isinstance(sentences, str):
656
+ objs.append(sentences)
657
+ else:
658
+ for item in sentences:
659
+ if isinstance(item, dict):
660
+ if item.get("text") is not None:
661
+ objs.append(item["text"])
662
+ elif item.get("image") is not None:
663
+ if re.match(r"^data:image/.+;base64,", item["image"]):
664
+ image = base64_to_image(item["image"])
665
+ objs.append(image)
666
+ else:
667
+ objs.append(item["image"])
663
668
  else:
664
- objs.append(item["image"])
669
+ raise ValueError("Please check the input data.")
670
+ elif isinstance(item, str):
671
+ objs.append(item)
665
672
  else:
666
- logger.error("Please check the input data.")
673
+ raise ValueError("Please check the input data.")
674
+
667
675
  all_embeddings, all_token_nums = encode(
668
676
  self._model,
669
677
  objs,
@@ -303,7 +303,16 @@
303
303
  "model_ability": [
304
304
  "text2image",
305
305
  "image2image"
306
- ]
306
+ ],
307
+ "default_model_config": {
308
+ "variant": "fp16"
309
+ },
310
+ "virtualenv": {
311
+ "packages": [
312
+ "diffusers>=0.30.0",
313
+ "#system_numpy#"
314
+ ]
315
+ }
307
316
  },
308
317
  {
309
318
  "model_name": "stable-diffusion-inpainting",
@@ -307,6 +307,26 @@
307
307
  }
308
308
  ]
309
309
  },
310
+ {
311
+ "model_name": "kolors",
312
+ "model_family": "stable_diffusion",
313
+ "model_hub": "modelscope",
314
+ "model_id": "JunHowie/Kolors-diffusers",
315
+ "model_revision": "master",
316
+ "model_ability": [
317
+ "text2image",
318
+ "image2image"
319
+ ],
320
+ "default_model_config": {
321
+ "variant": "fp16"
322
+ },
323
+ "virtualenv": {
324
+ "packages": [
325
+ "diffusers>=0.30.0",
326
+ "#system_numpy#"
327
+ ]
328
+ }
329
+ },
310
330
  {
311
331
  "model_name": "GOT-OCR2_0",
312
332
  "model_family": "ocr",
@@ -73,7 +73,7 @@ def generate_engine_config_by_model_family(model_family):
73
73
  model_size_in_billions = spec.model_size_in_billions
74
74
  quantizations = spec.quantizations
75
75
  for quantization in quantizations:
76
- # traverse all supported engines to match the name, format, size in billions and quatization of model
76
+ # traverse all supported engines to match the name, format, size in billions and quantization of model
77
77
  for engine in SUPPORTED_ENGINES:
78
78
  if not check_format_with_engine(
79
79
  model_format, engine
@@ -107,6 +107,10 @@ def generate_engine_config_by_model_family(model_family):
107
107
  "llm_class": cls,
108
108
  }
109
109
  )
110
+ if hasattr(spec, "multimodal_projectors"):
111
+ engine_params[-1][
112
+ "multimodal_projectors"
113
+ ] = spec.multimodal_projectors
110
114
  engines[engine] = engine_params
111
115
  break
112
116
  LLM_ENGINES[model_name] = engines
@@ -163,36 +167,9 @@ def _install():
163
167
  from .lmdeploy.core import LMDeployChatModel, LMDeployModel
164
168
  from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
165
169
  from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
166
- from .transformers.chatglm import ChatglmPytorchChatModel
167
- from .transformers.cogagent import CogAgentChatModel
168
- from .transformers.cogvlm2 import CogVLM2Model
169
- from .transformers.cogvlm2_video import CogVLM2VideoModel
170
170
  from .transformers.core import PytorchChatModel, PytorchModel
171
- from .transformers.deepseek_v2 import (
172
- DeepSeekV2PytorchChatModel,
173
- DeepSeekV2PytorchModel,
174
- )
175
- from .transformers.deepseek_vl import DeepSeekVLChatModel
176
- from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
177
- from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
178
- from .transformers.glm4v import Glm4VModel
179
- from .transformers.glm_edge_v import GlmEdgeVModel
180
- from .transformers.minicpmv25 import MiniCPMV25Model
181
- from .transformers.minicpmv26 import MiniCPMV26Model
182
- from .transformers.opt import OptPytorchModel
183
- from .transformers.ovis2 import Ovis2ChatModel
184
- from .transformers.qwen2_audio import Qwen2AudioChatModel
185
- from .transformers.qwen_vl import QwenVLChatModel
186
171
  from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
187
172
 
188
- try:
189
- from .transformers.omnilmm import OmniLMMModel
190
- except ImportError as e:
191
- # For quite old transformers version,
192
- # import will generate error
193
- OmniLMMModel = None
194
- warnings.warn(f"Cannot import OmniLLMModel due to reason: {e}")
195
-
196
173
  # register llm classes.
197
174
  LLAMA_CLASSES.extend(
198
175
  [
@@ -203,32 +180,7 @@ def _install():
203
180
  VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
204
181
  MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
205
182
  LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
206
- TRANSFORMERS_CLASSES.extend(
207
- [
208
- ChatglmPytorchChatModel,
209
- PytorchChatModel,
210
- QwenVLChatModel,
211
- Qwen2AudioChatModel,
212
- DeepSeekVLChatModel,
213
- DeepSeekVL2ChatModel,
214
- PytorchModel,
215
- CogVLM2Model,
216
- CogVLM2VideoModel,
217
- MiniCPMV25Model,
218
- MiniCPMV26Model,
219
- Glm4VModel,
220
- DeepSeekV2PytorchModel,
221
- DeepSeekV2PytorchChatModel,
222
- OptPytorchModel,
223
- GlmEdgeVModel,
224
- CogAgentChatModel,
225
- Gemma3TextChatModel,
226
- Gemma3ChatModel,
227
- Ovis2ChatModel,
228
- ]
229
- )
230
- if OmniLMMModel: # type: ignore
231
- TRANSFORMERS_CLASSES.append(OmniLMMModel)
183
+ TRANSFORMERS_CLASSES.extend([PytorchChatModel, PytorchModel])
232
184
 
233
185
  # support 4 engines for now
234
186
  SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES
@@ -160,12 +160,14 @@ class LLMDescription(ModelDescription):
160
160
  llm_family: "LLMFamilyV1",
161
161
  llm_spec: "LLMSpecV1",
162
162
  quantization: Optional[str],
163
+ multimodal_projector: Optional[str] = None,
163
164
  model_path: Optional[str] = None,
164
165
  ):
165
166
  super().__init__(address, devices, model_path=model_path)
166
167
  self._llm_family = llm_family
167
168
  self._llm_spec = llm_spec
168
169
  self._quantization = quantization
170
+ self._multimodal_projector = multimodal_projector
169
171
 
170
172
  @property
171
173
  def spec(self):
@@ -185,6 +187,7 @@ class LLMDescription(ModelDescription):
185
187
  "model_family": self._llm_family.model_family
186
188
  or self._llm_family.model_name,
187
189
  "quantization": self._quantization,
190
+ "multimodal_projector": self._multimodal_projector,
188
191
  "model_hub": self._llm_spec.model_hub,
189
192
  "revision": self._llm_spec.model_revision,
190
193
  "context_length": self._llm_family.context_length,
@@ -204,6 +207,7 @@ class LLMDescription(ModelDescription):
204
207
  "model_file_location": model_file_location,
205
208
  "cache_status": cache_status,
206
209
  "quantization": self._quantization,
210
+ "multimodal_projector": self._multimodal_projector,
207
211
  "model_format": self._llm_spec.model_format,
208
212
  "model_size_in_billions": self._llm_spec.model_size_in_billions,
209
213
  }
@@ -212,10 +216,19 @@ class LLMDescription(ModelDescription):
212
216
  def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
213
217
  res = defaultdict(list)
214
218
  for spec in llm_family.model_specs:
219
+ multimodal_projectors = getattr(spec, "multimodal_projectors", None)
215
220
  for q in spec.quantizations:
216
- res[llm_family.model_name].append(
217
- LLMDescription(None, None, llm_family, spec, q).to_version_info()
218
- )
221
+ if multimodal_projectors:
222
+ for mmproj in multimodal_projectors:
223
+ res[llm_family.model_name].append(
224
+ LLMDescription(
225
+ None, None, llm_family, spec, q, mmproj
226
+ ).to_version_info()
227
+ )
228
+ else:
229
+ res[llm_family.model_name].append(
230
+ LLMDescription(None, None, llm_family, spec, q).to_version_info()
231
+ )
219
232
  return res
220
233
 
221
234
 
@@ -260,8 +273,9 @@ def create_llm_model_instance(
260
273
  )
261
274
  logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
262
275
 
276
+ multimodal_projector = kwargs.get("multimodal_projector")
263
277
  if not model_path:
264
- model_path = cache(llm_family, llm_spec, quantization)
278
+ model_path = cache(llm_family, llm_spec, quantization, multimodal_projector)
265
279
 
266
280
  peft_model = peft_model_config.peft_model if peft_model_config else None
267
281
  if peft_model is not None:
@@ -288,5 +302,5 @@ def create_llm_model_instance(
288
302
  model_uid, llm_family, llm_spec, quantization, model_path, kwargs
289
303
  )
290
304
  return model, LLMDescription(
291
- subpool_addr, devices, llm_family, llm_spec, quantization
305
+ subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
292
306
  )