xinference 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (104) hide show
  1. xinference/_compat.py +2 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +23 -1
  4. xinference/core/model.py +1 -6
  5. xinference/core/utils.py +10 -6
  6. xinference/model/audio/core.py +5 -0
  7. xinference/model/audio/cosyvoice.py +25 -3
  8. xinference/model/audio/f5tts.py +15 -10
  9. xinference/model/audio/f5tts_mlx.py +260 -0
  10. xinference/model/audio/fish_speech.py +35 -111
  11. xinference/model/audio/model_spec.json +19 -3
  12. xinference/model/audio/model_spec_modelscope.json +9 -0
  13. xinference/model/audio/utils.py +32 -0
  14. xinference/model/image/core.py +69 -1
  15. xinference/model/image/model_spec.json +127 -4
  16. xinference/model/image/model_spec_modelscope.json +130 -4
  17. xinference/model/image/stable_diffusion/core.py +45 -13
  18. xinference/model/llm/llm_family.json +47 -0
  19. xinference/model/llm/llm_family.py +15 -36
  20. xinference/model/llm/llm_family_modelscope.json +49 -0
  21. xinference/model/llm/mlx/core.py +68 -13
  22. xinference/model/llm/transformers/core.py +1 -0
  23. xinference/model/llm/transformers/qwen2_vl.py +2 -0
  24. xinference/model/llm/utils.py +1 -0
  25. xinference/model/llm/vllm/core.py +11 -2
  26. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  27. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  28. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  29. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  30. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  31. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  32. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  33. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  34. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  35. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  36. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  37. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  38. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  39. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  40. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  41. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  42. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  43. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  44. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  45. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  46. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  47. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  48. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  49. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  50. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  51. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  52. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  53. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  54. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  55. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  56. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  57. xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
  58. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
  59. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
  60. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  61. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  62. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  63. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  64. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
  65. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  66. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  67. xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
  68. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  69. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  70. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  71. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  72. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  73. xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
  74. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  75. xinference/thirdparty/fish_speech/tools/schema.py +11 -28
  76. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  77. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  78. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  79. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  80. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  81. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  82. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  83. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  84. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  85. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  86. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  87. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  88. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  89. xinference/thirdparty/matcha/utils/utils.py +2 -2
  90. {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/METADATA +11 -6
  91. {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/RECORD +95 -74
  92. xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
  93. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  94. xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
  95. xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
  96. xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
  97. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  98. xinference/thirdparty/fish_speech/tools/api.py +0 -943
  99. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
  100. xinference/thirdparty/fish_speech/tools/webui.py +0 -548
  101. {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/LICENSE +0 -0
  102. {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/WHEEL +0 -0
  103. {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/entry_points.txt +0 -0
  104. {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/top_level.txt +0 -0
@@ -12,8 +12,24 @@
12
12
  ],
13
13
  "default_model_config": {
14
14
  "quantize": true,
15
- "quantize_text_encoder": "text_encoder_2"
16
- }
15
+ "quantize_text_encoder": "text_encoder_2",
16
+ "torch_dtype": "bfloat16"
17
+ },
18
+ "gguf_model_id": "Xorbits/FLUX.1-schnell-gguf",
19
+ "gguf_quantizations": [
20
+ "F16",
21
+ "Q2_K",
22
+ "Q3_K_S",
23
+ "Q4_0",
24
+ "Q4_1",
25
+ "Q4_K_S",
26
+ "Q5_0",
27
+ "Q5_1",
28
+ "Q5_K_S",
29
+ "Q6_K",
30
+ "Q8_0"
31
+ ],
32
+ "gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf"
17
33
  },
18
34
  {
19
35
  "model_name": "FLUX.1-dev",
@@ -28,8 +44,24 @@
28
44
  ],
29
45
  "default_model_config": {
30
46
  "quantize": true,
31
- "quantize_text_encoder": "text_encoder_2"
32
- }
47
+ "quantize_text_encoder": "text_encoder_2",
48
+ "torch_dtype": "bfloat16"
49
+ },
50
+ "gguf_model_id": "AI-ModelScope/FLUX.1-dev-gguf",
51
+ "gguf_quantizations": [
52
+ "F16",
53
+ "Q2_K",
54
+ "Q3_K_S",
55
+ "Q4_0",
56
+ "Q4_1",
57
+ "Q4_K_S",
58
+ "Q5_0",
59
+ "Q5_1",
60
+ "Q5_K_S",
61
+ "Q6_K",
62
+ "Q8_0"
63
+ ],
64
+ "gguf_model_file_name_template": "flux1-dev-{quantization}.gguf"
33
65
  },
34
66
  {
35
67
  "model_name": "sd3-medium",
@@ -47,6 +79,100 @@
47
79
  "quantize_text_encoder": "text_encoder_3"
48
80
  }
49
81
  },
82
+ {
83
+ "model_name": "sd3.5-medium",
84
+ "model_family": "stable_diffusion",
85
+ "model_hub": "modelscope",
86
+ "model_id": "AI-ModelScope/stable-diffusion-3.5-medium",
87
+ "model_revision": "master",
88
+ "model_ability": [
89
+ "text2image",
90
+ "image2image",
91
+ "inpainting"
92
+ ],
93
+ "default_model_config": {
94
+ "quantize": true,
95
+ "quantize_text_encoder": "text_encoder_3",
96
+ "torch_dtype": "bfloat16"
97
+ },
98
+ "gguf_model_id": "Xorbits/stable-diffusion-3.5-medium-gguf",
99
+ "gguf_quantizations": [
100
+ "F16",
101
+ "Q3_K_M",
102
+ "Q3_K_S",
103
+ "Q4_0",
104
+ "Q4_1",
105
+ "Q4_K_M",
106
+ "Q4_K_S",
107
+ "Q5_0",
108
+ "Q5_1",
109
+ "Q5_K_M",
110
+ "Q5_K_S",
111
+ "Q6_K",
112
+ "Q8_0"
113
+ ],
114
+ "gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf"
115
+ },
116
+ {
117
+ "model_name": "sd3.5-large",
118
+ "model_family": "stable_diffusion",
119
+ "model_hub": "modelscope",
120
+ "model_id": "AI-ModelScope/stable-diffusion-3.5-large",
121
+ "model_revision": "master",
122
+ "model_ability": [
123
+ "text2image",
124
+ "image2image",
125
+ "inpainting"
126
+ ],
127
+ "default_model_config": {
128
+ "quantize": true,
129
+ "quantize_text_encoder": "text_encoder_3",
130
+ "torch_dtype": "bfloat16",
131
+ "transformer_nf4": true
132
+ },
133
+ "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-gguf",
134
+ "gguf_quantizations": [
135
+ "F16",
136
+ "Q4_0",
137
+ "Q4_1",
138
+ "Q5_0",
139
+ "Q5_1",
140
+ "Q8_0"
141
+ ],
142
+ "gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf"
143
+ },
144
+ {
145
+ "model_name": "sd3.5-large-turbo",
146
+ "model_family": "stable_diffusion",
147
+ "model_hub": "modelscope",
148
+ "model_id": "AI-ModelScope/stable-diffusion-3.5-large-turbo",
149
+ "model_revision": "master",
150
+ "model_ability": [
151
+ "text2image",
152
+ "image2image",
153
+ "inpainting"
154
+ ],
155
+ "default_model_config": {
156
+ "quantize": true,
157
+ "quantize_text_encoder": "text_encoder_3",
158
+ "torch_dtype": "bfloat16",
159
+ "transformer_nf4": true
160
+ },
161
+ "default_generate_config": {
162
+ "guidance_scale": 1.0,
163
+ "num_inference_steps": 4
164
+ },
165
+ "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-turbo-gguf",
166
+ "gguf_quantizations": [
167
+ "F16",
168
+ "Q4_0",
169
+ "Q4_1",
170
+ "Q5_0",
171
+ "Q5_1",
172
+ "Q8_0"
173
+ ],
174
+ "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
175
+ },
50
176
  {
51
177
  "model_name": "sd-turbo",
52
178
  "model_family": "stable_diffusion",
@@ -14,8 +14,10 @@
14
14
 
15
15
  import contextlib
16
16
  import gc
17
+ import importlib
17
18
  import inspect
18
19
  import itertools
20
+ import json
19
21
  import logging
20
22
  import os
21
23
  import re
@@ -86,6 +88,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
86
88
  lora_load_kwargs: Optional[Dict] = None,
87
89
  lora_fuse_kwargs: Optional[Dict] = None,
88
90
  model_spec: Optional["ImageModelFamilyV1"] = None,
91
+ gguf_model_path: Optional[str] = None,
89
92
  **kwargs,
90
93
  ):
91
94
  self._model_uid = model_uid
@@ -109,6 +112,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
109
112
  self._model_spec = model_spec
110
113
  self._abilities = model_spec.model_ability or [] # type: ignore
111
114
  self._kwargs = kwargs
115
+ # gguf
116
+ self._gguf_model_path = gguf_model_path
112
117
 
113
118
  @property
114
119
  def model_ability(self):
@@ -184,7 +189,17 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
184
189
  self._model.fuse_lora(**self._lora_fuse_kwargs)
185
190
  logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
186
191
 
192
+ def _get_layer_cls(self, layer: str):
193
+ with open(os.path.join(self._model_path, "model_index.json")) as f: # type: ignore
194
+ model_index = json.load(f)
195
+ layer_info = model_index[layer]
196
+ module_name, class_name = layer_info
197
+ module = importlib.import_module(module_name)
198
+ return getattr(module, class_name)
199
+
187
200
  def load(self):
201
+ from transformers import BitsAndBytesConfig, T5EncoderModel
202
+
188
203
  if "text2image" in self._abilities or "image2image" in self._abilities:
189
204
  from diffusers import AutoPipelineForText2Image as AutoPipelineModel
190
205
  elif "inpainting" in self._abilities:
@@ -200,7 +215,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
200
215
  glob(os.path.join(self._model_path, "*/*.safetensors"))
201
216
  )
202
217
  if isinstance(torch_dtype, str):
203
- self._kwargs["torch_dtype"] = getattr(torch, torch_dtype)
218
+ self._torch_dtype = torch_dtype = self._kwargs["torch_dtype"] = getattr(
219
+ torch, torch_dtype
220
+ )
204
221
 
205
222
  controlnet = self._kwargs.get("controlnet")
206
223
  if controlnet is not None:
@@ -212,18 +229,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
212
229
  ]
213
230
 
214
231
  quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
215
- if quantize_text_encoder:
216
- try:
217
- from transformers import BitsAndBytesConfig, T5EncoderModel
218
- except ImportError:
219
- error_message = "Failed to import module 'transformers'"
220
- installation_guide = [
221
- "Please make sure 'transformers' is installed. ",
222
- "You can install it by `pip install transformers`\n",
223
- ]
224
-
225
- raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
226
-
232
+ if quantize_text_encoder and not self._gguf_model_path:
227
233
  try:
228
234
  import bitsandbytes # noqa: F401
229
235
  except ImportError:
@@ -249,6 +255,32 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
249
255
  self._kwargs[text_encoder_name] = text_encoder
250
256
  self._kwargs["device_map"] = "balanced"
251
257
 
258
+ if self._gguf_model_path:
259
+ from diffusers import GGUFQuantizationConfig
260
+
261
+ # GGUF transformer
262
+ self._kwargs["transformer"] = self._get_layer_cls(
263
+ "transformer"
264
+ ).from_single_file(
265
+ self._gguf_model_path,
266
+ quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
267
+ torch_dtype=torch_dtype,
268
+ config=os.path.join(self._model_path, "transformer"),
269
+ )
270
+ elif self._kwargs.get("transformer_nf4"):
271
+ nf4_config = BitsAndBytesConfig(
272
+ load_in_4bit=True,
273
+ bnb_4bit_quant_type="nf4",
274
+ bnb_4bit_compute_dtype=torch_dtype,
275
+ )
276
+ model_nf4 = self._get_layer_cls("transformer").from_pretrained(
277
+ self._model_path,
278
+ subfolder="transformer",
279
+ quantization_config=nf4_config,
280
+ torch_dtype=torch_dtype,
281
+ )
282
+ self._kwargs["transformer"] = model_nf4
283
+
252
284
  logger.debug(
253
285
  "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
254
286
  )
@@ -8942,5 +8942,52 @@
8942
8942
  "<|user|>",
8943
8943
  "<|observation|>"
8944
8944
  ]
8945
+ },
8946
+ {
8947
+ "version": 1,
8948
+ "context_length": 32768,
8949
+ "model_name": "QvQ-72B-Preview",
8950
+ "model_lang": [
8951
+ "en",
8952
+ "zh"
8953
+ ],
8954
+ "model_ability": [
8955
+ "chat",
8956
+ "vision"
8957
+ ],
8958
+ "model_description": "QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities.",
8959
+ "model_specs": [
8960
+ {
8961
+ "model_format": "pytorch",
8962
+ "model_size_in_billions": 72,
8963
+ "quantizations": [
8964
+ "4-bit",
8965
+ "8-bit",
8966
+ "none"
8967
+ ],
8968
+ "model_id": "Qwen/QVQ-72B-Preview"
8969
+ },
8970
+ {
8971
+ "model_format": "mlx",
8972
+ "model_size_in_billions": 72,
8973
+ "quantizations": [
8974
+ "3bit",
8975
+ "4bit",
8976
+ "6bit",
8977
+ "8bit",
8978
+ "bf16"
8979
+ ],
8980
+ "model_id": "mlx-community/QVQ-72B-Preview-{quantization}"
8981
+ }
8982
+ ],
8983
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
8984
+ "stop_token_ids": [
8985
+ 151645,
8986
+ 151643
8987
+ ],
8988
+ "stop": [
8989
+ "<|im_end|>",
8990
+ "<|endoftext|>"
8991
+ ]
8945
8992
  }
8946
8993
  ]
@@ -972,46 +972,25 @@ def match_llm(
972
972
  return spec
973
973
 
974
974
  # priority: download_hub > download_from_modelscope() and download_from_csghub()
975
- if download_hub == "modelscope":
976
- all_families = (
977
- BUILTIN_MODELSCOPE_LLM_FAMILIES
978
- + BUILTIN_LLM_FAMILIES
979
- + user_defined_llm_families
980
- )
981
- elif download_hub == "openmind_hub":
982
- all_families = (
983
- BUILTIN_OPENMIND_HUB_LLM_FAMILIES
984
- + BUILTIN_LLM_FAMILIES
985
- + user_defined_llm_families
986
- )
987
- elif download_hub == "csghub":
988
- all_families = (
989
- BUILTIN_CSGHUB_LLM_FAMILIES
990
- + BUILTIN_LLM_FAMILIES
991
- + user_defined_llm_families
992
- )
993
- elif download_hub == "huggingface":
994
- all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
975
+ # set base model
976
+ base_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
977
+ hub_families_map = {
978
+ "modelscope": BUILTIN_MODELSCOPE_LLM_FAMILIES,
979
+ "openmind_hub": BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
980
+ "csghub": BUILTIN_CSGHUB_LLM_FAMILIES,
981
+ }
982
+ if download_hub == "huggingface":
983
+ all_families = base_families
984
+ elif download_hub in hub_families_map:
985
+ all_families = hub_families_map[download_hub] + base_families
995
986
  elif download_from_modelscope():
996
- all_families = (
997
- BUILTIN_MODELSCOPE_LLM_FAMILIES
998
- + BUILTIN_LLM_FAMILIES
999
- + user_defined_llm_families
1000
- )
987
+ all_families = BUILTIN_MODELSCOPE_LLM_FAMILIES + base_families
1001
988
  elif download_from_openmind_hub():
1002
- all_families = (
1003
- BUILTIN_OPENMIND_HUB_LLM_FAMILIES
1004
- + BUILTIN_LLM_FAMILIES
1005
- + user_defined_llm_families
1006
- )
989
+ all_families = BUILTIN_OPENMIND_HUB_LLM_FAMILIES + base_families
1007
990
  elif download_from_csghub():
1008
- all_families = (
1009
- BUILTIN_CSGHUB_LLM_FAMILIES
1010
- + BUILTIN_LLM_FAMILIES
1011
- + user_defined_llm_families
1012
- )
991
+ all_families = BUILTIN_CSGHUB_LLM_FAMILIES + base_families
1013
992
  else:
1014
- all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
993
+ all_families = base_families
1015
994
 
1016
995
  for family in all_families:
1017
996
  if model_name != family.model_name:
@@ -6673,5 +6673,54 @@
6673
6673
  "<|user|>",
6674
6674
  "<|observation|>"
6675
6675
  ]
6676
+ },
6677
+ {
6678
+ "version": 1,
6679
+ "context_length": 32768,
6680
+ "model_name": "QvQ-72B-Preview",
6681
+ "model_lang": [
6682
+ "en",
6683
+ "zh"
6684
+ ],
6685
+ "model_ability": [
6686
+ "chat",
6687
+ "vision"
6688
+ ],
6689
+ "model_description": "QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities.",
6690
+ "model_specs": [
6691
+ {
6692
+ "model_format": "pytorch",
6693
+ "model_size_in_billions": 72,
6694
+ "quantizations": [
6695
+ "4-bit",
6696
+ "8-bit",
6697
+ "none"
6698
+ ],
6699
+ "model_id": "Qwen/QVQ-72B-Preview",
6700
+ "model_hub": "modelscope"
6701
+ },
6702
+ {
6703
+ "model_format": "mlx",
6704
+ "model_size_in_billions": 72,
6705
+ "quantizations": [
6706
+ "3bit",
6707
+ "4bit",
6708
+ "6bit",
6709
+ "8bit",
6710
+ "bf16"
6711
+ ],
6712
+ "model_id": "mlx-community/QVQ-72B-Preview-{quantization}",
6713
+ "model_hub": "modelscope"
6714
+ }
6715
+ ],
6716
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
6717
+ "stop_token_ids": [
6718
+ 151645,
6719
+ 151643
6720
+ ],
6721
+ "stop": [
6722
+ "<|im_end|>",
6723
+ "<|endoftext|>"
6724
+ ]
6676
6725
  }
6677
6726
  ]
@@ -173,7 +173,9 @@ class MLXModel(LLM):
173
173
  return False
174
174
  return True
175
175
 
176
- def _get_prompt_cache(self, prompt, lora_name: Optional[str] = None):
176
+ def _get_prompt_cache(
177
+ self, prompt, lora_name: Optional[str] = None, model: Any = None
178
+ ):
177
179
  from mlx_lm.models.cache import make_prompt_cache
178
180
 
179
181
  assert self._prompt_cache is not None
@@ -185,7 +187,9 @@ class MLXModel(LLM):
185
187
  or self._prompt_cache.tokens != prompt[:cache_len]
186
188
  ):
187
189
  self._prompt_cache.model_key = model_key
188
- self._prompt_cache.cache = make_prompt_cache(self._model, self._max_kv_size)
190
+ self._prompt_cache.cache = make_prompt_cache(
191
+ model or self._model, self._max_kv_size
192
+ )
189
193
  self._prompt_cache.tokens = []
190
194
  logger.debug("Making new prompt cache for %s", self.model_uid)
191
195
  else:
@@ -458,6 +462,8 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
458
462
 
459
463
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
460
464
 
465
+ self._prompt_cache = PromptCache()
466
+
461
467
  return load(self.model_path)
462
468
 
463
469
  def load(self):
@@ -471,13 +477,52 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
471
477
  self._model, self._processor = self._load_model(**kwargs)
472
478
  self._tokenizer = self._processor.tokenizer
473
479
 
480
+ def _generate_stream_inner_no_image(self, **kwargs):
481
+ import mlx.nn as nn
482
+ from mlx_lm.utils import make_sampler, stream_generate
483
+
484
+ # For mlx-lm, the model(inputs) will return logits,
485
+ # but the language model in mlx-vlm will return an object
486
+ # https://github.com/Blaizzy/mlx-vlm/blob/3f5e1620072440afb7496940f67ac1c7fc64056f/mlx_vlm/models/base.py#L260
487
+ # so we cannot pass the language model to stream_generate directly
488
+ # we wrap here to just let model(inputs) return logits to pass stream_generate
489
+ class ModelWrapper(nn.Module):
490
+ def __init__(self, model):
491
+ super().__init__()
492
+ self._model = model.language_model
493
+
494
+ @property
495
+ def layers(self):
496
+ return self._model.layers
497
+
498
+ def __call__(self, *args, **kwargs):
499
+ return self._model(*args, **kwargs).logits
500
+
501
+ sampler = make_sampler(
502
+ temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
503
+ )
504
+ prompt_token_ids = kwargs.pop("prompt_token_ids")
505
+ yield from stream_generate(
506
+ ModelWrapper(self._model),
507
+ self._tokenizer,
508
+ prompt_token_ids,
509
+ sampler=sampler,
510
+ **kwargs,
511
+ )
512
+
474
513
  def _generate_stream_inner(self, **kwargs):
475
514
  import mlx.core as mx
476
515
  from mlx_lm.utils import GenerationResponse
477
516
  from mlx_vlm.utils import generate_step
478
517
 
479
- max_tokens = kwargs.pop("max_tokens")
480
518
  inputs = kwargs["prompt_token_ids"]
519
+
520
+ if not isinstance(inputs, tuple):
521
+ # no images
522
+ yield from self._generate_stream_inner_no_image(**kwargs)
523
+ return
524
+
525
+ max_tokens = kwargs.pop("max_tokens")
481
526
  input_ids, pixel_values, mask = inputs[:3]
482
527
 
483
528
  kwargs = {
@@ -549,16 +594,26 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
549
594
  else:
550
595
  image_token_index = None
551
596
 
552
- inputs = prepare_inputs(
553
- None,
554
- self._processor,
555
- images,
556
- prompt_str,
557
- image_token_index,
558
- kwargs.get("resize_shape"),
559
- )
560
- input_ids = inputs[0]
561
- return inputs, len(input_ids)
597
+ if not images:
598
+ prompt = prompt["prompt"] # type: ignore
599
+ prompt_token_ids = self._tokenizer.encode(prompt)
600
+ prompt_token_ids = self._get_prompt_cache(
601
+ prompt_token_ids,
602
+ kwargs.get("lora_name"),
603
+ model=self._model.language_model,
604
+ )
605
+ return prompt_token_ids, len(prompt_token_ids)
606
+ else:
607
+ inputs = prepare_inputs(
608
+ None,
609
+ self._processor,
610
+ images,
611
+ prompt_str,
612
+ image_token_index,
613
+ kwargs.get("resize_shape"),
614
+ )
615
+ input_ids = inputs[0]
616
+ return inputs, len(input_ids)
562
617
 
563
618
  def chat(
564
619
  self,
@@ -69,6 +69,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
69
69
  "deepseek-v2.5",
70
70
  "deepseek-v2-chat-0628",
71
71
  "glm-edge-v",
72
+ "QvQ-72B-Preview",
72
73
  ]
73
74
 
74
75
 
@@ -47,6 +47,8 @@ class Qwen2VLChatModel(PytorchChatModel):
47
47
  llm_family = model_family.model_family or model_family.model_name
48
48
  if "qwen2-vl-instruct".lower() in llm_family.lower():
49
49
  return True
50
+ if "qvq-72b-preview".lower() in llm_family.lower():
51
+ return True
50
52
  return False
51
53
 
52
54
  def load(self):
@@ -52,6 +52,7 @@ QWEN_TOOL_CALL_FAMILY = [
52
52
  "qwen2-instruct",
53
53
  "qwen2-moe-instruct",
54
54
  "qwen2.5-instruct",
55
+ "qwen2.5-coder-instruct",
55
56
  ]
56
57
 
57
58
  GLM4_TOOL_CALL_FAMILY = [
@@ -70,6 +70,7 @@ class VLLMModelConfig(TypedDict, total=False):
70
70
  max_model_len: Optional[int]
71
71
  limit_mm_per_prompt: Optional[Dict[str, int]]
72
72
  guided_decoding_backend: Optional[str]
73
+ scheduling_policy: Optional[str]
73
74
 
74
75
 
75
76
  class VLLMGenerateConfig(TypedDict, total=False):
@@ -187,10 +188,14 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
187
188
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
188
189
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
189
190
 
191
+ if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
192
+ VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
193
+
190
194
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
191
195
  VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
192
196
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct")
193
197
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
198
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
194
199
 
195
200
 
196
201
  class VLLMModel(LLM):
@@ -244,7 +249,6 @@ class VLLMModel(LLM):
244
249
  multiprocessing.set_start_method("fork", force=True)
245
250
 
246
251
  self._model_config = self._sanitize_model_config(self._model_config)
247
-
248
252
  if self.lora_modules is None:
249
253
  self.lora_requests = []
250
254
  else:
@@ -327,7 +331,9 @@ class VLLMModel(LLM):
327
331
  model_config.setdefault("quantization", None)
328
332
  model_config.setdefault("max_model_len", None)
329
333
  model_config.setdefault("guided_decoding_backend", "outlines")
330
-
334
+ # Add scheduling policy if vLLM version is 0.6.3 or higher
335
+ if vllm.__version__ >= "0.6.3":
336
+ model_config.setdefault("scheduling_policy", "fcfs")
331
337
  return model_config
332
338
 
333
339
  @staticmethod
@@ -859,6 +865,9 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
859
865
  "image": 2, # default 2 images all chat
860
866
  }
861
867
  )
868
+ # Add scheduling policy if vLLM version is 0.6.3 or higher
869
+ if vllm.__version__ >= "0.6.3":
870
+ model_config.setdefault("scheduling_policy", "fcfs")
862
871
 
863
872
  return model_config
864
873