xinference 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +2 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +23 -1
- xinference/core/model.py +1 -6
- xinference/core/utils.py +10 -6
- xinference/model/audio/core.py +5 -0
- xinference/model/audio/cosyvoice.py +25 -3
- xinference/model/audio/f5tts.py +15 -10
- xinference/model/audio/f5tts_mlx.py +260 -0
- xinference/model/audio/fish_speech.py +35 -111
- xinference/model/audio/model_spec.json +19 -3
- xinference/model/audio/model_spec_modelscope.json +9 -0
- xinference/model/audio/utils.py +32 -0
- xinference/model/image/core.py +69 -1
- xinference/model/image/model_spec.json +127 -4
- xinference/model/image/model_spec_modelscope.json +130 -4
- xinference/model/image/stable_diffusion/core.py +45 -13
- xinference/model/llm/llm_family.json +47 -0
- xinference/model/llm/llm_family.py +15 -36
- xinference/model/llm/llm_family_modelscope.json +49 -0
- xinference/model/llm/mlx/core.py +68 -13
- xinference/model/llm/transformers/core.py +1 -0
- xinference/model/llm/transformers/qwen2_vl.py +2 -0
- xinference/model/llm/utils.py +1 -0
- xinference/model/llm/vllm/core.py +11 -2
- xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
- xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
- xinference/thirdparty/cosyvoice/bin/train.py +42 -8
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
- xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
- xinference/thirdparty/cosyvoice/cli/model.py +330 -80
- xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
- xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
- xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
- xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
- xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
- xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
- xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
- xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
- xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
- xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
- xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
- xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
- xinference/thirdparty/cosyvoice/utils/common.py +28 -1
- xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
- xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
- xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
- xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
- xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
- xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
- xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
- xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
- xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
- xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
- xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
- xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
- xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
- xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
- xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
- xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
- xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
- xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
- xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
- xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
- xinference/thirdparty/fish_speech/tools/schema.py +11 -28
- xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
- xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
- xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
- xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
- xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
- xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
- xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
- xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
- xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
- xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
- xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
- xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
- xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
- xinference/thirdparty/matcha/utils/utils.py +2 -2
- {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/METADATA +11 -6
- {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/RECORD +95 -74
- xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
- xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/api.py +0 -943
- xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
- xinference/thirdparty/fish_speech/tools/webui.py +0 -548
- {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/LICENSE +0 -0
- {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/WHEEL +0 -0
- {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -12,8 +12,24 @@
|
|
|
12
12
|
],
|
|
13
13
|
"default_model_config": {
|
|
14
14
|
"quantize": true,
|
|
15
|
-
"quantize_text_encoder": "text_encoder_2"
|
|
16
|
-
|
|
15
|
+
"quantize_text_encoder": "text_encoder_2",
|
|
16
|
+
"torch_dtype": "bfloat16"
|
|
17
|
+
},
|
|
18
|
+
"gguf_model_id": "Xorbits/FLUX.1-schnell-gguf",
|
|
19
|
+
"gguf_quantizations": [
|
|
20
|
+
"F16",
|
|
21
|
+
"Q2_K",
|
|
22
|
+
"Q3_K_S",
|
|
23
|
+
"Q4_0",
|
|
24
|
+
"Q4_1",
|
|
25
|
+
"Q4_K_S",
|
|
26
|
+
"Q5_0",
|
|
27
|
+
"Q5_1",
|
|
28
|
+
"Q5_K_S",
|
|
29
|
+
"Q6_K",
|
|
30
|
+
"Q8_0"
|
|
31
|
+
],
|
|
32
|
+
"gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf"
|
|
17
33
|
},
|
|
18
34
|
{
|
|
19
35
|
"model_name": "FLUX.1-dev",
|
|
@@ -28,8 +44,24 @@
|
|
|
28
44
|
],
|
|
29
45
|
"default_model_config": {
|
|
30
46
|
"quantize": true,
|
|
31
|
-
"quantize_text_encoder": "text_encoder_2"
|
|
32
|
-
|
|
47
|
+
"quantize_text_encoder": "text_encoder_2",
|
|
48
|
+
"torch_dtype": "bfloat16"
|
|
49
|
+
},
|
|
50
|
+
"gguf_model_id": "AI-ModelScope/FLUX.1-dev-gguf",
|
|
51
|
+
"gguf_quantizations": [
|
|
52
|
+
"F16",
|
|
53
|
+
"Q2_K",
|
|
54
|
+
"Q3_K_S",
|
|
55
|
+
"Q4_0",
|
|
56
|
+
"Q4_1",
|
|
57
|
+
"Q4_K_S",
|
|
58
|
+
"Q5_0",
|
|
59
|
+
"Q5_1",
|
|
60
|
+
"Q5_K_S",
|
|
61
|
+
"Q6_K",
|
|
62
|
+
"Q8_0"
|
|
63
|
+
],
|
|
64
|
+
"gguf_model_file_name_template": "flux1-dev-{quantization}.gguf"
|
|
33
65
|
},
|
|
34
66
|
{
|
|
35
67
|
"model_name": "sd3-medium",
|
|
@@ -47,6 +79,100 @@
|
|
|
47
79
|
"quantize_text_encoder": "text_encoder_3"
|
|
48
80
|
}
|
|
49
81
|
},
|
|
82
|
+
{
|
|
83
|
+
"model_name": "sd3.5-medium",
|
|
84
|
+
"model_family": "stable_diffusion",
|
|
85
|
+
"model_hub": "modelscope",
|
|
86
|
+
"model_id": "AI-ModelScope/stable-diffusion-3.5-medium",
|
|
87
|
+
"model_revision": "master",
|
|
88
|
+
"model_ability": [
|
|
89
|
+
"text2image",
|
|
90
|
+
"image2image",
|
|
91
|
+
"inpainting"
|
|
92
|
+
],
|
|
93
|
+
"default_model_config": {
|
|
94
|
+
"quantize": true,
|
|
95
|
+
"quantize_text_encoder": "text_encoder_3",
|
|
96
|
+
"torch_dtype": "bfloat16"
|
|
97
|
+
},
|
|
98
|
+
"gguf_model_id": "Xorbits/stable-diffusion-3.5-medium-gguf",
|
|
99
|
+
"gguf_quantizations": [
|
|
100
|
+
"F16",
|
|
101
|
+
"Q3_K_M",
|
|
102
|
+
"Q3_K_S",
|
|
103
|
+
"Q4_0",
|
|
104
|
+
"Q4_1",
|
|
105
|
+
"Q4_K_M",
|
|
106
|
+
"Q4_K_S",
|
|
107
|
+
"Q5_0",
|
|
108
|
+
"Q5_1",
|
|
109
|
+
"Q5_K_M",
|
|
110
|
+
"Q5_K_S",
|
|
111
|
+
"Q6_K",
|
|
112
|
+
"Q8_0"
|
|
113
|
+
],
|
|
114
|
+
"gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf"
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"model_name": "sd3.5-large",
|
|
118
|
+
"model_family": "stable_diffusion",
|
|
119
|
+
"model_hub": "modelscope",
|
|
120
|
+
"model_id": "AI-ModelScope/stable-diffusion-3.5-large",
|
|
121
|
+
"model_revision": "master",
|
|
122
|
+
"model_ability": [
|
|
123
|
+
"text2image",
|
|
124
|
+
"image2image",
|
|
125
|
+
"inpainting"
|
|
126
|
+
],
|
|
127
|
+
"default_model_config": {
|
|
128
|
+
"quantize": true,
|
|
129
|
+
"quantize_text_encoder": "text_encoder_3",
|
|
130
|
+
"torch_dtype": "bfloat16",
|
|
131
|
+
"transformer_nf4": true
|
|
132
|
+
},
|
|
133
|
+
"gguf_model_id": "Xorbits/stable-diffusion-3.5-large-gguf",
|
|
134
|
+
"gguf_quantizations": [
|
|
135
|
+
"F16",
|
|
136
|
+
"Q4_0",
|
|
137
|
+
"Q4_1",
|
|
138
|
+
"Q5_0",
|
|
139
|
+
"Q5_1",
|
|
140
|
+
"Q8_0"
|
|
141
|
+
],
|
|
142
|
+
"gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf"
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
"model_name": "sd3.5-large-turbo",
|
|
146
|
+
"model_family": "stable_diffusion",
|
|
147
|
+
"model_hub": "modelscope",
|
|
148
|
+
"model_id": "AI-ModelScope/stable-diffusion-3.5-large-turbo",
|
|
149
|
+
"model_revision": "master",
|
|
150
|
+
"model_ability": [
|
|
151
|
+
"text2image",
|
|
152
|
+
"image2image",
|
|
153
|
+
"inpainting"
|
|
154
|
+
],
|
|
155
|
+
"default_model_config": {
|
|
156
|
+
"quantize": true,
|
|
157
|
+
"quantize_text_encoder": "text_encoder_3",
|
|
158
|
+
"torch_dtype": "bfloat16",
|
|
159
|
+
"transformer_nf4": true
|
|
160
|
+
},
|
|
161
|
+
"default_generate_config": {
|
|
162
|
+
"guidance_scale": 1.0,
|
|
163
|
+
"num_inference_steps": 4
|
|
164
|
+
},
|
|
165
|
+
"gguf_model_id": "Xorbits/stable-diffusion-3.5-large-turbo-gguf",
|
|
166
|
+
"gguf_quantizations": [
|
|
167
|
+
"F16",
|
|
168
|
+
"Q4_0",
|
|
169
|
+
"Q4_1",
|
|
170
|
+
"Q5_0",
|
|
171
|
+
"Q5_1",
|
|
172
|
+
"Q8_0"
|
|
173
|
+
],
|
|
174
|
+
"gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
|
|
175
|
+
},
|
|
50
176
|
{
|
|
51
177
|
"model_name": "sd-turbo",
|
|
52
178
|
"model_family": "stable_diffusion",
|
|
@@ -14,8 +14,10 @@
|
|
|
14
14
|
|
|
15
15
|
import contextlib
|
|
16
16
|
import gc
|
|
17
|
+
import importlib
|
|
17
18
|
import inspect
|
|
18
19
|
import itertools
|
|
20
|
+
import json
|
|
19
21
|
import logging
|
|
20
22
|
import os
|
|
21
23
|
import re
|
|
@@ -86,6 +88,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
86
88
|
lora_load_kwargs: Optional[Dict] = None,
|
|
87
89
|
lora_fuse_kwargs: Optional[Dict] = None,
|
|
88
90
|
model_spec: Optional["ImageModelFamilyV1"] = None,
|
|
91
|
+
gguf_model_path: Optional[str] = None,
|
|
89
92
|
**kwargs,
|
|
90
93
|
):
|
|
91
94
|
self._model_uid = model_uid
|
|
@@ -109,6 +112,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
109
112
|
self._model_spec = model_spec
|
|
110
113
|
self._abilities = model_spec.model_ability or [] # type: ignore
|
|
111
114
|
self._kwargs = kwargs
|
|
115
|
+
# gguf
|
|
116
|
+
self._gguf_model_path = gguf_model_path
|
|
112
117
|
|
|
113
118
|
@property
|
|
114
119
|
def model_ability(self):
|
|
@@ -184,7 +189,17 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
184
189
|
self._model.fuse_lora(**self._lora_fuse_kwargs)
|
|
185
190
|
logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
|
|
186
191
|
|
|
192
|
+
def _get_layer_cls(self, layer: str):
|
|
193
|
+
with open(os.path.join(self._model_path, "model_index.json")) as f: # type: ignore
|
|
194
|
+
model_index = json.load(f)
|
|
195
|
+
layer_info = model_index[layer]
|
|
196
|
+
module_name, class_name = layer_info
|
|
197
|
+
module = importlib.import_module(module_name)
|
|
198
|
+
return getattr(module, class_name)
|
|
199
|
+
|
|
187
200
|
def load(self):
|
|
201
|
+
from transformers import BitsAndBytesConfig, T5EncoderModel
|
|
202
|
+
|
|
188
203
|
if "text2image" in self._abilities or "image2image" in self._abilities:
|
|
189
204
|
from diffusers import AutoPipelineForText2Image as AutoPipelineModel
|
|
190
205
|
elif "inpainting" in self._abilities:
|
|
@@ -200,7 +215,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
200
215
|
glob(os.path.join(self._model_path, "*/*.safetensors"))
|
|
201
216
|
)
|
|
202
217
|
if isinstance(torch_dtype, str):
|
|
203
|
-
self._kwargs["torch_dtype"] = getattr(
|
|
218
|
+
self._torch_dtype = torch_dtype = self._kwargs["torch_dtype"] = getattr(
|
|
219
|
+
torch, torch_dtype
|
|
220
|
+
)
|
|
204
221
|
|
|
205
222
|
controlnet = self._kwargs.get("controlnet")
|
|
206
223
|
if controlnet is not None:
|
|
@@ -212,18 +229,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
212
229
|
]
|
|
213
230
|
|
|
214
231
|
quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
|
|
215
|
-
if quantize_text_encoder:
|
|
216
|
-
try:
|
|
217
|
-
from transformers import BitsAndBytesConfig, T5EncoderModel
|
|
218
|
-
except ImportError:
|
|
219
|
-
error_message = "Failed to import module 'transformers'"
|
|
220
|
-
installation_guide = [
|
|
221
|
-
"Please make sure 'transformers' is installed. ",
|
|
222
|
-
"You can install it by `pip install transformers`\n",
|
|
223
|
-
]
|
|
224
|
-
|
|
225
|
-
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
226
|
-
|
|
232
|
+
if quantize_text_encoder and not self._gguf_model_path:
|
|
227
233
|
try:
|
|
228
234
|
import bitsandbytes # noqa: F401
|
|
229
235
|
except ImportError:
|
|
@@ -249,6 +255,32 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
249
255
|
self._kwargs[text_encoder_name] = text_encoder
|
|
250
256
|
self._kwargs["device_map"] = "balanced"
|
|
251
257
|
|
|
258
|
+
if self._gguf_model_path:
|
|
259
|
+
from diffusers import GGUFQuantizationConfig
|
|
260
|
+
|
|
261
|
+
# GGUF transformer
|
|
262
|
+
self._kwargs["transformer"] = self._get_layer_cls(
|
|
263
|
+
"transformer"
|
|
264
|
+
).from_single_file(
|
|
265
|
+
self._gguf_model_path,
|
|
266
|
+
quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
|
|
267
|
+
torch_dtype=torch_dtype,
|
|
268
|
+
config=os.path.join(self._model_path, "transformer"),
|
|
269
|
+
)
|
|
270
|
+
elif self._kwargs.get("transformer_nf4"):
|
|
271
|
+
nf4_config = BitsAndBytesConfig(
|
|
272
|
+
load_in_4bit=True,
|
|
273
|
+
bnb_4bit_quant_type="nf4",
|
|
274
|
+
bnb_4bit_compute_dtype=torch_dtype,
|
|
275
|
+
)
|
|
276
|
+
model_nf4 = self._get_layer_cls("transformer").from_pretrained(
|
|
277
|
+
self._model_path,
|
|
278
|
+
subfolder="transformer",
|
|
279
|
+
quantization_config=nf4_config,
|
|
280
|
+
torch_dtype=torch_dtype,
|
|
281
|
+
)
|
|
282
|
+
self._kwargs["transformer"] = model_nf4
|
|
283
|
+
|
|
252
284
|
logger.debug(
|
|
253
285
|
"Loading model from %s, kwargs: %s", self._model_path, self._kwargs
|
|
254
286
|
)
|
|
@@ -8942,5 +8942,52 @@
|
|
|
8942
8942
|
"<|user|>",
|
|
8943
8943
|
"<|observation|>"
|
|
8944
8944
|
]
|
|
8945
|
+
},
|
|
8946
|
+
{
|
|
8947
|
+
"version": 1,
|
|
8948
|
+
"context_length": 32768,
|
|
8949
|
+
"model_name": "QvQ-72B-Preview",
|
|
8950
|
+
"model_lang": [
|
|
8951
|
+
"en",
|
|
8952
|
+
"zh"
|
|
8953
|
+
],
|
|
8954
|
+
"model_ability": [
|
|
8955
|
+
"chat",
|
|
8956
|
+
"vision"
|
|
8957
|
+
],
|
|
8958
|
+
"model_description": "QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities.",
|
|
8959
|
+
"model_specs": [
|
|
8960
|
+
{
|
|
8961
|
+
"model_format": "pytorch",
|
|
8962
|
+
"model_size_in_billions": 72,
|
|
8963
|
+
"quantizations": [
|
|
8964
|
+
"4-bit",
|
|
8965
|
+
"8-bit",
|
|
8966
|
+
"none"
|
|
8967
|
+
],
|
|
8968
|
+
"model_id": "Qwen/QVQ-72B-Preview"
|
|
8969
|
+
},
|
|
8970
|
+
{
|
|
8971
|
+
"model_format": "mlx",
|
|
8972
|
+
"model_size_in_billions": 72,
|
|
8973
|
+
"quantizations": [
|
|
8974
|
+
"3bit",
|
|
8975
|
+
"4bit",
|
|
8976
|
+
"6bit",
|
|
8977
|
+
"8bit",
|
|
8978
|
+
"bf16"
|
|
8979
|
+
],
|
|
8980
|
+
"model_id": "mlx-community/QVQ-72B-Preview-{quantization}"
|
|
8981
|
+
}
|
|
8982
|
+
],
|
|
8983
|
+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
|
8984
|
+
"stop_token_ids": [
|
|
8985
|
+
151645,
|
|
8986
|
+
151643
|
|
8987
|
+
],
|
|
8988
|
+
"stop": [
|
|
8989
|
+
"<|im_end|>",
|
|
8990
|
+
"<|endoftext|>"
|
|
8991
|
+
]
|
|
8945
8992
|
}
|
|
8946
8993
|
]
|
|
@@ -972,46 +972,25 @@ def match_llm(
|
|
|
972
972
|
return spec
|
|
973
973
|
|
|
974
974
|
# priority: download_hub > download_from_modelscope() and download_from_csghub()
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
)
|
|
987
|
-
elif download_hub == "csghub":
|
|
988
|
-
all_families = (
|
|
989
|
-
BUILTIN_CSGHUB_LLM_FAMILIES
|
|
990
|
-
+ BUILTIN_LLM_FAMILIES
|
|
991
|
-
+ user_defined_llm_families
|
|
992
|
-
)
|
|
993
|
-
elif download_hub == "huggingface":
|
|
994
|
-
all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
|
|
975
|
+
# set base model
|
|
976
|
+
base_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
|
|
977
|
+
hub_families_map = {
|
|
978
|
+
"modelscope": BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
979
|
+
"openmind_hub": BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
980
|
+
"csghub": BUILTIN_CSGHUB_LLM_FAMILIES,
|
|
981
|
+
}
|
|
982
|
+
if download_hub == "huggingface":
|
|
983
|
+
all_families = base_families
|
|
984
|
+
elif download_hub in hub_families_map:
|
|
985
|
+
all_families = hub_families_map[download_hub] + base_families
|
|
995
986
|
elif download_from_modelscope():
|
|
996
|
-
all_families =
|
|
997
|
-
BUILTIN_MODELSCOPE_LLM_FAMILIES
|
|
998
|
-
+ BUILTIN_LLM_FAMILIES
|
|
999
|
-
+ user_defined_llm_families
|
|
1000
|
-
)
|
|
987
|
+
all_families = BUILTIN_MODELSCOPE_LLM_FAMILIES + base_families
|
|
1001
988
|
elif download_from_openmind_hub():
|
|
1002
|
-
all_families =
|
|
1003
|
-
BUILTIN_OPENMIND_HUB_LLM_FAMILIES
|
|
1004
|
-
+ BUILTIN_LLM_FAMILIES
|
|
1005
|
-
+ user_defined_llm_families
|
|
1006
|
-
)
|
|
989
|
+
all_families = BUILTIN_OPENMIND_HUB_LLM_FAMILIES + base_families
|
|
1007
990
|
elif download_from_csghub():
|
|
1008
|
-
all_families =
|
|
1009
|
-
BUILTIN_CSGHUB_LLM_FAMILIES
|
|
1010
|
-
+ BUILTIN_LLM_FAMILIES
|
|
1011
|
-
+ user_defined_llm_families
|
|
1012
|
-
)
|
|
991
|
+
all_families = BUILTIN_CSGHUB_LLM_FAMILIES + base_families
|
|
1013
992
|
else:
|
|
1014
|
-
all_families =
|
|
993
|
+
all_families = base_families
|
|
1015
994
|
|
|
1016
995
|
for family in all_families:
|
|
1017
996
|
if model_name != family.model_name:
|
|
@@ -6673,5 +6673,54 @@
|
|
|
6673
6673
|
"<|user|>",
|
|
6674
6674
|
"<|observation|>"
|
|
6675
6675
|
]
|
|
6676
|
+
},
|
|
6677
|
+
{
|
|
6678
|
+
"version": 1,
|
|
6679
|
+
"context_length": 32768,
|
|
6680
|
+
"model_name": "QvQ-72B-Preview",
|
|
6681
|
+
"model_lang": [
|
|
6682
|
+
"en",
|
|
6683
|
+
"zh"
|
|
6684
|
+
],
|
|
6685
|
+
"model_ability": [
|
|
6686
|
+
"chat",
|
|
6687
|
+
"vision"
|
|
6688
|
+
],
|
|
6689
|
+
"model_description": "QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities.",
|
|
6690
|
+
"model_specs": [
|
|
6691
|
+
{
|
|
6692
|
+
"model_format": "pytorch",
|
|
6693
|
+
"model_size_in_billions": 72,
|
|
6694
|
+
"quantizations": [
|
|
6695
|
+
"4-bit",
|
|
6696
|
+
"8-bit",
|
|
6697
|
+
"none"
|
|
6698
|
+
],
|
|
6699
|
+
"model_id": "Qwen/QVQ-72B-Preview",
|
|
6700
|
+
"model_hub": "modelscope"
|
|
6701
|
+
},
|
|
6702
|
+
{
|
|
6703
|
+
"model_format": "mlx",
|
|
6704
|
+
"model_size_in_billions": 72,
|
|
6705
|
+
"quantizations": [
|
|
6706
|
+
"3bit",
|
|
6707
|
+
"4bit",
|
|
6708
|
+
"6bit",
|
|
6709
|
+
"8bit",
|
|
6710
|
+
"bf16"
|
|
6711
|
+
],
|
|
6712
|
+
"model_id": "mlx-community/QVQ-72B-Preview-{quantization}",
|
|
6713
|
+
"model_hub": "modelscope"
|
|
6714
|
+
}
|
|
6715
|
+
],
|
|
6716
|
+
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
|
6717
|
+
"stop_token_ids": [
|
|
6718
|
+
151645,
|
|
6719
|
+
151643
|
|
6720
|
+
],
|
|
6721
|
+
"stop": [
|
|
6722
|
+
"<|im_end|>",
|
|
6723
|
+
"<|endoftext|>"
|
|
6724
|
+
]
|
|
6676
6725
|
}
|
|
6677
6726
|
]
|
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -173,7 +173,9 @@ class MLXModel(LLM):
|
|
|
173
173
|
return False
|
|
174
174
|
return True
|
|
175
175
|
|
|
176
|
-
def _get_prompt_cache(
|
|
176
|
+
def _get_prompt_cache(
|
|
177
|
+
self, prompt, lora_name: Optional[str] = None, model: Any = None
|
|
178
|
+
):
|
|
177
179
|
from mlx_lm.models.cache import make_prompt_cache
|
|
178
180
|
|
|
179
181
|
assert self._prompt_cache is not None
|
|
@@ -185,7 +187,9 @@ class MLXModel(LLM):
|
|
|
185
187
|
or self._prompt_cache.tokens != prompt[:cache_len]
|
|
186
188
|
):
|
|
187
189
|
self._prompt_cache.model_key = model_key
|
|
188
|
-
self._prompt_cache.cache = make_prompt_cache(
|
|
190
|
+
self._prompt_cache.cache = make_prompt_cache(
|
|
191
|
+
model or self._model, self._max_kv_size
|
|
192
|
+
)
|
|
189
193
|
self._prompt_cache.tokens = []
|
|
190
194
|
logger.debug("Making new prompt cache for %s", self.model_uid)
|
|
191
195
|
else:
|
|
@@ -458,6 +462,8 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
458
462
|
|
|
459
463
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
460
464
|
|
|
465
|
+
self._prompt_cache = PromptCache()
|
|
466
|
+
|
|
461
467
|
return load(self.model_path)
|
|
462
468
|
|
|
463
469
|
def load(self):
|
|
@@ -471,13 +477,52 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
471
477
|
self._model, self._processor = self._load_model(**kwargs)
|
|
472
478
|
self._tokenizer = self._processor.tokenizer
|
|
473
479
|
|
|
480
|
+
def _generate_stream_inner_no_image(self, **kwargs):
|
|
481
|
+
import mlx.nn as nn
|
|
482
|
+
from mlx_lm.utils import make_sampler, stream_generate
|
|
483
|
+
|
|
484
|
+
# For mlx-lm, the model(inputs) will return logits,
|
|
485
|
+
# but the language model in mlx-vlm will return an object
|
|
486
|
+
# https://github.com/Blaizzy/mlx-vlm/blob/3f5e1620072440afb7496940f67ac1c7fc64056f/mlx_vlm/models/base.py#L260
|
|
487
|
+
# so we cannot pass the language model to stream_generate directly
|
|
488
|
+
# we wrap here to just let model(inputs) return logits to pass stream_generate
|
|
489
|
+
class ModelWrapper(nn.Module):
|
|
490
|
+
def __init__(self, model):
|
|
491
|
+
super().__init__()
|
|
492
|
+
self._model = model.language_model
|
|
493
|
+
|
|
494
|
+
@property
|
|
495
|
+
def layers(self):
|
|
496
|
+
return self._model.layers
|
|
497
|
+
|
|
498
|
+
def __call__(self, *args, **kwargs):
|
|
499
|
+
return self._model(*args, **kwargs).logits
|
|
500
|
+
|
|
501
|
+
sampler = make_sampler(
|
|
502
|
+
temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
|
|
503
|
+
)
|
|
504
|
+
prompt_token_ids = kwargs.pop("prompt_token_ids")
|
|
505
|
+
yield from stream_generate(
|
|
506
|
+
ModelWrapper(self._model),
|
|
507
|
+
self._tokenizer,
|
|
508
|
+
prompt_token_ids,
|
|
509
|
+
sampler=sampler,
|
|
510
|
+
**kwargs,
|
|
511
|
+
)
|
|
512
|
+
|
|
474
513
|
def _generate_stream_inner(self, **kwargs):
|
|
475
514
|
import mlx.core as mx
|
|
476
515
|
from mlx_lm.utils import GenerationResponse
|
|
477
516
|
from mlx_vlm.utils import generate_step
|
|
478
517
|
|
|
479
|
-
max_tokens = kwargs.pop("max_tokens")
|
|
480
518
|
inputs = kwargs["prompt_token_ids"]
|
|
519
|
+
|
|
520
|
+
if not isinstance(inputs, tuple):
|
|
521
|
+
# no images
|
|
522
|
+
yield from self._generate_stream_inner_no_image(**kwargs)
|
|
523
|
+
return
|
|
524
|
+
|
|
525
|
+
max_tokens = kwargs.pop("max_tokens")
|
|
481
526
|
input_ids, pixel_values, mask = inputs[:3]
|
|
482
527
|
|
|
483
528
|
kwargs = {
|
|
@@ -549,16 +594,26 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
549
594
|
else:
|
|
550
595
|
image_token_index = None
|
|
551
596
|
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
self.
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
597
|
+
if not images:
|
|
598
|
+
prompt = prompt["prompt"] # type: ignore
|
|
599
|
+
prompt_token_ids = self._tokenizer.encode(prompt)
|
|
600
|
+
prompt_token_ids = self._get_prompt_cache(
|
|
601
|
+
prompt_token_ids,
|
|
602
|
+
kwargs.get("lora_name"),
|
|
603
|
+
model=self._model.language_model,
|
|
604
|
+
)
|
|
605
|
+
return prompt_token_ids, len(prompt_token_ids)
|
|
606
|
+
else:
|
|
607
|
+
inputs = prepare_inputs(
|
|
608
|
+
None,
|
|
609
|
+
self._processor,
|
|
610
|
+
images,
|
|
611
|
+
prompt_str,
|
|
612
|
+
image_token_index,
|
|
613
|
+
kwargs.get("resize_shape"),
|
|
614
|
+
)
|
|
615
|
+
input_ids = inputs[0]
|
|
616
|
+
return inputs, len(input_ids)
|
|
562
617
|
|
|
563
618
|
def chat(
|
|
564
619
|
self,
|
|
@@ -47,6 +47,8 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
47
47
|
llm_family = model_family.model_family or model_family.model_name
|
|
48
48
|
if "qwen2-vl-instruct".lower() in llm_family.lower():
|
|
49
49
|
return True
|
|
50
|
+
if "qvq-72b-preview".lower() in llm_family.lower():
|
|
51
|
+
return True
|
|
50
52
|
return False
|
|
51
53
|
|
|
52
54
|
def load(self):
|
xinference/model/llm/utils.py
CHANGED
|
@@ -70,6 +70,7 @@ class VLLMModelConfig(TypedDict, total=False):
|
|
|
70
70
|
max_model_len: Optional[int]
|
|
71
71
|
limit_mm_per_prompt: Optional[Dict[str, int]]
|
|
72
72
|
guided_decoding_backend: Optional[str]
|
|
73
|
+
scheduling_policy: Optional[str]
|
|
73
74
|
|
|
74
75
|
|
|
75
76
|
class VLLMGenerateConfig(TypedDict, total=False):
|
|
@@ -187,10 +188,14 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
187
188
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
188
189
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
189
190
|
|
|
191
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
|
|
192
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
|
|
193
|
+
|
|
190
194
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
|
|
191
195
|
VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
|
|
192
196
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct")
|
|
193
197
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
|
|
198
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
|
|
194
199
|
|
|
195
200
|
|
|
196
201
|
class VLLMModel(LLM):
|
|
@@ -244,7 +249,6 @@ class VLLMModel(LLM):
|
|
|
244
249
|
multiprocessing.set_start_method("fork", force=True)
|
|
245
250
|
|
|
246
251
|
self._model_config = self._sanitize_model_config(self._model_config)
|
|
247
|
-
|
|
248
252
|
if self.lora_modules is None:
|
|
249
253
|
self.lora_requests = []
|
|
250
254
|
else:
|
|
@@ -327,7 +331,9 @@ class VLLMModel(LLM):
|
|
|
327
331
|
model_config.setdefault("quantization", None)
|
|
328
332
|
model_config.setdefault("max_model_len", None)
|
|
329
333
|
model_config.setdefault("guided_decoding_backend", "outlines")
|
|
330
|
-
|
|
334
|
+
# Add scheduling policy if vLLM version is 0.6.3 or higher
|
|
335
|
+
if vllm.__version__ >= "0.6.3":
|
|
336
|
+
model_config.setdefault("scheduling_policy", "fcfs")
|
|
331
337
|
return model_config
|
|
332
338
|
|
|
333
339
|
@staticmethod
|
|
@@ -859,6 +865,9 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
859
865
|
"image": 2, # default 2 images all chat
|
|
860
866
|
}
|
|
861
867
|
)
|
|
868
|
+
# Add scheduling policy if vLLM version is 0.6.3 or higher
|
|
869
|
+
if vllm.__version__ >= "0.6.3":
|
|
870
|
+
model_config.setdefault("scheduling_policy", "fcfs")
|
|
862
871
|
|
|
863
872
|
return model_config
|
|
864
873
|
|