xinference 1.6.1__py3-none-any.whl → 1.7.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +79 -2
  3. xinference/client/restful/restful_client.py +64 -2
  4. xinference/core/media_interface.py +123 -0
  5. xinference/core/model.py +31 -0
  6. xinference/core/supervisor.py +8 -17
  7. xinference/core/worker.py +5 -17
  8. xinference/deploy/cmdline.py +6 -2
  9. xinference/model/audio/chattts.py +24 -39
  10. xinference/model/audio/cosyvoice.py +18 -30
  11. xinference/model/audio/funasr.py +42 -0
  12. xinference/model/audio/model_spec.json +18 -0
  13. xinference/model/audio/model_spec_modelscope.json +19 -1
  14. xinference/model/audio/utils.py +75 -0
  15. xinference/model/core.py +1 -0
  16. xinference/model/embedding/__init__.py +74 -18
  17. xinference/model/embedding/core.py +98 -597
  18. xinference/model/embedding/embed_family.py +133 -0
  19. xinference/model/embedding/flag/__init__.py +13 -0
  20. xinference/model/embedding/flag/core.py +282 -0
  21. xinference/model/embedding/model_spec.json +24 -0
  22. xinference/model/embedding/model_spec_modelscope.json +24 -0
  23. xinference/model/embedding/sentence_transformers/__init__.py +13 -0
  24. xinference/model/embedding/sentence_transformers/core.py +399 -0
  25. xinference/model/embedding/vllm/__init__.py +0 -0
  26. xinference/model/embedding/vllm/core.py +95 -0
  27. xinference/model/image/model_spec.json +20 -2
  28. xinference/model/image/model_spec_modelscope.json +21 -2
  29. xinference/model/image/stable_diffusion/core.py +144 -53
  30. xinference/model/llm/llama_cpp/memory.py +4 -2
  31. xinference/model/llm/llm_family.json +57 -0
  32. xinference/model/llm/llm_family_modelscope.json +61 -0
  33. xinference/model/llm/sglang/core.py +4 -0
  34. xinference/model/llm/utils.py +11 -0
  35. xinference/model/llm/vllm/core.py +3 -0
  36. xinference/model/rerank/core.py +96 -4
  37. xinference/model/rerank/model_spec.json +24 -0
  38. xinference/model/rerank/model_spec_modelscope.json +24 -0
  39. xinference/model/rerank/utils.py +4 -3
  40. xinference/model/utils.py +38 -1
  41. xinference/model/video/diffusers.py +65 -3
  42. xinference/model/video/model_spec.json +31 -4
  43. xinference/model/video/model_spec_modelscope.json +32 -4
  44. xinference/web/ui/build/asset-manifest.json +6 -6
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/css/main.013f296b.css +2 -0
  47. xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
  48. xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
  49. xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
  56. xinference/web/ui/src/locales/en.json +18 -7
  57. xinference/web/ui/src/locales/ja.json +224 -0
  58. xinference/web/ui/src/locales/ko.json +224 -0
  59. xinference/web/ui/src/locales/zh.json +18 -7
  60. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/METADATA +9 -8
  61. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/RECORD +66 -57
  62. xinference/web/ui/build/static/css/main.337afe76.css +0 -2
  63. xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
  64. xinference/web/ui/build/static/js/main.ddf9eaee.js +0 -3
  65. xinference/web/ui/build/static/js/main.ddf9eaee.js.map +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
  72. /xinference/web/ui/build/static/js/{main.ddf9eaee.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
  73. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/WHEEL +0 -0
  74. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/entry_points.txt +0 -0
  75. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/licenses/LICENSE +0 -0
  76. {xinference-1.6.1.dist-info → xinference-1.7.0.post1.dist-info}/top_level.txt +0 -0
@@ -22,6 +22,7 @@ import logging
22
22
  import os
23
23
  import re
24
24
  import sys
25
+ import warnings
25
26
  from glob import glob
26
27
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
27
28
 
@@ -197,8 +198,6 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
197
198
  return getattr(module, class_name)
198
199
 
199
200
  def load(self):
200
- from transformers import BitsAndBytesConfig, T5EncoderModel
201
-
202
201
  if "text2image" in self._abilities or "image2image" in self._abilities:
203
202
  from diffusers import AutoPipelineForText2Image as AutoPipelineModel
204
203
  elif "inpainting" in self._abilities:
@@ -227,58 +226,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
227
226
  self._get_controlnet_model(*cn) for cn in controlnet
228
227
  ]
229
228
 
229
+ # quantizations
230
+ # text_encoder
230
231
  quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
231
- if quantize_text_encoder and not self._gguf_model_path:
232
- try:
233
- import bitsandbytes # noqa: F401
234
- except ImportError:
235
- error_message = "Failed to import module 'bitsandbytes'"
236
- installation_guide = [
237
- "Please make sure 'bitsandbytes' is installed. ",
238
- "You can install it by `pip install bitsandbytes`\n",
239
- ]
240
-
241
- raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
242
-
243
- for text_encoder_name in quantize_text_encoder.split(","):
244
- quantization_config = BitsAndBytesConfig(load_in_8bit=True)
245
- quantization_kwargs = {}
246
- if torch_dtype:
247
- quantization_kwargs["torch_dtype"] = torch_dtype
248
- text_encoder = T5EncoderModel.from_pretrained(
249
- self._model_path,
250
- subfolder=text_encoder_name,
251
- quantization_config=quantization_config,
252
- **quantization_kwargs,
253
- )
254
- self._kwargs[text_encoder_name] = text_encoder
255
- self._kwargs["device_map"] = "balanced"
256
-
232
+ self._quantize_text_encoder(quantize_text_encoder)
233
+ # transformer
257
234
  if self._gguf_model_path:
258
- from diffusers import GGUFQuantizationConfig
259
-
260
- # GGUF transformer
261
- self._kwargs["transformer"] = self._get_layer_cls(
262
- "transformer"
263
- ).from_single_file(
264
- self._gguf_model_path,
265
- quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
266
- torch_dtype=torch_dtype,
267
- config=os.path.join(self._model_path, "transformer"),
268
- )
269
- elif self._kwargs.get("transformer_nf4"):
270
- nf4_config = BitsAndBytesConfig(
271
- load_in_4bit=True,
272
- bnb_4bit_quant_type="nf4",
273
- bnb_4bit_compute_dtype=torch_dtype,
274
- )
275
- model_nf4 = self._get_layer_cls("transformer").from_pretrained(
276
- self._model_path,
277
- subfolder="transformer",
278
- quantization_config=nf4_config,
279
- torch_dtype=torch_dtype,
280
- )
281
- self._kwargs["transformer"] = model_nf4
235
+ self._quantize_transformer_gguf()
236
+ else:
237
+ self._quantize_transformer()
282
238
 
283
239
  logger.debug(
284
240
  "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
@@ -308,6 +264,133 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
308
264
  cache_branch_id=self._kwargs.get("deepcache_cache_branch_id", 0),
309
265
  )
310
266
 
267
+ def _get_quantize_config(self, method: str, quantization: str, module: str):
268
+ if method == "bnb":
269
+ try:
270
+ import bitsandbytes # noqa: F401
271
+ except ImportError:
272
+ error_message = "Failed to import module 'bitsandbytes'"
273
+ installation_guide = [
274
+ "Please make sure 'bitsandbytes' is installed. ",
275
+ "You can install it by `pip install bitsandbytes`\n",
276
+ ]
277
+
278
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
279
+
280
+ if module.startswith("diffusers."):
281
+ from diffusers import BitsAndBytesConfig
282
+ else:
283
+ assert module.startswith("transformers.")
284
+ from transformers import BitsAndBytesConfig
285
+
286
+ if quantization == "4-bit":
287
+ return BitsAndBytesConfig(load_in_4bit=True)
288
+ elif quantization == "8-bit":
289
+ return BitsAndBytesConfig(load_in_8bit=True)
290
+ elif quantization == "nf4":
291
+ return BitsAndBytesConfig(
292
+ load_in_4bit=True,
293
+ bnb_4bit_quant_type="nf4",
294
+ bnb_4bit_compute_dtype=self._torch_dtype,
295
+ )
296
+ elif method == "torchao":
297
+ try:
298
+ import torchao # noqa: F401
299
+ except ImportError:
300
+ error_message = "Failed to import module 'torchao'"
301
+ installation_guide = [
302
+ "Please make sure 'torchao' is installed. ",
303
+ "You can install it by `pip install torchao`\n",
304
+ ]
305
+
306
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
307
+
308
+ if module.startswith("diffusers."):
309
+ from diffusers import TorchAoConfig
310
+ else:
311
+ assert module.startswith("transformers.")
312
+ from transformers import TorchAoConfig
313
+
314
+ return TorchAoConfig(quantization)
315
+ else:
316
+ raise ValueError(f"Unknown quantization method for image model: {method}")
317
+
318
+ def _quantize_text_encoder(self, quantize_text_encoder: Optional[str]):
319
+ if self._gguf_model_path:
320
+ # skip quantization when gguf applied to transformer
321
+ return
322
+
323
+ if not quantize_text_encoder:
324
+ return
325
+
326
+ quantization_method = self._kwargs.pop("text_encoder_quantize_method", "bnb")
327
+ quantization = self._kwargs.pop("text_encoder_quantization", "8-bit")
328
+
329
+ torch_dtype = self._torch_dtype
330
+ for text_encoder_name in quantize_text_encoder.split(","):
331
+ quantization_kwargs: Dict[str, Any] = {}
332
+ if torch_dtype:
333
+ quantization_kwargs["torch_dtype"] = torch_dtype
334
+ text_encoder_cls = self._get_layer_cls(text_encoder_name)
335
+ quantization_config = self._get_quantize_config(
336
+ quantization_method, quantization, text_encoder_cls.__module__
337
+ )
338
+ text_encoder = text_encoder_cls.from_pretrained(
339
+ self._model_path,
340
+ subfolder=text_encoder_name,
341
+ quantization_config=quantization_config,
342
+ **quantization_kwargs,
343
+ )
344
+ self._kwargs[text_encoder_name] = text_encoder
345
+ else:
346
+ if not self._kwargs.get("device_map"):
347
+ self._kwargs["device_map"] = "balanced"
348
+
349
+ def _quantize_transformer(self):
350
+ quantization = None
351
+ nf4 = self._kwargs.pop("transformer_nf4", None)
352
+ if nf4:
353
+ warnings.warn(
354
+ "`transformer_nf4` is deprecated, please use `transformer_quantization=nf4`",
355
+ category=DeprecationWarning,
356
+ stacklevel=2,
357
+ )
358
+ quantization = "nf4"
359
+ method = self._kwargs.pop("transformer_quantize_method", "bnb")
360
+ if not quantization:
361
+ quantization = self._kwargs.pop("transformer_quantization", None)
362
+
363
+ if not quantization:
364
+ # skip if no quantization specified
365
+ return
366
+
367
+ torch_dtype = self._torch_dtype
368
+ transformer_cls = self._get_layer_cls("transformer")
369
+ quantization_config = self._get_quantize_config(
370
+ method, quantization, transformer_cls.__module__
371
+ )
372
+ transformer_model = transformer_cls.from_pretrained(
373
+ self._model_path,
374
+ subfolder="transformer",
375
+ quantization_config=quantization_config,
376
+ torch_dtype=torch_dtype,
377
+ )
378
+ self._kwargs["transformer"] = transformer_model
379
+
380
+ def _quantize_transformer_gguf(self):
381
+ from diffusers import GGUFQuantizationConfig
382
+
383
+ # GGUF transformer
384
+ torch_dtype = self._torch_dtype
385
+ self._kwargs["transformer"] = self._get_layer_cls(
386
+ "transformer"
387
+ ).from_single_file(
388
+ self._gguf_model_path,
389
+ quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
390
+ torch_dtype=torch_dtype,
391
+ config=os.path.join(self._model_path, "transformer"),
392
+ )
393
+
311
394
  def _load_to_device(self, model):
312
395
  if self._kwargs.get("cpu_offload", False):
313
396
  logger.debug("CPU offloading model")
@@ -321,7 +404,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
321
404
  if self._kwargs.get("attention_slicing", False):
322
405
  model.enable_attention_slicing()
323
406
  if self._kwargs.get("vae_tiling", False):
324
- model.enable_vae_tiling()
407
+ try:
408
+ model.enable_vae_tiling()
409
+ except AttributeError:
410
+ model.vae.enable_tiling()
411
+ if self._kwargs.get("vae_slicing", False):
412
+ try:
413
+ model.enable_vae_slicing()
414
+ except AttributeError:
415
+ model.vae.enable_slicing()
325
416
 
326
417
  def get_max_num_images_for_batching(self):
327
418
  return self._kwargs.get("max_num_images", 16)
@@ -17,8 +17,10 @@ from collections.abc import Sequence
17
17
  from dataclasses import dataclass
18
18
  from typing import Any
19
19
 
20
- from gguf import GGUFReader, GGUFValueType # noqa: E402
21
-
20
+ try:
21
+ from gguf import GGUFReader, GGUFValueType # noqa: E402
22
+ except ImportError:
23
+ GGUFReader = GGUFValueType = None
22
24
  logger = logging.getLogger(__name__)
23
25
 
24
26
 
@@ -6142,6 +6142,53 @@
6142
6142
  "</s>"
6143
6143
  ]
6144
6144
  },
6145
+ {
6146
+ "version": 1,
6147
+ "context_length": 32768,
6148
+ "model_name": "minicpm4",
6149
+ "model_lang": [
6150
+ "zh"
6151
+ ],
6152
+ "model_ability": [
6153
+ "chat"
6154
+ ],
6155
+ "model_description": "MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.",
6156
+ "model_specs": [
6157
+ {
6158
+ "model_format": "pytorch",
6159
+ "model_size_in_billions": "0_5",
6160
+ "quantizations": [
6161
+ "none"
6162
+ ],
6163
+ "model_id": "JunHowie/MiniCPM4-0.5B"
6164
+ },
6165
+ {
6166
+ "model_format": "pytorch",
6167
+ "model_size_in_billions": 8,
6168
+ "quantizations": [
6169
+ "none"
6170
+ ],
6171
+ "model_id": "JunHowie/MiniCPM4-8B"
6172
+ },
6173
+ {
6174
+ "model_format": "mlx",
6175
+ "model_size_in_billions": 8,
6176
+ "quantizations": [
6177
+ "4bit"
6178
+ ],
6179
+ "model_id": "mlx-community/MiniCPM4-8B-4bit"
6180
+ }
6181
+ ],
6182
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
6183
+ "stop_token_ids": [
6184
+ 2,
6185
+ 73440
6186
+ ],
6187
+ "stop": [
6188
+ "</s>",
6189
+ "<|im_end|>"
6190
+ ]
6191
+ },
6145
6192
  {
6146
6193
  "version": 1,
6147
6194
  "context_length": 32768,
@@ -6737,6 +6784,16 @@
6737
6784
  "none"
6738
6785
  ],
6739
6786
  "model_id": "deepseek-ai/DeepSeek-R1-0528"
6787
+ },
6788
+ {
6789
+ "model_format": "gptq",
6790
+ "model_size_in_billions": 671,
6791
+ "quantizations": [
6792
+ "Int4-Int8Mix-Lite",
6793
+ "Int4-Int8Mix-Compact",
6794
+ "Int4-Int8Mix-Medium"
6795
+ ],
6796
+ "model_id": "QuantTrio/DeepSeek-R1-0528-GPTQ-{quantization}"
6740
6797
  }
6741
6798
  ],
6742
6799
  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
@@ -4277,6 +4277,56 @@
4277
4277
  "</s>"
4278
4278
  ]
4279
4279
  },
4280
+ {
4281
+ "version": 1,
4282
+ "context_length": 32768,
4283
+ "model_name": "minicpm4",
4284
+ "model_lang": [
4285
+ "zh"
4286
+ ],
4287
+ "model_ability": [
4288
+ "chat"
4289
+ ],
4290
+ "model_description": "MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.",
4291
+ "model_specs": [
4292
+ {
4293
+ "model_format": "pytorch",
4294
+ "model_size_in_billions": "0_5",
4295
+ "quantizations": [
4296
+ "none"
4297
+ ],
4298
+ "model_id": "JunHowie/MiniCPM4-0.5B",
4299
+ "model_hub": "modelscope"
4300
+ },
4301
+ {
4302
+ "model_format": "pytorch",
4303
+ "model_size_in_billions": 8,
4304
+ "quantizations": [
4305
+ "none"
4306
+ ],
4307
+ "model_id": "JunHowie/MiniCPM4-8B",
4308
+ "model_hub": "modelscope"
4309
+ },
4310
+ {
4311
+ "model_format": "mlx",
4312
+ "model_size_in_billions": 8,
4313
+ "quantizations": [
4314
+ "4bit"
4315
+ ],
4316
+ "model_id": "mlx-community/MiniCPM4-8B-4bit",
4317
+ "model_hub": "modelscope"
4318
+ }
4319
+ ],
4320
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
4321
+ "stop_token_ids": [
4322
+ 2,
4323
+ 73440
4324
+ ],
4325
+ "stop": [
4326
+ "</s>",
4327
+ "<|im_end|>"
4328
+ ]
4329
+ },
4280
4330
  {
4281
4331
  "version": 1,
4282
4332
  "context_length": 32768,
@@ -4883,6 +4933,17 @@
4883
4933
  ],
4884
4934
  "model_id": "deepseek-ai/DeepSeek-R1-0528",
4885
4935
  "model_hub": "modelscope"
4936
+ },
4937
+ {
4938
+ "model_format": "gptq",
4939
+ "model_size_in_billions": 671,
4940
+ "quantizations": [
4941
+ "Int4-Int8Mix-Lite",
4942
+ "Int4-Int8Mix-Compact",
4943
+ "Int4-Int8Mix-Medium"
4944
+ ],
4945
+ "model_id": "tclf90/DeepSeek-R1-0528-GPTQ-{quantization}",
4946
+ "model_hub": "modelscope"
4886
4947
  }
4887
4948
  ],
4888
4949
  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
@@ -14,6 +14,7 @@
14
14
  import importlib.util
15
15
  import json
16
16
  import logging
17
+ import multiprocessing
17
18
  import sys
18
19
  import threading
19
20
  import time
@@ -188,6 +189,9 @@ class SGLANGModel(LLM):
188
189
  if sgl_port is None:
189
190
  raise ValueError("Failed to find a port for sglang")
190
191
 
192
+ # fork may cause sglang stuck, force set to spawn
193
+ multiprocessing.set_start_method("spawn")
194
+
191
195
  if self._n_worker > 1:
192
196
  # distributed inference
193
197
  self._model_config["nnodes"] = self._n_worker
@@ -709,6 +709,12 @@ class ChatModelMixin:
709
709
  finish_reason = "tool_calls" if tool_calls else "stop"
710
710
 
711
711
  content = ". ".join(failed_contents) if failed_contents else None
712
+
713
+ # fix: qwen tool_call content field return null
714
+ family = model_family.model_family or model_family.model_name
715
+ if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
716
+ content = ""
717
+
712
718
  d = {
713
719
  "role": "assistant",
714
720
  "content": content,
@@ -779,6 +785,11 @@ class ChatModelMixin:
779
785
  failed_contents.append(content)
780
786
  finish_reason = "tool_calls" if tool_calls else "stop"
781
787
 
788
+ # fix: qwen tool_call content field return null
789
+ family = model_family.model_family or model_family.model_name
790
+ if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
791
+ content = ""
792
+
782
793
  m = {
783
794
  "role": "assistant",
784
795
  "content": content,
@@ -252,6 +252,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
252
252
  if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
253
253
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
254
254
 
255
+ if VLLM_INSTALLED and vllm.__version__ >= "0.9.1":
256
+ VLLM_SUPPORTED_CHAT_MODELS.append("minicpm4")
257
+
255
258
 
256
259
  class VLLMModel(LLM):
257
260
  def __init__(
@@ -14,6 +14,7 @@
14
14
 
15
15
  import gc
16
16
  import importlib
17
+ import importlib.util
17
18
  import logging
18
19
  import os
19
20
  import threading
@@ -31,6 +32,7 @@ from ...device_utils import empty_cache
31
32
  from ...types import Document, DocumentObj, Rerank, RerankTokens
32
33
  from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
33
34
  from ..utils import is_model_cached
35
+ from .utils import preprocess_sentence
34
36
 
35
37
  logger = logging.getLogger(__name__)
36
38
 
@@ -201,7 +203,10 @@ class RerankModel:
201
203
  )
202
204
  self._use_fp16 = True
203
205
 
204
- if self._model_spec.type == "normal":
206
+ if (
207
+ self._model_spec.type == "normal"
208
+ and "qwen3" not in self._model_spec.model_name.lower()
209
+ ):
205
210
  try:
206
211
  import sentence_transformers
207
212
  from sentence_transformers.cross_encoder import CrossEncoder
@@ -229,6 +234,74 @@ class RerankModel:
229
234
  )
230
235
  if self._use_fp16:
231
236
  self._model.model.half()
237
+ elif "qwen3" in self._model_spec.model_name.lower():
238
+ # qwen3-reranker
239
+ # now we use transformers
240
+ # TODO: support engines for rerank models
241
+ try:
242
+ from transformers import AutoModelForCausalLM, AutoTokenizer
243
+ except ImportError:
244
+ error_message = "Failed to import module 'transformers'"
245
+ installation_guide = [
246
+ "Please make sure 'transformers' is installed. ",
247
+ "You can install it by `pip install transformers`\n",
248
+ ]
249
+
250
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
251
+
252
+ tokenizer = AutoTokenizer.from_pretrained(
253
+ self._model_path, padding_side="left"
254
+ )
255
+ flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
256
+ model_kwargs = {"device_map": "auto"}
257
+ if flash_attn_installed:
258
+ model_kwargs["attn_implementation"] = "flash_attention_2"
259
+ model_kwargs["torch_dtype"] = torch.float16
260
+ model = self._model = AutoModelForCausalLM.from_pretrained(
261
+ self._model_path, **model_kwargs
262
+ ).eval()
263
+ max_length = getattr(self._model_spec, "max_tokens")
264
+
265
+ prefix = (
266
+ "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query "
267
+ 'and the Instruct provided. Note that the answer can only be "yes" or "no".'
268
+ "<|im_end|>\n<|im_start|>user\n"
269
+ )
270
+ suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
271
+ prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
272
+ suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
273
+
274
+ def process_inputs(pairs):
275
+ inputs = tokenizer(
276
+ pairs,
277
+ padding=False,
278
+ truncation="longest_first",
279
+ return_attention_mask=False,
280
+ max_length=max_length - len(prefix_tokens) - len(suffix_tokens),
281
+ )
282
+ for i, ele in enumerate(inputs["input_ids"]):
283
+ inputs["input_ids"][i] = prefix_tokens + ele + suffix_tokens
284
+ inputs = tokenizer.pad(
285
+ inputs, padding=True, return_tensors="pt", max_length=max_length
286
+ )
287
+ for key in inputs:
288
+ inputs[key] = inputs[key].to(model.device)
289
+ return inputs
290
+
291
+ token_false_id = tokenizer.convert_tokens_to_ids("no")
292
+ token_true_id = tokenizer.convert_tokens_to_ids("yes")
293
+
294
+ def compute_logits(inputs, **kwargs):
295
+ batch_scores = model(**inputs).logits[:, -1, :]
296
+ true_vector = batch_scores[:, token_true_id]
297
+ false_vector = batch_scores[:, token_false_id]
298
+ batch_scores = torch.stack([false_vector, true_vector], dim=1)
299
+ batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
300
+ scores = batch_scores[:, 1].exp().tolist()
301
+ return scores
302
+
303
+ self.process_inputs = process_inputs
304
+ self.compute_logits = compute_logits
232
305
  else:
233
306
  try:
234
307
  if self._model_spec.type == "LLM-based":
@@ -266,15 +339,17 @@ class RerankModel:
266
339
  raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
267
340
  logger.info("Rerank with kwargs: %s, model: %s", kwargs, self._model)
268
341
 
269
- from .utils import preprocess_sentence
270
-
271
342
  pre_query = preprocess_sentence(
272
343
  query, kwargs.get("instruction", None), self._model_spec.model_name
273
344
  )
274
345
  sentence_combinations = [[pre_query, doc] for doc in documents]
275
346
  # reset n tokens
276
347
  self._model.model.n_tokens = 0
277
- if self._model_spec.type == "normal":
348
+ if (
349
+ self._model_spec.type == "normal"
350
+ and "qwen3" not in self._model_spec.model_name.lower()
351
+ ):
352
+ logger.debug("Passing processed sentences: %s", sentence_combinations)
278
353
  similarity_scores = self._model.predict(
279
354
  sentence_combinations,
280
355
  convert_to_numpy=False,
@@ -283,6 +358,23 @@ class RerankModel:
283
358
  ).cpu()
284
359
  if similarity_scores.dtype == torch.bfloat16:
285
360
  similarity_scores = similarity_scores.float()
361
+ elif "qwen3" in self._model_spec.model_name.lower():
362
+
363
+ def format_instruction(instruction, query, doc):
364
+ if instruction is None:
365
+ instruction = "Given a web search query, retrieve relevant passages that answer the query"
366
+ output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
367
+ instruction=instruction, query=query, doc=doc
368
+ )
369
+ return output
370
+
371
+ pairs = [
372
+ format_instruction(kwargs.get("instruction", None), query, doc)
373
+ for doc in documents
374
+ ]
375
+ # Tokenize the input texts
376
+ inputs = self.process_inputs(pairs)
377
+ similarity_scores = self.compute_logits(inputs)
286
378
  else:
287
379
  # Related issue: https://github.com/xorbitsai/inference/issues/1775
288
380
  similarity_scores = self._model.compute_score(
@@ -62,5 +62,29 @@
62
62
  "max_tokens": 1024,
63
63
  "model_id": "openbmb/MiniCPM-Reranker",
64
64
  "model_revision": "5d2fd7345b6444c89d4c0fa59c92272888f3f2d0"
65
+ },
66
+ {
67
+ "model_name": "Qwen3-Reranker-0.6B",
68
+ "type": "normal",
69
+ "language": ["en", "zh"],
70
+ "max_tokens": 40960,
71
+ "model_id": "Qwen/Qwen3-Reranker-0.6B",
72
+ "model_revision": "6e9e69830b95c52b5fd889b7690dda3329508de3"
73
+ },
74
+ {
75
+ "model_name": "Qwen3-Reranker-4B",
76
+ "type": "normal",
77
+ "language": ["en", "zh"],
78
+ "max_tokens": 40960,
79
+ "model_id": "Qwen/Qwen3-Reranker-4B",
80
+ "model_revision": "f16fc5d5d2b9b1d0db8280929242745d79794ef5"
81
+ },
82
+ {
83
+ "model_name": "Qwen3-Reranker-8B",
84
+ "type": "normal",
85
+ "language": ["en", "zh"],
86
+ "max_tokens": 40960,
87
+ "model_id": "Qwen/Qwen3-Reranker-8B",
88
+ "model_revision": "5fa94080caafeaa45a15d11f969d7978e087a3db"
65
89
  }
66
90
  ]
@@ -57,5 +57,29 @@
57
57
  "max_tokens": 1024,
58
58
  "model_id": "OpenBMB/MiniCPM-Reranker",
59
59
  "model_hub": "modelscope"
60
+ },
61
+ {
62
+ "model_name": "Qwen3-Reranker-0.6B",
63
+ "type": "normal",
64
+ "language": ["en", "zh"],
65
+ "max_tokens": 40960,
66
+ "model_id": "Qwen/Qwen3-Reranker-0.6B",
67
+ "model_hub": "modelscope"
68
+ },
69
+ {
70
+ "model_name": "Qwen3-Reranker-4B",
71
+ "type": "normal",
72
+ "language": ["en", "zh"],
73
+ "max_tokens": 40960,
74
+ "model_id": "Qwen/Qwen3-Reranker-4B",
75
+ "model_hub": "modelscope"
76
+ },
77
+ {
78
+ "model_name": "Qwen3-Reranker-8B",
79
+ "type": "normal",
80
+ "language": ["en", "zh"],
81
+ "max_tokens": 40960,
82
+ "model_id": "Qwen/Qwen3-Reranker-8B",
83
+ "model_hub": "modelscope"
60
84
  }
61
85
  ]