xinference 0.15.1__py3-none-any.whl → 0.15.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (34) hide show
  1. xinference/_version.py +3 -3
  2. xinference/core/model.py +2 -2
  3. xinference/model/audio/cosyvoice.py +3 -3
  4. xinference/model/embedding/core.py +14 -5
  5. xinference/model/embedding/model_spec.json +7 -0
  6. xinference/model/embedding/model_spec_modelscope.json +9 -1
  7. xinference/model/image/stable_diffusion/core.py +42 -19
  8. xinference/model/llm/__init__.py +1 -1
  9. xinference/model/llm/llm_family.json +862 -26
  10. xinference/model/llm/llm_family_modelscope.json +895 -10
  11. xinference/model/llm/sglang/core.py +4 -0
  12. xinference/model/llm/utils.py +14 -3
  13. xinference/model/llm/vllm/core.py +27 -6
  14. xinference/model/llm/vllm/utils.py +42 -0
  15. xinference/model/rerank/core.py +19 -0
  16. xinference/model/rerank/model_spec.json +8 -0
  17. xinference/model/rerank/model_spec_modelscope.json +8 -0
  18. xinference/model/utils.py +0 -25
  19. xinference/web/ui/build/asset-manifest.json +3 -3
  20. xinference/web/ui/build/index.html +1 -1
  21. xinference/web/ui/build/static/js/{main.754740c0.js → main.e51a356d.js} +3 -3
  22. xinference/web/ui/build/static/js/main.e51a356d.js.map +1 -0
  23. xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +1 -0
  24. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +1 -0
  25. {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/METADATA +8 -7
  26. {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/RECORD +31 -30
  27. xinference/web/ui/build/static/js/main.754740c0.js.map +0 -1
  28. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +0 -1
  29. xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +0 -1
  30. /xinference/web/ui/build/static/js/{main.754740c0.js.LICENSE.txt → main.e51a356d.js.LICENSE.txt} +0 -0
  31. {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/LICENSE +0 -0
  32. {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/WHEEL +0 -0
  33. {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/entry_points.txt +0 -0
  34. {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/top_level.txt +0 -0
@@ -68,6 +68,8 @@ SGLANG_SUPPORTED_MODELS = [
68
68
  "llama-3.1",
69
69
  "mistral-v0.1",
70
70
  "mixtral-v0.1",
71
+ "qwen2.5",
72
+ "qwen2.5-coder",
71
73
  ]
72
74
  SGLANG_SUPPORTED_CHAT_MODELS = [
73
75
  "llama-2-chat",
@@ -85,6 +87,8 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
85
87
  "deepseek-v2.5",
86
88
  "deepseek-v2-chat",
87
89
  "deepseek-v2-chat-0628",
90
+ "qwen2.5-instruct",
91
+ "qwen2.5-coder-instruct",
88
92
  ]
89
93
 
90
94
 
@@ -159,14 +159,25 @@ class ChatModelMixin:
159
159
  for image_url in image_urls:
160
160
  fut = executor.submit(_decode_image, image_url)
161
161
  image_futures.append(fut)
162
- images = [fut.result() for fut in image_futures]
162
+ images.extend([fut.result() for fut in image_futures])
163
163
  if len(image_futures) == 0:
164
164
  ret += role + "\n" + text + intra_message_sep + "\n"
165
165
  else:
166
+ placeholders = "\n".join(
167
+ f"Image-{i+1}: <image>\n"
168
+ for i in range(
169
+ len(images) - len(image_futures), len(images)
170
+ )
171
+ )
166
172
  ret += (
167
- role + "\n" + f"<image>\n{text}" + intra_message_sep + "\n"
173
+ role
174
+ + "\n"
175
+ + f"{placeholders}\n{text}"
176
+ + intra_message_sep
177
+ + "\n"
168
178
  )
169
-
179
+ if len(images) == 1:
180
+ ret = ret.replace("Image-1: <image>\n", "<image>\n")
170
181
  return ret, images
171
182
  else:
172
183
  raise ValueError(f"Invalid model family: {model_family}")
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import asyncio
16
+ import json
16
17
  import logging
17
18
  import multiprocessing
18
19
  import os
@@ -47,6 +48,7 @@ from ..utils import (
47
48
  ChatModelMixin,
48
49
  generate_completion_chunk,
49
50
  )
51
+ from .utils import vllm_check
50
52
 
51
53
  logger = logging.getLogger(__name__)
52
54
 
@@ -65,6 +67,7 @@ class VLLMModelConfig(TypedDict, total=False):
65
67
  max_num_seqs: int
66
68
  quantization: Optional[str]
67
69
  max_model_len: Optional[int]
70
+ limit_mm_per_prompt: Optional[Dict[str, int]]
68
71
 
69
72
 
70
73
  class VLLMGenerateConfig(TypedDict, total=False):
@@ -90,9 +93,7 @@ try:
90
93
  except ImportError:
91
94
  VLLM_INSTALLED = False
92
95
 
93
- VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
94
- "internvl2",
95
- ]
96
+ VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = []
96
97
  VLLM_SUPPORTED_MODELS = [
97
98
  "llama-2",
98
99
  "llama-3",
@@ -138,6 +139,11 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
138
139
  VLLM_SUPPORTED_MODELS.append("codeqwen1.5")
139
140
  VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
140
141
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct")
142
+ VLLM_SUPPORTED_MODELS.append("qwen2.5")
143
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
144
+ VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
145
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
146
+
141
147
 
142
148
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
143
149
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -166,6 +172,9 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
166
172
  VLLM_SUPPORTED_MODELS.append("llama-3.1")
167
173
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
168
174
 
175
+ if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
176
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
177
+
169
178
 
170
179
  class VLLMModel(LLM):
171
180
  def __init__(
@@ -299,7 +308,12 @@ class VLLMModel(LLM):
299
308
  model_config.setdefault("gpu_memory_utilization", 0.90)
300
309
  model_config.setdefault("max_num_seqs", 256)
301
310
  model_config.setdefault("quantization", None)
302
- model_config.setdefault("max_model_len", 4096)
311
+ model_config.setdefault("max_model_len", None)
312
+ model_config["limit_mm_per_prompt"] = (
313
+ json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
314
+ if model_config.get("limit_mm_per_prompt")
315
+ else None
316
+ )
303
317
 
304
318
  return model_config
305
319
 
@@ -429,6 +443,7 @@ class VLLMModel(LLM):
429
443
  usage=usage,
430
444
  )
431
445
 
446
+ @vllm_check
432
447
  async def async_generate(
433
448
  self,
434
449
  prompt: Union[str, Dict[str, Any]],
@@ -660,6 +675,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
660
675
  yield self._to_chat_completion_chunk(chunk)
661
676
  i += 1
662
677
 
678
+ @vllm_check
663
679
  async def async_chat(
664
680
  self,
665
681
  messages: List[Dict],
@@ -736,13 +752,13 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
736
752
  )
737
753
  return generate_config
738
754
 
755
+ @vllm_check
739
756
  async def async_chat(
740
757
  self,
741
758
  messages: List[Dict],
742
759
  generate_config: Optional[Dict] = None,
743
760
  request_id: Optional[str] = None,
744
761
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
745
- # only support single image, waiting vllm support multi images
746
762
  model_family = self.model_family.model_family or self.model_family.model_name
747
763
  prompt, images = self.get_specific_prompt(model_family, messages)
748
764
 
@@ -750,11 +766,16 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
750
766
  inputs = {
751
767
  "prompt": prompt,
752
768
  }
753
- else:
769
+ elif len(images) == 1:
754
770
  inputs = {
755
771
  "prompt": prompt,
756
772
  "multi_modal_data": {"image": images[-1]}, # type: ignore
757
773
  }
774
+ else:
775
+ inputs = {
776
+ "prompt": prompt,
777
+ "multi_modal_data": {"image": images}, # type: ignore
778
+ }
758
779
  generate_config = self._sanitize_chat_config(generate_config)
759
780
 
760
781
  stream = generate_config.get("stream", None)
@@ -0,0 +1,42 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import functools
15
+ import logging
16
+ import os
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def vllm_check(fn):
22
+ try:
23
+ from vllm.engine.async_llm_engine import AsyncEngineDeadError
24
+ except:
25
+ return fn
26
+
27
+ @functools.wraps(fn)
28
+ async def _async_wrapper(self, *args, **kwargs):
29
+ logger.info("vllm_check")
30
+ try:
31
+ return await fn(self, *args, **kwargs)
32
+ except AsyncEngineDeadError:
33
+ logger.info("Detecting vLLM is not health, prepare to quit the process")
34
+ try:
35
+ self.stop()
36
+ except:
37
+ # ignore error when stop
38
+ pass
39
+ # Just kill the process and let xinference auto-recover the model
40
+ os._exit(1)
41
+
42
+ return _async_wrapper
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import gc
16
+ import importlib
16
17
  import logging
17
18
  import os
18
19
  import threading
@@ -178,9 +179,27 @@ class RerankModel:
178
179
  return rerank_type
179
180
 
180
181
  def load(self):
182
+ flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
183
+ if (
184
+ self._auto_detect_type(self._model_path) != "normal"
185
+ and flash_attn_installed
186
+ ):
187
+ logger.warning(
188
+ "flash_attn can only support fp16 and bf16, "
189
+ "will force set `use_fp16` to True"
190
+ )
191
+ self._use_fp16 = True
181
192
  if self._model_spec.type == "normal":
182
193
  try:
194
+ import sentence_transformers
183
195
  from sentence_transformers.cross_encoder import CrossEncoder
196
+
197
+ if sentence_transformers.__version__ < "3.1.0":
198
+ raise ValueError(
199
+ "The sentence_transformers version must be greater than 3.1.0. "
200
+ "Please upgrade your version via `pip install -U sentence_transformers` or refer to "
201
+ "https://github.com/UKPLab/sentence-transformers"
202
+ )
184
203
  except ImportError:
185
204
  error_message = "Failed to import module 'sentence-transformers'"
186
205
  installation_guide = [
@@ -54,5 +54,13 @@
54
54
  "max_tokens": 1024,
55
55
  "model_id": "jinaai/jina-reranker-v2-base-multilingual",
56
56
  "model_revision": "298e48cada4a9318650d7fbd795f63827f884087"
57
+ },
58
+ {
59
+ "model_name": "minicpm-reranker",
60
+ "type": "normal",
61
+ "language": ["en", "zh"],
62
+ "max_tokens": 1024,
63
+ "model_id": "openbmb/MiniCPM-Reranker",
64
+ "model_revision": "5d2fd7345b6444c89d4c0fa59c92272888f3f2d0"
57
65
  }
58
66
  ]
@@ -49,5 +49,13 @@
49
49
  "max_tokens": 2048,
50
50
  "model_id": "mirror013/bge-reranker-v2-minicpm-layerwise",
51
51
  "model_hub": "modelscope"
52
+ },
53
+ {
54
+ "model_name": "minicpm-reranker",
55
+ "type": "normal",
56
+ "language": ["en", "zh"],
57
+ "max_tokens": 1024,
58
+ "model_id": "OpenBMB/MiniCPM-Reranker",
59
+ "model_hub": "modelscope"
52
60
  }
53
61
  ]
xinference/model/utils.py CHANGED
@@ -300,31 +300,6 @@ def cache(model_spec: CacheableModelSpec, model_description_type: type):
300
300
  return cache_dir
301
301
 
302
302
 
303
- def patch_trust_remote_code():
304
- """sentence-transformers calls transformers without the trust_remote_code=True, some embedding
305
- models will fail to load, e.g. jina-embeddings-v2-base-en
306
-
307
- :return:
308
- """
309
- try:
310
- from transformers.dynamic_module_utils import resolve_trust_remote_code
311
- except ImportError:
312
- logger.error("Patch transformers trust_remote_code failed.")
313
- else:
314
-
315
- def _patched_resolve_trust_remote_code(*args, **kwargs):
316
- logger.info("Patched resolve_trust_remote_code: %s %s", args, kwargs)
317
- return True
318
-
319
- if (
320
- resolve_trust_remote_code.__code__
321
- != _patched_resolve_trust_remote_code.__code__
322
- ):
323
- resolve_trust_remote_code.__code__ = (
324
- _patched_resolve_trust_remote_code.__code__
325
- )
326
-
327
-
328
303
  def select_device(device):
329
304
  try:
330
305
  import torch # noqa: F401
@@ -1,14 +1,14 @@
1
1
  {
2
2
  "files": {
3
3
  "main.css": "./static/css/main.5061c4c3.css",
4
- "main.js": "./static/js/main.754740c0.js",
4
+ "main.js": "./static/js/main.e51a356d.js",
5
5
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
6
6
  "index.html": "./index.html",
7
7
  "main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
8
- "main.754740c0.js.map": "./static/js/main.754740c0.js.map"
8
+ "main.e51a356d.js.map": "./static/js/main.e51a356d.js.map"
9
9
  },
10
10
  "entrypoints": [
11
11
  "static/css/main.5061c4c3.css",
12
- "static/js/main.754740c0.js"
12
+ "static/js/main.e51a356d.js"
13
13
  ]
14
14
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.754740c0.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e51a356d.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>