xinference 0.15.3__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (65) hide show
  1. xinference/__init__.py +0 -4
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +29 -2
  4. xinference/client/restful/restful_client.py +10 -0
  5. xinference/constants.py +7 -3
  6. xinference/core/image_interface.py +76 -23
  7. xinference/core/model.py +158 -46
  8. xinference/core/progress_tracker.py +187 -0
  9. xinference/core/scheduler.py +10 -7
  10. xinference/core/supervisor.py +11 -0
  11. xinference/core/utils.py +9 -0
  12. xinference/core/worker.py +1 -0
  13. xinference/deploy/supervisor.py +4 -0
  14. xinference/model/__init__.py +4 -0
  15. xinference/model/audio/chattts.py +2 -1
  16. xinference/model/audio/core.py +0 -2
  17. xinference/model/audio/model_spec.json +8 -0
  18. xinference/model/audio/model_spec_modelscope.json +9 -0
  19. xinference/model/image/core.py +6 -7
  20. xinference/model/image/scheduler/__init__.py +13 -0
  21. xinference/model/image/scheduler/flux.py +533 -0
  22. xinference/model/image/sdapi.py +35 -4
  23. xinference/model/image/stable_diffusion/core.py +215 -110
  24. xinference/model/image/utils.py +39 -3
  25. xinference/model/llm/__init__.py +2 -0
  26. xinference/model/llm/llm_family.json +185 -17
  27. xinference/model/llm/llm_family_modelscope.json +124 -12
  28. xinference/model/llm/transformers/chatglm.py +104 -0
  29. xinference/model/llm/transformers/cogvlm2.py +2 -1
  30. xinference/model/llm/transformers/cogvlm2_video.py +2 -0
  31. xinference/model/llm/transformers/core.py +43 -113
  32. xinference/model/llm/transformers/deepseek_v2.py +0 -226
  33. xinference/model/llm/transformers/deepseek_vl.py +2 -0
  34. xinference/model/llm/transformers/glm4v.py +2 -1
  35. xinference/model/llm/transformers/intern_vl.py +2 -0
  36. xinference/model/llm/transformers/internlm2.py +3 -95
  37. xinference/model/llm/transformers/minicpmv25.py +2 -0
  38. xinference/model/llm/transformers/minicpmv26.py +2 -0
  39. xinference/model/llm/transformers/omnilmm.py +2 -0
  40. xinference/model/llm/transformers/opt.py +68 -0
  41. xinference/model/llm/transformers/qwen2_audio.py +11 -4
  42. xinference/model/llm/transformers/qwen2_vl.py +2 -28
  43. xinference/model/llm/transformers/qwen_vl.py +2 -1
  44. xinference/model/llm/transformers/utils.py +36 -283
  45. xinference/model/llm/transformers/yi_vl.py +2 -0
  46. xinference/model/llm/utils.py +60 -16
  47. xinference/model/llm/vllm/core.py +68 -9
  48. xinference/model/llm/vllm/utils.py +0 -1
  49. xinference/model/utils.py +7 -4
  50. xinference/model/video/core.py +0 -2
  51. xinference/utils.py +2 -3
  52. xinference/web/ui/build/asset-manifest.json +3 -3
  53. xinference/web/ui/build/index.html +1 -1
  54. xinference/web/ui/build/static/js/{main.e51a356d.js → main.f7da0140.js} +3 -3
  55. xinference/web/ui/build/static/js/main.f7da0140.js.map +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
  57. {xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/METADATA +38 -6
  58. {xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/RECORD +63 -59
  59. xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
  61. /xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.f7da0140.js.LICENSE.txt} +0 -0
  62. {xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/LICENSE +0 -0
  63. {xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/WHEEL +0 -0
  64. {xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/entry_points.txt +0 -0
  65. {xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,7 @@ from ...types import (
29
29
  ChatCompletion,
30
30
  ChatCompletionChoice,
31
31
  ChatCompletionChunk,
32
+ ChatCompletionMessage,
32
33
  Completion,
33
34
  CompletionChoice,
34
35
  CompletionChunk,
@@ -50,6 +51,7 @@ QWEN_TOOL_CALL_FAMILY = [
50
51
  "qwen1.5-moe-chat",
51
52
  "qwen2-instruct",
52
53
  "qwen2-moe-instruct",
54
+ "qwen2.5-instruct",
53
55
  ]
54
56
 
55
57
  GLM4_TOOL_CALL_FAMILY = [
@@ -57,6 +59,10 @@ GLM4_TOOL_CALL_FAMILY = [
57
59
  "glm4-chat-1m",
58
60
  ]
59
61
 
62
+ LLAMA3_TOOL_CALL_FAMILY = [
63
+ "llama-3.1-instruct",
64
+ ]
65
+
60
66
  QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
61
67
 
62
68
 
@@ -113,7 +119,7 @@ class ChatModelMixin:
113
119
  return self._build_from_raw_template(messages, chat_template, **kwargs)
114
120
 
115
121
  @staticmethod
116
- def get_specific_prompt(model_family: str, messages: List[Dict]):
122
+ def get_specific_prompt(model_family: str, messages: List[ChatCompletionMessage]):
117
123
  """
118
124
  Inspired by FastChat. Format chat history into a prompt according to the prompty style of
119
125
  different models.
@@ -129,7 +135,7 @@ class ChatModelMixin:
129
135
  ret = (
130
136
  "<s>"
131
137
  if system_prompt == ""
132
- else "<s><|im_start|>system\n"
138
+ else "<s><|im_start|>system\n" # type: ignore
133
139
  + system_prompt
134
140
  + intra_message_sep
135
141
  + "\n"
@@ -333,8 +339,9 @@ class ChatModelMixin:
333
339
  for content in contents:
334
340
  content = content.strip()
335
341
  if content:
336
- if content.startswith(QWEN_TOOL_CALL_SYMBOLS[0]):
337
- content = content[len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
342
+ pos = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
343
+ if pos != -1:
344
+ content = content[pos + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
338
345
  content = content.strip()
339
346
  try:
340
347
  res = json.loads(content)
@@ -353,6 +360,15 @@ class ChatModelMixin:
353
360
  text = c["choices"][0]["text"]
354
361
  return cls._handle_qwen_tool_result(text)
355
362
 
363
+ @classmethod
364
+ def _eval_llama3_chat_arguments(cls, c) -> List[Tuple]:
365
+ text = c["choices"][0]["text"]
366
+ try:
367
+ data = eval(text, {}, {})
368
+ return [(None, data["name"], data["parameters"])]
369
+ except Exception:
370
+ return [(text, None, None)]
371
+
356
372
  @classmethod
357
373
  def _eval_tool_arguments(cls, model_family, c):
358
374
  family = model_family.model_family or model_family.model_name
@@ -360,6 +376,8 @@ class ChatModelMixin:
360
376
  result = cls._eval_glm_chat_arguments(c)
361
377
  elif family in QWEN_TOOL_CALL_FAMILY:
362
378
  result = cls._eval_qwen_chat_arguments(c)
379
+ elif family in LLAMA3_TOOL_CALL_FAMILY:
380
+ result = cls._eval_llama3_chat_arguments(c)
363
381
  else:
364
382
  raise Exception(
365
383
  f"Model {model_family.model_name} is not support tool calls."
@@ -368,24 +386,22 @@ class ChatModelMixin:
368
386
  return result
369
387
 
370
388
  @classmethod
371
- def _tool_calls_completion_chunk(cls, model_family, model_uid, c):
372
- _id = str(uuid.uuid4())
389
+ def _tool_calls_completion_chunk(cls, model_family, model_uid, c, chunk_id=None):
390
+ _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
373
391
  tool_result = cls._eval_tool_arguments(model_family, c)
374
392
  tool_calls = []
375
393
  failed_contents = []
376
394
  for content, func, args in tool_result:
377
395
  if func:
378
396
  tool_calls.append(
379
- [
380
- {
381
- "id": f"call_{_id}",
382
- "type": "function",
383
- "function": {
384
- "name": func,
385
- "arguments": json.dumps(args, ensure_ascii=False),
386
- },
387
- }
388
- ]
397
+ {
398
+ "id": f"call_{_id}",
399
+ "type": "function",
400
+ "function": {
401
+ "name": func,
402
+ "arguments": json.dumps(args, ensure_ascii=False),
403
+ },
404
+ }
389
405
  )
390
406
  else:
391
407
  failed_contents.append(content)
@@ -471,6 +487,34 @@ class ChatModelMixin:
471
487
  "usage": usage,
472
488
  }
473
489
 
490
+ def _transform_messages(
491
+ self,
492
+ messages: List[ChatCompletionMessage],
493
+ ):
494
+ transformed_messages = []
495
+ for msg in messages:
496
+ new_content = []
497
+ role = msg["role"]
498
+ content = msg["content"]
499
+ if isinstance(content, str):
500
+ new_content.append({"type": "text", "text": content})
501
+ elif isinstance(content, List):
502
+ for item in content: # type: ignore
503
+ if "text" in item:
504
+ new_content.append({"type": "text", "text": item["text"]})
505
+ elif "image_url" in item:
506
+ new_content.append(
507
+ {"type": "image", "image": item["image_url"]["url"]}
508
+ )
509
+ elif "video_url" in item:
510
+ new_content.append(
511
+ {"type": "video", "video": item["video_url"]["url"]}
512
+ )
513
+ new_message = {"role": role, "content": new_content}
514
+ transformed_messages.append(new_message)
515
+
516
+ return transformed_messages
517
+
474
518
 
475
519
  def get_file_location(
476
520
  llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
@@ -34,6 +34,7 @@ from typing import (
34
34
  from ....types import (
35
35
  ChatCompletion,
36
36
  ChatCompletionChunk,
37
+ ChatCompletionMessage,
37
38
  Completion,
38
39
  CompletionChoice,
39
40
  CompletionChunk,
@@ -175,6 +176,9 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
175
176
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
176
177
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
177
178
 
179
+ if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
180
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
181
+
178
182
 
179
183
  class VLLMModel(LLM):
180
184
  def __init__(
@@ -309,11 +313,6 @@ class VLLMModel(LLM):
309
313
  model_config.setdefault("max_num_seqs", 256)
310
314
  model_config.setdefault("quantization", None)
311
315
  model_config.setdefault("max_model_len", None)
312
- model_config["limit_mm_per_prompt"] = (
313
- json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
314
- if model_config.get("limit_mm_per_prompt")
315
- else None
316
- )
317
316
 
318
317
  return model_config
319
318
 
@@ -718,11 +717,26 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
718
717
  def match(
719
718
  cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
720
719
  ) -> bool:
721
- if llm_spec.model_format != "pytorch":
720
+ if not cls._has_cuda_device():
721
+ return False
722
+ if not cls._is_linux():
723
+ return False
724
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
722
725
  return False
723
726
  if llm_spec.model_format == "pytorch":
724
727
  if quantization != "none" and not (quantization is None):
725
728
  return False
729
+ if llm_spec.model_format == "awq":
730
+ # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
731
+ if "4" not in quantization:
732
+ return False
733
+ if llm_spec.model_format == "gptq":
734
+ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
735
+ if not any(q in quantization for q in ("3", "4", "8")):
736
+ return False
737
+ else:
738
+ if "4" not in quantization:
739
+ return False
726
740
  if isinstance(llm_family, CustomLLMFamilyV1):
727
741
  if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
728
742
  return False
@@ -733,6 +747,33 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
733
747
  return False
734
748
  return VLLM_INSTALLED
735
749
 
750
+ def _sanitize_model_config(
751
+ self, model_config: Optional[VLLMModelConfig]
752
+ ) -> VLLMModelConfig:
753
+ if model_config is None:
754
+ model_config = VLLMModelConfig()
755
+
756
+ cuda_count = self._get_cuda_count()
757
+
758
+ model_config.setdefault("tokenizer_mode", "auto")
759
+ model_config.setdefault("trust_remote_code", True)
760
+ model_config.setdefault("tensor_parallel_size", cuda_count)
761
+ model_config.setdefault("block_size", 16)
762
+ model_config.setdefault("swap_space", 4)
763
+ model_config.setdefault("gpu_memory_utilization", 0.90)
764
+ model_config.setdefault("max_num_seqs", 256)
765
+ model_config.setdefault("quantization", None)
766
+ model_config.setdefault("max_model_len", None)
767
+ model_config["limit_mm_per_prompt"] = (
768
+ json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
769
+ if model_config.get("limit_mm_per_prompt")
770
+ else {
771
+ "image": 2, # default 2 images all chat
772
+ }
773
+ )
774
+
775
+ return model_config
776
+
736
777
  def _sanitize_chat_config(
737
778
  self,
738
779
  generate_config: Optional[Dict] = None,
@@ -755,14 +796,32 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
755
796
  @vllm_check
756
797
  async def async_chat(
757
798
  self,
758
- messages: List[Dict],
799
+ messages: List[ChatCompletionMessage], # type: ignore
759
800
  generate_config: Optional[Dict] = None,
760
801
  request_id: Optional[str] = None,
761
802
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
803
+ messages = self._transform_messages(messages)
804
+ tools = generate_config.pop("tools", []) if generate_config else None
805
+
762
806
  model_family = self.model_family.model_family or self.model_family.model_name
763
- prompt, images = self.get_specific_prompt(model_family, messages)
764
807
 
765
- if len(images) == 0:
808
+ if "internvl2" not in model_family.lower():
809
+ from qwen_vl_utils import process_vision_info
810
+
811
+ full_context_kwargs = {}
812
+ if tools and model_family in QWEN_TOOL_CALL_FAMILY:
813
+ full_context_kwargs["tools"] = tools
814
+ assert self.model_family.chat_template is not None
815
+ prompt = self.get_full_context(
816
+ messages, self.model_family.chat_template, **full_context_kwargs
817
+ )
818
+ images, video_inputs = process_vision_info(messages)
819
+ if video_inputs:
820
+ raise ValueError("Not support video input now.")
821
+ else:
822
+ prompt, images = self.get_specific_prompt(model_family, messages)
823
+
824
+ if not images:
766
825
  inputs = {
767
826
  "prompt": prompt,
768
827
  }
@@ -26,7 +26,6 @@ def vllm_check(fn):
26
26
 
27
27
  @functools.wraps(fn)
28
28
  async def _async_wrapper(self, *args, **kwargs):
29
- logger.info("vllm_check")
30
29
  try:
31
30
  return await fn(self, *args, **kwargs)
32
31
  except AsyncEngineDeadError:
xinference/model/utils.py CHANGED
@@ -23,12 +23,15 @@ import huggingface_hub
23
23
  import numpy as np
24
24
  import torch
25
25
 
26
- from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
26
+ from ..constants import (
27
+ XINFERENCE_CACHE_DIR,
28
+ XINFERENCE_DOWNLOAD_MAX_ATTEMPTS,
29
+ XINFERENCE_ENV_MODEL_SRC,
30
+ )
27
31
  from ..device_utils import get_available_device, is_device_available
28
32
  from .core import CacheableModelSpec
29
33
 
30
34
  logger = logging.getLogger(__name__)
31
- MAX_ATTEMPTS = 3
32
35
  IS_NEW_HUGGINGFACE_HUB: bool = huggingface_hub.__version__ >= "0.23.0"
33
36
 
34
37
 
@@ -100,11 +103,11 @@ def retry_download(
100
103
  **kwargs,
101
104
  ):
102
105
  last_ex = None
103
- for current_attempt in range(1, MAX_ATTEMPTS + 1):
106
+ for current_attempt in range(1, XINFERENCE_DOWNLOAD_MAX_ATTEMPTS + 1):
104
107
  try:
105
108
  return download_func(*args, **kwargs)
106
109
  except Exception as e:
107
- remaining_attempts = MAX_ATTEMPTS - current_attempt
110
+ remaining_attempts = XINFERENCE_DOWNLOAD_MAX_ATTEMPTS - current_attempt
108
111
  last_ex = e
109
112
  logger.debug(
110
113
  "Download failed: %s, download func: %s, download args: %s, kwargs: %s",
@@ -21,8 +21,6 @@ from ..core import CacheableModelSpec, ModelDescription
21
21
  from ..utils import valid_model_revision
22
22
  from .diffusers import DiffUsersVideoModel
23
23
 
24
- MAX_ATTEMPTS = 3
25
-
26
24
  logger = logging.getLogger(__name__)
27
25
 
28
26
  MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
xinference/utils.py CHANGED
@@ -13,9 +13,8 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
- import torch
17
-
18
-
19
16
  def cuda_count():
17
+ import torch
18
+
20
19
  # even if install torch cpu, this interface would return 0.
21
20
  return torch.cuda.device_count()
@@ -1,14 +1,14 @@
1
1
  {
2
2
  "files": {
3
3
  "main.css": "./static/css/main.5061c4c3.css",
4
- "main.js": "./static/js/main.e51a356d.js",
4
+ "main.js": "./static/js/main.f7da0140.js",
5
5
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
6
6
  "index.html": "./index.html",
7
7
  "main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
8
- "main.e51a356d.js.map": "./static/js/main.e51a356d.js.map"
8
+ "main.f7da0140.js.map": "./static/js/main.f7da0140.js.map"
9
9
  },
10
10
  "entrypoints": [
11
11
  "static/css/main.5061c4c3.css",
12
- "static/js/main.e51a356d.js"
12
+ "static/js/main.f7da0140.js"
13
13
  ]
14
14
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e51a356d.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.f7da0140.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>