xinference 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +4 -7
  3. xinference/client/handlers.py +3 -0
  4. xinference/core/chat_interface.py +6 -1
  5. xinference/core/model.py +2 -0
  6. xinference/core/scheduler.py +4 -7
  7. xinference/core/supervisor.py +114 -23
  8. xinference/core/worker.py +70 -4
  9. xinference/deploy/local.py +2 -1
  10. xinference/model/audio/core.py +11 -0
  11. xinference/model/audio/cosyvoice.py +16 -5
  12. xinference/model/audio/kokoro.py +139 -0
  13. xinference/model/audio/melotts.py +110 -0
  14. xinference/model/audio/model_spec.json +80 -0
  15. xinference/model/audio/model_spec_modelscope.json +18 -0
  16. xinference/model/audio/whisper.py +35 -10
  17. xinference/model/llm/llama_cpp/core.py +21 -14
  18. xinference/model/llm/llm_family.json +527 -1
  19. xinference/model/llm/llm_family.py +4 -1
  20. xinference/model/llm/llm_family_modelscope.json +495 -3
  21. xinference/model/llm/memory.py +1 -1
  22. xinference/model/llm/mlx/core.py +24 -6
  23. xinference/model/llm/transformers/core.py +9 -1
  24. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  25. xinference/model/llm/transformers/qwen2_vl.py +20 -3
  26. xinference/model/llm/transformers/utils.py +22 -11
  27. xinference/model/llm/utils.py +115 -1
  28. xinference/model/llm/vllm/core.py +14 -4
  29. xinference/model/llm/vllm/xavier/block.py +3 -4
  30. xinference/model/llm/vllm/xavier/block_tracker.py +71 -58
  31. xinference/model/llm/vllm/xavier/collective.py +74 -0
  32. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  33. xinference/model/llm/vllm/xavier/executor.py +18 -16
  34. xinference/model/llm/vllm/xavier/scheduler.py +79 -63
  35. xinference/model/llm/vllm/xavier/test/test_xavier.py +60 -35
  36. xinference/model/llm/vllm/xavier/transfer.py +53 -32
  37. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  38. xinference/thirdparty/melo/__init__.py +0 -0
  39. xinference/thirdparty/melo/api.py +135 -0
  40. xinference/thirdparty/melo/app.py +61 -0
  41. xinference/thirdparty/melo/attentions.py +459 -0
  42. xinference/thirdparty/melo/commons.py +160 -0
  43. xinference/thirdparty/melo/configs/config.json +94 -0
  44. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  45. xinference/thirdparty/melo/data_utils.py +413 -0
  46. xinference/thirdparty/melo/download_utils.py +67 -0
  47. xinference/thirdparty/melo/infer.py +25 -0
  48. xinference/thirdparty/melo/init_downloads.py +14 -0
  49. xinference/thirdparty/melo/losses.py +58 -0
  50. xinference/thirdparty/melo/main.py +36 -0
  51. xinference/thirdparty/melo/mel_processing.py +174 -0
  52. xinference/thirdparty/melo/models.py +1030 -0
  53. xinference/thirdparty/melo/modules.py +598 -0
  54. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  55. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  56. xinference/thirdparty/melo/preprocess_text.py +135 -0
  57. xinference/thirdparty/melo/split_utils.py +174 -0
  58. xinference/thirdparty/melo/text/__init__.py +35 -0
  59. xinference/thirdparty/melo/text/chinese.py +199 -0
  60. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  61. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  62. xinference/thirdparty/melo/text/cleaner.py +36 -0
  63. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  64. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  65. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  66. xinference/thirdparty/melo/text/english.py +284 -0
  67. xinference/thirdparty/melo/text/english_bert.py +39 -0
  68. xinference/thirdparty/melo/text/english_utils/__init__.py +0 -0
  69. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  70. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  71. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  72. xinference/thirdparty/melo/text/es_phonemizer/__init__.py +0 -0
  73. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  74. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  75. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  76. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  77. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  78. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  79. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  80. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  81. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  82. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  83. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  84. xinference/thirdparty/melo/text/fr_phonemizer/__init__.py +0 -0
  85. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  86. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  87. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  88. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  89. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  90. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  91. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  92. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  93. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  94. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  95. xinference/thirdparty/melo/text/french.py +94 -0
  96. xinference/thirdparty/melo/text/french_bert.py +39 -0
  97. xinference/thirdparty/melo/text/japanese.py +647 -0
  98. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  99. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  100. xinference/thirdparty/melo/text/korean.py +192 -0
  101. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  102. xinference/thirdparty/melo/text/spanish.py +122 -0
  103. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  104. xinference/thirdparty/melo/text/symbols.py +290 -0
  105. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  106. xinference/thirdparty/melo/train.py +635 -0
  107. xinference/thirdparty/melo/train.sh +19 -0
  108. xinference/thirdparty/melo/transforms.py +209 -0
  109. xinference/thirdparty/melo/utils.py +424 -0
  110. xinference/types.py +2 -0
  111. xinference/web/ui/build/asset-manifest.json +3 -3
  112. xinference/web/ui/build/index.html +1 -1
  113. xinference/web/ui/build/static/js/{main.1eb206d1.js → main.b0936c54.js} +3 -3
  114. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  115. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  116. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/METADATA +37 -27
  117. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/RECORD +122 -45
  118. xinference/web/ui/build/static/js/main.1eb206d1.js.map +0 -1
  119. xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +0 -1
  120. /xinference/web/ui/build/static/js/{main.1eb206d1.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  121. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/LICENSE +0 -0
  122. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/WHEEL +0 -0
  123. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/entry_points.txt +0 -0
  124. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/top_level.txt +0 -0
@@ -45,9 +45,13 @@ class Qwen2VLChatModel(PytorchChatModel):
45
45
  def match(
46
46
  cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
47
47
  ) -> bool:
48
+ if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
49
+ return False
48
50
  llm_family = model_family.model_family or model_family.model_name
49
51
  if "qwen2-vl-instruct".lower() in llm_family.lower():
50
52
  return True
53
+ if "qwen2.5-vl-instruct".lower() in llm_family.lower():
54
+ return True
51
55
  if "qvq-72b-preview".lower() in llm_family.lower():
52
56
  return True
53
57
  return False
@@ -55,6 +59,11 @@ class Qwen2VLChatModel(PytorchChatModel):
55
59
  def load(self):
56
60
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
57
61
 
62
+ try:
63
+ from transformers import Qwen2_5_VLForConditionalGeneration
64
+ except ImportError:
65
+ Qwen2_5_VLForConditionalGeneration = None
66
+
58
67
  device = self._pytorch_model_config.get("device", "auto")
59
68
  device = select_device(device)
60
69
  self._device = device
@@ -66,8 +75,16 @@ class Qwen2VLChatModel(PytorchChatModel):
66
75
  )
67
76
  self._tokenizer = self._processor.tokenizer
68
77
  flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
78
+ llm_family = self.model_family.model_family or self.model_family.model_name
79
+ model_cls = (
80
+ Qwen2_5_VLForConditionalGeneration
81
+ if "qwen2.5" in llm_family
82
+ else Qwen2VLForConditionalGeneration
83
+ )
84
+ if model_cls is None:
85
+ raise ImportError("`transformers` version is too old, please upgrade it")
69
86
  if flash_attn_installed:
70
- self._model = Qwen2VLForConditionalGeneration.from_pretrained(
87
+ self._model = model_cls.from_pretrained(
71
88
  self.model_path,
72
89
  torch_dtype="bfloat16",
73
90
  device_map=device,
@@ -76,14 +93,14 @@ class Qwen2VLChatModel(PytorchChatModel):
76
93
  ).eval()
77
94
  elif is_npu_available():
78
95
  # Ascend do not support bf16
79
- self._model = Qwen2VLForConditionalGeneration.from_pretrained(
96
+ self._model = model_cls.from_pretrained(
80
97
  self.model_path,
81
98
  device_map="auto",
82
99
  trust_remote_code=True,
83
100
  torch_dtype="float16",
84
101
  ).eval()
85
102
  else:
86
- self._model = Qwen2VLForConditionalGeneration.from_pretrained(
103
+ self._model = model_cls.from_pretrained(
87
104
  self.model_path, device_map=device, trust_remote_code=True
88
105
  ).eval()
89
106
 
@@ -193,16 +193,14 @@ def _get_pad_param(seq_len_idx: int, pad_len: int) -> Tuple:
193
193
 
194
194
  def _merge_kv_cache(
195
195
  xinf_model_obj: "PytorchModel",
196
- past_kv: Tuple[Tuple[torch.Tensor]],
197
- new_kv: Tuple[Tuple[torch.Tensor]],
198
- ):
196
+ past_cache: DynamicCache,
197
+ new_cache: DynamicCache,
198
+ ) -> DynamicCache:
199
199
  from torch.nn.functional import pad
200
200
 
201
201
  _, seq_len_idx = xinf_model_obj.get_batch_size_and_seq_len_indexes_from_kv()
202
- past_cache = DynamicCache.from_legacy_cache(past_kv)
203
- new_cache = DynamicCache.from_legacy_cache(new_kv)
204
- past_seq_len = past_kv[0][0].shape[seq_len_idx]
205
- new_seq_len = new_kv[0][0].shape[seq_len_idx]
202
+ past_seq_len = past_cache[0][0].shape[seq_len_idx]
203
+ new_seq_len = new_cache[0][0].shape[seq_len_idx]
206
204
  if past_seq_len != new_seq_len:
207
205
  padding_target = new_cache if past_seq_len > new_seq_len else past_cache
208
206
  padding_len = abs(past_seq_len - new_seq_len)
@@ -219,8 +217,12 @@ def _merge_kv_cache(
219
217
  for idx in range(len(past_cache)):
220
218
  k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
221
219
  v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
222
- ret_kv.update(torch.cat((k1, k2), 0), torch.cat((v1, v2), 0), idx)
223
- return ret_kv.to_legacy_cache()
220
+ ret_kv.update(
221
+ torch.cat((k1, k2), 0).contiguous(),
222
+ torch.cat((v1, v2), 0).contiguous(),
223
+ idx,
224
+ )
225
+ return ret_kv
224
226
 
225
227
 
226
228
  def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel"):
@@ -228,6 +230,15 @@ def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel")
228
230
  return kv[0][0].shape[bs_idx], kv[0][0].shape[seq_len_idx] + 1
229
231
 
230
232
 
233
+ def convert_to_cache_cls(cache) -> DynamicCache:
234
+ """
235
+ Compatible with some old models
236
+ """
237
+ if isinstance(cache, tuple):
238
+ return DynamicCache.from_legacy_cache(cache)
239
+ return cache
240
+
241
+
231
242
  @torch.inference_mode()
232
243
  def _batch_inference_one_step_internal(
233
244
  xinf_model_obj: "PytorchModel",
@@ -269,7 +280,7 @@ def _batch_inference_one_step_internal(
269
280
  out = model(**prefill_kws, use_cache=True)
270
281
 
271
282
  logits = out.logits
272
- past_key_values = out.past_key_values
283
+ past_key_values = convert_to_cache_cls(out.past_key_values)
273
284
 
274
285
  for i, r in enumerate(prefill_reqs):
275
286
  (
@@ -317,7 +328,7 @@ def _batch_inference_one_step_internal(
317
328
  )
318
329
  out = model(**inf_kws, use_cache=True, past_key_values=past_key_values)
319
330
  logits = out.logits
320
- past_key_values = out.past_key_values
331
+ past_key_values = convert_to_cache_cls(out.past_key_values)
321
332
 
322
333
  for i, r in enumerate(valid_req_list):
323
334
  (
@@ -11,16 +11,28 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import base64
15
16
  import functools
16
17
  import json
17
18
  import logging
18
19
  import os
20
+ import re
19
21
  import time
20
22
  import typing
21
23
  import uuid
22
24
  from io import BytesIO
23
- from typing import AsyncGenerator, Dict, Iterator, List, Optional, Tuple, cast
25
+ from typing import (
26
+ Any,
27
+ AsyncGenerator,
28
+ Dict,
29
+ Iterable,
30
+ Iterator,
31
+ List,
32
+ Optional,
33
+ Tuple,
34
+ cast,
35
+ )
24
36
 
25
37
  import requests
26
38
  from PIL import Image
@@ -64,6 +76,18 @@ LLAMA3_TOOL_CALL_FAMILY = [
64
76
  "llama-3.1-instruct",
65
77
  ]
66
78
 
79
+ DEEPSEEK_TOOL_CALL_FAMILY = [
80
+ "deepseek-r1-distill-qwen",
81
+ "deepseek-r1-distill-llama",
82
+ ]
83
+
84
+ TOOL_CALL_FAMILY = (
85
+ QWEN_TOOL_CALL_FAMILY
86
+ + GLM4_TOOL_CALL_FAMILY
87
+ + LLAMA3_TOOL_CALL_FAMILY
88
+ + DEEPSEEK_TOOL_CALL_FAMILY
89
+ )
90
+
67
91
  QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
68
92
 
69
93
 
@@ -104,6 +128,10 @@ class ChatModelMixin:
104
128
  tokenize=False,
105
129
  **kwargs,
106
130
  ):
131
+ if "vision" not in self.model_family.model_ability: # type: ignore
132
+ messages = self.convert_messages_with_content_list_to_str_conversion(
133
+ messages
134
+ )
107
135
  if tokenizer is not None:
108
136
  try:
109
137
  full_context = tokenizer.apply_chat_template(
@@ -304,6 +332,35 @@ class ChatModelMixin:
304
332
  else:
305
333
  yield cls._to_chat_completion_chunk(chunk)
306
334
 
335
+ @classmethod
336
+ def _tools_to_messages_for_deepseek(
337
+ cls, messages: List[dict], tools: Iterable[dict]
338
+ ):
339
+ # deepseek integrates tool calls into messages
340
+ # we follow the chat template rule to integrate tools into messages
341
+ tool_call_message: Dict[str, Any] = {
342
+ "role": "assistant",
343
+ "content": None,
344
+ "tool_calls": [],
345
+ }
346
+
347
+ for tool in tools:
348
+ function_name = tool["function"]["name"]
349
+ parameters = tool["function"].get("parameters", {}).get("properties", {})
350
+ function_args_json = json.dumps(parameters)
351
+
352
+ tool_call_message["tool_calls"].append(
353
+ {
354
+ "type": "function",
355
+ "function": {
356
+ "name": function_name,
357
+ "arguments": function_args_json,
358
+ },
359
+ }
360
+ )
361
+
362
+ messages.append(tool_call_message)
363
+
307
364
  @classmethod
308
365
  async def _async_to_chat_completion_chunks(
309
366
  cls,
@@ -397,6 +454,61 @@ class ChatModelMixin:
397
454
  except Exception:
398
455
  return [(text, None, None)]
399
456
 
457
+ @classmethod
458
+ def _eval_deepseek_chat_arguments(cls, c) -> List[Tuple]:
459
+ """
460
+ Parses tool calls from deepseek-r1 format and removes duplicates.
461
+
462
+ Returns:
463
+ List[Tuple[Optional[str], Optional[str], Optional[dict]]]
464
+ - (None, function_name, arguments) if successfully parsed.
465
+ - (content, None, None) if parsing failed (content is raw JSON text).
466
+
467
+ Example input:
468
+ <|tool▁call|>get_current_weather
469
+ ```json
470
+ {"location": "tokyo", "unit": "fahrenheit"}
471
+ ```
472
+
473
+ Output:
474
+ [
475
+ (None, "get_current_weather", {"location": "tokyo", "unit": "fahrenheit"})
476
+ ]
477
+ """
478
+
479
+ text = c["choices"][0]["text"]
480
+
481
+ pattern = r"<|tool▁call|>(\w+)\s*```json\s*(.*?)\s*```"
482
+ matches = re.findall(pattern, text, re.DOTALL)
483
+
484
+ if not matches:
485
+ return [(text, None, None)]
486
+
487
+ tool_calls = set() # Used for deduplication
488
+ results = []
489
+
490
+ for function_name, args_json in matches:
491
+ try:
492
+ arguments = json.loads(args_json)
493
+ # Convert dictionary to frozenset for deduplication
494
+ arguments_hashable = frozenset(arguments.items())
495
+ tool_call_tuple = (None, function_name, arguments)
496
+ except json.JSONDecodeError:
497
+ tool_call_tuple = (
498
+ args_json,
499
+ None,
500
+ None,
501
+ ) # If parsing fails, treat as raw content
502
+ arguments_hashable = None # No need for hashing
503
+
504
+ # Avoid duplicate entries
505
+ dedup_key = (function_name, arguments_hashable)
506
+ if dedup_key not in tool_calls:
507
+ tool_calls.add(dedup_key)
508
+ results.append(tool_call_tuple)
509
+
510
+ return results
511
+
400
512
  @classmethod
401
513
  def _eval_tool_arguments(cls, model_family, c):
402
514
  family = model_family.model_family or model_family.model_name
@@ -406,6 +518,8 @@ class ChatModelMixin:
406
518
  result = cls._eval_qwen_chat_arguments(c)
407
519
  elif family in LLAMA3_TOOL_CALL_FAMILY:
408
520
  result = cls._eval_llama3_chat_arguments(c)
521
+ elif family in DEEPSEEK_TOOL_CALL_FAMILY:
522
+ result = cls._eval_deepseek_chat_arguments(c)
409
523
  else:
410
524
  raise Exception(
411
525
  f"Model {model_family.model_name} is not support tool calls."
@@ -44,6 +44,7 @@ from ....types import (
44
44
  from .. import LLM, LLMFamilyV1, LLMSpecV1
45
45
  from ..llm_family import CustomLLMFamilyV1
46
46
  from ..utils import (
47
+ DEEPSEEK_TOOL_CALL_FAMILY,
47
48
  QWEN_TOOL_CALL_FAMILY,
48
49
  QWEN_TOOL_CALL_SYMBOLS,
49
50
  ChatModelMixin,
@@ -157,7 +158,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
157
158
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
158
159
  VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
159
160
  VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
160
-
161
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
161
162
 
162
163
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
163
164
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -185,6 +186,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
185
186
  VLLM_SUPPORTED_MODELS.append("llama-3.1")
186
187
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
187
188
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
189
+ VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-llama")
188
190
 
189
191
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
190
192
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
@@ -198,6 +200,12 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
198
200
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
199
201
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
200
202
 
203
+ if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
204
+ VLLM_SUPPORTED_CHAT_MODELS.append("internlm3-instruct")
205
+
206
+ if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
207
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
208
+
201
209
 
202
210
  class VLLMModel(LLM):
203
211
  def __init__(
@@ -804,12 +812,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
804
812
  generate_config: Optional[Dict] = None,
805
813
  request_id: Optional[str] = None,
806
814
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
807
- messages = self.convert_messages_with_content_list_to_str_conversion(messages)
808
815
  tools = generate_config.pop("tools", []) if generate_config else None
809
816
  model_family = self.model_family.model_family or self.model_family.model_name
810
817
  full_context_kwargs = {}
811
- if tools and model_family in QWEN_TOOL_CALL_FAMILY:
812
- full_context_kwargs["tools"] = tools
818
+ if tools:
819
+ if model_family in QWEN_TOOL_CALL_FAMILY:
820
+ full_context_kwargs["tools"] = tools
821
+ elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
822
+ self._tools_to_messages_for_deepseek(messages, tools)
813
823
  assert self.model_family.chat_template is not None
814
824
  full_prompt = self.get_full_context(
815
825
  messages, self.model_family.chat_template, **full_context_kwargs
@@ -76,12 +76,11 @@ class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
76
76
  self._xavier_config = v
77
77
 
78
78
  async def _get_block_tracker_ref(self):
79
- from .block_tracker import VLLMBlockTracker
80
-
81
79
  if self._block_tracker_ref is None:
82
80
  block_tracker_address = self.xavier_config.get("block_tracker_address")
81
+ block_tracker_uid = self.xavier_config.get("block_tracker_uid")
83
82
  self._block_tracker_ref = await xo.actor_ref(
84
- address=block_tracker_address, uid=VLLMBlockTracker.default_uid()
83
+ address=block_tracker_address, uid=block_tracker_uid
85
84
  )
86
85
  return self._block_tracker_ref
87
86
 
@@ -90,7 +89,7 @@ class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
90
89
  tracker_ref = await self._get_block_tracker_ref()
91
90
  await tracker_ref.unregister_block(
92
91
  self.xavier_config.get("virtual_engine"),
93
- self.xavier_config.get("rank_address"),
92
+ self.xavier_config.get("rank"),
94
93
  block_id,
95
94
  )
96
95
 
@@ -24,81 +24,75 @@ class VLLMBlockTracker(xo.StatelessActor):
24
24
 
25
25
  def __init__(self):
26
26
  super().__init__()
27
- # engine -> hash_to_address_and_block_id
28
- self._hash_to_address_and_block_id: Dict[
29
- int, Dict[int, Set[Tuple[str, int]]]
30
- ] = {}
31
- # engine -> address_to_hash_and_block_id
32
- self._address_to_hash_and_block_id: Dict[
33
- int, Dict[str, Set[Tuple[int, int]]]
34
- ] = {}
27
+ # engine -> hash -> (rank, block_id)
28
+ self._hash_to_rank_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
29
+ # engine -> rank -> (hash, block_id)
30
+ self._rank_to_hash_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
31
+ self._unavailable_ranks: Set[int] = set()
35
32
 
36
33
  def register_blocks(
37
- self, virtual_engine: int, block_infos: List[Tuple[int, int]], address: str
34
+ self, virtual_engine: int, block_infos: List[Tuple[int, int]], rank: int
38
35
  ):
39
36
  # Update query meta
40
- if virtual_engine not in self._hash_to_address_and_block_id:
41
- self._hash_to_address_and_block_id[virtual_engine] = {}
42
- hash_to_address_and_block_id = self._hash_to_address_and_block_id[
43
- virtual_engine
44
- ]
37
+ if virtual_engine not in self._hash_to_rank_and_block_id:
38
+ self._hash_to_rank_and_block_id[virtual_engine] = {}
39
+ hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
45
40
  for hash_content, block_id in block_infos:
46
- if hash_content not in hash_to_address_and_block_id:
47
- hash_to_address_and_block_id[hash_content] = {
48
- (address, block_id),
41
+ if hash_content not in hash_to_rank_and_block_id:
42
+ hash_to_rank_and_block_id[hash_content] = {
43
+ (rank, block_id),
49
44
  }
50
45
  else:
51
- hash_to_address_and_block_id[hash_content].add((address, block_id))
46
+ hash_to_rank_and_block_id[hash_content].add((rank, block_id))
52
47
 
53
48
  # Update remove meta
54
- if virtual_engine not in self._address_to_hash_and_block_id:
55
- self._address_to_hash_and_block_id[virtual_engine] = {}
56
- address_to_hash_and_block_id = self._address_to_hash_and_block_id[
57
- virtual_engine
58
- ]
59
- if address not in address_to_hash_and_block_id:
60
- address_to_hash_and_block_id[address] = set()
61
- address_to_hash_and_block_id[address].update(block_infos)
49
+ if virtual_engine not in self._rank_to_hash_and_block_id:
50
+ self._rank_to_hash_and_block_id[virtual_engine] = {}
51
+ rank_to_hash_and_block_id = self._rank_to_hash_and_block_id[virtual_engine]
52
+ if rank not in rank_to_hash_and_block_id:
53
+ rank_to_hash_and_block_id[rank] = set()
54
+ rank_to_hash_and_block_id[rank].update(block_infos)
62
55
 
63
56
  def query_blocks(
64
57
  self, virtual_engine: int, hash_contents: List[Tuple[int, int]]
65
- ) -> Dict[str, Set[Tuple[int, int, int]]]:
66
- if virtual_engine not in self._hash_to_address_and_block_id:
58
+ ) -> Dict[int, Set[Tuple[int, int, int]]]:
59
+ if virtual_engine not in self._hash_to_rank_and_block_id:
67
60
  return {}
68
- hash_to_address_and_block_id = self._hash_to_address_and_block_id[
69
- virtual_engine
70
- ]
71
- remote: Dict[str, Set[Tuple[int, int, int]]] = {}
61
+ hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
62
+ remote: Dict[int, Set[Tuple[int, int, int]]] = {}
72
63
  for hash_content, _id in hash_contents:
73
64
  if (
74
- hash_content in hash_to_address_and_block_id
75
- ) and hash_to_address_and_block_id[hash_content]:
76
- # TODO: Randomly select here, and try to distribute requests as evenly as possible.
77
- # There may be better methods in the future.
78
- address, block_id = random.choice(
79
- list(hash_to_address_and_block_id[hash_content])
80
- )
81
- if address not in remote:
82
- remote[address] = {
83
- (hash_content, block_id, _id),
84
- }
85
- else:
86
- remote[address].add((hash_content, block_id, _id))
65
+ hash_content in hash_to_rank_and_block_id
66
+ ) and hash_to_rank_and_block_id[hash_content]:
67
+ # exclude ranks that are in the recovery process
68
+ rank_and_block_id = [
69
+ (r, b)
70
+ for r, b in hash_to_rank_and_block_id[hash_content]
71
+ if r not in self._unavailable_ranks
72
+ ]
73
+ if rank_and_block_id:
74
+ # TODO: Randomly select here, and try to distribute requests as evenly as possible.
75
+ # There may be better methods in the future.
76
+ rank, block_id = random.choice(rank_and_block_id)
77
+ if rank not in remote:
78
+ remote[rank] = {
79
+ (hash_content, block_id, _id),
80
+ }
81
+ else:
82
+ remote[rank].add((hash_content, block_id, _id))
87
83
  return remote
88
84
 
89
- def unregister_block(self, virtual_engine: int, address: str, block_id: int):
90
- if (virtual_engine not in self._address_to_hash_and_block_id) or (
91
- virtual_engine not in self._hash_to_address_and_block_id
85
+ def unregister_block(self, virtual_engine: int, rank: int, block_id: int):
86
+ if (virtual_engine not in self._rank_to_hash_and_block_id) or (
87
+ virtual_engine not in self._hash_to_rank_and_block_id
92
88
  ):
93
89
  return
94
90
 
95
91
  # Update remove meta
96
- address_to_hash_and_block_id = self._address_to_hash_and_block_id[
97
- virtual_engine
98
- ]
99
- if address not in address_to_hash_and_block_id:
92
+ rank_to_hash_and_block_id = self._rank_to_hash_and_block_id[virtual_engine]
93
+ if rank not in rank_to_hash_and_block_id:
100
94
  return
101
- hash_and_block_id = address_to_hash_and_block_id[address]
95
+ hash_and_block_id = rank_to_hash_and_block_id[rank]
102
96
  detail: Optional[Tuple[int, int]] = None
103
97
  for hash_content, _id in hash_and_block_id.copy():
104
98
  if _id == block_id:
@@ -108,9 +102,28 @@ class VLLMBlockTracker(xo.StatelessActor):
108
102
 
109
103
  # Update query meta
110
104
  if detail is not None:
111
- hash_to_address_and_block_id = self._hash_to_address_and_block_id[
112
- virtual_engine
113
- ]
105
+ hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
114
106
  _hash = detail[0]
115
- if _hash in hash_to_address_and_block_id:
116
- hash_to_address_and_block_id[_hash].discard((address, detail[1]))
107
+ if _hash in hash_to_rank_and_block_id:
108
+ hash_to_rank_and_block_id[_hash].discard((rank, detail[1]))
109
+
110
+ def unregister_rank(self, rank: int):
111
+ """
112
+ This rank is in the recovery process, and its query results will be excluded.
113
+ """
114
+ self._unavailable_ranks.add(rank)
115
+
116
+ def register_rank(self, rank: int):
117
+ """
118
+ After recovery is successful, clear all stale data of the rank and mark the rank as available.
119
+ """
120
+ for _, rank_to_hash_and_block_id in self._rank_to_hash_and_block_id.items():
121
+ rank_to_hash_and_block_id.pop(rank, None)
122
+
123
+ for _, hash_to_rank_and_block_id in self._hash_to_rank_and_block_id.items():
124
+ for _, rank_and_block_id in hash_to_rank_and_block_id.items():
125
+ to_delete = [(r, b) for r, b in rank_and_block_id if r == rank]
126
+ if to_delete:
127
+ rank_and_block_id.difference_update(to_delete)
128
+
129
+ self._unavailable_ranks.discard(rank)
@@ -0,0 +1,74 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from typing import List, Optional
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class CollectiveRank:
21
+ def __init__(
22
+ self,
23
+ rank: int,
24
+ world_size: int,
25
+ rank_address: str,
26
+ store_address: str,
27
+ store_port: int,
28
+ world_addresses: List[str],
29
+ ):
30
+ self._rank = rank
31
+ self._world_size = world_size
32
+ self._rank_address = rank_address
33
+ self._world_addresses = world_addresses
34
+ self._store_address = store_address
35
+ self._store_port = store_port
36
+ self._device = None
37
+ self._tcp_store = None
38
+ self._context = None
39
+
40
+ def init_rank(self):
41
+ from xoscar.collective import xoscar_pygloo as xp
42
+
43
+ self._context = xp.rendezvous.Context(self._rank, self._world_size)
44
+
45
+ attr = xp.transport.tcp.attr(self._rank_address.split(":")[0])
46
+ self._device = xp.transport.tcp.CreateDevice(attr)
47
+
48
+ opt = xp.rendezvous.TCPStoreOptions()
49
+ opt.port = self._store_port
50
+ opt.numWorkers = self._world_size
51
+ opt.isServer = self._rank == 0
52
+ opt.waitWorkers = False
53
+
54
+ self._tcp_store = xp.rendezvous.TCPStore(self._store_address, opt)
55
+ if self._world_addresses:
56
+ self.connect_full_mesh()
57
+
58
+ def connect_full_mesh(
59
+ self, prefix: Optional[str] = None, world_addresses: Optional[List[str]] = None
60
+ ):
61
+ from xoscar.collective import xoscar_pygloo as xp
62
+
63
+ assert self._device is not None
64
+ assert self._tcp_store is not None
65
+ assert self._context is not None
66
+ if world_addresses is not None:
67
+ self._world_addresses = world_addresses
68
+ prefix_store = xp.rendezvous.PrefixStore(
69
+ prefix or str(self._world_size), self._tcp_store
70
+ )
71
+ self._context.connectFullMesh(prefix_store, self._device)
72
+ logger.debug(
73
+ f"Rank {self._rank} arrives successfully, world addresses: {self._world_addresses}"
74
+ )