xinference 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +4 -7
- xinference/client/handlers.py +3 -0
- xinference/core/chat_interface.py +6 -1
- xinference/core/model.py +2 -0
- xinference/core/scheduler.py +4 -7
- xinference/core/supervisor.py +114 -23
- xinference/core/worker.py +70 -4
- xinference/deploy/local.py +2 -1
- xinference/model/audio/core.py +11 -0
- xinference/model/audio/cosyvoice.py +16 -5
- xinference/model/audio/kokoro.py +139 -0
- xinference/model/audio/melotts.py +110 -0
- xinference/model/audio/model_spec.json +80 -0
- xinference/model/audio/model_spec_modelscope.json +18 -0
- xinference/model/audio/whisper.py +35 -10
- xinference/model/llm/llama_cpp/core.py +21 -14
- xinference/model/llm/llm_family.json +527 -1
- xinference/model/llm/llm_family.py +4 -1
- xinference/model/llm/llm_family_modelscope.json +495 -3
- xinference/model/llm/memory.py +1 -1
- xinference/model/llm/mlx/core.py +24 -6
- xinference/model/llm/transformers/core.py +9 -1
- xinference/model/llm/transformers/qwen2_audio.py +3 -1
- xinference/model/llm/transformers/qwen2_vl.py +20 -3
- xinference/model/llm/transformers/utils.py +22 -11
- xinference/model/llm/utils.py +115 -1
- xinference/model/llm/vllm/core.py +14 -4
- xinference/model/llm/vllm/xavier/block.py +3 -4
- xinference/model/llm/vllm/xavier/block_tracker.py +71 -58
- xinference/model/llm/vllm/xavier/collective.py +74 -0
- xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
- xinference/model/llm/vllm/xavier/executor.py +18 -16
- xinference/model/llm/vllm/xavier/scheduler.py +79 -63
- xinference/model/llm/vllm/xavier/test/test_xavier.py +60 -35
- xinference/model/llm/vllm/xavier/transfer.py +53 -32
- xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
- xinference/thirdparty/melo/__init__.py +0 -0
- xinference/thirdparty/melo/api.py +135 -0
- xinference/thirdparty/melo/app.py +61 -0
- xinference/thirdparty/melo/attentions.py +459 -0
- xinference/thirdparty/melo/commons.py +160 -0
- xinference/thirdparty/melo/configs/config.json +94 -0
- xinference/thirdparty/melo/data/example/metadata.list +20 -0
- xinference/thirdparty/melo/data_utils.py +413 -0
- xinference/thirdparty/melo/download_utils.py +67 -0
- xinference/thirdparty/melo/infer.py +25 -0
- xinference/thirdparty/melo/init_downloads.py +14 -0
- xinference/thirdparty/melo/losses.py +58 -0
- xinference/thirdparty/melo/main.py +36 -0
- xinference/thirdparty/melo/mel_processing.py +174 -0
- xinference/thirdparty/melo/models.py +1030 -0
- xinference/thirdparty/melo/modules.py +598 -0
- xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
- xinference/thirdparty/melo/monotonic_align/core.py +46 -0
- xinference/thirdparty/melo/preprocess_text.py +135 -0
- xinference/thirdparty/melo/split_utils.py +174 -0
- xinference/thirdparty/melo/text/__init__.py +35 -0
- xinference/thirdparty/melo/text/chinese.py +199 -0
- xinference/thirdparty/melo/text/chinese_bert.py +107 -0
- xinference/thirdparty/melo/text/chinese_mix.py +253 -0
- xinference/thirdparty/melo/text/cleaner.py +36 -0
- xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
- xinference/thirdparty/melo/text/cmudict.rep +129530 -0
- xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
- xinference/thirdparty/melo/text/english.py +284 -0
- xinference/thirdparty/melo/text/english_bert.py +39 -0
- xinference/thirdparty/melo/text/english_utils/__init__.py +0 -0
- xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
- xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
- xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
- xinference/thirdparty/melo/text/es_phonemizer/__init__.py +0 -0
- xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
- xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
- xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
- xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
- xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
- xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
- xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
- xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
- xinference/thirdparty/melo/text/fr_phonemizer/__init__.py +0 -0
- xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
- xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
- xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
- xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
- xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
- xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
- xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
- xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
- xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
- xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
- xinference/thirdparty/melo/text/french.py +94 -0
- xinference/thirdparty/melo/text/french_bert.py +39 -0
- xinference/thirdparty/melo/text/japanese.py +647 -0
- xinference/thirdparty/melo/text/japanese_bert.py +49 -0
- xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
- xinference/thirdparty/melo/text/korean.py +192 -0
- xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
- xinference/thirdparty/melo/text/spanish.py +122 -0
- xinference/thirdparty/melo/text/spanish_bert.py +39 -0
- xinference/thirdparty/melo/text/symbols.py +290 -0
- xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
- xinference/thirdparty/melo/train.py +635 -0
- xinference/thirdparty/melo/train.sh +19 -0
- xinference/thirdparty/melo/transforms.py +209 -0
- xinference/thirdparty/melo/utils.py +424 -0
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.1eb206d1.js → main.b0936c54.js} +3 -3
- xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/METADATA +37 -27
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/RECORD +122 -45
- xinference/web/ui/build/static/js/main.1eb206d1.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +0 -1
- /xinference/web/ui/build/static/js/{main.1eb206d1.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/LICENSE +0 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/WHEEL +0 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/entry_points.txt +0 -0
- {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/top_level.txt +0 -0
|
@@ -45,9 +45,13 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
45
45
|
def match(
|
|
46
46
|
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
47
47
|
) -> bool:
|
|
48
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
49
|
+
return False
|
|
48
50
|
llm_family = model_family.model_family or model_family.model_name
|
|
49
51
|
if "qwen2-vl-instruct".lower() in llm_family.lower():
|
|
50
52
|
return True
|
|
53
|
+
if "qwen2.5-vl-instruct".lower() in llm_family.lower():
|
|
54
|
+
return True
|
|
51
55
|
if "qvq-72b-preview".lower() in llm_family.lower():
|
|
52
56
|
return True
|
|
53
57
|
return False
|
|
@@ -55,6 +59,11 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
55
59
|
def load(self):
|
|
56
60
|
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
|
|
57
61
|
|
|
62
|
+
try:
|
|
63
|
+
from transformers import Qwen2_5_VLForConditionalGeneration
|
|
64
|
+
except ImportError:
|
|
65
|
+
Qwen2_5_VLForConditionalGeneration = None
|
|
66
|
+
|
|
58
67
|
device = self._pytorch_model_config.get("device", "auto")
|
|
59
68
|
device = select_device(device)
|
|
60
69
|
self._device = device
|
|
@@ -66,8 +75,16 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
66
75
|
)
|
|
67
76
|
self._tokenizer = self._processor.tokenizer
|
|
68
77
|
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
|
|
78
|
+
llm_family = self.model_family.model_family or self.model_family.model_name
|
|
79
|
+
model_cls = (
|
|
80
|
+
Qwen2_5_VLForConditionalGeneration
|
|
81
|
+
if "qwen2.5" in llm_family
|
|
82
|
+
else Qwen2VLForConditionalGeneration
|
|
83
|
+
)
|
|
84
|
+
if model_cls is None:
|
|
85
|
+
raise ImportError("`transformers` version is too old, please upgrade it")
|
|
69
86
|
if flash_attn_installed:
|
|
70
|
-
self._model =
|
|
87
|
+
self._model = model_cls.from_pretrained(
|
|
71
88
|
self.model_path,
|
|
72
89
|
torch_dtype="bfloat16",
|
|
73
90
|
device_map=device,
|
|
@@ -76,14 +93,14 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
76
93
|
).eval()
|
|
77
94
|
elif is_npu_available():
|
|
78
95
|
# Ascend do not support bf16
|
|
79
|
-
self._model =
|
|
96
|
+
self._model = model_cls.from_pretrained(
|
|
80
97
|
self.model_path,
|
|
81
98
|
device_map="auto",
|
|
82
99
|
trust_remote_code=True,
|
|
83
100
|
torch_dtype="float16",
|
|
84
101
|
).eval()
|
|
85
102
|
else:
|
|
86
|
-
self._model =
|
|
103
|
+
self._model = model_cls.from_pretrained(
|
|
87
104
|
self.model_path, device_map=device, trust_remote_code=True
|
|
88
105
|
).eval()
|
|
89
106
|
|
|
@@ -193,16 +193,14 @@ def _get_pad_param(seq_len_idx: int, pad_len: int) -> Tuple:
|
|
|
193
193
|
|
|
194
194
|
def _merge_kv_cache(
|
|
195
195
|
xinf_model_obj: "PytorchModel",
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
):
|
|
196
|
+
past_cache: DynamicCache,
|
|
197
|
+
new_cache: DynamicCache,
|
|
198
|
+
) -> DynamicCache:
|
|
199
199
|
from torch.nn.functional import pad
|
|
200
200
|
|
|
201
201
|
_, seq_len_idx = xinf_model_obj.get_batch_size_and_seq_len_indexes_from_kv()
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
past_seq_len = past_kv[0][0].shape[seq_len_idx]
|
|
205
|
-
new_seq_len = new_kv[0][0].shape[seq_len_idx]
|
|
202
|
+
past_seq_len = past_cache[0][0].shape[seq_len_idx]
|
|
203
|
+
new_seq_len = new_cache[0][0].shape[seq_len_idx]
|
|
206
204
|
if past_seq_len != new_seq_len:
|
|
207
205
|
padding_target = new_cache if past_seq_len > new_seq_len else past_cache
|
|
208
206
|
padding_len = abs(past_seq_len - new_seq_len)
|
|
@@ -219,8 +217,12 @@ def _merge_kv_cache(
|
|
|
219
217
|
for idx in range(len(past_cache)):
|
|
220
218
|
k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
|
|
221
219
|
v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
|
|
222
|
-
ret_kv.update(
|
|
223
|
-
|
|
220
|
+
ret_kv.update(
|
|
221
|
+
torch.cat((k1, k2), 0).contiguous(),
|
|
222
|
+
torch.cat((v1, v2), 0).contiguous(),
|
|
223
|
+
idx,
|
|
224
|
+
)
|
|
225
|
+
return ret_kv
|
|
224
226
|
|
|
225
227
|
|
|
226
228
|
def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel"):
|
|
@@ -228,6 +230,15 @@ def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel")
|
|
|
228
230
|
return kv[0][0].shape[bs_idx], kv[0][0].shape[seq_len_idx] + 1
|
|
229
231
|
|
|
230
232
|
|
|
233
|
+
def convert_to_cache_cls(cache) -> DynamicCache:
|
|
234
|
+
"""
|
|
235
|
+
Compatible with some old models
|
|
236
|
+
"""
|
|
237
|
+
if isinstance(cache, tuple):
|
|
238
|
+
return DynamicCache.from_legacy_cache(cache)
|
|
239
|
+
return cache
|
|
240
|
+
|
|
241
|
+
|
|
231
242
|
@torch.inference_mode()
|
|
232
243
|
def _batch_inference_one_step_internal(
|
|
233
244
|
xinf_model_obj: "PytorchModel",
|
|
@@ -269,7 +280,7 @@ def _batch_inference_one_step_internal(
|
|
|
269
280
|
out = model(**prefill_kws, use_cache=True)
|
|
270
281
|
|
|
271
282
|
logits = out.logits
|
|
272
|
-
past_key_values = out.past_key_values
|
|
283
|
+
past_key_values = convert_to_cache_cls(out.past_key_values)
|
|
273
284
|
|
|
274
285
|
for i, r in enumerate(prefill_reqs):
|
|
275
286
|
(
|
|
@@ -317,7 +328,7 @@ def _batch_inference_one_step_internal(
|
|
|
317
328
|
)
|
|
318
329
|
out = model(**inf_kws, use_cache=True, past_key_values=past_key_values)
|
|
319
330
|
logits = out.logits
|
|
320
|
-
past_key_values = out.past_key_values
|
|
331
|
+
past_key_values = convert_to_cache_cls(out.past_key_values)
|
|
321
332
|
|
|
322
333
|
for i, r in enumerate(valid_req_list):
|
|
323
334
|
(
|
xinference/model/llm/utils.py
CHANGED
|
@@ -11,16 +11,28 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import base64
|
|
15
16
|
import functools
|
|
16
17
|
import json
|
|
17
18
|
import logging
|
|
18
19
|
import os
|
|
20
|
+
import re
|
|
19
21
|
import time
|
|
20
22
|
import typing
|
|
21
23
|
import uuid
|
|
22
24
|
from io import BytesIO
|
|
23
|
-
from typing import
|
|
25
|
+
from typing import (
|
|
26
|
+
Any,
|
|
27
|
+
AsyncGenerator,
|
|
28
|
+
Dict,
|
|
29
|
+
Iterable,
|
|
30
|
+
Iterator,
|
|
31
|
+
List,
|
|
32
|
+
Optional,
|
|
33
|
+
Tuple,
|
|
34
|
+
cast,
|
|
35
|
+
)
|
|
24
36
|
|
|
25
37
|
import requests
|
|
26
38
|
from PIL import Image
|
|
@@ -64,6 +76,18 @@ LLAMA3_TOOL_CALL_FAMILY = [
|
|
|
64
76
|
"llama-3.1-instruct",
|
|
65
77
|
]
|
|
66
78
|
|
|
79
|
+
DEEPSEEK_TOOL_CALL_FAMILY = [
|
|
80
|
+
"deepseek-r1-distill-qwen",
|
|
81
|
+
"deepseek-r1-distill-llama",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
TOOL_CALL_FAMILY = (
|
|
85
|
+
QWEN_TOOL_CALL_FAMILY
|
|
86
|
+
+ GLM4_TOOL_CALL_FAMILY
|
|
87
|
+
+ LLAMA3_TOOL_CALL_FAMILY
|
|
88
|
+
+ DEEPSEEK_TOOL_CALL_FAMILY
|
|
89
|
+
)
|
|
90
|
+
|
|
67
91
|
QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
|
|
68
92
|
|
|
69
93
|
|
|
@@ -104,6 +128,10 @@ class ChatModelMixin:
|
|
|
104
128
|
tokenize=False,
|
|
105
129
|
**kwargs,
|
|
106
130
|
):
|
|
131
|
+
if "vision" not in self.model_family.model_ability: # type: ignore
|
|
132
|
+
messages = self.convert_messages_with_content_list_to_str_conversion(
|
|
133
|
+
messages
|
|
134
|
+
)
|
|
107
135
|
if tokenizer is not None:
|
|
108
136
|
try:
|
|
109
137
|
full_context = tokenizer.apply_chat_template(
|
|
@@ -304,6 +332,35 @@ class ChatModelMixin:
|
|
|
304
332
|
else:
|
|
305
333
|
yield cls._to_chat_completion_chunk(chunk)
|
|
306
334
|
|
|
335
|
+
@classmethod
|
|
336
|
+
def _tools_to_messages_for_deepseek(
|
|
337
|
+
cls, messages: List[dict], tools: Iterable[dict]
|
|
338
|
+
):
|
|
339
|
+
# deepseek integrates tool calls into messages
|
|
340
|
+
# we follow the chat template rule to integrate tools into messages
|
|
341
|
+
tool_call_message: Dict[str, Any] = {
|
|
342
|
+
"role": "assistant",
|
|
343
|
+
"content": None,
|
|
344
|
+
"tool_calls": [],
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
for tool in tools:
|
|
348
|
+
function_name = tool["function"]["name"]
|
|
349
|
+
parameters = tool["function"].get("parameters", {}).get("properties", {})
|
|
350
|
+
function_args_json = json.dumps(parameters)
|
|
351
|
+
|
|
352
|
+
tool_call_message["tool_calls"].append(
|
|
353
|
+
{
|
|
354
|
+
"type": "function",
|
|
355
|
+
"function": {
|
|
356
|
+
"name": function_name,
|
|
357
|
+
"arguments": function_args_json,
|
|
358
|
+
},
|
|
359
|
+
}
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
messages.append(tool_call_message)
|
|
363
|
+
|
|
307
364
|
@classmethod
|
|
308
365
|
async def _async_to_chat_completion_chunks(
|
|
309
366
|
cls,
|
|
@@ -397,6 +454,61 @@ class ChatModelMixin:
|
|
|
397
454
|
except Exception:
|
|
398
455
|
return [(text, None, None)]
|
|
399
456
|
|
|
457
|
+
@classmethod
|
|
458
|
+
def _eval_deepseek_chat_arguments(cls, c) -> List[Tuple]:
|
|
459
|
+
"""
|
|
460
|
+
Parses tool calls from deepseek-r1 format and removes duplicates.
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
List[Tuple[Optional[str], Optional[str], Optional[dict]]]
|
|
464
|
+
- (None, function_name, arguments) if successfully parsed.
|
|
465
|
+
- (content, None, None) if parsing failed (content is raw JSON text).
|
|
466
|
+
|
|
467
|
+
Example input:
|
|
468
|
+
<|tool▁call|>get_current_weather
|
|
469
|
+
```json
|
|
470
|
+
{"location": "tokyo", "unit": "fahrenheit"}
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
Output:
|
|
474
|
+
[
|
|
475
|
+
(None, "get_current_weather", {"location": "tokyo", "unit": "fahrenheit"})
|
|
476
|
+
]
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
text = c["choices"][0]["text"]
|
|
480
|
+
|
|
481
|
+
pattern = r"<|tool▁call|>(\w+)\s*```json\s*(.*?)\s*```"
|
|
482
|
+
matches = re.findall(pattern, text, re.DOTALL)
|
|
483
|
+
|
|
484
|
+
if not matches:
|
|
485
|
+
return [(text, None, None)]
|
|
486
|
+
|
|
487
|
+
tool_calls = set() # Used for deduplication
|
|
488
|
+
results = []
|
|
489
|
+
|
|
490
|
+
for function_name, args_json in matches:
|
|
491
|
+
try:
|
|
492
|
+
arguments = json.loads(args_json)
|
|
493
|
+
# Convert dictionary to frozenset for deduplication
|
|
494
|
+
arguments_hashable = frozenset(arguments.items())
|
|
495
|
+
tool_call_tuple = (None, function_name, arguments)
|
|
496
|
+
except json.JSONDecodeError:
|
|
497
|
+
tool_call_tuple = (
|
|
498
|
+
args_json,
|
|
499
|
+
None,
|
|
500
|
+
None,
|
|
501
|
+
) # If parsing fails, treat as raw content
|
|
502
|
+
arguments_hashable = None # No need for hashing
|
|
503
|
+
|
|
504
|
+
# Avoid duplicate entries
|
|
505
|
+
dedup_key = (function_name, arguments_hashable)
|
|
506
|
+
if dedup_key not in tool_calls:
|
|
507
|
+
tool_calls.add(dedup_key)
|
|
508
|
+
results.append(tool_call_tuple)
|
|
509
|
+
|
|
510
|
+
return results
|
|
511
|
+
|
|
400
512
|
@classmethod
|
|
401
513
|
def _eval_tool_arguments(cls, model_family, c):
|
|
402
514
|
family = model_family.model_family or model_family.model_name
|
|
@@ -406,6 +518,8 @@ class ChatModelMixin:
|
|
|
406
518
|
result = cls._eval_qwen_chat_arguments(c)
|
|
407
519
|
elif family in LLAMA3_TOOL_CALL_FAMILY:
|
|
408
520
|
result = cls._eval_llama3_chat_arguments(c)
|
|
521
|
+
elif family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
522
|
+
result = cls._eval_deepseek_chat_arguments(c)
|
|
409
523
|
else:
|
|
410
524
|
raise Exception(
|
|
411
525
|
f"Model {model_family.model_name} is not support tool calls."
|
|
@@ -44,6 +44,7 @@ from ....types import (
|
|
|
44
44
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
45
45
|
from ..llm_family import CustomLLMFamilyV1
|
|
46
46
|
from ..utils import (
|
|
47
|
+
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
47
48
|
QWEN_TOOL_CALL_FAMILY,
|
|
48
49
|
QWEN_TOOL_CALL_SYMBOLS,
|
|
49
50
|
ChatModelMixin,
|
|
@@ -157,7 +158,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
|
157
158
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
|
|
158
159
|
VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
|
|
159
160
|
VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
|
|
160
|
-
|
|
161
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
|
|
161
162
|
|
|
162
163
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
|
|
163
164
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
|
|
@@ -185,6 +186,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
185
186
|
VLLM_SUPPORTED_MODELS.append("llama-3.1")
|
|
186
187
|
VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
|
|
187
188
|
VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
|
|
189
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-llama")
|
|
188
190
|
|
|
189
191
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
190
192
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
@@ -198,6 +200,12 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
|
|
|
198
200
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
|
|
199
201
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
|
|
200
202
|
|
|
203
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.7.0":
|
|
204
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("internlm3-instruct")
|
|
205
|
+
|
|
206
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
|
|
207
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
|
|
208
|
+
|
|
201
209
|
|
|
202
210
|
class VLLMModel(LLM):
|
|
203
211
|
def __init__(
|
|
@@ -804,12 +812,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
804
812
|
generate_config: Optional[Dict] = None,
|
|
805
813
|
request_id: Optional[str] = None,
|
|
806
814
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
807
|
-
messages = self.convert_messages_with_content_list_to_str_conversion(messages)
|
|
808
815
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
809
816
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
810
817
|
full_context_kwargs = {}
|
|
811
|
-
if tools
|
|
812
|
-
|
|
818
|
+
if tools:
|
|
819
|
+
if model_family in QWEN_TOOL_CALL_FAMILY:
|
|
820
|
+
full_context_kwargs["tools"] = tools
|
|
821
|
+
elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
822
|
+
self._tools_to_messages_for_deepseek(messages, tools)
|
|
813
823
|
assert self.model_family.chat_template is not None
|
|
814
824
|
full_prompt = self.get_full_context(
|
|
815
825
|
messages, self.model_family.chat_template, **full_context_kwargs
|
|
@@ -76,12 +76,11 @@ class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
|
|
|
76
76
|
self._xavier_config = v
|
|
77
77
|
|
|
78
78
|
async def _get_block_tracker_ref(self):
|
|
79
|
-
from .block_tracker import VLLMBlockTracker
|
|
80
|
-
|
|
81
79
|
if self._block_tracker_ref is None:
|
|
82
80
|
block_tracker_address = self.xavier_config.get("block_tracker_address")
|
|
81
|
+
block_tracker_uid = self.xavier_config.get("block_tracker_uid")
|
|
83
82
|
self._block_tracker_ref = await xo.actor_ref(
|
|
84
|
-
address=block_tracker_address, uid=
|
|
83
|
+
address=block_tracker_address, uid=block_tracker_uid
|
|
85
84
|
)
|
|
86
85
|
return self._block_tracker_ref
|
|
87
86
|
|
|
@@ -90,7 +89,7 @@ class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
|
|
|
90
89
|
tracker_ref = await self._get_block_tracker_ref()
|
|
91
90
|
await tracker_ref.unregister_block(
|
|
92
91
|
self.xavier_config.get("virtual_engine"),
|
|
93
|
-
self.xavier_config.get("
|
|
92
|
+
self.xavier_config.get("rank"),
|
|
94
93
|
block_id,
|
|
95
94
|
)
|
|
96
95
|
|
|
@@ -24,81 +24,75 @@ class VLLMBlockTracker(xo.StatelessActor):
|
|
|
24
24
|
|
|
25
25
|
def __init__(self):
|
|
26
26
|
super().__init__()
|
|
27
|
-
# engine ->
|
|
28
|
-
self.
|
|
29
|
-
|
|
30
|
-
] = {}
|
|
31
|
-
|
|
32
|
-
self._address_to_hash_and_block_id: Dict[
|
|
33
|
-
int, Dict[str, Set[Tuple[int, int]]]
|
|
34
|
-
] = {}
|
|
27
|
+
# engine -> hash -> (rank, block_id)
|
|
28
|
+
self._hash_to_rank_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
|
|
29
|
+
# engine -> rank -> (hash, block_id)
|
|
30
|
+
self._rank_to_hash_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
|
|
31
|
+
self._unavailable_ranks: Set[int] = set()
|
|
35
32
|
|
|
36
33
|
def register_blocks(
|
|
37
|
-
self, virtual_engine: int, block_infos: List[Tuple[int, int]],
|
|
34
|
+
self, virtual_engine: int, block_infos: List[Tuple[int, int]], rank: int
|
|
38
35
|
):
|
|
39
36
|
# Update query meta
|
|
40
|
-
if virtual_engine not in self.
|
|
41
|
-
self.
|
|
42
|
-
|
|
43
|
-
virtual_engine
|
|
44
|
-
]
|
|
37
|
+
if virtual_engine not in self._hash_to_rank_and_block_id:
|
|
38
|
+
self._hash_to_rank_and_block_id[virtual_engine] = {}
|
|
39
|
+
hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
|
|
45
40
|
for hash_content, block_id in block_infos:
|
|
46
|
-
if hash_content not in
|
|
47
|
-
|
|
48
|
-
(
|
|
41
|
+
if hash_content not in hash_to_rank_and_block_id:
|
|
42
|
+
hash_to_rank_and_block_id[hash_content] = {
|
|
43
|
+
(rank, block_id),
|
|
49
44
|
}
|
|
50
45
|
else:
|
|
51
|
-
|
|
46
|
+
hash_to_rank_and_block_id[hash_content].add((rank, block_id))
|
|
52
47
|
|
|
53
48
|
# Update remove meta
|
|
54
|
-
if virtual_engine not in self.
|
|
55
|
-
self.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
address_to_hash_and_block_id[address] = set()
|
|
61
|
-
address_to_hash_and_block_id[address].update(block_infos)
|
|
49
|
+
if virtual_engine not in self._rank_to_hash_and_block_id:
|
|
50
|
+
self._rank_to_hash_and_block_id[virtual_engine] = {}
|
|
51
|
+
rank_to_hash_and_block_id = self._rank_to_hash_and_block_id[virtual_engine]
|
|
52
|
+
if rank not in rank_to_hash_and_block_id:
|
|
53
|
+
rank_to_hash_and_block_id[rank] = set()
|
|
54
|
+
rank_to_hash_and_block_id[rank].update(block_infos)
|
|
62
55
|
|
|
63
56
|
def query_blocks(
|
|
64
57
|
self, virtual_engine: int, hash_contents: List[Tuple[int, int]]
|
|
65
|
-
) -> Dict[
|
|
66
|
-
if virtual_engine not in self.
|
|
58
|
+
) -> Dict[int, Set[Tuple[int, int, int]]]:
|
|
59
|
+
if virtual_engine not in self._hash_to_rank_and_block_id:
|
|
67
60
|
return {}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
]
|
|
71
|
-
remote: Dict[str, Set[Tuple[int, int, int]]] = {}
|
|
61
|
+
hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
|
|
62
|
+
remote: Dict[int, Set[Tuple[int, int, int]]] = {}
|
|
72
63
|
for hash_content, _id in hash_contents:
|
|
73
64
|
if (
|
|
74
|
-
hash_content in
|
|
75
|
-
) and
|
|
76
|
-
#
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
65
|
+
hash_content in hash_to_rank_and_block_id
|
|
66
|
+
) and hash_to_rank_and_block_id[hash_content]:
|
|
67
|
+
# exclude ranks that are in the recovery process
|
|
68
|
+
rank_and_block_id = [
|
|
69
|
+
(r, b)
|
|
70
|
+
for r, b in hash_to_rank_and_block_id[hash_content]
|
|
71
|
+
if r not in self._unavailable_ranks
|
|
72
|
+
]
|
|
73
|
+
if rank_and_block_id:
|
|
74
|
+
# TODO: Randomly select here, and try to distribute requests as evenly as possible.
|
|
75
|
+
# There may be better methods in the future.
|
|
76
|
+
rank, block_id = random.choice(rank_and_block_id)
|
|
77
|
+
if rank not in remote:
|
|
78
|
+
remote[rank] = {
|
|
79
|
+
(hash_content, block_id, _id),
|
|
80
|
+
}
|
|
81
|
+
else:
|
|
82
|
+
remote[rank].add((hash_content, block_id, _id))
|
|
87
83
|
return remote
|
|
88
84
|
|
|
89
|
-
def unregister_block(self, virtual_engine: int,
|
|
90
|
-
if (virtual_engine not in self.
|
|
91
|
-
virtual_engine not in self.
|
|
85
|
+
def unregister_block(self, virtual_engine: int, rank: int, block_id: int):
|
|
86
|
+
if (virtual_engine not in self._rank_to_hash_and_block_id) or (
|
|
87
|
+
virtual_engine not in self._hash_to_rank_and_block_id
|
|
92
88
|
):
|
|
93
89
|
return
|
|
94
90
|
|
|
95
91
|
# Update remove meta
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
]
|
|
99
|
-
if address not in address_to_hash_and_block_id:
|
|
92
|
+
rank_to_hash_and_block_id = self._rank_to_hash_and_block_id[virtual_engine]
|
|
93
|
+
if rank not in rank_to_hash_and_block_id:
|
|
100
94
|
return
|
|
101
|
-
hash_and_block_id =
|
|
95
|
+
hash_and_block_id = rank_to_hash_and_block_id[rank]
|
|
102
96
|
detail: Optional[Tuple[int, int]] = None
|
|
103
97
|
for hash_content, _id in hash_and_block_id.copy():
|
|
104
98
|
if _id == block_id:
|
|
@@ -108,9 +102,28 @@ class VLLMBlockTracker(xo.StatelessActor):
|
|
|
108
102
|
|
|
109
103
|
# Update query meta
|
|
110
104
|
if detail is not None:
|
|
111
|
-
|
|
112
|
-
virtual_engine
|
|
113
|
-
]
|
|
105
|
+
hash_to_rank_and_block_id = self._hash_to_rank_and_block_id[virtual_engine]
|
|
114
106
|
_hash = detail[0]
|
|
115
|
-
if _hash in
|
|
116
|
-
|
|
107
|
+
if _hash in hash_to_rank_and_block_id:
|
|
108
|
+
hash_to_rank_and_block_id[_hash].discard((rank, detail[1]))
|
|
109
|
+
|
|
110
|
+
def unregister_rank(self, rank: int):
|
|
111
|
+
"""
|
|
112
|
+
This rank is in the recovery process, and its query results will be excluded.
|
|
113
|
+
"""
|
|
114
|
+
self._unavailable_ranks.add(rank)
|
|
115
|
+
|
|
116
|
+
def register_rank(self, rank: int):
|
|
117
|
+
"""
|
|
118
|
+
After recovery is successful, clear all stale data of the rank and mark the rank as available.
|
|
119
|
+
"""
|
|
120
|
+
for _, rank_to_hash_and_block_id in self._rank_to_hash_and_block_id.items():
|
|
121
|
+
rank_to_hash_and_block_id.pop(rank, None)
|
|
122
|
+
|
|
123
|
+
for _, hash_to_rank_and_block_id in self._hash_to_rank_and_block_id.items():
|
|
124
|
+
for _, rank_and_block_id in hash_to_rank_and_block_id.items():
|
|
125
|
+
to_delete = [(r, b) for r, b in rank_and_block_id if r == rank]
|
|
126
|
+
if to_delete:
|
|
127
|
+
rank_and_block_id.difference_update(to_delete)
|
|
128
|
+
|
|
129
|
+
self._unavailable_ranks.discard(rank)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
from typing import List, Optional
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CollectiveRank:
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
rank: int,
|
|
24
|
+
world_size: int,
|
|
25
|
+
rank_address: str,
|
|
26
|
+
store_address: str,
|
|
27
|
+
store_port: int,
|
|
28
|
+
world_addresses: List[str],
|
|
29
|
+
):
|
|
30
|
+
self._rank = rank
|
|
31
|
+
self._world_size = world_size
|
|
32
|
+
self._rank_address = rank_address
|
|
33
|
+
self._world_addresses = world_addresses
|
|
34
|
+
self._store_address = store_address
|
|
35
|
+
self._store_port = store_port
|
|
36
|
+
self._device = None
|
|
37
|
+
self._tcp_store = None
|
|
38
|
+
self._context = None
|
|
39
|
+
|
|
40
|
+
def init_rank(self):
|
|
41
|
+
from xoscar.collective import xoscar_pygloo as xp
|
|
42
|
+
|
|
43
|
+
self._context = xp.rendezvous.Context(self._rank, self._world_size)
|
|
44
|
+
|
|
45
|
+
attr = xp.transport.tcp.attr(self._rank_address.split(":")[0])
|
|
46
|
+
self._device = xp.transport.tcp.CreateDevice(attr)
|
|
47
|
+
|
|
48
|
+
opt = xp.rendezvous.TCPStoreOptions()
|
|
49
|
+
opt.port = self._store_port
|
|
50
|
+
opt.numWorkers = self._world_size
|
|
51
|
+
opt.isServer = self._rank == 0
|
|
52
|
+
opt.waitWorkers = False
|
|
53
|
+
|
|
54
|
+
self._tcp_store = xp.rendezvous.TCPStore(self._store_address, opt)
|
|
55
|
+
if self._world_addresses:
|
|
56
|
+
self.connect_full_mesh()
|
|
57
|
+
|
|
58
|
+
def connect_full_mesh(
|
|
59
|
+
self, prefix: Optional[str] = None, world_addresses: Optional[List[str]] = None
|
|
60
|
+
):
|
|
61
|
+
from xoscar.collective import xoscar_pygloo as xp
|
|
62
|
+
|
|
63
|
+
assert self._device is not None
|
|
64
|
+
assert self._tcp_store is not None
|
|
65
|
+
assert self._context is not None
|
|
66
|
+
if world_addresses is not None:
|
|
67
|
+
self._world_addresses = world_addresses
|
|
68
|
+
prefix_store = xp.rendezvous.PrefixStore(
|
|
69
|
+
prefix or str(self._world_size), self._tcp_store
|
|
70
|
+
)
|
|
71
|
+
self._context.connectFullMesh(prefix_store, self._device)
|
|
72
|
+
logger.debug(
|
|
73
|
+
f"Rank {self._rank} arrives successfully, world addresses: {self._world_addresses}"
|
|
74
|
+
)
|