xinference 1.7.0.post1__py3-none-any.whl → 1.7.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +3 -4
- xinference/client/__init__.py +2 -0
- xinference/client/common.py +49 -2
- xinference/client/handlers.py +18 -0
- xinference/client/restful/async_restful_client.py +1760 -0
- xinference/client/restful/restful_client.py +74 -78
- xinference/core/media_interface.py +3 -1
- xinference/core/model.py +5 -4
- xinference/core/supervisor.py +10 -5
- xinference/core/worker.py +15 -14
- xinference/deploy/local.py +51 -9
- xinference/deploy/worker.py +5 -3
- xinference/device_utils.py +22 -3
- xinference/model/audio/fish_speech.py +23 -34
- xinference/model/audio/model_spec.json +4 -2
- xinference/model/audio/model_spec_modelscope.json +4 -2
- xinference/model/audio/utils.py +2 -2
- xinference/model/core.py +1 -0
- xinference/model/embedding/__init__.py +8 -8
- xinference/model/embedding/custom.py +6 -1
- xinference/model/embedding/embed_family.py +0 -41
- xinference/model/embedding/model_spec.json +10 -1
- xinference/model/embedding/model_spec_modelscope.json +10 -1
- xinference/model/embedding/sentence_transformers/core.py +30 -15
- xinference/model/flexible/core.py +1 -1
- xinference/model/flexible/launchers/__init__.py +2 -0
- xinference/model/flexible/launchers/image_process_launcher.py +1 -1
- xinference/model/flexible/launchers/modelscope_launcher.py +47 -0
- xinference/model/flexible/launchers/transformers_launcher.py +5 -5
- xinference/model/flexible/launchers/yolo_launcher.py +62 -0
- xinference/model/llm/__init__.py +7 -0
- xinference/model/llm/core.py +18 -1
- xinference/model/llm/llama_cpp/core.py +1 -1
- xinference/model/llm/llm_family.json +41 -1
- xinference/model/llm/llm_family.py +6 -0
- xinference/model/llm/llm_family_modelscope.json +43 -1
- xinference/model/llm/mlx/core.py +271 -18
- xinference/model/llm/mlx/distributed_models/__init__.py +13 -0
- xinference/model/llm/mlx/distributed_models/core.py +164 -0
- xinference/model/llm/mlx/distributed_models/deepseek_v3.py +75 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +82 -0
- xinference/model/llm/mlx/distributed_models/qwen3.py +82 -0
- xinference/model/llm/mlx/distributed_models/qwen3_moe.py +76 -0
- xinference/model/llm/reasoning_parser.py +12 -6
- xinference/model/llm/sglang/core.py +8 -4
- xinference/model/llm/transformers/chatglm.py +4 -1
- xinference/model/llm/transformers/core.py +4 -2
- xinference/model/llm/transformers/multimodal/cogagent.py +10 -4
- xinference/model/llm/transformers/multimodal/intern_vl.py +1 -1
- xinference/model/llm/utils.py +36 -17
- xinference/model/llm/vllm/core.py +142 -34
- xinference/model/llm/vllm/distributed_executor.py +96 -21
- xinference/model/llm/vllm/xavier/transfer.py +2 -2
- xinference/model/rerank/core.py +16 -9
- xinference/model/rerank/model_spec.json +3 -3
- xinference/model/rerank/model_spec_modelscope.json +3 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.9b12b7f9.js +3 -0
- xinference/web/ui/build/static/js/main.9b12b7f9.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0fd4820d93f99509e80d8702dc3f6f8272424acab5608fa7c0e82cb1d3250a87.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f75545479c17fdfe2a00235fa4a0e9da1ae95e6b3caafba87ded92de6b0240e4.json +1 -0
- xinference/web/ui/src/locales/en.json +3 -0
- xinference/web/ui/src/locales/ja.json +3 -0
- xinference/web/ui/src/locales/ko.json +3 -0
- xinference/web/ui/src/locales/zh.json +3 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/METADATA +4 -3
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/RECORD +77 -67
- xinference/web/ui/build/static/js/main.8a9e3ba0.js +0 -3
- xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +0 -1
- /xinference/web/ui/build/static/js/{main.8a9e3ba0.js.LICENSE.txt → main.9b12b7f9.js.LICENSE.txt} +0 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/WHEEL +0 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/entry_points.txt +0 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/top_level.txt +0 -0
|
@@ -51,6 +51,7 @@ from ....types import (
|
|
|
51
51
|
LoRA,
|
|
52
52
|
)
|
|
53
53
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
54
|
+
from ..core import chat_context_var
|
|
54
55
|
from ..llm_family import CustomLLMFamilyV1, cache_model_tokenizer_and_config
|
|
55
56
|
from ..utils import (
|
|
56
57
|
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
@@ -241,6 +242,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
|
|
|
241
242
|
|
|
242
243
|
if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
|
|
243
244
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
|
|
245
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwenLong-l1")
|
|
244
246
|
|
|
245
247
|
if VLLM_INSTALLED and vllm.__version__ >= "0.8.0":
|
|
246
248
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
|
|
@@ -332,6 +334,7 @@ class VLLMModel(LLM):
|
|
|
332
334
|
def load(self):
|
|
333
335
|
try:
|
|
334
336
|
import vllm
|
|
337
|
+
from vllm import envs
|
|
335
338
|
from vllm.config import VllmConfig
|
|
336
339
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
337
340
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
|
@@ -414,8 +417,6 @@ class VLLMModel(LLM):
|
|
|
414
417
|
elif self._n_worker > 1 or (
|
|
415
418
|
self._device_count > 1 and vllm.__version__ >= "0.7.0"
|
|
416
419
|
):
|
|
417
|
-
from .distributed_executor import XinferenceDistributedExecutor
|
|
418
|
-
|
|
419
420
|
# model across multiple workers or GPUs
|
|
420
421
|
engine_args = AsyncEngineArgs(
|
|
421
422
|
model=self.model_path,
|
|
@@ -423,6 +424,7 @@ class VLLMModel(LLM):
|
|
|
423
424
|
max_loras=max_loras,
|
|
424
425
|
**self._model_config,
|
|
425
426
|
)
|
|
427
|
+
self._enable_v1_if_supported(engine_args)
|
|
426
428
|
|
|
427
429
|
assert self._loop is not None
|
|
428
430
|
self._worker_addresses = {}
|
|
@@ -464,21 +466,47 @@ class VLLMModel(LLM):
|
|
|
464
466
|
assert worker_addresses
|
|
465
467
|
loop = self._loop
|
|
466
468
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
)
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
)
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
469
|
+
if not (envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1):
|
|
470
|
+
# vLLM v0
|
|
471
|
+
from .distributed_executor import (
|
|
472
|
+
XinferenceDistributedExecutor,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
class XinferenceAsyncLLMEngine(AsyncLLMEngine):
|
|
476
|
+
@classmethod
|
|
477
|
+
def _get_executor_cls(
|
|
478
|
+
cls, engine_config: VllmConfig
|
|
479
|
+
) -> Type[ExecutorBase]:
|
|
480
|
+
return partial( # type: ignore
|
|
481
|
+
XinferenceDistributedExecutor,
|
|
482
|
+
pool_addresses=worker_addresses,
|
|
483
|
+
n_worker=self._n_worker,
|
|
484
|
+
loop=loop,
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
self._engine = XinferenceAsyncLLMEngine.from_engine_args(
|
|
488
|
+
engine_args
|
|
489
|
+
)
|
|
490
|
+
else:
|
|
491
|
+
from vllm.v1.executor.abstract import Executor
|
|
492
|
+
|
|
493
|
+
from .distributed_executor import (
|
|
494
|
+
XinferenceDistributedExecutorV1,
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
# vLLM V1
|
|
498
|
+
# NOTE: loop has to be None for vLLM v1
|
|
499
|
+
# in v1, a new process called EngineCore will be created via fork by default
|
|
500
|
+
# in which executor is initialized, we cannot pass loop, or it will be stuck,
|
|
501
|
+
# instead, a new loop will be created inside executor
|
|
502
|
+
executor_cls = partial( # type: ignore
|
|
503
|
+
XinferenceDistributedExecutorV1,
|
|
504
|
+
pool_addresses=worker_addresses,
|
|
505
|
+
n_worker=self._n_worker,
|
|
506
|
+
)
|
|
507
|
+
# patch vllm Executor.get_class
|
|
508
|
+
Executor.get_class = lambda vllm_config: executor_cls
|
|
509
|
+
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
|
|
482
510
|
except:
|
|
483
511
|
logger.exception("Creating vllm engine failed")
|
|
484
512
|
self._loading_error = sys.exc_info()
|
|
@@ -495,6 +523,7 @@ class VLLMModel(LLM):
|
|
|
495
523
|
max_loras=max_loras,
|
|
496
524
|
**self._model_config,
|
|
497
525
|
)
|
|
526
|
+
self._enable_v1_if_supported(engine_args)
|
|
498
527
|
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
|
|
499
528
|
|
|
500
529
|
self._check_health_task = None
|
|
@@ -509,6 +538,46 @@ class VLLMModel(LLM):
|
|
|
509
538
|
_, err, tb = self._loading_error
|
|
510
539
|
raise err.with_traceback(tb)
|
|
511
540
|
|
|
541
|
+
def _enable_v1_if_supported(self, engine_args: "vllm.AsyncEngineArgs"):
|
|
542
|
+
from vllm import __version__ as vllm_version
|
|
543
|
+
|
|
544
|
+
if os.getenv("VLLM_USE_V1") is not None:
|
|
545
|
+
logger.debug(
|
|
546
|
+
"Setting vLLM v1 via environment variable already, skip checking"
|
|
547
|
+
)
|
|
548
|
+
return
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
supported_func = engine_args._is_v1_supported_oracle
|
|
552
|
+
except AttributeError:
|
|
553
|
+
logger.debug(
|
|
554
|
+
"Cannot get `EngineArgs._is_v1_supported_oracle` "
|
|
555
|
+
"to decide enabling vLLM v1, perhaps vllm version is too old, "
|
|
556
|
+
"version: %s",
|
|
557
|
+
vllm_version,
|
|
558
|
+
)
|
|
559
|
+
return
|
|
560
|
+
|
|
561
|
+
model_config = engine_args.create_model_config()
|
|
562
|
+
old_main_thread = threading.main_thread()
|
|
563
|
+
try:
|
|
564
|
+
# HACK: patch main thread to let vllm pass check
|
|
565
|
+
# vllm do some signal handling when on main thread
|
|
566
|
+
# but they will skip registering signal if not on main thread,
|
|
567
|
+
# however, the _is_v1_supported_oracle will return False
|
|
568
|
+
# when not on main thread, we patched the main thread temporially,
|
|
569
|
+
# It's OK because Xinference will take care of all processes
|
|
570
|
+
threading.main_thread = lambda: threading.current_thread()
|
|
571
|
+
|
|
572
|
+
if supported_func(model_config):
|
|
573
|
+
logger.debug("Setting vLLM v1 by checking model config")
|
|
574
|
+
os.environ["VLLM_USE_V1"] = "1"
|
|
575
|
+
else:
|
|
576
|
+
logger.debug("Use vLLM v0 due to not supported config")
|
|
577
|
+
finally:
|
|
578
|
+
# patch back
|
|
579
|
+
threading.main_thread = lambda: old_main_thread
|
|
580
|
+
|
|
512
581
|
def _preprocess_load_gguf(self):
|
|
513
582
|
# check if it is multi gguf files
|
|
514
583
|
if (
|
|
@@ -549,6 +618,8 @@ class VLLMModel(LLM):
|
|
|
549
618
|
)
|
|
550
619
|
|
|
551
620
|
def stop(self):
|
|
621
|
+
from vllm import envs
|
|
622
|
+
|
|
552
623
|
# though the vLLM engine will shutdown when deleted,
|
|
553
624
|
# but some issue e.g. GH#1682 reported
|
|
554
625
|
# when deleting, the engine exists still
|
|
@@ -556,9 +627,17 @@ class VLLMModel(LLM):
|
|
|
556
627
|
if self._check_health_task:
|
|
557
628
|
self._check_health_task.cancel()
|
|
558
629
|
if self._engine:
|
|
559
|
-
if
|
|
560
|
-
|
|
561
|
-
|
|
630
|
+
if not (envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1):
|
|
631
|
+
# v0
|
|
632
|
+
if model_executor := getattr(
|
|
633
|
+
self._engine.engine, "model_executor", None
|
|
634
|
+
):
|
|
635
|
+
model_executor.shutdown()
|
|
636
|
+
self._engine = None
|
|
637
|
+
else:
|
|
638
|
+
# v1
|
|
639
|
+
self._engine.shutdown()
|
|
640
|
+
self._engine = None
|
|
562
641
|
|
|
563
642
|
async def init_xavier(self):
|
|
564
643
|
await self._engine.init_xavier()
|
|
@@ -602,7 +681,6 @@ class VLLMModel(LLM):
|
|
|
602
681
|
else:
|
|
603
682
|
model_config.setdefault("quantization", None)
|
|
604
683
|
model_config.setdefault("max_model_len", None)
|
|
605
|
-
model_config.setdefault("guided_decoding_backend", "outlines")
|
|
606
684
|
model_config.setdefault("reasoning_content", False)
|
|
607
685
|
# Add scheduling policy if vLLM version is 0.6.3 or higher
|
|
608
686
|
if vllm.__version__ >= "0.6.3":
|
|
@@ -960,6 +1038,16 @@ class VLLMModel(LLM):
|
|
|
960
1038
|
assert chunk is not None
|
|
961
1039
|
yield chunk
|
|
962
1040
|
|
|
1041
|
+
logger.info(
|
|
1042
|
+
"Generate finished, request_id: %s, stop reason: %s, prompt tokens: %s, "
|
|
1043
|
+
"completion tokens: %s, all tokens: %s",
|
|
1044
|
+
request_id,
|
|
1045
|
+
finish_reason,
|
|
1046
|
+
prompt_tokens,
|
|
1047
|
+
completion_tokens,
|
|
1048
|
+
total_tokens,
|
|
1049
|
+
)
|
|
1050
|
+
|
|
963
1051
|
# match OpenAI API stream
|
|
964
1052
|
yield generate_completion_chunk(
|
|
965
1053
|
chunk_text="",
|
|
@@ -1055,17 +1143,23 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1055
1143
|
return generate_config
|
|
1056
1144
|
|
|
1057
1145
|
@staticmethod
|
|
1058
|
-
def
|
|
1146
|
+
def is_tool_call_chunk_start(chunk):
|
|
1059
1147
|
return chunk["choices"][0]["text"].startswith(QWEN_TOOL_CALL_SYMBOLS[0])
|
|
1060
1148
|
|
|
1149
|
+
@staticmethod
|
|
1150
|
+
def is_tool_call_chunk_end(chunk):
|
|
1151
|
+
return chunk["choices"][0]["text"].endswith(QWEN_TOOL_CALL_SYMBOLS[1])
|
|
1152
|
+
|
|
1061
1153
|
async def _async_to_tool_completion_chunks(
|
|
1062
1154
|
self,
|
|
1063
1155
|
chunks: AsyncGenerator[CompletionChunk, None],
|
|
1064
1156
|
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
1065
1157
|
i = 0
|
|
1066
1158
|
previous_texts = [""]
|
|
1159
|
+
tool_call = False
|
|
1160
|
+
tool_call_texts = [""]
|
|
1067
1161
|
if self.reasoning_parser:
|
|
1068
|
-
chunks = self.reasoning_parser.
|
|
1162
|
+
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
|
|
1069
1163
|
async for chunk in chunks:
|
|
1070
1164
|
if i == 0:
|
|
1071
1165
|
for first_chunk in self._get_first_chat_completion_chunk(
|
|
@@ -1077,13 +1171,22 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1077
1171
|
if not choices:
|
|
1078
1172
|
yield self._get_final_chat_completion_chunk(chunk)
|
|
1079
1173
|
else:
|
|
1080
|
-
if self.
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
)
|
|
1174
|
+
if self.is_tool_call_chunk_start(chunk):
|
|
1175
|
+
tool_call = True
|
|
1176
|
+
if tool_call:
|
|
1177
|
+
tool_call_text = tool_call_texts[-1]
|
|
1178
|
+
tool_call_text += chunk["choices"][0]["text"]
|
|
1179
|
+
tool_call_texts.append(tool_call_text)
|
|
1180
|
+
if self.is_tool_call_chunk_end(chunk):
|
|
1181
|
+
yield self._post_process_completion_chunk(
|
|
1182
|
+
self.model_family,
|
|
1183
|
+
self.model_uid,
|
|
1184
|
+
chunk,
|
|
1185
|
+
reasoning_parser=self.reasoning_parser,
|
|
1186
|
+
tool_call_text=tool_call_text,
|
|
1187
|
+
)
|
|
1188
|
+
tool_call = False
|
|
1189
|
+
tool_call_texts = [""]
|
|
1087
1190
|
else:
|
|
1088
1191
|
yield self._to_chat_completion_chunk(
|
|
1089
1192
|
chunk, self.reasoning_parser, previous_texts
|
|
@@ -1099,12 +1202,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1099
1202
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
1100
1203
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
1101
1204
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
1102
|
-
|
|
1205
|
+
chat_template_kwargs = (
|
|
1103
1206
|
self._get_chat_template_kwargs_from_generate_config(
|
|
1104
1207
|
generate_config, self.reasoning_parser
|
|
1105
1208
|
)
|
|
1106
1209
|
or {}
|
|
1107
1210
|
)
|
|
1211
|
+
chat_context_var.set(chat_template_kwargs)
|
|
1212
|
+
full_context_kwargs = chat_template_kwargs.copy()
|
|
1108
1213
|
if tools:
|
|
1109
1214
|
if (
|
|
1110
1215
|
model_family in QWEN_TOOL_CALL_FAMILY
|
|
@@ -1214,20 +1319,23 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1214
1319
|
generate_config: Optional[Dict] = None,
|
|
1215
1320
|
request_id: Optional[str] = None,
|
|
1216
1321
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
1217
|
-
messages = self._transform_messages(messages)
|
|
1218
1322
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
1219
1323
|
|
|
1220
1324
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
1221
1325
|
|
|
1222
|
-
if "
|
|
1326
|
+
if "internvl" not in model_family.lower():
|
|
1223
1327
|
from qwen_vl_utils import process_vision_info
|
|
1224
1328
|
|
|
1225
|
-
|
|
1329
|
+
messages = self._transform_messages(messages)
|
|
1330
|
+
|
|
1331
|
+
chat_template_kwargs = (
|
|
1226
1332
|
self._get_chat_template_kwargs_from_generate_config(
|
|
1227
1333
|
generate_config, self.reasoning_parser
|
|
1228
1334
|
)
|
|
1229
1335
|
or {}
|
|
1230
1336
|
)
|
|
1337
|
+
chat_context_var.set(chat_template_kwargs)
|
|
1338
|
+
full_context_kwargs = chat_template_kwargs.copy()
|
|
1231
1339
|
if tools and model_family in QWEN_TOOL_CALL_FAMILY:
|
|
1232
1340
|
full_context_kwargs["tools"] = tools
|
|
1233
1341
|
assert self.model_family.chat_template is not None
|
|
@@ -19,11 +19,19 @@ from functools import partial
|
|
|
19
19
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
|
20
20
|
|
|
21
21
|
import xoscar as xo
|
|
22
|
+
from vllm import envs
|
|
22
23
|
from vllm.executor.executor_base import DistributedExecutorBase
|
|
23
24
|
from vllm.utils import _run_task_with_lock, get_distributed_init_method
|
|
24
25
|
from vllm.worker.worker_base import WorkerWrapperBase
|
|
25
26
|
from xoscar.utils import get_next_port
|
|
26
27
|
|
|
28
|
+
try:
|
|
29
|
+
from vllm.v1.executor.abstract import Executor as ExecutorV1
|
|
30
|
+
except ImportError:
|
|
31
|
+
ExecutorV1 = None
|
|
32
|
+
|
|
33
|
+
from ....isolation import Isolation
|
|
34
|
+
|
|
27
35
|
if TYPE_CHECKING:
|
|
28
36
|
from vllm.config import VllmConfig
|
|
29
37
|
from vllm.model_executor.layers.sampler import SamplerOutput
|
|
@@ -31,6 +39,8 @@ if TYPE_CHECKING:
|
|
|
31
39
|
|
|
32
40
|
logger = logging.getLogger(__name__)
|
|
33
41
|
|
|
42
|
+
DEBUG_EXECUTOR = bool(int(os.getenv("XINFERENCE_DEBUG_VLLM_EXECUTOR", "0")))
|
|
43
|
+
|
|
34
44
|
|
|
35
45
|
class WorkerActor(xo.StatelessActor):
|
|
36
46
|
def __init__(self, vllm_config: "VllmConfig", rpc_rank: int = 0, **kwargs):
|
|
@@ -54,14 +64,15 @@ class WorkerActor(xo.StatelessActor):
|
|
|
54
64
|
return f"VllmWorker_{rank}"
|
|
55
65
|
|
|
56
66
|
def execute_method(self, method: Union[str, Callable], *args, **kwargs):
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
67
|
+
if DEBUG_EXECUTOR:
|
|
68
|
+
# NOTE: too many logs, but useful for debug
|
|
69
|
+
logger.debug(
|
|
70
|
+
"Calling method %s in vllm worker %s, args: %s, kwargs: %s",
|
|
71
|
+
method,
|
|
72
|
+
self.uid,
|
|
73
|
+
args,
|
|
74
|
+
kwargs,
|
|
75
|
+
)
|
|
65
76
|
if isinstance(method, str):
|
|
66
77
|
return getattr(self._worker, method)(*args, **kwargs)
|
|
67
78
|
else:
|
|
@@ -92,7 +103,7 @@ class WorkerWrapper:
|
|
|
92
103
|
class XinferenceDistributedExecutor(DistributedExecutorBase):
|
|
93
104
|
"""Xoscar based distributed executor"""
|
|
94
105
|
|
|
95
|
-
|
|
106
|
+
uses_ray: bool = False
|
|
96
107
|
_loop: asyncio.AbstractEventLoop
|
|
97
108
|
_pool_addresses: List[str]
|
|
98
109
|
_n_worker: int
|
|
@@ -112,15 +123,27 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
|
|
|
112
123
|
self._is_shutdown = False
|
|
113
124
|
super().__init__(vllm_config, *args, **kwargs)
|
|
114
125
|
|
|
126
|
+
def _create_workers(self, refs: xo.ActorRefType[WorkerActor]) -> None:
|
|
127
|
+
self.driver_worker: Optional[WorkerActor] = None
|
|
128
|
+
# The remaining workers are Xoscar actors
|
|
129
|
+
self.workers: List[WorkerWrapper] = []
|
|
130
|
+
|
|
131
|
+
self.workers = [WorkerWrapper(self._loop, ref) for ref in refs[1:]]
|
|
132
|
+
|
|
133
|
+
# driver worker only for vllm v0
|
|
134
|
+
self.driver_worker = WorkerActor(self.vllm_config, rpc_rank=0)
|
|
135
|
+
|
|
136
|
+
def driver_execute_method(*args, **kwargs):
|
|
137
|
+
func = partial(self.driver_worker.execute_method, *args, **kwargs)
|
|
138
|
+
return self._loop.run_in_executor(None, func)
|
|
139
|
+
|
|
140
|
+
self.driver_exec_method = driver_execute_method
|
|
141
|
+
|
|
115
142
|
def _init_executor(self) -> None:
|
|
116
143
|
# Create the parallel GPU workers.
|
|
117
144
|
world_size = self.parallel_config.world_size
|
|
118
145
|
tensor_parallel_size = self.parallel_config.tensor_parallel_size
|
|
119
146
|
|
|
120
|
-
self.driver_worker: Optional[WorkerActor] = None
|
|
121
|
-
# The remaining workers are Xoscar actors
|
|
122
|
-
self.workers: List[WorkerWrapper] = []
|
|
123
|
-
|
|
124
147
|
assert (
|
|
125
148
|
self._pool_addresses and len(self._pool_addresses) == world_size
|
|
126
149
|
), f"Pool addresses(#{len(self._pool_addresses or [])} must be equal to worldsize(#{world_size})"
|
|
@@ -135,15 +158,10 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
|
|
|
135
158
|
uid=WorkerActor.gen_uid(rank),
|
|
136
159
|
)
|
|
137
160
|
futures.append(asyncio.run_coroutine_threadsafe(coro, self._loop))
|
|
138
|
-
refs = [fut.result() for fut in futures]
|
|
139
|
-
self.workers = [WorkerWrapper(self._loop, ref) for ref in refs[1:]]
|
|
140
|
-
self.driver_worker = WorkerActor(self.vllm_config, rpc_rank=0)
|
|
141
|
-
|
|
142
|
-
def driver_execute_method(*args, **kwargs):
|
|
143
|
-
func = partial(self.driver_worker.execute_method, *args, **kwargs)
|
|
144
|
-
return self._loop.run_in_executor(None, func)
|
|
161
|
+
refs: List[xo.ActorRefType[WorkerActor]] = [fut.result() for fut in futures]
|
|
145
162
|
|
|
146
|
-
|
|
163
|
+
# create workers
|
|
164
|
+
self._create_workers(refs)
|
|
147
165
|
|
|
148
166
|
# Set environment variables for the driver and workers.
|
|
149
167
|
all_args_to_update_environment_variables: List[Dict[str, str]] = [
|
|
@@ -319,3 +337,60 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
|
|
|
319
337
|
for worker in self.non_driver_workers
|
|
320
338
|
]
|
|
321
339
|
return await asyncio.gather(*coros)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
if ExecutorV1:
|
|
343
|
+
|
|
344
|
+
class XinferenceDistributedExecutorV1(XinferenceDistributedExecutor, ExecutorV1):
|
|
345
|
+
def __init__(
|
|
346
|
+
self,
|
|
347
|
+
vllm_config: "VllmConfig",
|
|
348
|
+
pool_addresses: List[str],
|
|
349
|
+
n_worker: int,
|
|
350
|
+
*args,
|
|
351
|
+
**kwargs,
|
|
352
|
+
):
|
|
353
|
+
assert envs.VLLM_USE_V1
|
|
354
|
+
|
|
355
|
+
isolation = Isolation(asyncio.new_event_loop())
|
|
356
|
+
isolation.start()
|
|
357
|
+
loop = isolation.loop
|
|
358
|
+
|
|
359
|
+
XinferenceDistributedExecutor.__init__(
|
|
360
|
+
self, vllm_config, pool_addresses, n_worker, loop, *args, **kwargs
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
def _create_workers(self, refs: xo.ActorRefType[WorkerActor]) -> None:
|
|
364
|
+
self.workers = [WorkerWrapper(self._loop, ref) for ref in refs]
|
|
365
|
+
|
|
366
|
+
def execute_model(
|
|
367
|
+
self,
|
|
368
|
+
execute_model_req: "ExecuteModelRequest",
|
|
369
|
+
) -> List["SamplerOutput"]:
|
|
370
|
+
outputs = self._run_workers("execute_model", execute_model_req)
|
|
371
|
+
return outputs[0]
|
|
372
|
+
|
|
373
|
+
def _run_workers(
|
|
374
|
+
self,
|
|
375
|
+
method: Union[str, Callable],
|
|
376
|
+
*args,
|
|
377
|
+
async_run_tensor_parallel_workers_only: bool = False,
|
|
378
|
+
max_concurrent_workers: Optional[int] = None,
|
|
379
|
+
**kwargs,
|
|
380
|
+
) -> Any:
|
|
381
|
+
if max_concurrent_workers:
|
|
382
|
+
raise NotImplementedError(
|
|
383
|
+
"max_concurrent_workers is not supported yet."
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
workers = self.workers
|
|
387
|
+
if async_run_tensor_parallel_workers_only:
|
|
388
|
+
workers = self.non_driver_workers
|
|
389
|
+
worker_outputs = [
|
|
390
|
+
worker.execute_method(method, *args, **kwargs) for worker in workers
|
|
391
|
+
]
|
|
392
|
+
|
|
393
|
+
if async_run_tensor_parallel_workers_only:
|
|
394
|
+
return worker_outputs
|
|
395
|
+
|
|
396
|
+
return [output.result() for output in worker_outputs]
|
|
@@ -31,8 +31,8 @@ logger = logging.getLogger(__name__)
|
|
|
31
31
|
class BufferTransferMixin:
|
|
32
32
|
def __init__(self):
|
|
33
33
|
self.num_buffer: int = 0
|
|
34
|
-
self.buffers: List[torch.Tensor] = []
|
|
35
|
-
self.buffer_queue: Optional[Queue] = None
|
|
34
|
+
self.buffers: List[torch.Tensor] = [] # type: ignore
|
|
35
|
+
self.buffer_queue: Optional[Queue] = None # type: ignore
|
|
36
36
|
self.transfer_block_num = 0
|
|
37
37
|
self.num_attn_layers = 0
|
|
38
38
|
|
xinference/model/rerank/core.py
CHANGED
|
@@ -252,11 +252,13 @@ class RerankModel:
|
|
|
252
252
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
253
253
|
self._model_path, padding_side="left"
|
|
254
254
|
)
|
|
255
|
-
|
|
255
|
+
enable_flash_attn = self._model_config.get("enable_flash_attn", True)
|
|
256
256
|
model_kwargs = {"device_map": "auto"}
|
|
257
|
-
if flash_attn_installed:
|
|
257
|
+
if flash_attn_installed and enable_flash_attn:
|
|
258
258
|
model_kwargs["attn_implementation"] = "flash_attention_2"
|
|
259
259
|
model_kwargs["torch_dtype"] = torch.float16
|
|
260
|
+
model_kwargs.update(self._model_config)
|
|
261
|
+
logger.debug("Loading qwen3 rerank with kwargs %s", model_kwargs)
|
|
260
262
|
model = self._model = AutoModelForCausalLM.from_pretrained(
|
|
261
263
|
self._model_path, **model_kwargs
|
|
262
264
|
).eval()
|
|
@@ -368,13 +370,18 @@ class RerankModel:
|
|
|
368
370
|
)
|
|
369
371
|
return output
|
|
370
372
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
373
|
+
# reduce memory usage.
|
|
374
|
+
micro_bs = 4
|
|
375
|
+
similarity_scores = []
|
|
376
|
+
for i in range(0, len(documents), micro_bs):
|
|
377
|
+
sub_docs = documents[i : i + micro_bs]
|
|
378
|
+
pairs = [
|
|
379
|
+
format_instruction(kwargs.get("instruction", None), query, doc)
|
|
380
|
+
for doc in sub_docs
|
|
381
|
+
]
|
|
382
|
+
# Tokenize the input texts
|
|
383
|
+
inputs = self.process_inputs(pairs)
|
|
384
|
+
similarity_scores.extend(self.compute_logits(inputs))
|
|
378
385
|
else:
|
|
379
386
|
# Related issue: https://github.com/xorbitsai/inference/issues/1775
|
|
380
387
|
similarity_scores = self._model.compute_score(
|
|
@@ -67,7 +67,7 @@
|
|
|
67
67
|
"model_name": "Qwen3-Reranker-0.6B",
|
|
68
68
|
"type": "normal",
|
|
69
69
|
"language": ["en", "zh"],
|
|
70
|
-
"max_tokens":
|
|
70
|
+
"max_tokens": 32768,
|
|
71
71
|
"model_id": "Qwen/Qwen3-Reranker-0.6B",
|
|
72
72
|
"model_revision": "6e9e69830b95c52b5fd889b7690dda3329508de3"
|
|
73
73
|
},
|
|
@@ -75,7 +75,7 @@
|
|
|
75
75
|
"model_name": "Qwen3-Reranker-4B",
|
|
76
76
|
"type": "normal",
|
|
77
77
|
"language": ["en", "zh"],
|
|
78
|
-
"max_tokens":
|
|
78
|
+
"max_tokens": 32768,
|
|
79
79
|
"model_id": "Qwen/Qwen3-Reranker-4B",
|
|
80
80
|
"model_revision": "f16fc5d5d2b9b1d0db8280929242745d79794ef5"
|
|
81
81
|
},
|
|
@@ -83,7 +83,7 @@
|
|
|
83
83
|
"model_name": "Qwen3-Reranker-8B",
|
|
84
84
|
"type": "normal",
|
|
85
85
|
"language": ["en", "zh"],
|
|
86
|
-
"max_tokens":
|
|
86
|
+
"max_tokens": 32768,
|
|
87
87
|
"model_id": "Qwen/Qwen3-Reranker-8B",
|
|
88
88
|
"model_revision": "5fa94080caafeaa45a15d11f969d7978e087a3db"
|
|
89
89
|
}
|
|
@@ -62,7 +62,7 @@
|
|
|
62
62
|
"model_name": "Qwen3-Reranker-0.6B",
|
|
63
63
|
"type": "normal",
|
|
64
64
|
"language": ["en", "zh"],
|
|
65
|
-
"max_tokens":
|
|
65
|
+
"max_tokens": 32768,
|
|
66
66
|
"model_id": "Qwen/Qwen3-Reranker-0.6B",
|
|
67
67
|
"model_hub": "modelscope"
|
|
68
68
|
},
|
|
@@ -70,7 +70,7 @@
|
|
|
70
70
|
"model_name": "Qwen3-Reranker-4B",
|
|
71
71
|
"type": "normal",
|
|
72
72
|
"language": ["en", "zh"],
|
|
73
|
-
"max_tokens":
|
|
73
|
+
"max_tokens": 32768,
|
|
74
74
|
"model_id": "Qwen/Qwen3-Reranker-4B",
|
|
75
75
|
"model_hub": "modelscope"
|
|
76
76
|
},
|
|
@@ -78,7 +78,7 @@
|
|
|
78
78
|
"model_name": "Qwen3-Reranker-8B",
|
|
79
79
|
"type": "normal",
|
|
80
80
|
"language": ["en", "zh"],
|
|
81
|
-
"max_tokens":
|
|
81
|
+
"max_tokens": 32768,
|
|
82
82
|
"model_id": "Qwen/Qwen3-Reranker-8B",
|
|
83
83
|
"model_hub": "modelscope"
|
|
84
84
|
}
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.013f296b.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.9b12b7f9.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.013f296b.css.map": "./static/css/main.013f296b.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.9b12b7f9.js.map": "./static/js/main.9b12b7f9.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.013f296b.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.9b12b7f9.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.9b12b7f9.js"></script><link href="./static/css/main.013f296b.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|