xinference 1.7.0.post1__py3-none-any.whl → 1.7.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (83) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +3 -4
  3. xinference/client/__init__.py +2 -0
  4. xinference/client/common.py +49 -2
  5. xinference/client/handlers.py +18 -0
  6. xinference/client/restful/async_restful_client.py +1760 -0
  7. xinference/client/restful/restful_client.py +74 -78
  8. xinference/core/media_interface.py +3 -1
  9. xinference/core/model.py +5 -4
  10. xinference/core/supervisor.py +10 -5
  11. xinference/core/worker.py +15 -14
  12. xinference/deploy/local.py +51 -9
  13. xinference/deploy/worker.py +5 -3
  14. xinference/device_utils.py +22 -3
  15. xinference/model/audio/fish_speech.py +23 -34
  16. xinference/model/audio/model_spec.json +4 -2
  17. xinference/model/audio/model_spec_modelscope.json +4 -2
  18. xinference/model/audio/utils.py +2 -2
  19. xinference/model/core.py +1 -0
  20. xinference/model/embedding/__init__.py +8 -8
  21. xinference/model/embedding/custom.py +6 -1
  22. xinference/model/embedding/embed_family.py +0 -41
  23. xinference/model/embedding/model_spec.json +10 -1
  24. xinference/model/embedding/model_spec_modelscope.json +10 -1
  25. xinference/model/embedding/sentence_transformers/core.py +30 -15
  26. xinference/model/flexible/core.py +1 -1
  27. xinference/model/flexible/launchers/__init__.py +2 -0
  28. xinference/model/flexible/launchers/image_process_launcher.py +1 -1
  29. xinference/model/flexible/launchers/modelscope_launcher.py +47 -0
  30. xinference/model/flexible/launchers/transformers_launcher.py +5 -5
  31. xinference/model/flexible/launchers/yolo_launcher.py +62 -0
  32. xinference/model/llm/__init__.py +7 -0
  33. xinference/model/llm/core.py +18 -1
  34. xinference/model/llm/llama_cpp/core.py +1 -1
  35. xinference/model/llm/llm_family.json +41 -1
  36. xinference/model/llm/llm_family.py +6 -0
  37. xinference/model/llm/llm_family_modelscope.json +43 -1
  38. xinference/model/llm/mlx/core.py +271 -18
  39. xinference/model/llm/mlx/distributed_models/__init__.py +13 -0
  40. xinference/model/llm/mlx/distributed_models/core.py +164 -0
  41. xinference/model/llm/mlx/distributed_models/deepseek_v3.py +75 -0
  42. xinference/model/llm/mlx/distributed_models/qwen2.py +82 -0
  43. xinference/model/llm/mlx/distributed_models/qwen3.py +82 -0
  44. xinference/model/llm/mlx/distributed_models/qwen3_moe.py +76 -0
  45. xinference/model/llm/reasoning_parser.py +12 -6
  46. xinference/model/llm/sglang/core.py +8 -4
  47. xinference/model/llm/transformers/chatglm.py +4 -1
  48. xinference/model/llm/transformers/core.py +4 -2
  49. xinference/model/llm/transformers/multimodal/cogagent.py +10 -4
  50. xinference/model/llm/transformers/multimodal/intern_vl.py +1 -1
  51. xinference/model/llm/utils.py +36 -17
  52. xinference/model/llm/vllm/core.py +142 -34
  53. xinference/model/llm/vllm/distributed_executor.py +96 -21
  54. xinference/model/llm/vllm/xavier/transfer.py +2 -2
  55. xinference/model/rerank/core.py +16 -9
  56. xinference/model/rerank/model_spec.json +3 -3
  57. xinference/model/rerank/model_spec_modelscope.json +3 -3
  58. xinference/web/ui/build/asset-manifest.json +3 -3
  59. xinference/web/ui/build/index.html +1 -1
  60. xinference/web/ui/build/static/js/main.9b12b7f9.js +3 -0
  61. xinference/web/ui/build/static/js/main.9b12b7f9.js.map +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/0fd4820d93f99509e80d8702dc3f6f8272424acab5608fa7c0e82cb1d3250a87.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/f75545479c17fdfe2a00235fa4a0e9da1ae95e6b3caafba87ded92de6b0240e4.json +1 -0
  67. xinference/web/ui/src/locales/en.json +3 -0
  68. xinference/web/ui/src/locales/ja.json +3 -0
  69. xinference/web/ui/src/locales/ko.json +3 -0
  70. xinference/web/ui/src/locales/zh.json +3 -0
  71. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/METADATA +4 -3
  72. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/RECORD +77 -67
  73. xinference/web/ui/build/static/js/main.8a9e3ba0.js +0 -3
  74. xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +0 -1
  76. xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +0 -1
  79. /xinference/web/ui/build/static/js/{main.8a9e3ba0.js.LICENSE.txt → main.9b12b7f9.js.LICENSE.txt} +0 -0
  80. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/WHEEL +0 -0
  81. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/entry_points.txt +0 -0
  82. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/licenses/LICENSE +0 -0
  83. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/top_level.txt +0 -0
@@ -51,6 +51,7 @@ from ....types import (
51
51
  LoRA,
52
52
  )
53
53
  from .. import LLM, LLMFamilyV1, LLMSpecV1
54
+ from ..core import chat_context_var
54
55
  from ..llm_family import CustomLLMFamilyV1, cache_model_tokenizer_and_config
55
56
  from ..utils import (
56
57
  DEEPSEEK_TOOL_CALL_FAMILY,
@@ -241,6 +242,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.7.2":
241
242
 
242
243
  if VLLM_INSTALLED and vllm.__version__ >= "0.7.3":
243
244
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
245
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwenLong-l1")
244
246
 
245
247
  if VLLM_INSTALLED and vllm.__version__ >= "0.8.0":
246
248
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
@@ -332,6 +334,7 @@ class VLLMModel(LLM):
332
334
  def load(self):
333
335
  try:
334
336
  import vllm
337
+ from vllm import envs
335
338
  from vllm.config import VllmConfig
336
339
  from vllm.engine.arg_utils import AsyncEngineArgs
337
340
  from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -414,8 +417,6 @@ class VLLMModel(LLM):
414
417
  elif self._n_worker > 1 or (
415
418
  self._device_count > 1 and vllm.__version__ >= "0.7.0"
416
419
  ):
417
- from .distributed_executor import XinferenceDistributedExecutor
418
-
419
420
  # model across multiple workers or GPUs
420
421
  engine_args = AsyncEngineArgs(
421
422
  model=self.model_path,
@@ -423,6 +424,7 @@ class VLLMModel(LLM):
423
424
  max_loras=max_loras,
424
425
  **self._model_config,
425
426
  )
427
+ self._enable_v1_if_supported(engine_args)
426
428
 
427
429
  assert self._loop is not None
428
430
  self._worker_addresses = {}
@@ -464,21 +466,47 @@ class VLLMModel(LLM):
464
466
  assert worker_addresses
465
467
  loop = self._loop
466
468
 
467
- class XinferenceAsyncLLMEngine(AsyncLLMEngine):
468
- @classmethod
469
- def _get_executor_cls(
470
- cls, engine_config: VllmConfig
471
- ) -> Type[ExecutorBase]:
472
- return partial( # type: ignore
473
- XinferenceDistributedExecutor,
474
- pool_addresses=worker_addresses,
475
- n_worker=self._n_worker,
476
- loop=loop,
477
- )
478
-
479
- self._engine = XinferenceAsyncLLMEngine.from_engine_args(
480
- engine_args
481
- )
469
+ if not (envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1):
470
+ # vLLM v0
471
+ from .distributed_executor import (
472
+ XinferenceDistributedExecutor,
473
+ )
474
+
475
+ class XinferenceAsyncLLMEngine(AsyncLLMEngine):
476
+ @classmethod
477
+ def _get_executor_cls(
478
+ cls, engine_config: VllmConfig
479
+ ) -> Type[ExecutorBase]:
480
+ return partial( # type: ignore
481
+ XinferenceDistributedExecutor,
482
+ pool_addresses=worker_addresses,
483
+ n_worker=self._n_worker,
484
+ loop=loop,
485
+ )
486
+
487
+ self._engine = XinferenceAsyncLLMEngine.from_engine_args(
488
+ engine_args
489
+ )
490
+ else:
491
+ from vllm.v1.executor.abstract import Executor
492
+
493
+ from .distributed_executor import (
494
+ XinferenceDistributedExecutorV1,
495
+ )
496
+
497
+ # vLLM V1
498
+ # NOTE: loop has to be None for vLLM v1
499
+ # in v1, a new process called EngineCore will be created via fork by default
500
+ # in which executor is initialized, we cannot pass loop, or it will be stuck,
501
+ # instead, a new loop will be created inside executor
502
+ executor_cls = partial( # type: ignore
503
+ XinferenceDistributedExecutorV1,
504
+ pool_addresses=worker_addresses,
505
+ n_worker=self._n_worker,
506
+ )
507
+ # patch vllm Executor.get_class
508
+ Executor.get_class = lambda vllm_config: executor_cls
509
+ self._engine = AsyncLLMEngine.from_engine_args(engine_args)
482
510
  except:
483
511
  logger.exception("Creating vllm engine failed")
484
512
  self._loading_error = sys.exc_info()
@@ -495,6 +523,7 @@ class VLLMModel(LLM):
495
523
  max_loras=max_loras,
496
524
  **self._model_config,
497
525
  )
526
+ self._enable_v1_if_supported(engine_args)
498
527
  self._engine = AsyncLLMEngine.from_engine_args(engine_args)
499
528
 
500
529
  self._check_health_task = None
@@ -509,6 +538,46 @@ class VLLMModel(LLM):
509
538
  _, err, tb = self._loading_error
510
539
  raise err.with_traceback(tb)
511
540
 
541
+ def _enable_v1_if_supported(self, engine_args: "vllm.AsyncEngineArgs"):
542
+ from vllm import __version__ as vllm_version
543
+
544
+ if os.getenv("VLLM_USE_V1") is not None:
545
+ logger.debug(
546
+ "Setting vLLM v1 via environment variable already, skip checking"
547
+ )
548
+ return
549
+
550
+ try:
551
+ supported_func = engine_args._is_v1_supported_oracle
552
+ except AttributeError:
553
+ logger.debug(
554
+ "Cannot get `EngineArgs._is_v1_supported_oracle` "
555
+ "to decide enabling vLLM v1, perhaps vllm version is too old, "
556
+ "version: %s",
557
+ vllm_version,
558
+ )
559
+ return
560
+
561
+ model_config = engine_args.create_model_config()
562
+ old_main_thread = threading.main_thread()
563
+ try:
564
+ # HACK: patch main thread to let vllm pass check
565
+ # vllm do some signal handling when on main thread
566
+ # but they will skip registering signal if not on main thread,
567
+ # however, the _is_v1_supported_oracle will return False
568
+ # when not on main thread, we patched the main thread temporially,
569
+ # It's OK because Xinference will take care of all processes
570
+ threading.main_thread = lambda: threading.current_thread()
571
+
572
+ if supported_func(model_config):
573
+ logger.debug("Setting vLLM v1 by checking model config")
574
+ os.environ["VLLM_USE_V1"] = "1"
575
+ else:
576
+ logger.debug("Use vLLM v0 due to not supported config")
577
+ finally:
578
+ # patch back
579
+ threading.main_thread = lambda: old_main_thread
580
+
512
581
  def _preprocess_load_gguf(self):
513
582
  # check if it is multi gguf files
514
583
  if (
@@ -549,6 +618,8 @@ class VLLMModel(LLM):
549
618
  )
550
619
 
551
620
  def stop(self):
621
+ from vllm import envs
622
+
552
623
  # though the vLLM engine will shutdown when deleted,
553
624
  # but some issue e.g. GH#1682 reported
554
625
  # when deleting, the engine exists still
@@ -556,9 +627,17 @@ class VLLMModel(LLM):
556
627
  if self._check_health_task:
557
628
  self._check_health_task.cancel()
558
629
  if self._engine:
559
- if model_executor := getattr(self._engine.engine, "model_executor", None):
560
- model_executor.shutdown()
561
- self._engine = None
630
+ if not (envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1):
631
+ # v0
632
+ if model_executor := getattr(
633
+ self._engine.engine, "model_executor", None
634
+ ):
635
+ model_executor.shutdown()
636
+ self._engine = None
637
+ else:
638
+ # v1
639
+ self._engine.shutdown()
640
+ self._engine = None
562
641
 
563
642
  async def init_xavier(self):
564
643
  await self._engine.init_xavier()
@@ -602,7 +681,6 @@ class VLLMModel(LLM):
602
681
  else:
603
682
  model_config.setdefault("quantization", None)
604
683
  model_config.setdefault("max_model_len", None)
605
- model_config.setdefault("guided_decoding_backend", "outlines")
606
684
  model_config.setdefault("reasoning_content", False)
607
685
  # Add scheduling policy if vLLM version is 0.6.3 or higher
608
686
  if vllm.__version__ >= "0.6.3":
@@ -960,6 +1038,16 @@ class VLLMModel(LLM):
960
1038
  assert chunk is not None
961
1039
  yield chunk
962
1040
 
1041
+ logger.info(
1042
+ "Generate finished, request_id: %s, stop reason: %s, prompt tokens: %s, "
1043
+ "completion tokens: %s, all tokens: %s",
1044
+ request_id,
1045
+ finish_reason,
1046
+ prompt_tokens,
1047
+ completion_tokens,
1048
+ total_tokens,
1049
+ )
1050
+
963
1051
  # match OpenAI API stream
964
1052
  yield generate_completion_chunk(
965
1053
  chunk_text="",
@@ -1055,17 +1143,23 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1055
1143
  return generate_config
1056
1144
 
1057
1145
  @staticmethod
1058
- def is_tool_call_chunk(chunk):
1146
+ def is_tool_call_chunk_start(chunk):
1059
1147
  return chunk["choices"][0]["text"].startswith(QWEN_TOOL_CALL_SYMBOLS[0])
1060
1148
 
1149
+ @staticmethod
1150
+ def is_tool_call_chunk_end(chunk):
1151
+ return chunk["choices"][0]["text"].endswith(QWEN_TOOL_CALL_SYMBOLS[1])
1152
+
1061
1153
  async def _async_to_tool_completion_chunks(
1062
1154
  self,
1063
1155
  chunks: AsyncGenerator[CompletionChunk, None],
1064
1156
  ) -> AsyncGenerator[ChatCompletionChunk, None]:
1065
1157
  i = 0
1066
1158
  previous_texts = [""]
1159
+ tool_call = False
1160
+ tool_call_texts = [""]
1067
1161
  if self.reasoning_parser:
1068
- chunks = self.reasoning_parser.prepare_reasoning_content(chunks)
1162
+ chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
1069
1163
  async for chunk in chunks:
1070
1164
  if i == 0:
1071
1165
  for first_chunk in self._get_first_chat_completion_chunk(
@@ -1077,13 +1171,22 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1077
1171
  if not choices:
1078
1172
  yield self._get_final_chat_completion_chunk(chunk)
1079
1173
  else:
1080
- if self.is_tool_call_chunk(chunk):
1081
- yield self._post_process_completion_chunk(
1082
- self.model_family,
1083
- self.model_uid,
1084
- chunk,
1085
- reasoning_parser=self.reasoning_parser,
1086
- )
1174
+ if self.is_tool_call_chunk_start(chunk):
1175
+ tool_call = True
1176
+ if tool_call:
1177
+ tool_call_text = tool_call_texts[-1]
1178
+ tool_call_text += chunk["choices"][0]["text"]
1179
+ tool_call_texts.append(tool_call_text)
1180
+ if self.is_tool_call_chunk_end(chunk):
1181
+ yield self._post_process_completion_chunk(
1182
+ self.model_family,
1183
+ self.model_uid,
1184
+ chunk,
1185
+ reasoning_parser=self.reasoning_parser,
1186
+ tool_call_text=tool_call_text,
1187
+ )
1188
+ tool_call = False
1189
+ tool_call_texts = [""]
1087
1190
  else:
1088
1191
  yield self._to_chat_completion_chunk(
1089
1192
  chunk, self.reasoning_parser, previous_texts
@@ -1099,12 +1202,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1099
1202
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
1100
1203
  tools = generate_config.pop("tools", []) if generate_config else None
1101
1204
  model_family = self.model_family.model_family or self.model_family.model_name
1102
- full_context_kwargs = (
1205
+ chat_template_kwargs = (
1103
1206
  self._get_chat_template_kwargs_from_generate_config(
1104
1207
  generate_config, self.reasoning_parser
1105
1208
  )
1106
1209
  or {}
1107
1210
  )
1211
+ chat_context_var.set(chat_template_kwargs)
1212
+ full_context_kwargs = chat_template_kwargs.copy()
1108
1213
  if tools:
1109
1214
  if (
1110
1215
  model_family in QWEN_TOOL_CALL_FAMILY
@@ -1214,20 +1319,23 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1214
1319
  generate_config: Optional[Dict] = None,
1215
1320
  request_id: Optional[str] = None,
1216
1321
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
1217
- messages = self._transform_messages(messages)
1218
1322
  tools = generate_config.pop("tools", []) if generate_config else None
1219
1323
 
1220
1324
  model_family = self.model_family.model_family or self.model_family.model_name
1221
1325
 
1222
- if "internvl2" not in model_family.lower():
1326
+ if "internvl" not in model_family.lower():
1223
1327
  from qwen_vl_utils import process_vision_info
1224
1328
 
1225
- full_context_kwargs = (
1329
+ messages = self._transform_messages(messages)
1330
+
1331
+ chat_template_kwargs = (
1226
1332
  self._get_chat_template_kwargs_from_generate_config(
1227
1333
  generate_config, self.reasoning_parser
1228
1334
  )
1229
1335
  or {}
1230
1336
  )
1337
+ chat_context_var.set(chat_template_kwargs)
1338
+ full_context_kwargs = chat_template_kwargs.copy()
1231
1339
  if tools and model_family in QWEN_TOOL_CALL_FAMILY:
1232
1340
  full_context_kwargs["tools"] = tools
1233
1341
  assert self.model_family.chat_template is not None
@@ -19,11 +19,19 @@ from functools import partial
19
19
  from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
20
20
 
21
21
  import xoscar as xo
22
+ from vllm import envs
22
23
  from vllm.executor.executor_base import DistributedExecutorBase
23
24
  from vllm.utils import _run_task_with_lock, get_distributed_init_method
24
25
  from vllm.worker.worker_base import WorkerWrapperBase
25
26
  from xoscar.utils import get_next_port
26
27
 
28
+ try:
29
+ from vllm.v1.executor.abstract import Executor as ExecutorV1
30
+ except ImportError:
31
+ ExecutorV1 = None
32
+
33
+ from ....isolation import Isolation
34
+
27
35
  if TYPE_CHECKING:
28
36
  from vllm.config import VllmConfig
29
37
  from vllm.model_executor.layers.sampler import SamplerOutput
@@ -31,6 +39,8 @@ if TYPE_CHECKING:
31
39
 
32
40
  logger = logging.getLogger(__name__)
33
41
 
42
+ DEBUG_EXECUTOR = bool(int(os.getenv("XINFERENCE_DEBUG_VLLM_EXECUTOR", "0")))
43
+
34
44
 
35
45
  class WorkerActor(xo.StatelessActor):
36
46
  def __init__(self, vllm_config: "VllmConfig", rpc_rank: int = 0, **kwargs):
@@ -54,14 +64,15 @@ class WorkerActor(xo.StatelessActor):
54
64
  return f"VllmWorker_{rank}"
55
65
 
56
66
  def execute_method(self, method: Union[str, Callable], *args, **kwargs):
57
- # NOTE: too many logs, but useful for debug
58
- # logger.debug(
59
- # "Calling method %s in vllm worker %s, args: %s, kwargs: %s",
60
- # method,
61
- # self.uid,
62
- # args,
63
- # kwargs,
64
- # )
67
+ if DEBUG_EXECUTOR:
68
+ # NOTE: too many logs, but useful for debug
69
+ logger.debug(
70
+ "Calling method %s in vllm worker %s, args: %s, kwargs: %s",
71
+ method,
72
+ self.uid,
73
+ args,
74
+ kwargs,
75
+ )
65
76
  if isinstance(method, str):
66
77
  return getattr(self._worker, method)(*args, **kwargs)
67
78
  else:
@@ -92,7 +103,7 @@ class WorkerWrapper:
92
103
  class XinferenceDistributedExecutor(DistributedExecutorBase):
93
104
  """Xoscar based distributed executor"""
94
105
 
95
- use_ray: bool = False
106
+ uses_ray: bool = False
96
107
  _loop: asyncio.AbstractEventLoop
97
108
  _pool_addresses: List[str]
98
109
  _n_worker: int
@@ -112,15 +123,27 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
112
123
  self._is_shutdown = False
113
124
  super().__init__(vllm_config, *args, **kwargs)
114
125
 
126
+ def _create_workers(self, refs: xo.ActorRefType[WorkerActor]) -> None:
127
+ self.driver_worker: Optional[WorkerActor] = None
128
+ # The remaining workers are Xoscar actors
129
+ self.workers: List[WorkerWrapper] = []
130
+
131
+ self.workers = [WorkerWrapper(self._loop, ref) for ref in refs[1:]]
132
+
133
+ # driver worker only for vllm v0
134
+ self.driver_worker = WorkerActor(self.vllm_config, rpc_rank=0)
135
+
136
+ def driver_execute_method(*args, **kwargs):
137
+ func = partial(self.driver_worker.execute_method, *args, **kwargs)
138
+ return self._loop.run_in_executor(None, func)
139
+
140
+ self.driver_exec_method = driver_execute_method
141
+
115
142
  def _init_executor(self) -> None:
116
143
  # Create the parallel GPU workers.
117
144
  world_size = self.parallel_config.world_size
118
145
  tensor_parallel_size = self.parallel_config.tensor_parallel_size
119
146
 
120
- self.driver_worker: Optional[WorkerActor] = None
121
- # The remaining workers are Xoscar actors
122
- self.workers: List[WorkerWrapper] = []
123
-
124
147
  assert (
125
148
  self._pool_addresses and len(self._pool_addresses) == world_size
126
149
  ), f"Pool addresses(#{len(self._pool_addresses or [])} must be equal to worldsize(#{world_size})"
@@ -135,15 +158,10 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
135
158
  uid=WorkerActor.gen_uid(rank),
136
159
  )
137
160
  futures.append(asyncio.run_coroutine_threadsafe(coro, self._loop))
138
- refs = [fut.result() for fut in futures]
139
- self.workers = [WorkerWrapper(self._loop, ref) for ref in refs[1:]]
140
- self.driver_worker = WorkerActor(self.vllm_config, rpc_rank=0)
141
-
142
- def driver_execute_method(*args, **kwargs):
143
- func = partial(self.driver_worker.execute_method, *args, **kwargs)
144
- return self._loop.run_in_executor(None, func)
161
+ refs: List[xo.ActorRefType[WorkerActor]] = [fut.result() for fut in futures]
145
162
 
146
- self.driver_exec_method = driver_execute_method
163
+ # create workers
164
+ self._create_workers(refs)
147
165
 
148
166
  # Set environment variables for the driver and workers.
149
167
  all_args_to_update_environment_variables: List[Dict[str, str]] = [
@@ -319,3 +337,60 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
319
337
  for worker in self.non_driver_workers
320
338
  ]
321
339
  return await asyncio.gather(*coros)
340
+
341
+
342
+ if ExecutorV1:
343
+
344
+ class XinferenceDistributedExecutorV1(XinferenceDistributedExecutor, ExecutorV1):
345
+ def __init__(
346
+ self,
347
+ vllm_config: "VllmConfig",
348
+ pool_addresses: List[str],
349
+ n_worker: int,
350
+ *args,
351
+ **kwargs,
352
+ ):
353
+ assert envs.VLLM_USE_V1
354
+
355
+ isolation = Isolation(asyncio.new_event_loop())
356
+ isolation.start()
357
+ loop = isolation.loop
358
+
359
+ XinferenceDistributedExecutor.__init__(
360
+ self, vllm_config, pool_addresses, n_worker, loop, *args, **kwargs
361
+ )
362
+
363
+ def _create_workers(self, refs: xo.ActorRefType[WorkerActor]) -> None:
364
+ self.workers = [WorkerWrapper(self._loop, ref) for ref in refs]
365
+
366
+ def execute_model(
367
+ self,
368
+ execute_model_req: "ExecuteModelRequest",
369
+ ) -> List["SamplerOutput"]:
370
+ outputs = self._run_workers("execute_model", execute_model_req)
371
+ return outputs[0]
372
+
373
+ def _run_workers(
374
+ self,
375
+ method: Union[str, Callable],
376
+ *args,
377
+ async_run_tensor_parallel_workers_only: bool = False,
378
+ max_concurrent_workers: Optional[int] = None,
379
+ **kwargs,
380
+ ) -> Any:
381
+ if max_concurrent_workers:
382
+ raise NotImplementedError(
383
+ "max_concurrent_workers is not supported yet."
384
+ )
385
+
386
+ workers = self.workers
387
+ if async_run_tensor_parallel_workers_only:
388
+ workers = self.non_driver_workers
389
+ worker_outputs = [
390
+ worker.execute_method(method, *args, **kwargs) for worker in workers
391
+ ]
392
+
393
+ if async_run_tensor_parallel_workers_only:
394
+ return worker_outputs
395
+
396
+ return [output.result() for output in worker_outputs]
@@ -31,8 +31,8 @@ logger = logging.getLogger(__name__)
31
31
  class BufferTransferMixin:
32
32
  def __init__(self):
33
33
  self.num_buffer: int = 0
34
- self.buffers: List[torch.Tensor] = []
35
- self.buffer_queue: Optional[Queue] = None
34
+ self.buffers: List[torch.Tensor] = [] # type: ignore
35
+ self.buffer_queue: Optional[Queue] = None # type: ignore
36
36
  self.transfer_block_num = 0
37
37
  self.num_attn_layers = 0
38
38
 
@@ -252,11 +252,13 @@ class RerankModel:
252
252
  tokenizer = AutoTokenizer.from_pretrained(
253
253
  self._model_path, padding_side="left"
254
254
  )
255
- flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
255
+ enable_flash_attn = self._model_config.get("enable_flash_attn", True)
256
256
  model_kwargs = {"device_map": "auto"}
257
- if flash_attn_installed:
257
+ if flash_attn_installed and enable_flash_attn:
258
258
  model_kwargs["attn_implementation"] = "flash_attention_2"
259
259
  model_kwargs["torch_dtype"] = torch.float16
260
+ model_kwargs.update(self._model_config)
261
+ logger.debug("Loading qwen3 rerank with kwargs %s", model_kwargs)
260
262
  model = self._model = AutoModelForCausalLM.from_pretrained(
261
263
  self._model_path, **model_kwargs
262
264
  ).eval()
@@ -368,13 +370,18 @@ class RerankModel:
368
370
  )
369
371
  return output
370
372
 
371
- pairs = [
372
- format_instruction(kwargs.get("instruction", None), query, doc)
373
- for doc in documents
374
- ]
375
- # Tokenize the input texts
376
- inputs = self.process_inputs(pairs)
377
- similarity_scores = self.compute_logits(inputs)
373
+ # reduce memory usage.
374
+ micro_bs = 4
375
+ similarity_scores = []
376
+ for i in range(0, len(documents), micro_bs):
377
+ sub_docs = documents[i : i + micro_bs]
378
+ pairs = [
379
+ format_instruction(kwargs.get("instruction", None), query, doc)
380
+ for doc in sub_docs
381
+ ]
382
+ # Tokenize the input texts
383
+ inputs = self.process_inputs(pairs)
384
+ similarity_scores.extend(self.compute_logits(inputs))
378
385
  else:
379
386
  # Related issue: https://github.com/xorbitsai/inference/issues/1775
380
387
  similarity_scores = self._model.compute_score(
@@ -67,7 +67,7 @@
67
67
  "model_name": "Qwen3-Reranker-0.6B",
68
68
  "type": "normal",
69
69
  "language": ["en", "zh"],
70
- "max_tokens": 40960,
70
+ "max_tokens": 32768,
71
71
  "model_id": "Qwen/Qwen3-Reranker-0.6B",
72
72
  "model_revision": "6e9e69830b95c52b5fd889b7690dda3329508de3"
73
73
  },
@@ -75,7 +75,7 @@
75
75
  "model_name": "Qwen3-Reranker-4B",
76
76
  "type": "normal",
77
77
  "language": ["en", "zh"],
78
- "max_tokens": 40960,
78
+ "max_tokens": 32768,
79
79
  "model_id": "Qwen/Qwen3-Reranker-4B",
80
80
  "model_revision": "f16fc5d5d2b9b1d0db8280929242745d79794ef5"
81
81
  },
@@ -83,7 +83,7 @@
83
83
  "model_name": "Qwen3-Reranker-8B",
84
84
  "type": "normal",
85
85
  "language": ["en", "zh"],
86
- "max_tokens": 40960,
86
+ "max_tokens": 32768,
87
87
  "model_id": "Qwen/Qwen3-Reranker-8B",
88
88
  "model_revision": "5fa94080caafeaa45a15d11f969d7978e087a3db"
89
89
  }
@@ -62,7 +62,7 @@
62
62
  "model_name": "Qwen3-Reranker-0.6B",
63
63
  "type": "normal",
64
64
  "language": ["en", "zh"],
65
- "max_tokens": 40960,
65
+ "max_tokens": 32768,
66
66
  "model_id": "Qwen/Qwen3-Reranker-0.6B",
67
67
  "model_hub": "modelscope"
68
68
  },
@@ -70,7 +70,7 @@
70
70
  "model_name": "Qwen3-Reranker-4B",
71
71
  "type": "normal",
72
72
  "language": ["en", "zh"],
73
- "max_tokens": 40960,
73
+ "max_tokens": 32768,
74
74
  "model_id": "Qwen/Qwen3-Reranker-4B",
75
75
  "model_hub": "modelscope"
76
76
  },
@@ -78,7 +78,7 @@
78
78
  "model_name": "Qwen3-Reranker-8B",
79
79
  "type": "normal",
80
80
  "language": ["en", "zh"],
81
- "max_tokens": 40960,
81
+ "max_tokens": 32768,
82
82
  "model_id": "Qwen/Qwen3-Reranker-8B",
83
83
  "model_hub": "modelscope"
84
84
  }
@@ -1,14 +1,14 @@
1
1
  {
2
2
  "files": {
3
3
  "main.css": "./static/css/main.013f296b.css",
4
- "main.js": "./static/js/main.8a9e3ba0.js",
4
+ "main.js": "./static/js/main.9b12b7f9.js",
5
5
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
6
6
  "index.html": "./index.html",
7
7
  "main.013f296b.css.map": "./static/css/main.013f296b.css.map",
8
- "main.8a9e3ba0.js.map": "./static/js/main.8a9e3ba0.js.map"
8
+ "main.9b12b7f9.js.map": "./static/js/main.9b12b7f9.js.map"
9
9
  },
10
10
  "entrypoints": [
11
11
  "static/css/main.013f296b.css",
12
- "static/js/main.8a9e3ba0.js"
12
+ "static/js/main.9b12b7f9.js"
13
13
  ]
14
14
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.8a9e3ba0.js"></script><link href="./static/css/main.013f296b.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.9b12b7f9.js"></script><link href="./static/css/main.013f296b.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>