xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +79 -2
  3. xinference/client/restful/restful_client.py +65 -3
  4. xinference/conftest.py +0 -7
  5. xinference/core/media_interface.py +132 -8
  6. xinference/core/model.py +44 -6
  7. xinference/core/scheduler.py +1 -10
  8. xinference/core/supervisor.py +8 -17
  9. xinference/core/worker.py +5 -27
  10. xinference/deploy/cmdline.py +6 -2
  11. xinference/model/audio/chattts.py +24 -39
  12. xinference/model/audio/cosyvoice.py +18 -30
  13. xinference/model/audio/funasr.py +42 -0
  14. xinference/model/audio/model_spec.json +71 -1
  15. xinference/model/audio/model_spec_modelscope.json +76 -2
  16. xinference/model/audio/utils.py +75 -0
  17. xinference/model/core.py +1 -0
  18. xinference/model/embedding/__init__.py +74 -18
  19. xinference/model/embedding/core.py +98 -589
  20. xinference/model/embedding/embed_family.py +133 -0
  21. xinference/{thirdparty/omnilmm/train → model/embedding/flag}/__init__.py +1 -1
  22. xinference/model/embedding/flag/core.py +282 -0
  23. xinference/model/embedding/model_spec.json +24 -0
  24. xinference/model/embedding/model_spec_modelscope.json +24 -0
  25. xinference/model/embedding/sentence_transformers/__init__.py +13 -0
  26. xinference/model/embedding/sentence_transformers/core.py +399 -0
  27. xinference/model/embedding/vllm/core.py +95 -0
  28. xinference/model/image/model_spec.json +30 -3
  29. xinference/model/image/model_spec_modelscope.json +41 -2
  30. xinference/model/image/stable_diffusion/core.py +144 -53
  31. xinference/model/llm/__init__.py +6 -54
  32. xinference/model/llm/core.py +19 -5
  33. xinference/model/llm/llama_cpp/core.py +59 -3
  34. xinference/model/llm/llama_cpp/memory.py +457 -0
  35. xinference/model/llm/llm_family.json +247 -402
  36. xinference/model/llm/llm_family.py +88 -16
  37. xinference/model/llm/llm_family_modelscope.json +260 -421
  38. xinference/model/llm/llm_family_openmind_hub.json +0 -34
  39. xinference/model/llm/sglang/core.py +8 -0
  40. xinference/model/llm/transformers/__init__.py +27 -6
  41. xinference/model/llm/transformers/chatglm.py +4 -2
  42. xinference/model/llm/transformers/core.py +49 -28
  43. xinference/model/llm/transformers/deepseek_v2.py +6 -49
  44. xinference/model/llm/transformers/gemma3.py +119 -164
  45. xinference/model/llm/transformers/multimodal/__init__.py +13 -0
  46. xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
  47. xinference/model/llm/transformers/multimodal/core.py +205 -0
  48. xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
  49. xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
  50. xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
  51. xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
  52. xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
  53. xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
  54. xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
  55. xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
  56. xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
  57. xinference/model/llm/transformers/opt.py +4 -2
  58. xinference/model/llm/transformers/utils.py +6 -37
  59. xinference/model/llm/utils.py +11 -0
  60. xinference/model/llm/vllm/core.py +7 -0
  61. xinference/model/rerank/core.py +91 -3
  62. xinference/model/rerank/model_spec.json +24 -0
  63. xinference/model/rerank/model_spec_modelscope.json +24 -0
  64. xinference/model/rerank/utils.py +20 -2
  65. xinference/model/utils.py +38 -1
  66. xinference/model/video/diffusers.py +65 -3
  67. xinference/model/video/model_spec.json +31 -4
  68. xinference/model/video/model_spec_modelscope.json +32 -4
  69. xinference/web/ui/build/asset-manifest.json +6 -6
  70. xinference/web/ui/build/index.html +1 -1
  71. xinference/web/ui/build/static/css/main.013f296b.css +2 -0
  72. xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
  73. xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
  74. xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
  79. xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
  80. xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
  81. xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
  82. xinference/web/ui/src/locales/en.json +21 -8
  83. xinference/web/ui/src/locales/ja.json +224 -0
  84. xinference/web/ui/src/locales/ko.json +224 -0
  85. xinference/web/ui/src/locales/zh.json +21 -8
  86. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/METADATA +14 -11
  87. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/RECORD +93 -100
  88. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/WHEEL +1 -1
  89. xinference/model/llm/transformers/cogvlm2.py +0 -442
  90. xinference/model/llm/transformers/cogvlm2_video.py +0 -333
  91. xinference/model/llm/transformers/deepseek_vl.py +0 -280
  92. xinference/model/llm/transformers/glm_edge_v.py +0 -213
  93. xinference/model/llm/transformers/intern_vl.py +0 -526
  94. xinference/model/llm/transformers/internlm2.py +0 -94
  95. xinference/model/llm/transformers/minicpmv25.py +0 -193
  96. xinference/model/llm/transformers/omnilmm.py +0 -132
  97. xinference/model/llm/transformers/qwen2_audio.py +0 -179
  98. xinference/model/llm/transformers/qwen_vl.py +0 -360
  99. xinference/thirdparty/omnilmm/LICENSE +0 -201
  100. xinference/thirdparty/omnilmm/chat.py +0 -218
  101. xinference/thirdparty/omnilmm/constants.py +0 -4
  102. xinference/thirdparty/omnilmm/conversation.py +0 -332
  103. xinference/thirdparty/omnilmm/model/__init__.py +0 -1
  104. xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
  105. xinference/thirdparty/omnilmm/model/resampler.py +0 -166
  106. xinference/thirdparty/omnilmm/model/utils.py +0 -578
  107. xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
  108. xinference/thirdparty/omnilmm/utils.py +0 -134
  109. xinference/web/ui/build/static/css/main.337afe76.css +0 -2
  110. xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
  111. xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
  112. xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
  113. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
  114. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
  115. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
  116. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
  117. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
  118. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
  119. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
  120. /xinference/{thirdparty/omnilmm → model/embedding/vllm}/__init__.py +0 -0
  121. /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
  122. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/entry_points.txt +0 -0
  123. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/licenses/LICENSE +0 -0
  124. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,131 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from io import BytesIO
16
+ from threading import Thread
17
+ from typing import Any, Dict, Iterator, List, Tuple
18
+ from urllib.request import urlopen
19
+
20
+ import numpy as np
21
+
22
+ from .....model.utils import select_device
23
+ from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
24
+ from ..core import register_non_default_model
25
+ from .core import PytorchMultiModalModel
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @register_transformer
31
+ @register_non_default_model("qwen2-audio-instruct")
32
+ class Qwen2AudioChatModel(PytorchMultiModalModel):
33
+ @classmethod
34
+ def match_json(
35
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
36
+ ) -> bool:
37
+ llm_family = model_family.model_family or model_family.model_name
38
+ if "qwen2-audio".lower() in llm_family.lower():
39
+ return True
40
+ return False
41
+
42
+ def decide_device(self):
43
+ device = self._pytorch_model_config.get("device", "auto")
44
+ self._device = select_device(device)
45
+
46
+ def load_processor(self):
47
+ from transformers import AutoProcessor
48
+
49
+ self._processor = AutoProcessor.from_pretrained(
50
+ self.model_path,
51
+ device_map="auto" if self._device == "cuda" else self._device,
52
+ # trust_remote_code=True,
53
+ code_revision=self.model_spec.model_revision,
54
+ )
55
+
56
+ self._tokenizer = self._processor.tokenizer
57
+
58
+ def load_multimodal_model(self):
59
+ from transformers import Qwen2AudioForConditionalGeneration
60
+
61
+ kwargs = self.apply_bnb_quantization()
62
+ self._model = Qwen2AudioForConditionalGeneration.from_pretrained(
63
+ self.model_path,
64
+ device_map="auto" if self._device == "cuda" else self._device,
65
+ # trust_remote_code=True,
66
+ revision=self.model_spec.model_revision,
67
+ **kwargs,
68
+ )
69
+
70
+ def _transform_messages(
71
+ self,
72
+ messages: List[dict], # type: ignore
73
+ ):
74
+ import librosa
75
+
76
+ text = self._processor.apply_chat_template(
77
+ messages, add_generation_prompt=True, tokenize=False
78
+ )
79
+ audios: List[np.ndarray] = []
80
+ for msg in messages:
81
+ content = msg["content"]
82
+ if isinstance(content, List):
83
+ for item in content: # type: ignore
84
+ if item.get("type") == "audio" and "audio_url" in item:
85
+ audio = librosa.load(
86
+ BytesIO(urlopen(item["audio_url"]["url"]).read()),
87
+ sr=self._processor.feature_extractor.sampling_rate,
88
+ )[0]
89
+ audios.append(audio)
90
+
91
+ return text, audios
92
+
93
+ def build_inputs_from_messages(
94
+ self,
95
+ messages: List[Dict],
96
+ generate_config: Dict,
97
+ ):
98
+ text, audios = self._transform_messages(messages)
99
+ inputs = self._processor(
100
+ text=text, audios=audios, return_tensors="pt", padding=True
101
+ )
102
+ # Make sure that the inputs and the model are on the same device.
103
+ inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
104
+ inputs.input_ids = inputs.input_ids.to(self._device)
105
+ return inputs
106
+
107
+ def build_generate_kwargs(
108
+ self,
109
+ generate_config: Dict,
110
+ ) -> Dict[str, Any]:
111
+ return dict(max_length=generate_config.get("max_tokens", 512))
112
+
113
+ def build_streaming_iter(
114
+ self,
115
+ messages: List[Dict],
116
+ generate_config: Dict,
117
+ ) -> Tuple[Iterator, int]:
118
+ from transformers import TextIteratorStreamer
119
+
120
+ inputs = self.build_inputs_from_messages(messages, generate_config)
121
+ config = self.build_generate_kwargs(generate_config)
122
+
123
+ tokenizer = self._processor.tokenizer
124
+ streamer = TextIteratorStreamer(
125
+ tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
126
+ )
127
+
128
+ gen_kwargs = {"streamer": streamer, **inputs, **config}
129
+ thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
130
+ thread.start()
131
+ return streamer, len(inputs.input_ids[0])
@@ -1,256 +1,224 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import importlib.util
15
- import logging
16
- import sys
17
- import uuid
18
- from typing import Iterator, List, Optional, Union
19
-
20
- from ....device_utils import is_npu_available
21
- from ....model.utils import select_device
22
- from ....types import (
23
- ChatCompletion,
24
- ChatCompletionChunk,
25
- ChatCompletionMessage,
26
- CompletionChunk,
27
- PytorchModelConfig,
28
- )
29
- from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
30
- from ..utils import generate_chat_completion, generate_completion_chunk
31
- from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
32
- from .utils import cache_clean
33
-
34
- logger = logging.getLogger(__name__)
35
-
36
-
37
- @register_transformer
38
- @register_non_default_model("qwen2-vl-instruct", "qwen2.5-vl-instruct")
39
- class Qwen2VLChatModel(PytorchChatModel):
40
- def __init__(self, *args, **kwargs):
41
- super().__init__(*args, **kwargs)
42
- self._tokenizer = None
43
- self._model = None
44
- self._device = None
45
- self._processor = None
46
-
47
- def _sanitize_model_config(
48
- self, pytorch_model_config: Optional[PytorchModelConfig]
49
- ) -> PytorchModelConfig:
50
- pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
51
- assert pytorch_model_config is not None
52
- pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
53
- pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
54
- return pytorch_model_config
55
-
56
- @classmethod
57
- def match_json(
58
- cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
59
- ) -> bool:
60
- if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
61
- return False
62
- llm_family = model_family.model_family or model_family.model_name
63
- if "qwen2-vl-instruct".lower() in llm_family.lower():
64
- return True
65
- if "qwen2.5-vl-instruct".lower() in llm_family.lower():
66
- return True
67
- if "qvq-72b-preview".lower() in llm_family.lower():
68
- return True
69
- return False
70
-
71
- def load(self):
72
- from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
73
-
74
- try:
75
- from transformers import Qwen2_5_VLForConditionalGeneration
76
- except ImportError:
77
- Qwen2_5_VLForConditionalGeneration = None
78
-
79
- device = self._pytorch_model_config.get("device", "auto")
80
- device = select_device(device)
81
- self._device = device
82
- # for multiple GPU, set back to auto to make multiple devices work
83
- device = "auto" if device == "cuda" else device
84
- kwargs = self.apply_bnb_quantization()
85
-
86
- min_pixels = self._pytorch_model_config.get("min_pixels")
87
- max_pixels = self._pytorch_model_config.get("max_pixels")
88
- self._processor = AutoProcessor.from_pretrained(
89
- self.model_path,
90
- trust_remote_code=True,
91
- min_pixels=min_pixels,
92
- max_pixels=max_pixels,
93
- )
94
- self._tokenizer = self._processor.tokenizer
95
- flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
96
- llm_family = self.model_family.model_family or self.model_family.model_name
97
- model_cls = (
98
- Qwen2_5_VLForConditionalGeneration
99
- if "qwen2.5" in llm_family
100
- else Qwen2VLForConditionalGeneration
101
- )
102
- if model_cls is None:
103
- raise ImportError("`transformers` version is too old, please upgrade it")
104
- if flash_attn_installed:
105
- self._model = model_cls.from_pretrained(
106
- self.model_path,
107
- torch_dtype="bfloat16",
108
- device_map=device,
109
- attn_implementation="flash_attention_2",
110
- trust_remote_code=True,
111
- **kwargs,
112
- ).eval()
113
- elif is_npu_available():
114
- # Ascend do not support bf16
115
- self._model = model_cls.from_pretrained(
116
- self.model_path,
117
- device_map="auto",
118
- trust_remote_code=True,
119
- torch_dtype="float16",
120
- **kwargs,
121
- ).eval()
122
- else:
123
- self._model = model_cls.from_pretrained(
124
- self.model_path, device_map=device, trust_remote_code=True
125
- ).eval()
126
-
127
- @cache_clean
128
- def chat(
129
- self,
130
- messages: List[ChatCompletionMessage], # type: ignore
131
- generate_config: Optional[PytorchGenerateConfig] = None,
132
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
133
- messages = self._transform_messages(messages)
134
-
135
- generate_config = generate_config if generate_config else {}
136
-
137
- stream = generate_config.get("stream", False) if generate_config else False
138
-
139
- if stream:
140
- it = self._generate_stream(messages, generate_config)
141
- return self._to_chat_completion_chunks(it)
142
- else:
143
- c = self._generate(messages, generate_config)
144
- return c
145
-
146
- def _generate(
147
- self, messages: List, config: PytorchGenerateConfig = {}
148
- ) -> ChatCompletion:
149
- from qwen_vl_utils import process_vision_info
150
-
151
- # Preparation for inference
152
- text = self._processor.apply_chat_template(
153
- messages, tokenize=False, add_generation_prompt=True
154
- )
155
- image_inputs, video_inputs = process_vision_info(messages)
156
- inputs = self._processor(
157
- text=[text],
158
- images=image_inputs,
159
- videos=video_inputs,
160
- padding=True,
161
- return_tensors="pt",
162
- )
163
- inputs = inputs.to(self._device)
164
-
165
- # Inference: Generation of the output
166
- generated_ids = self._model.generate(
167
- **inputs,
168
- max_new_tokens=config.get("max_tokens", 512),
169
- temperature=config.get("temperature", 1),
170
- )
171
- generated_ids_trimmed = [
172
- out_ids[len(in_ids) :]
173
- for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
174
- ]
175
- output_text = self._processor.batch_decode(
176
- generated_ids_trimmed,
177
- skip_special_tokens=True,
178
- clean_up_tokenization_spaces=False,
179
- )[0]
180
- return generate_chat_completion(self.model_uid, output_text)
181
-
182
- def _generate_stream(
183
- self, messages: List, config: PytorchGenerateConfig = {}
184
- ) -> Iterator[CompletionChunk]:
185
- from threading import Thread
186
-
187
- from qwen_vl_utils import process_vision_info
188
- from transformers import TextIteratorStreamer
189
-
190
- text = self._processor.apply_chat_template(
191
- messages, tokenize=False, add_generation_prompt=True
192
- )
193
- image_inputs, video_inputs = process_vision_info(messages)
194
- inputs = self._processor(
195
- text=[text],
196
- images=image_inputs,
197
- videos=video_inputs,
198
- padding=True,
199
- return_tensors="pt",
200
- )
201
- inputs = inputs.to(self._model.device)
202
-
203
- tokenizer = self._tokenizer
204
- streamer = TextIteratorStreamer(
205
- tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
206
- )
207
-
208
- gen_kwargs = {
209
- "max_new_tokens": config.get("max_tokens", 512),
210
- "temperature": config.get("temperature", 1),
211
- "streamer": streamer,
212
- **inputs,
213
- }
214
- error = None
215
-
216
- def model_generate():
217
- try:
218
- return self._model.generate(**gen_kwargs)
219
- except Exception:
220
- nonlocal error
221
- error = sys.exc_info()
222
- streamer.end()
223
- raise
224
-
225
- thread = Thread(target=model_generate)
226
- thread.start()
227
-
228
- completion_id = str(uuid.uuid1())
229
- for new_text in streamer:
230
- yield generate_completion_chunk(
231
- chunk_text=new_text,
232
- finish_reason=None,
233
- chunk_id=completion_id,
234
- model_uid=self.model_uid,
235
- prompt_tokens=-1,
236
- completion_tokens=-1,
237
- total_tokens=-1,
238
- has_choice=True,
239
- has_content=True,
240
- )
241
-
242
- if error:
243
- _, err, tb = error # type: ignore
244
- raise err.with_traceback(tb)
245
-
246
- yield generate_completion_chunk(
247
- chunk_text=None,
248
- finish_reason="stop",
249
- chunk_id=completion_id,
250
- model_uid=self.model_uid,
251
- prompt_tokens=-1,
252
- completion_tokens=-1,
253
- total_tokens=-1,
254
- has_choice=True,
255
- has_content=False,
256
- )
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import importlib.util
15
+ import logging
16
+ from typing import Any, Dict, Iterator, List, Optional, Tuple
17
+
18
+ from .....core.model import register_batching_multimodal_models
19
+ from .....core.scheduler import InferenceRequest
20
+ from .....device_utils import is_npu_available
21
+ from .....model.utils import select_device
22
+ from .....types import PytorchModelConfig
23
+ from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
24
+ from ..core import register_non_default_model
25
+ from .core import PytorchMultiModalModel
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @register_batching_multimodal_models(
31
+ "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
32
+ )
33
+ @register_transformer
34
+ @register_non_default_model(
35
+ "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
36
+ )
37
+ class Qwen2VLChatModel(PytorchMultiModalModel):
38
+ def _sanitize_model_config(
39
+ self, pytorch_model_config: Optional[PytorchModelConfig]
40
+ ) -> PytorchModelConfig:
41
+ pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
42
+ assert pytorch_model_config is not None
43
+ pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
44
+ pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
45
+ return pytorch_model_config
46
+
47
+ @classmethod
48
+ def match_json(
49
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
50
+ ) -> bool:
51
+ if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
52
+ return False
53
+ llm_family = model_family.model_family or model_family.model_name
54
+ if "qwen2-vl-instruct".lower() in llm_family.lower():
55
+ return True
56
+ if "qwen2.5-vl-instruct".lower() in llm_family.lower():
57
+ return True
58
+ if "qvq-72b-preview".lower() in llm_family.lower():
59
+ return True
60
+ return False
61
+
62
+ def decide_device(self):
63
+ device = self._pytorch_model_config.get("device", "auto")
64
+ device = select_device(device)
65
+ # for multiple GPU, set back to auto to make multiple devices work
66
+ self._device = device
67
+
68
+ def load_processor(self):
69
+ from transformers import AutoProcessor
70
+
71
+ min_pixels = self._pytorch_model_config.get("min_pixels")
72
+ max_pixels = self._pytorch_model_config.get("max_pixels")
73
+ self._processor = AutoProcessor.from_pretrained(
74
+ self.model_path,
75
+ trust_remote_code=True,
76
+ min_pixels=min_pixels,
77
+ max_pixels=max_pixels,
78
+ )
79
+ self._tokenizer = self._processor.tokenizer
80
+
81
+ def load_multimodal_model(self):
82
+ from transformers import Qwen2VLForConditionalGeneration
83
+
84
+ try:
85
+ from transformers import Qwen2_5_VLForConditionalGeneration
86
+ except ImportError:
87
+ Qwen2_5_VLForConditionalGeneration = None
88
+
89
+ kwargs = self.apply_bnb_quantization()
90
+ flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
91
+ llm_family = self.model_family.model_family or self.model_family.model_name
92
+ model_cls = (
93
+ Qwen2_5_VLForConditionalGeneration
94
+ if "qwen2.5" in llm_family
95
+ else Qwen2VLForConditionalGeneration
96
+ )
97
+ if model_cls is None:
98
+ raise ImportError("`transformers` version is too old, please upgrade it")
99
+ device = "auto" if self._device == "cuda" else self._device
100
+ if flash_attn_installed:
101
+ self._model = model_cls.from_pretrained(
102
+ self.model_path,
103
+ torch_dtype="bfloat16",
104
+ device_map=device,
105
+ attn_implementation="flash_attention_2",
106
+ trust_remote_code=True,
107
+ **kwargs,
108
+ ).eval()
109
+ elif is_npu_available():
110
+ # Ascend do not support bf16
111
+ self._model = model_cls.from_pretrained(
112
+ self.model_path,
113
+ device_map="auto",
114
+ trust_remote_code=True,
115
+ torch_dtype="float16",
116
+ **kwargs,
117
+ ).eval()
118
+ else:
119
+ self._model = model_cls.from_pretrained(
120
+ self.model_path,
121
+ device_map=device,
122
+ trust_remote_code=True,
123
+ **kwargs,
124
+ ).eval()
125
+
126
+ def build_inputs_from_messages(
127
+ self,
128
+ messages: List[Dict],
129
+ generate_config: Dict,
130
+ ):
131
+ from qwen_vl_utils import process_vision_info
132
+
133
+ messages = self._transform_messages(messages)
134
+ # Preparation for inference
135
+ text = self._processor.apply_chat_template(
136
+ messages, tokenize=False, add_generation_prompt=True
137
+ )
138
+ image_inputs, video_inputs = process_vision_info(messages)
139
+ inputs = self._processor(
140
+ text=[text],
141
+ images=image_inputs,
142
+ videos=video_inputs,
143
+ padding=True,
144
+ return_tensors="pt",
145
+ )
146
+ inputs = inputs.to(self._device)
147
+ return inputs
148
+
149
+ def build_generate_kwargs(self, generate_config: Dict) -> Dict[str, Any]:
150
+ max_new_tokens = generate_config.get("max_tokens", 512)
151
+ temperature = generate_config.get("temperature", 1)
152
+ return {"max_new_tokens": max_new_tokens, "temperature": temperature}
153
+
154
+ def build_streaming_iter(
155
+ self,
156
+ messages: List[Dict],
157
+ generate_config: Dict,
158
+ ) -> Tuple[Iterator, int]:
159
+ from threading import Thread
160
+
161
+ from transformers import TextIteratorStreamer
162
+
163
+ tokenizer = self._tokenizer
164
+ streamer = TextIteratorStreamer(
165
+ tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
166
+ )
167
+
168
+ inputs = self.build_inputs_from_messages(messages, generate_config)
169
+ config = self.build_generate_kwargs(generate_config)
170
+
171
+ def model_generate():
172
+ try:
173
+ return self._model.generate(**inputs, **config, streamer=streamer)
174
+ except Exception:
175
+ streamer.end()
176
+ raise
177
+
178
+ thread = Thread(target=model_generate)
179
+ thread.start()
180
+ return streamer, len(inputs.input_ids[0])
181
+
182
+ def prepare_sanitize_generate_config(self, req: InferenceRequest):
183
+ """
184
+ This file corresponds to multiple models,
185
+ so the corresponding configuration is read directly through the transformers interface.
186
+ """
187
+ from transformers import GenerationConfig
188
+
189
+ gen_config = GenerationConfig.from_pretrained(self.model_path).to_dict()
190
+ raw_config = req.inference_kwargs.get("raw_params", {})
191
+ gen_config.update(raw_config)
192
+ return gen_config
193
+
194
+ def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
195
+ return self._transform_messages(messages)
196
+
197
+ def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
198
+ import torch
199
+ from qwen_vl_utils import process_vision_info
200
+
201
+ batch_text = self._processor.apply_chat_template(
202
+ prompts, tokenize=False, add_generation_prompt=True
203
+ )
204
+ image_inputs, video_inputs = process_vision_info(prompts)
205
+ inputs = self._processor(
206
+ text=batch_text,
207
+ images=image_inputs,
208
+ videos=video_inputs,
209
+ padding=True,
210
+ padding_side="left",
211
+ return_tensors="pt",
212
+ )
213
+ inputs = inputs.to(self._model.device)
214
+ for r, _ids, attn_mask in zip(
215
+ req_list, inputs["input_ids"], inputs["attention_mask"]
216
+ ):
217
+ r.prompt_tokens = _ids.tolist()
218
+ real_len = torch.sum(attn_mask).item()
219
+ r.padding_len = attn_mask.numel() - real_len
220
+ r.extra_kwargs["attention_mask_seq_len"] = real_len
221
+ input_ids = inputs["input_ids"]
222
+ batch_size, seq_len = input_ids.shape
223
+ position_ids = self.build_prefill_position_ids(batch_size, seq_len, req_list)
224
+ return {**inputs, "position_ids": position_ids}
@@ -16,10 +16,12 @@ from typing import List, Optional
16
16
 
17
17
  from ....core.scheduler import InferenceRequest
18
18
  from ....types import LoRA
19
- from ..llm_family import LLMFamilyV1, LLMSpecV1
20
- from .core import PytorchModel, PytorchModelConfig
19
+ from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
20
+ from .core import PytorchModel, PytorchModelConfig, register_non_default_model
21
21
 
22
22
 
23
+ @register_transformer
24
+ @register_non_default_model("opt")
23
25
  class OptPytorchModel(PytorchModel):
24
26
  def __init__(
25
27
  self,