xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show
  1. xinference/_compat.py +51 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +209 -40
  4. xinference/client/restful/restful_client.py +7 -26
  5. xinference/conftest.py +1 -1
  6. xinference/constants.py +5 -0
  7. xinference/core/cache_tracker.py +1 -1
  8. xinference/core/chat_interface.py +8 -14
  9. xinference/core/event.py +1 -1
  10. xinference/core/image_interface.py +28 -0
  11. xinference/core/model.py +110 -31
  12. xinference/core/scheduler.py +37 -37
  13. xinference/core/status_guard.py +1 -1
  14. xinference/core/supervisor.py +17 -10
  15. xinference/core/utils.py +80 -22
  16. xinference/core/worker.py +17 -16
  17. xinference/deploy/cmdline.py +8 -16
  18. xinference/deploy/local.py +1 -1
  19. xinference/deploy/supervisor.py +1 -1
  20. xinference/deploy/utils.py +1 -1
  21. xinference/deploy/worker.py +1 -1
  22. xinference/model/audio/cosyvoice.py +86 -41
  23. xinference/model/audio/fish_speech.py +9 -9
  24. xinference/model/audio/model_spec.json +9 -9
  25. xinference/model/audio/whisper.py +4 -1
  26. xinference/model/embedding/core.py +52 -31
  27. xinference/model/image/core.py +2 -1
  28. xinference/model/image/model_spec.json +16 -4
  29. xinference/model/image/model_spec_modelscope.json +16 -4
  30. xinference/model/image/sdapi.py +136 -0
  31. xinference/model/image/stable_diffusion/core.py +164 -19
  32. xinference/model/llm/__init__.py +29 -11
  33. xinference/model/llm/llama_cpp/core.py +16 -33
  34. xinference/model/llm/llm_family.json +1011 -1296
  35. xinference/model/llm/llm_family.py +34 -53
  36. xinference/model/llm/llm_family_csghub.json +18 -35
  37. xinference/model/llm/llm_family_modelscope.json +981 -1122
  38. xinference/model/llm/lmdeploy/core.py +56 -88
  39. xinference/model/llm/mlx/core.py +46 -69
  40. xinference/model/llm/sglang/core.py +36 -18
  41. xinference/model/llm/transformers/chatglm.py +168 -306
  42. xinference/model/llm/transformers/cogvlm2.py +36 -63
  43. xinference/model/llm/transformers/cogvlm2_video.py +33 -223
  44. xinference/model/llm/transformers/core.py +55 -50
  45. xinference/model/llm/transformers/deepseek_v2.py +340 -0
  46. xinference/model/llm/transformers/deepseek_vl.py +53 -96
  47. xinference/model/llm/transformers/glm4v.py +55 -111
  48. xinference/model/llm/transformers/intern_vl.py +39 -70
  49. xinference/model/llm/transformers/internlm2.py +32 -54
  50. xinference/model/llm/transformers/minicpmv25.py +22 -55
  51. xinference/model/llm/transformers/minicpmv26.py +158 -68
  52. xinference/model/llm/transformers/omnilmm.py +5 -28
  53. xinference/model/llm/transformers/qwen2_audio.py +168 -0
  54. xinference/model/llm/transformers/qwen2_vl.py +234 -0
  55. xinference/model/llm/transformers/qwen_vl.py +34 -86
  56. xinference/model/llm/transformers/utils.py +32 -38
  57. xinference/model/llm/transformers/yi_vl.py +32 -72
  58. xinference/model/llm/utils.py +280 -554
  59. xinference/model/llm/vllm/core.py +161 -100
  60. xinference/model/rerank/core.py +41 -8
  61. xinference/model/rerank/model_spec.json +7 -0
  62. xinference/model/rerank/model_spec_modelscope.json +7 -1
  63. xinference/model/utils.py +1 -31
  64. xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
  65. xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
  66. xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
  67. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
  68. xinference/thirdparty/cosyvoice/cli/model.py +139 -26
  69. xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
  70. xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
  71. xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
  72. xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
  73. xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
  74. xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
  75. xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
  76. xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
  77. xinference/thirdparty/cosyvoice/utils/common.py +36 -0
  78. xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
  79. xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
  80. xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
  81. xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
  82. xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
  83. xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
  84. xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
  85. xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
  86. xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
  87. xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
  88. xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
  89. xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
  90. xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
  91. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +33 -0
  92. xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  93. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
  94. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
  95. xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
  96. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +1 -1
  97. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +1 -1
  98. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +1 -1
  99. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +1 -1
  100. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +1 -1
  101. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +2 -2
  102. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +0 -3
  103. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +169 -198
  104. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +4 -27
  105. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
  107. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +9 -47
  108. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +2 -2
  109. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -0
  110. xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
  111. xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
  112. xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
  113. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +12 -10
  114. xinference/thirdparty/fish_speech/tools/api.py +79 -134
  115. xinference/thirdparty/fish_speech/tools/commons.py +35 -0
  116. xinference/thirdparty/fish_speech/tools/download_models.py +3 -3
  117. xinference/thirdparty/fish_speech/tools/file.py +17 -0
  118. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +1 -1
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +29 -24
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +1 -1
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +2 -2
  122. xinference/thirdparty/fish_speech/tools/msgpack_api.py +34 -0
  123. xinference/thirdparty/fish_speech/tools/post_api.py +85 -44
  124. xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +1 -1
  126. xinference/thirdparty/fish_speech/tools/smart_pad.py +16 -3
  127. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +2 -2
  128. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +4 -2
  129. xinference/thirdparty/fish_speech/tools/webui.py +12 -146
  130. xinference/thirdparty/matcha/VERSION +1 -0
  131. xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
  132. xinference/thirdparty/matcha/hifigan/README.md +101 -0
  133. xinference/thirdparty/omnilmm/LICENSE +201 -0
  134. xinference/thirdparty/whisper/__init__.py +156 -0
  135. xinference/thirdparty/whisper/__main__.py +3 -0
  136. xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
  137. xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
  138. xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
  139. xinference/thirdparty/whisper/audio.py +157 -0
  140. xinference/thirdparty/whisper/decoding.py +826 -0
  141. xinference/thirdparty/whisper/model.py +314 -0
  142. xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
  143. xinference/thirdparty/whisper/normalizers/basic.py +76 -0
  144. xinference/thirdparty/whisper/normalizers/english.json +1741 -0
  145. xinference/thirdparty/whisper/normalizers/english.py +550 -0
  146. xinference/thirdparty/whisper/timing.py +386 -0
  147. xinference/thirdparty/whisper/tokenizer.py +395 -0
  148. xinference/thirdparty/whisper/transcribe.py +605 -0
  149. xinference/thirdparty/whisper/triton_ops.py +109 -0
  150. xinference/thirdparty/whisper/utils.py +316 -0
  151. xinference/thirdparty/whisper/version.py +1 -0
  152. xinference/types.py +14 -53
  153. xinference/web/ui/build/asset-manifest.json +6 -6
  154. xinference/web/ui/build/index.html +1 -1
  155. xinference/web/ui/build/static/css/{main.4bafd904.css → main.5061c4c3.css} +2 -2
  156. xinference/web/ui/build/static/css/main.5061c4c3.css.map +1 -0
  157. xinference/web/ui/build/static/js/main.754740c0.js +3 -0
  158. xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.754740c0.js.LICENSE.txt} +2 -0
  159. xinference/web/ui/build/static/js/main.754740c0.js.map +1 -0
  160. xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
  161. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
  162. xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
  163. xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
  164. xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +1 -0
  165. xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
  166. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +1 -0
  167. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
  168. xinference/web/ui/node_modules/.package-lock.json +37 -0
  169. xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
  170. xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
  171. xinference/web/ui/node_modules/nunjucks/package.json +112 -0
  172. xinference/web/ui/package-lock.json +38 -0
  173. xinference/web/ui/package.json +1 -0
  174. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/METADATA +16 -10
  175. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/RECORD +179 -127
  176. xinference/model/llm/transformers/llama_2.py +0 -108
  177. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +0 -442
  178. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +0 -44
  179. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +0 -115
  180. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +0 -225
  181. xinference/thirdparty/fish_speech/tools/auto_rerank.py +0 -159
  182. xinference/thirdparty/fish_speech/tools/gen_ref.py +0 -36
  183. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +0 -55
  184. xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
  185. xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
  186. xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
  187. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
  188. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
  189. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
  190. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
  191. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/LICENSE +0 -0
  192. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/WHEEL +0 -0
  193. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/entry_points.txt +0 -0
  194. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,234 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import importlib.util
15
+ import logging
16
+ import sys
17
+ import uuid
18
+ from typing import Iterator, List, Optional, Union
19
+
20
+ from ....model.utils import select_device
21
+ from ....types import (
22
+ ChatCompletion,
23
+ ChatCompletionChunk,
24
+ ChatCompletionMessage,
25
+ CompletionChunk,
26
+ )
27
+ from ..llm_family import LLMFamilyV1, LLMSpecV1
28
+ from ..utils import generate_chat_completion, generate_completion_chunk
29
+ from .core import PytorchChatModel, PytorchGenerateConfig
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class Qwen2VLChatModel(PytorchChatModel):
35
+ def __init__(self, *args, **kwargs):
36
+ super().__init__(*args, **kwargs)
37
+ self._tokenizer = None
38
+ self._model = None
39
+ self._device = None
40
+ self._processor = None
41
+
42
+ @classmethod
43
+ def match(
44
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
45
+ ) -> bool:
46
+ llm_family = model_family.model_family or model_family.model_name
47
+ if "qwen2-vl-instruct".lower() in llm_family.lower():
48
+ return True
49
+ return False
50
+
51
+ def load(self):
52
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
53
+
54
+ device = self._pytorch_model_config.get("device", "auto")
55
+ device = select_device(device)
56
+ self._device = device
57
+ # for multiple GPU, set back to auto to make multiple devices work
58
+ device = "auto" if device == "cuda" else device
59
+
60
+ self._processor = AutoProcessor.from_pretrained(
61
+ self.model_path, trust_remote_code=True
62
+ )
63
+ self._tokenizer = self._processor.tokenizer
64
+ flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
65
+ if flash_attn_installed:
66
+ self._model = Qwen2VLForConditionalGeneration.from_pretrained(
67
+ self.model_path,
68
+ torch_dtype="bfloat16",
69
+ device_map=device,
70
+ attn_implementation="flash_attention_2",
71
+ trust_remote_code=True,
72
+ ).eval()
73
+ else:
74
+ self._model = Qwen2VLForConditionalGeneration.from_pretrained(
75
+ self.model_path, device_map=device, trust_remote_code=True
76
+ ).eval()
77
+
78
+ def _transform_messages(
79
+ self,
80
+ messages: List[ChatCompletionMessage],
81
+ ):
82
+ transformed_messages = []
83
+ for msg in messages:
84
+ new_content = []
85
+ role = msg["role"]
86
+ content = msg["content"]
87
+ if isinstance(content, str):
88
+ new_content.append({"type": "text", "text": content})
89
+ elif isinstance(content, List):
90
+ for item in content: # type: ignore
91
+ if "text" in item:
92
+ new_content.append({"type": "text", "text": item["text"]})
93
+ elif "image_url" in item:
94
+ new_content.append(
95
+ {"type": "image", "image": item["image_url"]["url"]}
96
+ )
97
+ elif "video_url" in item:
98
+ new_content.append(
99
+ {"type": "video", "video": item["video_url"]["url"]}
100
+ )
101
+ new_message = {"role": role, "content": new_content}
102
+ transformed_messages.append(new_message)
103
+
104
+ return transformed_messages
105
+
106
+ def chat(
107
+ self,
108
+ messages: List[ChatCompletionMessage], # type: ignore
109
+ generate_config: Optional[PytorchGenerateConfig] = None,
110
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
111
+ messages = self._transform_messages(messages)
112
+
113
+ generate_config = generate_config if generate_config else {}
114
+
115
+ stream = generate_config.get("stream", False) if generate_config else False
116
+
117
+ if stream:
118
+ it = self._generate_stream(messages, generate_config)
119
+ return self._to_chat_completion_chunks(it)
120
+ else:
121
+ c = self._generate(messages, generate_config)
122
+ return c
123
+
124
+ def _generate(
125
+ self, messages: List, config: PytorchGenerateConfig = {}
126
+ ) -> ChatCompletion:
127
+ from qwen_vl_utils import process_vision_info
128
+
129
+ # Preparation for inference
130
+ text = self._processor.apply_chat_template(
131
+ messages, tokenize=False, add_generation_prompt=True
132
+ )
133
+ image_inputs, video_inputs = process_vision_info(messages)
134
+ inputs = self._processor(
135
+ text=[text],
136
+ images=image_inputs,
137
+ videos=video_inputs,
138
+ padding=True,
139
+ return_tensors="pt",
140
+ )
141
+ inputs = inputs.to("cuda")
142
+
143
+ # Inference: Generation of the output
144
+ generated_ids = self._model.generate(
145
+ **inputs,
146
+ max_new_tokens=config.get("max_tokens", 512),
147
+ temperature=config.get("temperature", 1),
148
+ )
149
+ generated_ids_trimmed = [
150
+ out_ids[len(in_ids) :]
151
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
152
+ ]
153
+ output_text = self._processor.batch_decode(
154
+ generated_ids_trimmed,
155
+ skip_special_tokens=True,
156
+ clean_up_tokenization_spaces=False,
157
+ )[0]
158
+ return generate_chat_completion(self.model_uid, output_text)
159
+
160
+ def _generate_stream(
161
+ self, messages: List, config: PytorchGenerateConfig = {}
162
+ ) -> Iterator[CompletionChunk]:
163
+ from threading import Thread
164
+
165
+ from qwen_vl_utils import process_vision_info
166
+ from transformers import TextIteratorStreamer
167
+
168
+ text = self._processor.apply_chat_template(
169
+ messages, tokenize=False, add_generation_prompt=True
170
+ )
171
+ image_inputs, video_inputs = process_vision_info(messages)
172
+ inputs = self._processor(
173
+ text=[text],
174
+ images=image_inputs,
175
+ videos=video_inputs,
176
+ padding=True,
177
+ return_tensors="pt",
178
+ )
179
+ inputs = inputs.to(self._model.device)
180
+
181
+ tokenizer = self._tokenizer
182
+ streamer = TextIteratorStreamer(
183
+ tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
184
+ )
185
+
186
+ gen_kwargs = {
187
+ "max_new_tokens": config.get("max_tokens", 512),
188
+ "temperature": config.get("temperature", 1),
189
+ "streamer": streamer,
190
+ **inputs,
191
+ }
192
+ error = None
193
+
194
+ def model_generate():
195
+ try:
196
+ return self._model.generate(**gen_kwargs)
197
+ except Exception:
198
+ nonlocal error
199
+ error = sys.exc_info()
200
+ streamer.end()
201
+ raise
202
+
203
+ thread = Thread(target=model_generate)
204
+ thread.start()
205
+
206
+ completion_id = str(uuid.uuid1())
207
+ for new_text in streamer:
208
+ yield generate_completion_chunk(
209
+ chunk_text=new_text,
210
+ finish_reason=None,
211
+ chunk_id=completion_id,
212
+ model_uid=self.model_uid,
213
+ prompt_tokens=-1,
214
+ completion_tokens=-1,
215
+ total_tokens=-1,
216
+ has_choice=True,
217
+ has_content=True,
218
+ )
219
+
220
+ if error:
221
+ _, err, tb = error # type: ignore
222
+ raise err.with_traceback(tb)
223
+
224
+ yield generate_completion_chunk(
225
+ chunk_text=None,
226
+ finish_reason="stop",
227
+ chunk_id=completion_id,
228
+ model_uid=self.model_uid,
229
+ prompt_tokens=-1,
230
+ completion_tokens=-1,
231
+ total_tokens=-1,
232
+ has_choice=True,
233
+ has_content=False,
234
+ )
@@ -15,7 +15,6 @@ import base64
15
15
  import logging
16
16
  import operator
17
17
  import tempfile
18
- import time
19
18
  import typing
20
19
  import uuid
21
20
  from typing import Dict, Iterator, List, Optional, Tuple, Union
@@ -25,16 +24,9 @@ from transformers import PreTrainedTokenizer
25
24
 
26
25
  from ....core.scheduler import InferenceRequest
27
26
  from ....model.utils import select_device
28
- from ....types import (
29
- ChatCompletion,
30
- ChatCompletionChunk,
31
- ChatCompletionMessage,
32
- Completion,
33
- CompletionChoice,
34
- CompletionChunk,
35
- CompletionUsage,
36
- )
27
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
37
28
  from ..llm_family import LLMFamilyV1, LLMSpecV1
29
+ from ..utils import generate_chat_completion, generate_completion_chunk
38
30
  from .core import PytorchChatModel, PytorchGenerateConfig
39
31
  from .utils import pad_prefill_tokens
40
32
 
@@ -53,7 +45,7 @@ class QwenVLChatModel(PytorchChatModel):
53
45
  cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
54
46
  ) -> bool:
55
47
  llm_family = model_family.model_family or model_family.model_name
56
- if "qwen" in llm_family and "vision" in model_family.model_ability:
48
+ if "qwen-" in llm_family and "vision" in model_family.model_ability:
57
49
  return True
58
50
  return False
59
51
 
@@ -129,18 +121,12 @@ class QwenVLChatModel(PytorchChatModel):
129
121
  return self._tokenizer.from_list_format(content)
130
122
  return content
131
123
 
132
- def _get_prompt_and_chat_history(
133
- self,
134
- prompt: Union[str, List[Dict]],
135
- chat_history: Optional[List[ChatCompletionMessage]] = None,
136
- ):
137
- prompt = self._message_content_to_qwen(prompt)
138
- # Convert openai history to qwen vl history
124
+ def _get_prompt_and_chat_history(self, messages: List[Dict]):
139
125
  qwen_history = []
140
126
  query_to_response: List = []
141
- for h in chat_history or []:
142
- role = h["role"]
143
- content = self._message_content_to_qwen(h["content"])
127
+ for message in messages[:-1]:
128
+ role = message["role"]
129
+ content = self._message_content_to_qwen(message["content"])
144
130
  if len(query_to_response) == 0 and role == "user":
145
131
  query_to_response.append(content)
146
132
  if len(query_to_response) == 1 and role == "assistant":
@@ -148,18 +134,15 @@ class QwenVLChatModel(PytorchChatModel):
148
134
  if len(query_to_response) == 2:
149
135
  qwen_history.append(query_to_response)
150
136
  query_to_response = []
137
+ prompt = self._message_content_to_qwen(messages[-1]["content"])
151
138
  return prompt, qwen_history
152
139
 
153
140
  def chat(
154
141
  self,
155
- prompt: Union[str, List[Dict]],
156
- system_prompt: Optional[str] = None,
157
- chat_history: Optional[List[ChatCompletionMessage]] = None,
142
+ messages: List[Dict],
158
143
  generate_config: Optional[PytorchGenerateConfig] = None,
159
144
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
160
- prompt, qwen_history = self._get_prompt_and_chat_history(
161
- prompt, chat_history=chat_history
162
- )
145
+ prompt, qwen_history = self._get_prompt_and_chat_history(messages)
163
146
 
164
147
  stream = generate_config.get("stream", False) if generate_config else False
165
148
  stream_options = (
@@ -174,33 +157,17 @@ class QwenVLChatModel(PytorchChatModel):
174
157
  it = self._generate_stream(prompt, qwen_history, include_usage) # type: ignore
175
158
  return self._to_chat_completion_chunks(it)
176
159
  else:
177
- c = self._generate(prompt, qwen_history) # type: ignore
178
- return self._to_chat_completion(c)
160
+ return self._generate(prompt, qwen_history) # type: ignore
179
161
 
180
- def _generate(self, prompt: str, qwen_history: List) -> Completion:
162
+ def _generate(self, prompt: str, qwen_history: List) -> ChatCompletion:
181
163
  response, history = self._model.chat(
182
164
  self._tokenizer, query=prompt, history=qwen_history
183
165
  )
184
- c = Completion(
185
- id=str(uuid.uuid1()),
186
- object="text_completion",
187
- created=int(time.time()),
188
- model=self.model_uid,
189
- choices=[
190
- CompletionChoice(
191
- index=0, text=response, finish_reason="stop", logprobs=None
192
- )
193
- ],
194
- usage=CompletionUsage(
195
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
196
- ),
197
- )
198
- return c
166
+ return generate_chat_completion(self.model_uid, response)
199
167
 
200
168
  def _generate_stream(
201
169
  self, prompt: str, qwen_history: List, include_usage
202
170
  ) -> Iterator[CompletionChunk]:
203
- # response, history = model.chat(tokenizer, message, history=history)
204
171
  response_generator = self._model.chat_stream(
205
172
  self._tokenizer, query=prompt, history=qwen_history
206
173
  )
@@ -212,57 +179,40 @@ class QwenVLChatModel(PytorchChatModel):
212
179
  for response in response_generator:
213
180
  inc_content = response[len(full_response) :]
214
181
  full_response = response
215
- completion_choice = CompletionChoice(
216
- text=inc_content, index=0, logprobs=None, finish_reason=None
217
- )
218
- completion_chunk = CompletionChunk(
219
- id=completion_id,
220
- object="text_completion",
221
- created=int(time.time()),
222
- model=self.model_uid,
223
- choices=[completion_choice],
224
- )
225
182
  completion_tokens = completion_tokens + 1
226
183
  total_tokens = prompt_tokens + completion_tokens
227
- completion_usage = CompletionUsage(
184
+ yield generate_completion_chunk(
185
+ chunk_text=inc_content,
186
+ finish_reason=None,
187
+ chunk_id=completion_id,
188
+ model_uid=self.model_uid,
228
189
  prompt_tokens=prompt_tokens,
229
190
  completion_tokens=completion_tokens,
230
191
  total_tokens=total_tokens,
231
192
  )
232
- completion_chunk["usage"] = completion_usage
233
- yield completion_chunk
234
-
235
- completion_choice = CompletionChoice(
236
- text="", index=0, logprobs=None, finish_reason="stop"
237
- )
238
- completion_chunk = CompletionChunk(
239
- id=completion_id,
240
- object="text_completion",
241
- created=int(time.time()),
242
- model=self.model_uid,
243
- choices=[completion_choice],
244
- )
245
- completion_usage = CompletionUsage(
193
+ yield generate_completion_chunk(
194
+ chunk_text=None,
195
+ finish_reason="stop",
196
+ chunk_id=completion_id,
197
+ model_uid=self.model_uid,
246
198
  prompt_tokens=prompt_tokens,
247
199
  completion_tokens=completion_tokens,
248
200
  total_tokens=total_tokens,
201
+ has_choice=True,
202
+ has_content=False,
249
203
  )
250
- completion_chunk["usage"] = completion_usage
251
- yield completion_chunk
252
204
  if include_usage:
253
- chunk = CompletionChunk(
254
- id=completion_id,
255
- object="text_completion",
256
- created=int(time.time()),
257
- model=self.model_uid,
258
- choices=[],
259
- )
260
- chunk["usage"] = CompletionUsage(
205
+ yield generate_completion_chunk(
206
+ chunk_text=None,
207
+ finish_reason=None,
208
+ chunk_id=completion_id,
209
+ model_uid=self.model_uid,
261
210
  prompt_tokens=prompt_tokens,
262
211
  completion_tokens=completion_tokens,
263
212
  total_tokens=total_tokens,
213
+ has_choice=False,
214
+ has_content=False,
264
215
  )
265
- yield chunk
266
216
 
267
217
  @staticmethod
268
218
  def get_batch_size_and_seq_len_indexes_from_kv() -> Tuple[int, int]:
@@ -359,10 +309,8 @@ class QwenVLChatModel(PytorchChatModel):
359
309
 
360
310
  return raw_text, context_tokens
361
311
 
362
- def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
363
- prompt, qwen_history = self._get_prompt_and_chat_history(
364
- prompt, chat_history=chat_history
365
- )
312
+ def _get_full_prompt(self, messages: List[Dict], tools):
313
+ prompt, qwen_history = self._get_prompt_and_chat_history(messages)
366
314
  _, context_tokens = self.make_context(self._tokenizer, prompt, qwen_history)
367
315
  return context_tokens
368
316
 
@@ -321,7 +321,7 @@ def generate_stream(
321
321
 
322
322
  if stream:
323
323
  completion_choice = CompletionChoice(
324
- text="", index=0, logprobs=None, finish_reason=finish_reason
324
+ text=output, index=0, logprobs=None, finish_reason=finish_reason
325
325
  )
326
326
  else:
327
327
  completion_choice = CompletionChoice(
@@ -430,39 +430,6 @@ def pad_prefill_tokens(
430
430
  return prompt_tokens
431
431
 
432
432
 
433
- def _get_completion_chunk(
434
- output: str,
435
- chunk_id: str,
436
- finish_reason: Optional[str],
437
- model_uid: str,
438
- r: InferenceRequest,
439
- just_usage: bool,
440
- ):
441
- completion_choice = (
442
- [
443
- CompletionChoice(
444
- text=output, index=0, logprobs=None, finish_reason=finish_reason
445
- )
446
- ]
447
- if not just_usage
448
- else []
449
- )
450
- completion_chunk = CompletionChunk(
451
- id=chunk_id,
452
- object="text_completion",
453
- created=int(time.time()),
454
- model=model_uid,
455
- choices=completion_choice,
456
- )
457
- completion_usage = CompletionUsage(
458
- prompt_tokens=len(r.prompt_tokens),
459
- completion_tokens=len(r.new_tokens),
460
- total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
461
- )
462
- completion_chunk["usage"] = completion_usage
463
- return completion_chunk
464
-
465
-
466
433
  def _get_completion(
467
434
  output: str,
468
435
  chunk_id: str,
@@ -551,6 +518,8 @@ def _batch_inference_one_step_internal(
551
518
  bos_flag: str = "<bos_stream>",
552
519
  eos_flag: str = "<eos_stream>",
553
520
  ):
521
+ from ..utils import generate_completion_chunk
522
+
554
523
  # need to judge stopped here,
555
524
  # since some requests state may change to stopped due to invalid parameters, e.g. max_src_len
556
525
  valid_req_list = [r for r in req_list if not r.stopped]
@@ -710,11 +679,28 @@ def _batch_inference_one_step_internal(
710
679
  output = output[r.last_output_length :]
711
680
  r.last_output_length += len(output)
712
681
 
713
- completion_chunk = _get_completion_chunk(
714
- output, r.chunk_id, r.finish_reason, model_uid, r, False
682
+ completion_chunk = generate_completion_chunk(
683
+ chunk_text=output,
684
+ finish_reason=None,
685
+ chunk_id=r.chunk_id,
686
+ model_uid=model_uid,
687
+ prompt_tokens=len(r.prompt_tokens),
688
+ completion_tokens=len(r.new_tokens),
689
+ total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
715
690
  )
716
691
  r.completion.append(completion_chunk)
717
692
  if r.stopped:
693
+ # OpenAI compatible chunk
694
+ completion_chunk = generate_completion_chunk(
695
+ chunk_text="",
696
+ finish_reason=r.finish_reason,
697
+ chunk_id=r.chunk_id,
698
+ model_uid=model_uid,
699
+ prompt_tokens=len(r.prompt_tokens),
700
+ completion_tokens=len(r.new_tokens),
701
+ total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
702
+ )
703
+ r.completion.append(completion_chunk)
718
704
  r.completion.append(eos_flag)
719
705
 
720
706
  # last round, handle stream result
@@ -723,8 +709,16 @@ def _batch_inference_one_step_internal(
723
709
  # these tokens are real generated and should be counted.
724
710
  if r.stopped and _i == decode_round - 1 and include_usage:
725
711
  r.completion.append(
726
- _get_completion_chunk(
727
- "", r.chunk_id, r.finish_reason, model_uid, r, True
712
+ generate_completion_chunk(
713
+ chunk_text=None,
714
+ finish_reason=None,
715
+ chunk_id=r.chunk_id,
716
+ model_uid=model_uid,
717
+ prompt_tokens=len(r.prompt_tokens),
718
+ completion_tokens=len(r.new_tokens),
719
+ total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
720
+ has_choice=False,
721
+ has_content=False,
728
722
  )
729
723
  )
730
724
  else: