xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show
  1. xinference/_compat.py +51 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +209 -40
  4. xinference/client/restful/restful_client.py +7 -26
  5. xinference/conftest.py +1 -1
  6. xinference/constants.py +5 -0
  7. xinference/core/cache_tracker.py +1 -1
  8. xinference/core/chat_interface.py +8 -14
  9. xinference/core/event.py +1 -1
  10. xinference/core/image_interface.py +28 -0
  11. xinference/core/model.py +110 -31
  12. xinference/core/scheduler.py +37 -37
  13. xinference/core/status_guard.py +1 -1
  14. xinference/core/supervisor.py +17 -10
  15. xinference/core/utils.py +80 -22
  16. xinference/core/worker.py +17 -16
  17. xinference/deploy/cmdline.py +8 -16
  18. xinference/deploy/local.py +1 -1
  19. xinference/deploy/supervisor.py +1 -1
  20. xinference/deploy/utils.py +1 -1
  21. xinference/deploy/worker.py +1 -1
  22. xinference/model/audio/cosyvoice.py +86 -41
  23. xinference/model/audio/fish_speech.py +9 -9
  24. xinference/model/audio/model_spec.json +9 -9
  25. xinference/model/audio/whisper.py +4 -1
  26. xinference/model/embedding/core.py +52 -31
  27. xinference/model/image/core.py +2 -1
  28. xinference/model/image/model_spec.json +16 -4
  29. xinference/model/image/model_spec_modelscope.json +16 -4
  30. xinference/model/image/sdapi.py +136 -0
  31. xinference/model/image/stable_diffusion/core.py +164 -19
  32. xinference/model/llm/__init__.py +29 -11
  33. xinference/model/llm/llama_cpp/core.py +16 -33
  34. xinference/model/llm/llm_family.json +1011 -1296
  35. xinference/model/llm/llm_family.py +34 -53
  36. xinference/model/llm/llm_family_csghub.json +18 -35
  37. xinference/model/llm/llm_family_modelscope.json +981 -1122
  38. xinference/model/llm/lmdeploy/core.py +56 -88
  39. xinference/model/llm/mlx/core.py +46 -69
  40. xinference/model/llm/sglang/core.py +36 -18
  41. xinference/model/llm/transformers/chatglm.py +168 -306
  42. xinference/model/llm/transformers/cogvlm2.py +36 -63
  43. xinference/model/llm/transformers/cogvlm2_video.py +33 -223
  44. xinference/model/llm/transformers/core.py +55 -50
  45. xinference/model/llm/transformers/deepseek_v2.py +340 -0
  46. xinference/model/llm/transformers/deepseek_vl.py +53 -96
  47. xinference/model/llm/transformers/glm4v.py +55 -111
  48. xinference/model/llm/transformers/intern_vl.py +39 -70
  49. xinference/model/llm/transformers/internlm2.py +32 -54
  50. xinference/model/llm/transformers/minicpmv25.py +22 -55
  51. xinference/model/llm/transformers/minicpmv26.py +158 -68
  52. xinference/model/llm/transformers/omnilmm.py +5 -28
  53. xinference/model/llm/transformers/qwen2_audio.py +168 -0
  54. xinference/model/llm/transformers/qwen2_vl.py +234 -0
  55. xinference/model/llm/transformers/qwen_vl.py +34 -86
  56. xinference/model/llm/transformers/utils.py +32 -38
  57. xinference/model/llm/transformers/yi_vl.py +32 -72
  58. xinference/model/llm/utils.py +280 -554
  59. xinference/model/llm/vllm/core.py +161 -100
  60. xinference/model/rerank/core.py +41 -8
  61. xinference/model/rerank/model_spec.json +7 -0
  62. xinference/model/rerank/model_spec_modelscope.json +7 -1
  63. xinference/model/utils.py +1 -31
  64. xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
  65. xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
  66. xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
  67. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
  68. xinference/thirdparty/cosyvoice/cli/model.py +139 -26
  69. xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
  70. xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
  71. xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
  72. xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
  73. xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
  74. xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
  75. xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
  76. xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
  77. xinference/thirdparty/cosyvoice/utils/common.py +36 -0
  78. xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
  79. xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
  80. xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
  81. xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
  82. xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
  83. xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
  84. xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
  85. xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
  86. xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
  87. xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
  88. xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
  89. xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
  90. xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
  91. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +33 -0
  92. xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  93. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
  94. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
  95. xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
  96. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +1 -1
  97. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +1 -1
  98. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +1 -1
  99. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +1 -1
  100. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +1 -1
  101. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +2 -2
  102. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +0 -3
  103. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +169 -198
  104. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +4 -27
  105. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
  107. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +9 -47
  108. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +2 -2
  109. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -0
  110. xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
  111. xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
  112. xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
  113. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +12 -10
  114. xinference/thirdparty/fish_speech/tools/api.py +79 -134
  115. xinference/thirdparty/fish_speech/tools/commons.py +35 -0
  116. xinference/thirdparty/fish_speech/tools/download_models.py +3 -3
  117. xinference/thirdparty/fish_speech/tools/file.py +17 -0
  118. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +1 -1
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +29 -24
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +1 -1
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +2 -2
  122. xinference/thirdparty/fish_speech/tools/msgpack_api.py +34 -0
  123. xinference/thirdparty/fish_speech/tools/post_api.py +85 -44
  124. xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +1 -1
  126. xinference/thirdparty/fish_speech/tools/smart_pad.py +16 -3
  127. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +2 -2
  128. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +4 -2
  129. xinference/thirdparty/fish_speech/tools/webui.py +12 -146
  130. xinference/thirdparty/matcha/VERSION +1 -0
  131. xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
  132. xinference/thirdparty/matcha/hifigan/README.md +101 -0
  133. xinference/thirdparty/omnilmm/LICENSE +201 -0
  134. xinference/thirdparty/whisper/__init__.py +156 -0
  135. xinference/thirdparty/whisper/__main__.py +3 -0
  136. xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
  137. xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
  138. xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
  139. xinference/thirdparty/whisper/audio.py +157 -0
  140. xinference/thirdparty/whisper/decoding.py +826 -0
  141. xinference/thirdparty/whisper/model.py +314 -0
  142. xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
  143. xinference/thirdparty/whisper/normalizers/basic.py +76 -0
  144. xinference/thirdparty/whisper/normalizers/english.json +1741 -0
  145. xinference/thirdparty/whisper/normalizers/english.py +550 -0
  146. xinference/thirdparty/whisper/timing.py +386 -0
  147. xinference/thirdparty/whisper/tokenizer.py +395 -0
  148. xinference/thirdparty/whisper/transcribe.py +605 -0
  149. xinference/thirdparty/whisper/triton_ops.py +109 -0
  150. xinference/thirdparty/whisper/utils.py +316 -0
  151. xinference/thirdparty/whisper/version.py +1 -0
  152. xinference/types.py +14 -53
  153. xinference/web/ui/build/asset-manifest.json +6 -6
  154. xinference/web/ui/build/index.html +1 -1
  155. xinference/web/ui/build/static/css/{main.4bafd904.css → main.5061c4c3.css} +2 -2
  156. xinference/web/ui/build/static/css/main.5061c4c3.css.map +1 -0
  157. xinference/web/ui/build/static/js/main.754740c0.js +3 -0
  158. xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.754740c0.js.LICENSE.txt} +2 -0
  159. xinference/web/ui/build/static/js/main.754740c0.js.map +1 -0
  160. xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
  161. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
  162. xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
  163. xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
  164. xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +1 -0
  165. xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
  166. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +1 -0
  167. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
  168. xinference/web/ui/node_modules/.package-lock.json +37 -0
  169. xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
  170. xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
  171. xinference/web/ui/node_modules/nunjucks/package.json +112 -0
  172. xinference/web/ui/package-lock.json +38 -0
  173. xinference/web/ui/package.json +1 -0
  174. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/METADATA +16 -10
  175. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/RECORD +179 -127
  176. xinference/model/llm/transformers/llama_2.py +0 -108
  177. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +0 -442
  178. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +0 -44
  179. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +0 -115
  180. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +0 -225
  181. xinference/thirdparty/fish_speech/tools/auto_rerank.py +0 -159
  182. xinference/thirdparty/fish_speech/tools/gen_ref.py +0 -36
  183. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +0 -55
  184. xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
  185. xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
  186. xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
  187. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
  188. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
  189. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
  190. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
  191. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/LICENSE +0 -0
  192. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/WHEEL +0 -0
  193. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/entry_points.txt +0 -0
  194. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/top_level.txt +0 -0
@@ -12,26 +12,23 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- import time
16
15
  import uuid
17
16
  from concurrent.futures import ThreadPoolExecutor
18
- from typing import Dict, Iterator, List, Optional, Union
17
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
19
18
 
20
19
  import torch
21
20
  from PIL import Image
22
21
 
23
- from ....types import (
24
- ChatCompletion,
25
- ChatCompletionChunk,
26
- ChatCompletionMessage,
27
- Completion,
28
- CompletionChoice,
29
- CompletionChunk,
30
- CompletionUsage,
31
- )
22
+ from ....core.scheduler import InferenceRequest
23
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
32
24
  from ...utils import select_device
33
25
  from ..llm_family import LLMFamilyV1, LLMSpecV1
34
- from ..utils import _decode_image
26
+ from ..utils import (
27
+ _decode_image,
28
+ generate_chat_completion,
29
+ generate_completion_chunk,
30
+ parse_messages,
31
+ )
35
32
  from .core import PytorchChatModel, PytorchGenerateConfig
36
33
 
37
34
  logger = logging.getLogger(__name__)
@@ -43,6 +40,7 @@ class MiniCPMV26Model(PytorchChatModel):
43
40
  self._device = None
44
41
  self._tokenizer = None
45
42
  self._model = None
43
+ self._processor = None
46
44
 
47
45
  @classmethod
48
46
  def match(
@@ -59,7 +57,7 @@ class MiniCPMV26Model(PytorchChatModel):
59
57
  return AutoModel
60
58
 
61
59
  def load(self, **kwargs):
62
- from transformers import AutoModel, AutoTokenizer
60
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer
63
61
  from transformers.generation import GenerationConfig
64
62
 
65
63
  device = self._pytorch_model_config.get("device", "auto")
@@ -100,6 +98,10 @@ class MiniCPMV26Model(PytorchChatModel):
100
98
  self.model_path,
101
99
  trust_remote_code=True,
102
100
  )
101
+ self._processor = AutoProcessor.from_pretrained(
102
+ self.model_path, trust_remote_code=True
103
+ )
104
+ self._device = self._model.device
103
105
  self._save_tensorizer()
104
106
 
105
107
  def _message_content_to_chat(self, content):
@@ -120,7 +122,9 @@ class MiniCPMV26Model(PytorchChatModel):
120
122
  frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
121
123
  frames = vr.get_batch(frame_idx).asnumpy()
122
124
  frames = [Image.fromarray(v.astype("uint8")) for v in frames]
123
- print("num frames:", len(frames))
125
+ logger.info(
126
+ f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
127
+ )
124
128
  return frames
125
129
 
126
130
  def _load_video(_url):
@@ -158,19 +162,13 @@ class MiniCPMV26Model(PytorchChatModel):
158
162
  return text, images, frames
159
163
  return content, [], []
160
164
 
161
- def chat(
162
- self,
163
- prompt: Union[str, List[Dict]],
164
- system_prompt: Optional[str] = None,
165
- chat_history: Optional[List[ChatCompletionMessage]] = None,
166
- generate_config: Optional[PytorchGenerateConfig] = None,
167
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
168
- stream = generate_config.get("stream", False) if generate_config else False
169
- videoExisted = False
165
+ def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
166
+ video_existed = False
167
+ prompt, _, chat_history = parse_messages(messages)
170
168
 
171
169
  content, images_chat, video_frames = self._message_content_to_chat(prompt)
172
170
  if len(video_frames) > 0:
173
- videoExisted = True
171
+ video_existed = True
174
172
  images_chat = video_frames
175
173
 
176
174
  msgs = []
@@ -184,7 +182,7 @@ class MiniCPMV26Model(PytorchChatModel):
184
182
  if images_tmp != []:
185
183
  images_history = images_tmp
186
184
  if len(video_frames_h) > 0:
187
- videoExisted = True
185
+ video_existed = True
188
186
  images_history = video_frames_h
189
187
  if len(query_to_response) == 0 and role == "user":
190
188
  query_to_response.append(
@@ -198,10 +196,19 @@ class MiniCPMV26Model(PytorchChatModel):
198
196
  msgs.extend(query_to_response)
199
197
  query_to_response = []
200
198
  msgs.append({"role": "user", "content": images_chat + [content]})
199
+ return msgs, video_existed
200
+
201
+ def chat(
202
+ self,
203
+ messages: List[Dict],
204
+ generate_config: Optional[PytorchGenerateConfig] = None,
205
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
206
+ stream = generate_config.get("stream", False) if generate_config else False
207
+ msgs, video_existed = self._convert_to_specific_style(messages)
201
208
 
202
209
  # Set decode params for video
203
210
  params = {}
204
- if videoExisted:
211
+ if video_existed:
205
212
  params = {"use_image_id": False, "max_slice_nums": 1}
206
213
 
207
214
  chat = self._model.chat(
@@ -216,57 +223,140 @@ class MiniCPMV26Model(PytorchChatModel):
216
223
  it = self.chat_stream(chat)
217
224
  return self._to_chat_completion_chunks(it)
218
225
  else:
219
- c = Completion(
220
- id=str(uuid.uuid1()),
221
- object="text_completion",
222
- created=int(time.time()),
223
- model=self.model_uid,
224
- choices=[
225
- CompletionChoice(
226
- index=0, text=chat, finish_reason="stop", logprobs=None
227
- )
228
- ],
229
- usage=CompletionUsage(
230
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
231
- ),
232
- )
233
- return self._to_chat_completion(c)
226
+ return generate_chat_completion(self.model_uid, chat)
234
227
 
235
228
  def chat_stream(self, chat) -> Iterator[CompletionChunk]:
236
229
  completion_id = str(uuid.uuid1())
237
230
  for new_text in chat:
238
- completion_choice = CompletionChoice(
239
- text=new_text, index=0, logprobs=None, finish_reason=None
240
- )
241
- chunk = CompletionChunk(
242
- id=completion_id,
243
- object="text_completion",
244
- created=int(time.time()),
245
- model=self.model_uid,
246
- choices=[completion_choice],
247
- )
248
- completion_usage = CompletionUsage(
231
+ yield generate_completion_chunk(
232
+ chunk_text=new_text,
233
+ finish_reason=None,
234
+ chunk_id=completion_id,
235
+ model_uid=self.model_uid,
249
236
  prompt_tokens=-1,
250
237
  completion_tokens=-1,
251
238
  total_tokens=-1,
252
239
  )
253
- chunk["usage"] = completion_usage
254
- yield chunk
255
-
256
- completion_choice = CompletionChoice(
257
- text="", index=0, logprobs=None, finish_reason="stop"
258
- )
259
- chunk = CompletionChunk(
260
- id=completion_id,
261
- object="text_completion",
262
- created=int(time.time()),
263
- model=self.model_uid,
264
- choices=[completion_choice],
265
- )
266
- completion_usage = CompletionUsage(
240
+ yield generate_completion_chunk(
241
+ chunk_text=None,
242
+ finish_reason="stop",
243
+ chunk_id=completion_id,
244
+ model_uid=self.model_uid,
267
245
  prompt_tokens=-1,
268
246
  completion_tokens=-1,
269
247
  total_tokens=-1,
248
+ has_choice=True,
249
+ has_content=False,
250
+ )
251
+
252
+ def prepare_sanitize_generate_config(self, req: InferenceRequest):
253
+ """
254
+ Refer to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py
255
+ """
256
+ raw_config = req.inference_kwargs.get("raw_params", {})
257
+ temperature = raw_config.get("temperature", None)
258
+ if temperature is None:
259
+ raw_config["temperature"] = 0.7
260
+ top_p = raw_config.get("top_p", None)
261
+ if top_p is None:
262
+ raw_config["top_p"] = 0.8
263
+ top_k = raw_config.get("top_k", None)
264
+ if top_k is None:
265
+ raw_config["top_k"] = 100
266
+ repetition_penalty = raw_config.get("repetition_penalty", None)
267
+ if repetition_penalty is None:
268
+ raw_config["repetition_penalty"] = 1.05
269
+ return raw_config
270
+
271
+ def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
272
+ """
273
+ Copied from https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py#L315
274
+ """
275
+ from copy import deepcopy
276
+
277
+ copy_msgs = deepcopy(msgs)
278
+
279
+ images = []
280
+ for i, msg in enumerate(copy_msgs):
281
+ role = msg["role"]
282
+ content = msg["content"]
283
+ assert role in ["user", "assistant"]
284
+ if i == 0:
285
+ assert role == "user", "The role of first msg should be user"
286
+ if isinstance(content, str):
287
+ content = [content]
288
+ cur_msgs = []
289
+ for c in content:
290
+ if isinstance(c, Image.Image):
291
+ images.append(c)
292
+ cur_msgs.append("(<image>./</image>)")
293
+ elif isinstance(c, str):
294
+ cur_msgs.append(c)
295
+ msg["content"] = "\n".join(cur_msgs)
296
+
297
+ return {
298
+ "prompt": self._processor.tokenizer.apply_chat_template(
299
+ copy_msgs, tokenize=False, add_generation_prompt=True
300
+ ),
301
+ "input_image": images,
302
+ }
303
+
304
+ def _get_full_prompt(self, messages: List[Dict], tools):
305
+ msgs, video_existed = self._convert_to_specific_style(messages)
306
+ if video_existed:
307
+ raise RuntimeError(
308
+ f"Continuous batching does not support video inputs for this model: {self.model_uid}"
309
+ )
310
+ return self._handle_input_ids_and_images(msgs)
311
+
312
+ def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
313
+ prompts_lists = [x["prompt"] for x in prompts]
314
+ input_images_lists = [x["input_image"] for x in prompts]
315
+ inputs = self._processor(
316
+ prompts_lists,
317
+ input_images_lists,
318
+ max_slice_nums=None,
319
+ use_image_id=None,
320
+ return_tensors="pt",
321
+ max_length=8192,
322
+ ).to(self._model.device)
323
+ inputs.pop("image_sizes")
324
+
325
+ masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
326
+ for i in range(masked_input_ids.shape[0]):
327
+ non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
328
+ req_list[i].prompt_tokens = non_zero_values
329
+ req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
330
+ req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
331
+
332
+ model_inputs = {
333
+ "input_ids": inputs["input_ids"],
334
+ "image_bound": inputs["image_bound"],
335
+ "pixel_values": inputs["pixel_values"],
336
+ "tgt_sizes": inputs["tgt_sizes"],
337
+ }
338
+ model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
339
+
340
+ return {
341
+ "inputs_embeds": model_inputs["inputs_embeds"],
342
+ "attention_mask": inputs["attention_mask"],
343
+ }
344
+
345
+ def build_decode_position_ids(
346
+ self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
347
+ ):
348
+ return None
349
+
350
+ def batch_inference(self, req_list: List[InferenceRequest]):
351
+ """
352
+ This method is rewritten
353
+ because the specific inference process is performed by `self._model.llm`,
354
+ not `self._model` itself
355
+ """
356
+ from .utils import batch_inference_one_step
357
+
358
+ self.prepare_batch_inference(req_list)
359
+ batch_inference_one_step(
360
+ self, req_list, self.model_uid, self._model.llm, self._tokenizer
270
361
  )
271
- chunk["usage"] = completion_usage
272
- yield chunk
362
+ self.handle_batch_inference_results(req_list)
@@ -16,20 +16,13 @@ import json
16
16
  import logging
17
17
  import operator
18
18
  import tempfile
19
- import time
20
- import uuid
21
19
  from typing import Dict, Iterator, List, Optional, Tuple, Union
22
20
 
23
21
  from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64
24
- from ....types import (
25
- ChatCompletion,
26
- ChatCompletionChoice,
27
- ChatCompletionChunk,
28
- ChatCompletionMessage,
29
- CompletionUsage,
30
- )
22
+ from ....types import ChatCompletion, ChatCompletionChunk
31
23
  from ...utils import select_device
32
24
  from ..llm_family import LLMFamilyV1, LLMSpecV1
25
+ from ..utils import generate_chat_completion, parse_messages
33
26
  from .core import PytorchChatModel, PytorchGenerateConfig
34
27
 
35
28
  logger = logging.getLogger(__name__)
@@ -96,15 +89,14 @@ class OmniLMMModel(PytorchChatModel):
96
89
 
97
90
  def chat(
98
91
  self,
99
- prompt: Union[str, List[Dict]],
100
- system_prompt: Optional[str] = None,
101
- chat_history: Optional[List[ChatCompletionMessage]] = None,
92
+ messages: List[Dict],
102
93
  generate_config: Optional[PytorchGenerateConfig] = None,
103
94
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
104
95
  if generate_config and generate_config.get("stream"):
105
96
  raise Exception(
106
97
  f"Chat with model {self.model_family.model_name} does not support stream."
107
98
  )
99
+ prompt, _, chat_history = parse_messages(messages)
108
100
  image_first, prompt = self._message_content_to_OmniLMM(prompt)
109
101
 
110
102
  msgs = []
@@ -135,19 +127,4 @@ class OmniLMMModel(PytorchChatModel):
135
127
  input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
136
128
  answer = self._model.chat(input=input)
137
129
 
138
- return ChatCompletion(
139
- id="chat" + str(uuid.uuid1()),
140
- object="chat.completion",
141
- created=int(time.time()),
142
- model=self.model_uid,
143
- choices=[
144
- ChatCompletionChoice(
145
- index=0,
146
- message={"role": "assistant", "content": answer},
147
- finish_reason="stop",
148
- )
149
- ],
150
- usage=CompletionUsage(
151
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
152
- ),
153
- )
130
+ return generate_chat_completion(self.model_uid, answer)
@@ -0,0 +1,168 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ import uuid
16
+ from io import BytesIO
17
+ from typing import Dict, Iterator, List, Optional, Union
18
+ from urllib.request import urlopen
19
+
20
+ import numpy as np
21
+
22
+ from ....model.utils import select_device
23
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
24
+ from ..llm_family import LLMFamilyV1, LLMSpecV1
25
+ from ..utils import generate_chat_completion, generate_completion_chunk
26
+ from .core import PytorchChatModel, PytorchGenerateConfig
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class Qwen2AudioChatModel(PytorchChatModel):
32
+ def __init__(self, *args, **kwargs):
33
+ super().__init__(*args, **kwargs)
34
+ self._processor = None
35
+ self._model = None
36
+ self._device = None
37
+
38
+ @classmethod
39
+ def match(
40
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
41
+ ) -> bool:
42
+ llm_family = model_family.model_family or model_family.model_name
43
+ if "qwen2-audio".lower() in llm_family.lower():
44
+ return True
45
+ return False
46
+
47
+ def load(self):
48
+ from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
49
+
50
+ device = self._pytorch_model_config.get("device", "auto")
51
+ device = select_device(device)
52
+ self._device = device
53
+ # for multiple GPU, set back to auto to make multiple devices work
54
+ device = "auto" if device == "cuda" else device
55
+
56
+ self._processor = AutoProcessor.from_pretrained(
57
+ self.model_path,
58
+ device_map=device,
59
+ # trust_remote_code=True,
60
+ code_revision=self.model_spec.model_revision,
61
+ )
62
+ self._model = Qwen2AudioForConditionalGeneration.from_pretrained(
63
+ self.model_path,
64
+ device_map=device,
65
+ # trust_remote_code=True,
66
+ revision=self.model_spec.model_revision,
67
+ )
68
+
69
+ def _transform_messages(
70
+ self,
71
+ messages: List[Dict],
72
+ ):
73
+ import librosa
74
+
75
+ text = self._processor.apply_chat_template(
76
+ messages, add_generation_prompt=True, tokenize=False
77
+ )
78
+ audios: List[np.ndarray] = []
79
+ for msg in messages:
80
+ content = msg["content"]
81
+ if isinstance(content, List):
82
+ for item in content: # type: ignore
83
+ if item.get("type") == "audio" and "audio_url" in item:
84
+ audio = librosa.load(
85
+ BytesIO(urlopen(item["audio_url"]).read()),
86
+ sr=self._processor.feature_extractor.sampling_rate,
87
+ )[0]
88
+ audios.append(audio)
89
+
90
+ return text, audios
91
+
92
+ def chat(
93
+ self,
94
+ messages: List[Dict],
95
+ generate_config: Optional[PytorchGenerateConfig] = None,
96
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
97
+ text, audios = self._transform_messages(messages)
98
+ inputs = self._processor(
99
+ text=text, audios=audios, return_tensors="pt", padding=True
100
+ )
101
+ inputs.input_ids = inputs.input_ids.to(self._device)
102
+ generate_config = generate_config if generate_config else {}
103
+ stream = generate_config.get("stream", False) if generate_config else False
104
+
105
+ if stream:
106
+ it = self._generate_stream(inputs, generate_config)
107
+ return self._to_chat_completion_chunks(it)
108
+ else:
109
+ c = self._generate(inputs, generate_config)
110
+ return c
111
+
112
+ def _generate(self, inputs, config: PytorchGenerateConfig = {}) -> ChatCompletion:
113
+ generate_ids = self._model.generate(
114
+ **inputs,
115
+ max_length=config.get("max_tokens", 512),
116
+ )
117
+ generate_ids = generate_ids[:, inputs.input_ids.size(1) :]
118
+ response = self._processor.batch_decode(
119
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
120
+ )[0]
121
+ return generate_chat_completion(self.model_uid, response)
122
+
123
+ def _generate_stream(
124
+ self, inputs, config: PytorchGenerateConfig = {}
125
+ ) -> Iterator[CompletionChunk]:
126
+ from threading import Thread
127
+
128
+ from transformers import TextIteratorStreamer
129
+
130
+ tokenizer = self._processor.tokenizer
131
+ streamer = TextIteratorStreamer(
132
+ tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
133
+ )
134
+
135
+ gen_kwargs = {
136
+ "max_new_tokens": config.get("max_tokens", 512),
137
+ "streamer": streamer,
138
+ **inputs,
139
+ }
140
+
141
+ thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
142
+ thread.start()
143
+
144
+ completion_id = str(uuid.uuid1())
145
+ for new_text in streamer:
146
+ yield generate_completion_chunk(
147
+ chunk_text=new_text,
148
+ finish_reason=None,
149
+ chunk_id=completion_id,
150
+ model_uid=self.model_uid,
151
+ prompt_tokens=-1,
152
+ completion_tokens=-1,
153
+ total_tokens=-1,
154
+ has_choice=True,
155
+ has_content=True,
156
+ )
157
+
158
+ yield generate_completion_chunk(
159
+ chunk_text=None,
160
+ finish_reason="stop",
161
+ chunk_id=completion_id,
162
+ model_uid=self.model_uid,
163
+ prompt_tokens=-1,
164
+ completion_tokens=-1,
165
+ total_tokens=-1,
166
+ has_choice=True,
167
+ has_content=False,
168
+ )