xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (149) hide show
  1. xinference/_compat.py +51 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +5 -39
  4. xinference/client/restful/restful_client.py +3 -24
  5. xinference/conftest.py +1 -1
  6. xinference/constants.py +5 -0
  7. xinference/core/cache_tracker.py +1 -1
  8. xinference/core/chat_interface.py +8 -14
  9. xinference/core/event.py +1 -1
  10. xinference/core/model.py +82 -31
  11. xinference/core/scheduler.py +37 -37
  12. xinference/core/status_guard.py +1 -1
  13. xinference/core/supervisor.py +11 -10
  14. xinference/core/utils.py +80 -22
  15. xinference/core/worker.py +17 -16
  16. xinference/deploy/cmdline.py +8 -16
  17. xinference/deploy/local.py +1 -1
  18. xinference/deploy/supervisor.py +1 -1
  19. xinference/deploy/utils.py +1 -1
  20. xinference/deploy/worker.py +1 -1
  21. xinference/model/audio/cosyvoice.py +86 -41
  22. xinference/model/embedding/core.py +52 -31
  23. xinference/model/image/stable_diffusion/core.py +18 -1
  24. xinference/model/llm/__init__.py +21 -11
  25. xinference/model/llm/llama_cpp/core.py +16 -33
  26. xinference/model/llm/llm_family.json +619 -1297
  27. xinference/model/llm/llm_family.py +31 -52
  28. xinference/model/llm/llm_family_csghub.json +18 -35
  29. xinference/model/llm/llm_family_modelscope.json +573 -1119
  30. xinference/model/llm/lmdeploy/core.py +56 -88
  31. xinference/model/llm/mlx/core.py +46 -69
  32. xinference/model/llm/sglang/core.py +33 -18
  33. xinference/model/llm/transformers/chatglm.py +167 -305
  34. xinference/model/llm/transformers/cogvlm2.py +36 -63
  35. xinference/model/llm/transformers/cogvlm2_video.py +33 -223
  36. xinference/model/llm/transformers/core.py +49 -50
  37. xinference/model/llm/transformers/deepseek_vl.py +53 -96
  38. xinference/model/llm/transformers/glm4v.py +55 -111
  39. xinference/model/llm/transformers/intern_vl.py +39 -70
  40. xinference/model/llm/transformers/internlm2.py +32 -54
  41. xinference/model/llm/transformers/minicpmv25.py +22 -55
  42. xinference/model/llm/transformers/minicpmv26.py +158 -68
  43. xinference/model/llm/transformers/omnilmm.py +5 -28
  44. xinference/model/llm/transformers/qwen2_vl.py +208 -0
  45. xinference/model/llm/transformers/qwen_vl.py +34 -86
  46. xinference/model/llm/transformers/utils.py +32 -38
  47. xinference/model/llm/transformers/yi_vl.py +32 -72
  48. xinference/model/llm/utils.py +195 -489
  49. xinference/model/llm/vllm/core.py +153 -100
  50. xinference/model/rerank/core.py +41 -8
  51. xinference/model/rerank/model_spec.json +7 -0
  52. xinference/model/rerank/model_spec_modelscope.json +7 -1
  53. xinference/model/utils.py +1 -31
  54. xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
  55. xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
  56. xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
  57. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
  58. xinference/thirdparty/cosyvoice/cli/model.py +139 -26
  59. xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
  60. xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
  61. xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
  62. xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
  63. xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
  64. xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
  65. xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
  66. xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
  67. xinference/thirdparty/cosyvoice/utils/common.py +36 -0
  68. xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
  69. xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
  70. xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
  71. xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
  72. xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
  73. xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
  74. xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
  75. xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
  76. xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
  77. xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
  78. xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
  79. xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
  80. xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
  81. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +34 -0
  82. xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  83. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
  84. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
  85. xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
  88. xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
  89. xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
  90. xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
  91. xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
  92. xinference/thirdparty/matcha/VERSION +1 -0
  93. xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
  94. xinference/thirdparty/matcha/hifigan/README.md +101 -0
  95. xinference/thirdparty/omnilmm/LICENSE +201 -0
  96. xinference/thirdparty/whisper/__init__.py +156 -0
  97. xinference/thirdparty/whisper/__main__.py +3 -0
  98. xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
  99. xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
  100. xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
  101. xinference/thirdparty/whisper/audio.py +157 -0
  102. xinference/thirdparty/whisper/decoding.py +826 -0
  103. xinference/thirdparty/whisper/model.py +314 -0
  104. xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
  105. xinference/thirdparty/whisper/normalizers/basic.py +76 -0
  106. xinference/thirdparty/whisper/normalizers/english.json +1741 -0
  107. xinference/thirdparty/whisper/normalizers/english.py +550 -0
  108. xinference/thirdparty/whisper/timing.py +386 -0
  109. xinference/thirdparty/whisper/tokenizer.py +395 -0
  110. xinference/thirdparty/whisper/transcribe.py +605 -0
  111. xinference/thirdparty/whisper/triton_ops.py +109 -0
  112. xinference/thirdparty/whisper/utils.py +316 -0
  113. xinference/thirdparty/whisper/version.py +1 -0
  114. xinference/types.py +7 -49
  115. xinference/web/ui/build/asset-manifest.json +6 -6
  116. xinference/web/ui/build/index.html +1 -1
  117. xinference/web/ui/build/static/css/{main.4bafd904.css → main.632e9148.css} +2 -2
  118. xinference/web/ui/build/static/css/main.632e9148.css.map +1 -0
  119. xinference/web/ui/build/static/js/main.9cfafbd6.js +3 -0
  120. xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.9cfafbd6.js.LICENSE.txt} +2 -0
  121. xinference/web/ui/build/static/js/main.9cfafbd6.js.map +1 -0
  122. xinference/web/ui/node_modules/.cache/babel-loader/01d6d198156bacbd436c51435edbd4b2cacd47a79db929105eba30f74b67d48d.json +1 -0
  123. xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
  124. xinference/web/ui/node_modules/.cache/babel-loader/59eb25f514afcc4fefd1b309d192b2455f1e0aec68a9de598ca4b2333fe2c774.json +1 -0
  125. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
  126. xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
  127. xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
  128. xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
  129. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
  130. xinference/web/ui/node_modules/.package-lock.json +37 -0
  131. xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
  132. xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
  133. xinference/web/ui/node_modules/nunjucks/package.json +112 -0
  134. xinference/web/ui/package-lock.json +38 -0
  135. xinference/web/ui/package.json +1 -0
  136. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/METADATA +8 -8
  137. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/RECORD +141 -87
  138. xinference/model/llm/transformers/llama_2.py +0 -108
  139. xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
  140. xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
  141. xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
  142. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
  143. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
  144. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
  145. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
  146. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/LICENSE +0 -0
  147. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/WHEEL +0 -0
  148. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/entry_points.txt +0 -0
  149. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/top_level.txt +0 -0
@@ -11,23 +11,13 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import time
15
14
  import uuid
16
15
  from typing import Any, Dict, Iterator, List, Optional, Union
17
16
 
18
17
  from ....core.scheduler import InferenceRequest
19
- from ....types import (
20
- ChatCompletion,
21
- ChatCompletionChoice,
22
- ChatCompletionChunk,
23
- ChatCompletionMessage,
24
- CompletionChoice,
25
- CompletionChunk,
26
- CompletionUsage,
27
- LoRA,
28
- PytorchGenerateConfig,
29
- )
18
+ from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
30
19
  from ..llm_family import LLMFamilyV1, LLMSpecV1
20
+ from ..utils import generate_chat_completion, generate_completion_chunk, parse_messages
31
21
  from .core import PytorchChatModel, PytorchModelConfig
32
22
 
33
23
 
@@ -106,9 +96,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
106
96
 
107
97
  def chat(
108
98
  self,
109
- prompt: str,
110
- system_prompt: Optional[str] = None,
111
- chat_history: Optional[List[ChatCompletionMessage]] = None,
99
+ messages: List[Dict],
112
100
  generate_config: Optional[PytorchGenerateConfig] = None,
113
101
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
114
102
  kwargs: Dict[str, Any] = {}
@@ -130,6 +118,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
130
118
  if isinstance(stream_options, dict)
131
119
  else False
132
120
  )
121
+
122
+ prompt, system_prompt, chat_history = parse_messages(messages)
133
123
  if chat_history:
134
124
  input_history = [
135
125
  (chat_history[i]["content"], (chat_history[i + 1]["content"]))
@@ -155,54 +145,42 @@ class Internlm2PytorchChatModel(PytorchChatModel):
155
145
  total_tokens = prompt_tokens + completion_tokens
156
146
  chunk_text = chunk_text[last_chunk_text_length:]
157
147
  last_chunk_text_length += len(chunk_text)
158
- completion_choice = CompletionChoice(
159
- text=chunk_text, index=0, logprobs=None, finish_reason=None
160
- )
161
- yield CompletionChunk(
162
- id=chunk_id,
163
- object="text_completion",
164
- created=int(time.time()),
165
- model=self.model_uid,
166
- choices=[completion_choice],
167
- usage=CompletionUsage(
168
- prompt_tokens=prompt_tokens,
169
- completion_tokens=completion_tokens,
170
- total_tokens=total_tokens,
171
- ),
148
+
149
+ yield generate_completion_chunk(
150
+ chunk_text,
151
+ finish_reason=None,
152
+ chunk_id=chunk_id,
153
+ model_uid=self.model_uid,
154
+ prompt_tokens=prompt_tokens,
155
+ completion_tokens=completion_tokens,
156
+ total_tokens=total_tokens,
172
157
  )
158
+ yield generate_completion_chunk(
159
+ None,
160
+ finish_reason="stop",
161
+ chunk_id=chunk_id,
162
+ model_uid=self.model_uid,
163
+ prompt_tokens=prompt_tokens,
164
+ completion_tokens=completion_tokens,
165
+ total_tokens=total_tokens,
166
+ has_choice=True,
167
+ has_content=False,
168
+ )
173
169
  if include_usage:
174
- chunk = CompletionChunk(
175
- id=chunk_id,
176
- object="text_completion",
177
- created=int(time.time()),
178
- model=self.model_uid,
179
- choices=[],
180
- )
181
- chunk["usage"] = CompletionUsage(
170
+ yield generate_completion_chunk(
171
+ None,
172
+ finish_reason=None,
173
+ chunk_id=chunk_id,
174
+ model_uid=self.model_uid,
182
175
  prompt_tokens=prompt_tokens,
183
176
  completion_tokens=completion_tokens,
184
177
  total_tokens=total_tokens,
178
+ has_choice=False,
185
179
  )
186
- yield chunk
187
180
 
188
181
  return self._to_chat_completion_chunks(_stream_generator())
189
182
  else:
190
183
  response, _ = self._model.chat(
191
184
  self._tokenizer, prompt, input_history, **kwargs
192
185
  )
193
- return ChatCompletion(
194
- id="chat" + str(uuid.uuid1()),
195
- object="chat.completion",
196
- created=int(time.time()),
197
- model=self.model_uid,
198
- choices=[
199
- ChatCompletionChoice(
200
- index=0,
201
- message={"role": "assistant", "content": response},
202
- finish_reason="stop",
203
- )
204
- ],
205
- usage=CompletionUsage(
206
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
207
- ),
208
- )
186
+ return generate_chat_completion(self.model_uid, response)
@@ -13,25 +13,21 @@
13
13
  # limitations under the License.
14
14
  import json
15
15
  import logging
16
- import time
17
16
  import uuid
18
17
  from concurrent.futures import ThreadPoolExecutor
19
18
  from typing import Dict, Iterator, List, Optional, Union
20
19
 
21
20
  import torch
22
21
 
23
- from ....types import (
24
- ChatCompletion,
25
- ChatCompletionChunk,
26
- ChatCompletionMessage,
27
- Completion,
28
- CompletionChoice,
29
- CompletionChunk,
30
- CompletionUsage,
31
- )
22
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
32
23
  from ...utils import select_device
33
24
  from ..llm_family import LLMFamilyV1, LLMSpecV1
34
- from ..utils import _decode_image
25
+ from ..utils import (
26
+ _decode_image,
27
+ generate_chat_completion,
28
+ generate_completion_chunk,
29
+ parse_messages,
30
+ )
35
31
  from .core import PytorchChatModel, PytorchGenerateConfig
36
32
 
37
33
  logger = logging.getLogger(__name__)
@@ -125,12 +121,11 @@ class MiniCPMV25Model(PytorchChatModel):
125
121
 
126
122
  def chat(
127
123
  self,
128
- prompt: Union[str, List[Dict]],
129
- system_prompt: Optional[str] = None,
130
- chat_history: Optional[List[ChatCompletionMessage]] = None,
124
+ messages: List[Dict],
131
125
  generate_config: Optional[PytorchGenerateConfig] = None,
132
126
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
133
127
  stream = generate_config.get("stream", False) if generate_config else False
128
+ prompt, _, chat_history = parse_messages(messages)
134
129
  content, images_chat = self._message_content_to_chat(prompt)
135
130
 
136
131
  msgs = []
@@ -166,57 +161,29 @@ class MiniCPMV25Model(PytorchChatModel):
166
161
  it = self.chat_stream(chat)
167
162
  return self._to_chat_completion_chunks(it)
168
163
  else:
169
- c = Completion(
170
- id=str(uuid.uuid1()),
171
- object="text_completion",
172
- created=int(time.time()),
173
- model=self.model_uid,
174
- choices=[
175
- CompletionChoice(
176
- index=0, text=chat, finish_reason="stop", logprobs=None
177
- )
178
- ],
179
- usage=CompletionUsage(
180
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
181
- ),
182
- )
183
- return self._to_chat_completion(c)
164
+ return generate_chat_completion(self.model_uid, chat)
184
165
 
185
166
  def chat_stream(self, chat) -> Iterator[CompletionChunk]:
186
167
  completion_id = str(uuid.uuid1())
187
168
  for new_text in chat:
188
- completion_choice = CompletionChoice(
189
- text=new_text, index=0, logprobs=None, finish_reason=None
190
- )
191
- chunk = CompletionChunk(
192
- id=completion_id,
193
- object="text_completion",
194
- created=int(time.time()),
195
- model=self.model_uid,
196
- choices=[completion_choice],
197
- )
198
- completion_usage = CompletionUsage(
169
+ yield generate_completion_chunk(
170
+ chunk_text=new_text,
171
+ finish_reason=None,
172
+ chunk_id=completion_id,
173
+ model_uid=self.model_uid,
199
174
  prompt_tokens=-1,
200
175
  completion_tokens=-1,
201
176
  total_tokens=-1,
202
177
  )
203
- chunk["usage"] = completion_usage
204
- yield chunk
205
178
 
206
- completion_choice = CompletionChoice(
207
- text="", index=0, logprobs=None, finish_reason="stop"
208
- )
209
- chunk = CompletionChunk(
210
- id=completion_id,
211
- object="text_completion",
212
- created=int(time.time()),
213
- model=self.model_uid,
214
- choices=[completion_choice],
215
- )
216
- completion_usage = CompletionUsage(
179
+ yield generate_completion_chunk(
180
+ chunk_text=None,
181
+ finish_reason="stop",
182
+ chunk_id=completion_id,
183
+ model_uid=self.model_uid,
217
184
  prompt_tokens=-1,
218
185
  completion_tokens=-1,
219
186
  total_tokens=-1,
187
+ has_choice=True,
188
+ has_content=False,
220
189
  )
221
- chunk["usage"] = completion_usage
222
- yield chunk
@@ -12,26 +12,23 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- import time
16
15
  import uuid
17
16
  from concurrent.futures import ThreadPoolExecutor
18
- from typing import Dict, Iterator, List, Optional, Union
17
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
19
18
 
20
19
  import torch
21
20
  from PIL import Image
22
21
 
23
- from ....types import (
24
- ChatCompletion,
25
- ChatCompletionChunk,
26
- ChatCompletionMessage,
27
- Completion,
28
- CompletionChoice,
29
- CompletionChunk,
30
- CompletionUsage,
31
- )
22
+ from ....core.scheduler import InferenceRequest
23
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
32
24
  from ...utils import select_device
33
25
  from ..llm_family import LLMFamilyV1, LLMSpecV1
34
- from ..utils import _decode_image
26
+ from ..utils import (
27
+ _decode_image,
28
+ generate_chat_completion,
29
+ generate_completion_chunk,
30
+ parse_messages,
31
+ )
35
32
  from .core import PytorchChatModel, PytorchGenerateConfig
36
33
 
37
34
  logger = logging.getLogger(__name__)
@@ -43,6 +40,7 @@ class MiniCPMV26Model(PytorchChatModel):
43
40
  self._device = None
44
41
  self._tokenizer = None
45
42
  self._model = None
43
+ self._processor = None
46
44
 
47
45
  @classmethod
48
46
  def match(
@@ -59,7 +57,7 @@ class MiniCPMV26Model(PytorchChatModel):
59
57
  return AutoModel
60
58
 
61
59
  def load(self, **kwargs):
62
- from transformers import AutoModel, AutoTokenizer
60
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer
63
61
  from transformers.generation import GenerationConfig
64
62
 
65
63
  device = self._pytorch_model_config.get("device", "auto")
@@ -100,6 +98,10 @@ class MiniCPMV26Model(PytorchChatModel):
100
98
  self.model_path,
101
99
  trust_remote_code=True,
102
100
  )
101
+ self._processor = AutoProcessor.from_pretrained(
102
+ self.model_path, trust_remote_code=True
103
+ )
104
+ self._device = self._model.device
103
105
  self._save_tensorizer()
104
106
 
105
107
  def _message_content_to_chat(self, content):
@@ -120,7 +122,9 @@ class MiniCPMV26Model(PytorchChatModel):
120
122
  frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
121
123
  frames = vr.get_batch(frame_idx).asnumpy()
122
124
  frames = [Image.fromarray(v.astype("uint8")) for v in frames]
123
- print("num frames:", len(frames))
125
+ logger.info(
126
+ f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
127
+ )
124
128
  return frames
125
129
 
126
130
  def _load_video(_url):
@@ -158,19 +162,13 @@ class MiniCPMV26Model(PytorchChatModel):
158
162
  return text, images, frames
159
163
  return content, [], []
160
164
 
161
- def chat(
162
- self,
163
- prompt: Union[str, List[Dict]],
164
- system_prompt: Optional[str] = None,
165
- chat_history: Optional[List[ChatCompletionMessage]] = None,
166
- generate_config: Optional[PytorchGenerateConfig] = None,
167
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
168
- stream = generate_config.get("stream", False) if generate_config else False
169
- videoExisted = False
165
+ def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
166
+ video_existed = False
167
+ prompt, _, chat_history = parse_messages(messages)
170
168
 
171
169
  content, images_chat, video_frames = self._message_content_to_chat(prompt)
172
170
  if len(video_frames) > 0:
173
- videoExisted = True
171
+ video_existed = True
174
172
  images_chat = video_frames
175
173
 
176
174
  msgs = []
@@ -184,7 +182,7 @@ class MiniCPMV26Model(PytorchChatModel):
184
182
  if images_tmp != []:
185
183
  images_history = images_tmp
186
184
  if len(video_frames_h) > 0:
187
- videoExisted = True
185
+ video_existed = True
188
186
  images_history = video_frames_h
189
187
  if len(query_to_response) == 0 and role == "user":
190
188
  query_to_response.append(
@@ -198,10 +196,19 @@ class MiniCPMV26Model(PytorchChatModel):
198
196
  msgs.extend(query_to_response)
199
197
  query_to_response = []
200
198
  msgs.append({"role": "user", "content": images_chat + [content]})
199
+ return msgs, video_existed
200
+
201
+ def chat(
202
+ self,
203
+ messages: List[Dict],
204
+ generate_config: Optional[PytorchGenerateConfig] = None,
205
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
206
+ stream = generate_config.get("stream", False) if generate_config else False
207
+ msgs, video_existed = self._convert_to_specific_style(messages)
201
208
 
202
209
  # Set decode params for video
203
210
  params = {}
204
- if videoExisted:
211
+ if video_existed:
205
212
  params = {"use_image_id": False, "max_slice_nums": 1}
206
213
 
207
214
  chat = self._model.chat(
@@ -216,57 +223,140 @@ class MiniCPMV26Model(PytorchChatModel):
216
223
  it = self.chat_stream(chat)
217
224
  return self._to_chat_completion_chunks(it)
218
225
  else:
219
- c = Completion(
220
- id=str(uuid.uuid1()),
221
- object="text_completion",
222
- created=int(time.time()),
223
- model=self.model_uid,
224
- choices=[
225
- CompletionChoice(
226
- index=0, text=chat, finish_reason="stop", logprobs=None
227
- )
228
- ],
229
- usage=CompletionUsage(
230
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
231
- ),
232
- )
233
- return self._to_chat_completion(c)
226
+ return generate_chat_completion(self.model_uid, chat)
234
227
 
235
228
  def chat_stream(self, chat) -> Iterator[CompletionChunk]:
236
229
  completion_id = str(uuid.uuid1())
237
230
  for new_text in chat:
238
- completion_choice = CompletionChoice(
239
- text=new_text, index=0, logprobs=None, finish_reason=None
240
- )
241
- chunk = CompletionChunk(
242
- id=completion_id,
243
- object="text_completion",
244
- created=int(time.time()),
245
- model=self.model_uid,
246
- choices=[completion_choice],
247
- )
248
- completion_usage = CompletionUsage(
231
+ yield generate_completion_chunk(
232
+ chunk_text=new_text,
233
+ finish_reason=None,
234
+ chunk_id=completion_id,
235
+ model_uid=self.model_uid,
249
236
  prompt_tokens=-1,
250
237
  completion_tokens=-1,
251
238
  total_tokens=-1,
252
239
  )
253
- chunk["usage"] = completion_usage
254
- yield chunk
255
-
256
- completion_choice = CompletionChoice(
257
- text="", index=0, logprobs=None, finish_reason="stop"
258
- )
259
- chunk = CompletionChunk(
260
- id=completion_id,
261
- object="text_completion",
262
- created=int(time.time()),
263
- model=self.model_uid,
264
- choices=[completion_choice],
265
- )
266
- completion_usage = CompletionUsage(
240
+ yield generate_completion_chunk(
241
+ chunk_text=None,
242
+ finish_reason="stop",
243
+ chunk_id=completion_id,
244
+ model_uid=self.model_uid,
267
245
  prompt_tokens=-1,
268
246
  completion_tokens=-1,
269
247
  total_tokens=-1,
248
+ has_choice=True,
249
+ has_content=False,
250
+ )
251
+
252
+ def prepare_sanitize_generate_config(self, req: InferenceRequest):
253
+ """
254
+ Refer to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py
255
+ """
256
+ raw_config = req.inference_kwargs.get("raw_params", {})
257
+ temperature = raw_config.get("temperature", None)
258
+ if temperature is None:
259
+ raw_config["temperature"] = 0.7
260
+ top_p = raw_config.get("top_p", None)
261
+ if top_p is None:
262
+ raw_config["top_p"] = 0.8
263
+ top_k = raw_config.get("top_k", None)
264
+ if top_k is None:
265
+ raw_config["top_k"] = 100
266
+ repetition_penalty = raw_config.get("repetition_penalty", None)
267
+ if repetition_penalty is None:
268
+ raw_config["repetition_penalty"] = 1.05
269
+ return raw_config
270
+
271
+ def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
272
+ """
273
+ Copied from https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py#L315
274
+ """
275
+ from copy import deepcopy
276
+
277
+ copy_msgs = deepcopy(msgs)
278
+
279
+ images = []
280
+ for i, msg in enumerate(copy_msgs):
281
+ role = msg["role"]
282
+ content = msg["content"]
283
+ assert role in ["user", "assistant"]
284
+ if i == 0:
285
+ assert role == "user", "The role of first msg should be user"
286
+ if isinstance(content, str):
287
+ content = [content]
288
+ cur_msgs = []
289
+ for c in content:
290
+ if isinstance(c, Image.Image):
291
+ images.append(c)
292
+ cur_msgs.append("(<image>./</image>)")
293
+ elif isinstance(c, str):
294
+ cur_msgs.append(c)
295
+ msg["content"] = "\n".join(cur_msgs)
296
+
297
+ return {
298
+ "prompt": self._processor.tokenizer.apply_chat_template(
299
+ copy_msgs, tokenize=False, add_generation_prompt=True
300
+ ),
301
+ "input_image": images,
302
+ }
303
+
304
+ def _get_full_prompt(self, messages: List[Dict], tools):
305
+ msgs, video_existed = self._convert_to_specific_style(messages)
306
+ if video_existed:
307
+ raise RuntimeError(
308
+ f"Continuous batching does not support video inputs for this model: {self.model_uid}"
309
+ )
310
+ return self._handle_input_ids_and_images(msgs)
311
+
312
+ def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
313
+ prompts_lists = [x["prompt"] for x in prompts]
314
+ input_images_lists = [x["input_image"] for x in prompts]
315
+ inputs = self._processor(
316
+ prompts_lists,
317
+ input_images_lists,
318
+ max_slice_nums=None,
319
+ use_image_id=None,
320
+ return_tensors="pt",
321
+ max_length=8192,
322
+ ).to(self._model.device)
323
+ inputs.pop("image_sizes")
324
+
325
+ masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
326
+ for i in range(masked_input_ids.shape[0]):
327
+ non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
328
+ req_list[i].prompt_tokens = non_zero_values
329
+ req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
330
+ req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
331
+
332
+ model_inputs = {
333
+ "input_ids": inputs["input_ids"],
334
+ "image_bound": inputs["image_bound"],
335
+ "pixel_values": inputs["pixel_values"],
336
+ "tgt_sizes": inputs["tgt_sizes"],
337
+ }
338
+ model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
339
+
340
+ return {
341
+ "inputs_embeds": model_inputs["inputs_embeds"],
342
+ "attention_mask": inputs["attention_mask"],
343
+ }
344
+
345
+ def build_decode_position_ids(
346
+ self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
347
+ ):
348
+ return None
349
+
350
+ def batch_inference(self, req_list: List[InferenceRequest]):
351
+ """
352
+ This method is rewritten
353
+ because the specific inference process is performed by `self._model.llm`,
354
+ not `self._model` itself
355
+ """
356
+ from .utils import batch_inference_one_step
357
+
358
+ self.prepare_batch_inference(req_list)
359
+ batch_inference_one_step(
360
+ self, req_list, self.model_uid, self._model.llm, self._tokenizer
270
361
  )
271
- chunk["usage"] = completion_usage
272
- yield chunk
362
+ self.handle_batch_inference_results(req_list)
@@ -16,20 +16,13 @@ import json
16
16
  import logging
17
17
  import operator
18
18
  import tempfile
19
- import time
20
- import uuid
21
19
  from typing import Dict, Iterator, List, Optional, Tuple, Union
22
20
 
23
21
  from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64
24
- from ....types import (
25
- ChatCompletion,
26
- ChatCompletionChoice,
27
- ChatCompletionChunk,
28
- ChatCompletionMessage,
29
- CompletionUsage,
30
- )
22
+ from ....types import ChatCompletion, ChatCompletionChunk
31
23
  from ...utils import select_device
32
24
  from ..llm_family import LLMFamilyV1, LLMSpecV1
25
+ from ..utils import generate_chat_completion, parse_messages
33
26
  from .core import PytorchChatModel, PytorchGenerateConfig
34
27
 
35
28
  logger = logging.getLogger(__name__)
@@ -96,15 +89,14 @@ class OmniLMMModel(PytorchChatModel):
96
89
 
97
90
  def chat(
98
91
  self,
99
- prompt: Union[str, List[Dict]],
100
- system_prompt: Optional[str] = None,
101
- chat_history: Optional[List[ChatCompletionMessage]] = None,
92
+ messages: List[Dict],
102
93
  generate_config: Optional[PytorchGenerateConfig] = None,
103
94
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
104
95
  if generate_config and generate_config.get("stream"):
105
96
  raise Exception(
106
97
  f"Chat with model {self.model_family.model_name} does not support stream."
107
98
  )
99
+ prompt, _, chat_history = parse_messages(messages)
108
100
  image_first, prompt = self._message_content_to_OmniLMM(prompt)
109
101
 
110
102
  msgs = []
@@ -135,19 +127,4 @@ class OmniLMMModel(PytorchChatModel):
135
127
  input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
136
128
  answer = self._model.chat(input=input)
137
129
 
138
- return ChatCompletion(
139
- id="chat" + str(uuid.uuid1()),
140
- object="chat.completion",
141
- created=int(time.time()),
142
- model=self.model_uid,
143
- choices=[
144
- ChatCompletionChoice(
145
- index=0,
146
- message={"role": "assistant", "content": answer},
147
- finish_reason="stop",
148
- )
149
- ],
150
- usage=CompletionUsage(
151
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
152
- ),
153
- )
130
+ return generate_chat_completion(self.model_uid, answer)