xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show
  1. xinference/_compat.py +51 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +209 -40
  4. xinference/client/restful/restful_client.py +7 -26
  5. xinference/conftest.py +1 -1
  6. xinference/constants.py +5 -0
  7. xinference/core/cache_tracker.py +1 -1
  8. xinference/core/chat_interface.py +8 -14
  9. xinference/core/event.py +1 -1
  10. xinference/core/image_interface.py +28 -0
  11. xinference/core/model.py +110 -31
  12. xinference/core/scheduler.py +37 -37
  13. xinference/core/status_guard.py +1 -1
  14. xinference/core/supervisor.py +17 -10
  15. xinference/core/utils.py +80 -22
  16. xinference/core/worker.py +17 -16
  17. xinference/deploy/cmdline.py +8 -16
  18. xinference/deploy/local.py +1 -1
  19. xinference/deploy/supervisor.py +1 -1
  20. xinference/deploy/utils.py +1 -1
  21. xinference/deploy/worker.py +1 -1
  22. xinference/model/audio/cosyvoice.py +86 -41
  23. xinference/model/audio/fish_speech.py +9 -9
  24. xinference/model/audio/model_spec.json +9 -9
  25. xinference/model/audio/whisper.py +4 -1
  26. xinference/model/embedding/core.py +52 -31
  27. xinference/model/image/core.py +2 -1
  28. xinference/model/image/model_spec.json +16 -4
  29. xinference/model/image/model_spec_modelscope.json +16 -4
  30. xinference/model/image/sdapi.py +136 -0
  31. xinference/model/image/stable_diffusion/core.py +164 -19
  32. xinference/model/llm/__init__.py +29 -11
  33. xinference/model/llm/llama_cpp/core.py +16 -33
  34. xinference/model/llm/llm_family.json +1011 -1296
  35. xinference/model/llm/llm_family.py +34 -53
  36. xinference/model/llm/llm_family_csghub.json +18 -35
  37. xinference/model/llm/llm_family_modelscope.json +981 -1122
  38. xinference/model/llm/lmdeploy/core.py +56 -88
  39. xinference/model/llm/mlx/core.py +46 -69
  40. xinference/model/llm/sglang/core.py +36 -18
  41. xinference/model/llm/transformers/chatglm.py +168 -306
  42. xinference/model/llm/transformers/cogvlm2.py +36 -63
  43. xinference/model/llm/transformers/cogvlm2_video.py +33 -223
  44. xinference/model/llm/transformers/core.py +55 -50
  45. xinference/model/llm/transformers/deepseek_v2.py +340 -0
  46. xinference/model/llm/transformers/deepseek_vl.py +53 -96
  47. xinference/model/llm/transformers/glm4v.py +55 -111
  48. xinference/model/llm/transformers/intern_vl.py +39 -70
  49. xinference/model/llm/transformers/internlm2.py +32 -54
  50. xinference/model/llm/transformers/minicpmv25.py +22 -55
  51. xinference/model/llm/transformers/minicpmv26.py +158 -68
  52. xinference/model/llm/transformers/omnilmm.py +5 -28
  53. xinference/model/llm/transformers/qwen2_audio.py +168 -0
  54. xinference/model/llm/transformers/qwen2_vl.py +234 -0
  55. xinference/model/llm/transformers/qwen_vl.py +34 -86
  56. xinference/model/llm/transformers/utils.py +32 -38
  57. xinference/model/llm/transformers/yi_vl.py +32 -72
  58. xinference/model/llm/utils.py +280 -554
  59. xinference/model/llm/vllm/core.py +161 -100
  60. xinference/model/rerank/core.py +41 -8
  61. xinference/model/rerank/model_spec.json +7 -0
  62. xinference/model/rerank/model_spec_modelscope.json +7 -1
  63. xinference/model/utils.py +1 -31
  64. xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
  65. xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
  66. xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
  67. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
  68. xinference/thirdparty/cosyvoice/cli/model.py +139 -26
  69. xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
  70. xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
  71. xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
  72. xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
  73. xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
  74. xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
  75. xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
  76. xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
  77. xinference/thirdparty/cosyvoice/utils/common.py +36 -0
  78. xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
  79. xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
  80. xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
  81. xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
  82. xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
  83. xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
  84. xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
  85. xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
  86. xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
  87. xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
  88. xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
  89. xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
  90. xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
  91. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +33 -0
  92. xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  93. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
  94. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
  95. xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
  96. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +1 -1
  97. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +1 -1
  98. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +1 -1
  99. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +1 -1
  100. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +1 -1
  101. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +2 -2
  102. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +0 -3
  103. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +169 -198
  104. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +4 -27
  105. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
  107. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +9 -47
  108. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +2 -2
  109. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -0
  110. xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
  111. xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
  112. xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
  113. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +12 -10
  114. xinference/thirdparty/fish_speech/tools/api.py +79 -134
  115. xinference/thirdparty/fish_speech/tools/commons.py +35 -0
  116. xinference/thirdparty/fish_speech/tools/download_models.py +3 -3
  117. xinference/thirdparty/fish_speech/tools/file.py +17 -0
  118. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +1 -1
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +29 -24
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +1 -1
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +2 -2
  122. xinference/thirdparty/fish_speech/tools/msgpack_api.py +34 -0
  123. xinference/thirdparty/fish_speech/tools/post_api.py +85 -44
  124. xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +1 -1
  126. xinference/thirdparty/fish_speech/tools/smart_pad.py +16 -3
  127. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +2 -2
  128. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +4 -2
  129. xinference/thirdparty/fish_speech/tools/webui.py +12 -146
  130. xinference/thirdparty/matcha/VERSION +1 -0
  131. xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
  132. xinference/thirdparty/matcha/hifigan/README.md +101 -0
  133. xinference/thirdparty/omnilmm/LICENSE +201 -0
  134. xinference/thirdparty/whisper/__init__.py +156 -0
  135. xinference/thirdparty/whisper/__main__.py +3 -0
  136. xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
  137. xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
  138. xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
  139. xinference/thirdparty/whisper/audio.py +157 -0
  140. xinference/thirdparty/whisper/decoding.py +826 -0
  141. xinference/thirdparty/whisper/model.py +314 -0
  142. xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
  143. xinference/thirdparty/whisper/normalizers/basic.py +76 -0
  144. xinference/thirdparty/whisper/normalizers/english.json +1741 -0
  145. xinference/thirdparty/whisper/normalizers/english.py +550 -0
  146. xinference/thirdparty/whisper/timing.py +386 -0
  147. xinference/thirdparty/whisper/tokenizer.py +395 -0
  148. xinference/thirdparty/whisper/transcribe.py +605 -0
  149. xinference/thirdparty/whisper/triton_ops.py +109 -0
  150. xinference/thirdparty/whisper/utils.py +316 -0
  151. xinference/thirdparty/whisper/version.py +1 -0
  152. xinference/types.py +14 -53
  153. xinference/web/ui/build/asset-manifest.json +6 -6
  154. xinference/web/ui/build/index.html +1 -1
  155. xinference/web/ui/build/static/css/{main.4bafd904.css → main.5061c4c3.css} +2 -2
  156. xinference/web/ui/build/static/css/main.5061c4c3.css.map +1 -0
  157. xinference/web/ui/build/static/js/main.754740c0.js +3 -0
  158. xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.754740c0.js.LICENSE.txt} +2 -0
  159. xinference/web/ui/build/static/js/main.754740c0.js.map +1 -0
  160. xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
  161. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
  162. xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
  163. xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
  164. xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +1 -0
  165. xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
  166. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +1 -0
  167. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
  168. xinference/web/ui/node_modules/.package-lock.json +37 -0
  169. xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
  170. xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
  171. xinference/web/ui/node_modules/nunjucks/package.json +112 -0
  172. xinference/web/ui/package-lock.json +38 -0
  173. xinference/web/ui/package.json +1 -0
  174. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/METADATA +16 -10
  175. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/RECORD +179 -127
  176. xinference/model/llm/transformers/llama_2.py +0 -108
  177. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +0 -442
  178. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +0 -44
  179. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +0 -115
  180. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +0 -225
  181. xinference/thirdparty/fish_speech/tools/auto_rerank.py +0 -159
  182. xinference/thirdparty/fish_speech/tools/gen_ref.py +0 -36
  183. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +0 -55
  184. xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
  185. xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
  186. xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
  187. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
  188. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
  189. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
  190. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
  191. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/LICENSE +0 -0
  192. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/WHEEL +0 -0
  193. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/entry_points.txt +0 -0
  194. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- import time
16
15
  import typing
17
16
  import uuid
18
17
  from concurrent.futures import ThreadPoolExecutor
@@ -22,18 +21,10 @@ from typing import Dict, Iterator, List, Optional, Union
22
21
  import torch
23
22
 
24
23
  from ....core.scheduler import InferenceRequest
25
- from ....types import (
26
- ChatCompletion,
27
- ChatCompletionChunk,
28
- ChatCompletionMessage,
29
- Completion,
30
- CompletionChoice,
31
- CompletionChunk,
32
- CompletionUsage,
33
- )
24
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
34
25
  from ...utils import select_device
35
26
  from ..llm_family import LLMFamilyV1, LLMSpecV1
36
- from ..utils import _decode_image
27
+ from ..utils import _decode_image, generate_chat_completion, generate_completion_chunk
37
28
  from .core import PytorchChatModel, PytorchGenerateConfig
38
29
  from .utils import get_max_src_len
39
30
 
@@ -102,66 +93,45 @@ class Glm4VModel(PytorchChatModel):
102
93
  self._tokenizer = tokenizer
103
94
  self._save_tensorizer()
104
95
 
105
- def _message_content_to_chat(self, content):
106
- if not isinstance(content, str):
107
- texts = []
108
- image_urls = []
109
- for c in content:
110
- c_type = c.get("type")
111
- if c_type == "text":
112
- texts.append(c["text"])
113
- elif c_type == "image_url":
114
- image_urls.append(c["image_url"]["url"])
115
- image_futures = []
116
- with ThreadPoolExecutor() as executor:
117
- for image_url in image_urls:
118
- fut = executor.submit(_decode_image, image_url)
119
- image_futures.append(fut)
120
- images = [fut.result() for fut in image_futures]
121
- text = " ".join(texts)
122
- if len(images) == 0:
123
- return text, []
124
- elif len(images) == 1:
125
- return text, images
96
+ @staticmethod
97
+ def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
98
+ res = []
99
+ for message in messages:
100
+ role = message["role"]
101
+ content = message["content"]
102
+ if isinstance(content, str):
103
+ res.append({"role": role, "content": content})
126
104
  else:
127
- raise RuntimeError("Only one image per message is supported")
128
- return content, []
129
-
130
- def _get_chat_msgs(
131
- self,
132
- prompt: Union[str, List[Dict]],
133
- chat_history: Optional[List[ChatCompletionMessage]] = None,
134
- ):
135
- content, images_chat = self._message_content_to_chat(prompt)
136
-
137
- msgs = []
138
- query_to_response: List[Dict] = []
139
- images_history = []
140
- for h in chat_history or []:
141
- role = h["role"]
142
- content_h, images_tmp = self._message_content_to_chat(h["content"])
143
- if images_tmp:
144
- images_history = images_tmp
145
- if len(query_to_response) == 0 and role == "user":
146
- query_to_response.append({"role": "user", "content": content_h})
147
- if len(query_to_response) == 1 and role == "assistant":
148
- query_to_response.append({"role": "assistant", "content": content_h})
149
- if len(query_to_response) == 2:
150
- msgs.extend(query_to_response)
151
- query_to_response = []
152
- image = None
153
- if len(images_chat) > 0:
154
- image = images_chat[0]
155
- elif len(images_history) > 0:
156
- image = images_history[0]
157
- msgs.append({"role": "user", "content": content, "image": image})
158
- return msgs
105
+ texts = []
106
+ image_urls = []
107
+ for c in content:
108
+ c_type = c.get("type")
109
+ if c_type == "text":
110
+ texts.append(c["text"])
111
+ else:
112
+ assert (
113
+ c_type == "image_url"
114
+ ), "Please follow the image input of the OpenAI API."
115
+ image_urls.append(c["image_url"]["url"])
116
+ if len(image_urls) > 1:
117
+ raise RuntimeError("Only one image per message is supported")
118
+ image_futures = []
119
+ with ThreadPoolExecutor() as executor:
120
+ for image_url in image_urls:
121
+ fut = executor.submit(_decode_image, image_url)
122
+ image_futures.append(fut)
123
+ images = [fut.result() for fut in image_futures]
124
+ assert len(images) <= 1
125
+ text = " ".join(texts)
126
+ if images:
127
+ res.append({"role": role, "content": text, "image": images[0]})
128
+ else:
129
+ res.append({"role": role, "content": text})
130
+ return res
159
131
 
160
132
  def chat(
161
133
  self,
162
- prompt: Union[str, List[Dict]],
163
- system_prompt: Optional[str] = None,
164
- chat_history: Optional[List[ChatCompletionMessage]] = None,
134
+ messages: List[Dict],
165
135
  generate_config: Optional[PytorchGenerateConfig] = None,
166
136
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
167
137
  from transformers import TextIteratorStreamer
@@ -170,7 +140,7 @@ class Glm4VModel(PytorchChatModel):
170
140
  generate_config = {}
171
141
 
172
142
  stream = generate_config.get("stream", False)
173
- msgs = self._get_chat_msgs(prompt, chat_history)
143
+ msgs = self._get_processed_msgs(messages)
174
144
 
175
145
  inputs = self._tokenizer.apply_chat_template(
176
146
  msgs,
@@ -213,64 +183,38 @@ class Glm4VModel(PytorchChatModel):
213
183
  response = self._tokenizer.decode(outputs[0])
214
184
  if response.endswith(stop_str):
215
185
  response = response[: -len(stop_str)]
216
- c = Completion(
217
- id=str(uuid.uuid1()),
218
- object="text_completion",
219
- created=int(time.time()),
220
- model=self.model_uid,
221
- choices=[
222
- CompletionChoice(
223
- index=0, text=response, finish_reason="stop", logprobs=None
224
- )
225
- ],
226
- usage=CompletionUsage(
227
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
228
- ),
229
- )
230
- return self._to_chat_completion(c)
186
+ return generate_chat_completion(self.model_uid, response)
231
187
 
232
188
  def chat_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
233
189
  completion_id = str(uuid.uuid1())
234
190
  for new_text in streamer:
235
191
  if not new_text.endswith(stop_str):
236
- completion_choice = CompletionChoice(
237
- text=new_text, index=0, logprobs=None, finish_reason=None
238
- )
239
- chunk = CompletionChunk(
240
- id=completion_id,
241
- object="text_completion",
242
- created=int(time.time()),
243
- model=self.model_uid,
244
- choices=[completion_choice],
245
- )
246
- completion_usage = CompletionUsage(
192
+ yield generate_completion_chunk(
193
+ chunk_text=new_text,
194
+ finish_reason=None,
195
+ chunk_id=completion_id,
196
+ model_uid=self.model_uid,
247
197
  prompt_tokens=-1,
248
198
  completion_tokens=-1,
249
199
  total_tokens=-1,
200
+ has_choice=True,
201
+ has_content=True,
250
202
  )
251
- chunk["usage"] = completion_usage
252
- yield chunk
253
203
 
254
- completion_choice = CompletionChoice(
255
- text="", index=0, logprobs=None, finish_reason="stop"
256
- )
257
- chunk = CompletionChunk(
258
- id=completion_id,
259
- object="text_completion",
260
- created=int(time.time()),
261
- model=self.model_uid,
262
- choices=[completion_choice],
263
- )
264
- completion_usage = CompletionUsage(
204
+ yield generate_completion_chunk(
205
+ chunk_text=None,
206
+ finish_reason="stop",
207
+ chunk_id=completion_id,
208
+ model_uid=self.model_uid,
265
209
  prompt_tokens=-1,
266
210
  completion_tokens=-1,
267
211
  total_tokens=-1,
212
+ has_choice=True,
213
+ has_content=False,
268
214
  )
269
- chunk["usage"] = completion_usage
270
- yield chunk
271
215
 
272
- def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
273
- msgs = self._get_chat_msgs(prompt, chat_history)
216
+ def _get_full_prompt(self, messages, tools):
217
+ msgs = self._get_processed_msgs(messages)
274
218
  inputs = self._tokenizer.apply_chat_template(
275
219
  msgs,
276
220
  add_generation_prompt=True,
@@ -12,24 +12,20 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- import time
16
15
  import uuid
17
16
  from concurrent.futures import ThreadPoolExecutor
18
17
  from typing import Dict, Iterator, List, Optional, Union
19
18
 
20
19
  import torch
21
20
 
22
- from ....types import (
23
- ChatCompletion,
24
- ChatCompletionChunk,
25
- ChatCompletionMessage,
26
- Completion,
27
- CompletionChoice,
28
- CompletionChunk,
29
- CompletionUsage,
30
- )
21
+ from ....types import ChatCompletion, ChatCompletionChunk
31
22
  from ..llm_family import LLMFamilyV1, LLMSpecV1
32
- from ..utils import _decode_image
23
+ from ..utils import (
24
+ _decode_image,
25
+ generate_chat_completion,
26
+ generate_completion_chunk,
27
+ parse_messages,
28
+ )
33
29
  from .core import PytorchChatModel, PytorchGenerateConfig
34
30
 
35
31
  logger = logging.getLogger(__name__)
@@ -78,7 +74,7 @@ def _message_content_to_intern(content, image_cnt):
78
74
 
79
75
  def _get_prompt_and_chat_history(
80
76
  prompt: Union[str, List[Dict]],
81
- chat_history: Optional[List[ChatCompletionMessage]] = None,
77
+ chat_history: Optional[List[Dict]] = None,
82
78
  ):
83
79
  # Convert openai history to intern vl history
84
80
  images = []
@@ -332,9 +328,7 @@ class InternVLChatModel(PytorchChatModel):
332
328
 
333
329
  def chat(
334
330
  self,
335
- prompt: Union[str, List[Dict]],
336
- system_prompt: Optional[str] = None,
337
- chat_history: Optional[List[ChatCompletionMessage]] = None,
331
+ messages: List[Dict],
338
332
  generate_config: Optional[PytorchGenerateConfig] = None,
339
333
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
340
334
  from ....thirdparty.internvl.conversation import get_conv_template
@@ -366,6 +360,7 @@ class InternVLChatModel(PytorchChatModel):
366
360
  else False
367
361
  )
368
362
 
363
+ prompt, _, chat_history = parse_messages(messages)
369
364
  content, history, images, videos = _get_prompt_and_chat_history(
370
365
  prompt, chat_history
371
366
  )
@@ -434,10 +429,9 @@ class InternVLChatModel(PytorchChatModel):
434
429
  chunk = self._generate_stream(generate_kwargs, input_ids, include_usage)
435
430
  return self._to_chat_completion_chunks(chunk)
436
431
  else:
437
- chunk = self._generate(generate_kwargs, input_ids, template)
438
- return self._to_chat_completion(chunk)
432
+ return self._generate(generate_kwargs, input_ids, template)
439
433
 
440
- def _generate(self, generate_kwargs, input_ids, template):
434
+ def _generate(self, generate_kwargs, input_ids, template) -> ChatCompletion:
441
435
  prompt_tokens = len(input_ids[0])
442
436
  generation_output = self._model.generate(**generate_kwargs)
443
437
  completion_tokens = len(generation_output[0])
@@ -445,23 +439,13 @@ class InternVLChatModel(PytorchChatModel):
445
439
  generation_output, skip_special_tokens=True
446
440
  )[0]
447
441
  response = response.split(template.sep)[0].strip()
448
- chunk = Completion(
449
- id=str(uuid.uuid1()),
450
- object="text_completion",
451
- created=int(time.time()),
452
- model=self.model_uid,
453
- choices=[
454
- CompletionChoice(
455
- index=0, text=response, finish_reason="stop", logprobs=None
456
- )
457
- ],
458
- usage=CompletionUsage(
459
- prompt_tokens=prompt_tokens,
460
- completion_tokens=completion_tokens,
461
- total_tokens=prompt_tokens + completion_tokens,
462
- ),
442
+ return generate_chat_completion(
443
+ self.model_uid,
444
+ response,
445
+ prompt_tokens=prompt_tokens,
446
+ completion_tokens=completion_tokens,
447
+ total_tokens=prompt_tokens + completion_tokens,
463
448
  )
464
- return chunk
465
449
 
466
450
  def _generate_stream(self, generate_kwargs, input_ids, include_usage):
467
451
  from threading import Thread
@@ -483,58 +467,43 @@ class InternVLChatModel(PytorchChatModel):
483
467
 
484
468
  completion_id = str(uuid.uuid1())
485
469
  prompt_tokens = len(input_ids[0])
486
- completion_tokens = 0
470
+ total_tokens, completion_tokens = 0, 0
487
471
  # Loop through the streamer to get the new text as it is generated
488
472
  for i, new_text in enumerate(streamer):
489
473
  if new_text == self._model.conv_template.sep:
490
474
  break
491
- completion_choice = CompletionChoice(
492
- text=new_text, index=0, logprobs=None, finish_reason=None
493
- )
494
- chunk = CompletionChunk(
495
- id=completion_id,
496
- object="text_completion",
497
- created=int(time.time()),
498
- model=self.model_uid,
499
- choices=[completion_choice],
500
- )
501
475
  completion_tokens = max(completion_tokens, len(streamer.token_cache))
502
476
  total_tokens = prompt_tokens + completion_tokens
503
- completion_usage = CompletionUsage(
477
+ yield generate_completion_chunk(
478
+ chunk_text=new_text,
479
+ finish_reason=None,
480
+ chunk_id=completion_id,
481
+ model_uid=self.model_uid,
504
482
  prompt_tokens=prompt_tokens,
505
483
  completion_tokens=completion_tokens,
506
484
  total_tokens=total_tokens,
507
485
  )
508
- chunk["usage"] = completion_usage
509
- yield chunk
510
- completion_choice = CompletionChoice(
511
- text="", index=0, logprobs=None, finish_reason="stop"
512
- )
513
- chunk = CompletionChunk(
514
- id=completion_id,
515
- object="text_completion",
516
- created=int(time.time()),
517
- model=self.model_uid,
518
- choices=[completion_choice],
519
- )
520
- completion_usage = CompletionUsage(
486
+ yield generate_completion_chunk(
487
+ chunk_text=None,
488
+ finish_reason="stop",
489
+ chunk_id=completion_id,
490
+ model_uid=self.model_uid,
521
491
  prompt_tokens=prompt_tokens,
522
492
  completion_tokens=completion_tokens,
523
493
  total_tokens=total_tokens,
494
+ has_choice=True,
495
+ has_content=False,
524
496
  )
525
- chunk["usage"] = completion_usage
526
- yield chunk
497
+
527
498
  if include_usage:
528
- chunk = CompletionChunk(
529
- id=completion_id,
530
- object="text_completion",
531
- created=int(time.time()),
532
- model=self.model_uid,
533
- choices=[],
534
- )
535
- chunk["usage"] = CompletionUsage(
499
+ yield generate_completion_chunk(
500
+ chunk_text=None,
501
+ finish_reason=None,
502
+ chunk_id=completion_id,
503
+ model_uid=self.model_uid,
536
504
  prompt_tokens=prompt_tokens,
537
505
  completion_tokens=completion_tokens,
538
506
  total_tokens=total_tokens,
507
+ has_choice=False,
508
+ has_content=False,
539
509
  )
540
- yield chunk
@@ -11,23 +11,13 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import time
15
14
  import uuid
16
15
  from typing import Any, Dict, Iterator, List, Optional, Union
17
16
 
18
17
  from ....core.scheduler import InferenceRequest
19
- from ....types import (
20
- ChatCompletion,
21
- ChatCompletionChoice,
22
- ChatCompletionChunk,
23
- ChatCompletionMessage,
24
- CompletionChoice,
25
- CompletionChunk,
26
- CompletionUsage,
27
- LoRA,
28
- PytorchGenerateConfig,
29
- )
18
+ from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
30
19
  from ..llm_family import LLMFamilyV1, LLMSpecV1
20
+ from ..utils import generate_chat_completion, generate_completion_chunk, parse_messages
31
21
  from .core import PytorchChatModel, PytorchModelConfig
32
22
 
33
23
 
@@ -106,9 +96,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
106
96
 
107
97
  def chat(
108
98
  self,
109
- prompt: str,
110
- system_prompt: Optional[str] = None,
111
- chat_history: Optional[List[ChatCompletionMessage]] = None,
99
+ messages: List[Dict],
112
100
  generate_config: Optional[PytorchGenerateConfig] = None,
113
101
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
114
102
  kwargs: Dict[str, Any] = {}
@@ -130,6 +118,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
130
118
  if isinstance(stream_options, dict)
131
119
  else False
132
120
  )
121
+
122
+ prompt, system_prompt, chat_history = parse_messages(messages)
133
123
  if chat_history:
134
124
  input_history = [
135
125
  (chat_history[i]["content"], (chat_history[i + 1]["content"]))
@@ -155,54 +145,42 @@ class Internlm2PytorchChatModel(PytorchChatModel):
155
145
  total_tokens = prompt_tokens + completion_tokens
156
146
  chunk_text = chunk_text[last_chunk_text_length:]
157
147
  last_chunk_text_length += len(chunk_text)
158
- completion_choice = CompletionChoice(
159
- text=chunk_text, index=0, logprobs=None, finish_reason=None
160
- )
161
- yield CompletionChunk(
162
- id=chunk_id,
163
- object="text_completion",
164
- created=int(time.time()),
165
- model=self.model_uid,
166
- choices=[completion_choice],
167
- usage=CompletionUsage(
168
- prompt_tokens=prompt_tokens,
169
- completion_tokens=completion_tokens,
170
- total_tokens=total_tokens,
171
- ),
148
+
149
+ yield generate_completion_chunk(
150
+ chunk_text,
151
+ finish_reason=None,
152
+ chunk_id=chunk_id,
153
+ model_uid=self.model_uid,
154
+ prompt_tokens=prompt_tokens,
155
+ completion_tokens=completion_tokens,
156
+ total_tokens=total_tokens,
172
157
  )
158
+ yield generate_completion_chunk(
159
+ None,
160
+ finish_reason="stop",
161
+ chunk_id=chunk_id,
162
+ model_uid=self.model_uid,
163
+ prompt_tokens=prompt_tokens,
164
+ completion_tokens=completion_tokens,
165
+ total_tokens=total_tokens,
166
+ has_choice=True,
167
+ has_content=False,
168
+ )
173
169
  if include_usage:
174
- chunk = CompletionChunk(
175
- id=chunk_id,
176
- object="text_completion",
177
- created=int(time.time()),
178
- model=self.model_uid,
179
- choices=[],
180
- )
181
- chunk["usage"] = CompletionUsage(
170
+ yield generate_completion_chunk(
171
+ None,
172
+ finish_reason=None,
173
+ chunk_id=chunk_id,
174
+ model_uid=self.model_uid,
182
175
  prompt_tokens=prompt_tokens,
183
176
  completion_tokens=completion_tokens,
184
177
  total_tokens=total_tokens,
178
+ has_choice=False,
185
179
  )
186
- yield chunk
187
180
 
188
181
  return self._to_chat_completion_chunks(_stream_generator())
189
182
  else:
190
183
  response, _ = self._model.chat(
191
184
  self._tokenizer, prompt, input_history, **kwargs
192
185
  )
193
- return ChatCompletion(
194
- id="chat" + str(uuid.uuid1()),
195
- object="chat.completion",
196
- created=int(time.time()),
197
- model=self.model_uid,
198
- choices=[
199
- ChatCompletionChoice(
200
- index=0,
201
- message={"role": "assistant", "content": response},
202
- finish_reason="stop",
203
- )
204
- ],
205
- usage=CompletionUsage(
206
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
207
- ),
208
- )
186
+ return generate_chat_completion(self.model_uid, response)
@@ -13,25 +13,21 @@
13
13
  # limitations under the License.
14
14
  import json
15
15
  import logging
16
- import time
17
16
  import uuid
18
17
  from concurrent.futures import ThreadPoolExecutor
19
18
  from typing import Dict, Iterator, List, Optional, Union
20
19
 
21
20
  import torch
22
21
 
23
- from ....types import (
24
- ChatCompletion,
25
- ChatCompletionChunk,
26
- ChatCompletionMessage,
27
- Completion,
28
- CompletionChoice,
29
- CompletionChunk,
30
- CompletionUsage,
31
- )
22
+ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
32
23
  from ...utils import select_device
33
24
  from ..llm_family import LLMFamilyV1, LLMSpecV1
34
- from ..utils import _decode_image
25
+ from ..utils import (
26
+ _decode_image,
27
+ generate_chat_completion,
28
+ generate_completion_chunk,
29
+ parse_messages,
30
+ )
35
31
  from .core import PytorchChatModel, PytorchGenerateConfig
36
32
 
37
33
  logger = logging.getLogger(__name__)
@@ -125,12 +121,11 @@ class MiniCPMV25Model(PytorchChatModel):
125
121
 
126
122
  def chat(
127
123
  self,
128
- prompt: Union[str, List[Dict]],
129
- system_prompt: Optional[str] = None,
130
- chat_history: Optional[List[ChatCompletionMessage]] = None,
124
+ messages: List[Dict],
131
125
  generate_config: Optional[PytorchGenerateConfig] = None,
132
126
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
133
127
  stream = generate_config.get("stream", False) if generate_config else False
128
+ prompt, _, chat_history = parse_messages(messages)
134
129
  content, images_chat = self._message_content_to_chat(prompt)
135
130
 
136
131
  msgs = []
@@ -166,57 +161,29 @@ class MiniCPMV25Model(PytorchChatModel):
166
161
  it = self.chat_stream(chat)
167
162
  return self._to_chat_completion_chunks(it)
168
163
  else:
169
- c = Completion(
170
- id=str(uuid.uuid1()),
171
- object="text_completion",
172
- created=int(time.time()),
173
- model=self.model_uid,
174
- choices=[
175
- CompletionChoice(
176
- index=0, text=chat, finish_reason="stop", logprobs=None
177
- )
178
- ],
179
- usage=CompletionUsage(
180
- prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
181
- ),
182
- )
183
- return self._to_chat_completion(c)
164
+ return generate_chat_completion(self.model_uid, chat)
184
165
 
185
166
  def chat_stream(self, chat) -> Iterator[CompletionChunk]:
186
167
  completion_id = str(uuid.uuid1())
187
168
  for new_text in chat:
188
- completion_choice = CompletionChoice(
189
- text=new_text, index=0, logprobs=None, finish_reason=None
190
- )
191
- chunk = CompletionChunk(
192
- id=completion_id,
193
- object="text_completion",
194
- created=int(time.time()),
195
- model=self.model_uid,
196
- choices=[completion_choice],
197
- )
198
- completion_usage = CompletionUsage(
169
+ yield generate_completion_chunk(
170
+ chunk_text=new_text,
171
+ finish_reason=None,
172
+ chunk_id=completion_id,
173
+ model_uid=self.model_uid,
199
174
  prompt_tokens=-1,
200
175
  completion_tokens=-1,
201
176
  total_tokens=-1,
202
177
  )
203
- chunk["usage"] = completion_usage
204
- yield chunk
205
178
 
206
- completion_choice = CompletionChoice(
207
- text="", index=0, logprobs=None, finish_reason="stop"
208
- )
209
- chunk = CompletionChunk(
210
- id=completion_id,
211
- object="text_completion",
212
- created=int(time.time()),
213
- model=self.model_uid,
214
- choices=[completion_choice],
215
- )
216
- completion_usage = CompletionUsage(
179
+ yield generate_completion_chunk(
180
+ chunk_text=None,
181
+ finish_reason="stop",
182
+ chunk_id=completion_id,
183
+ model_uid=self.model_uid,
217
184
  prompt_tokens=-1,
218
185
  completion_tokens=-1,
219
186
  total_tokens=-1,
187
+ has_choice=True,
188
+ has_content=False,
220
189
  )
221
- chunk["usage"] = completion_usage
222
- yield chunk