xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +51 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +209 -40
- xinference/client/restful/restful_client.py +7 -26
- xinference/conftest.py +1 -1
- xinference/constants.py +5 -0
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +8 -14
- xinference/core/event.py +1 -1
- xinference/core/image_interface.py +28 -0
- xinference/core/model.py +110 -31
- xinference/core/scheduler.py +37 -37
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +17 -10
- xinference/core/utils.py +80 -22
- xinference/core/worker.py +17 -16
- xinference/deploy/cmdline.py +8 -16
- xinference/deploy/local.py +1 -1
- xinference/deploy/supervisor.py +1 -1
- xinference/deploy/utils.py +1 -1
- xinference/deploy/worker.py +1 -1
- xinference/model/audio/cosyvoice.py +86 -41
- xinference/model/audio/fish_speech.py +9 -9
- xinference/model/audio/model_spec.json +9 -9
- xinference/model/audio/whisper.py +4 -1
- xinference/model/embedding/core.py +52 -31
- xinference/model/image/core.py +2 -1
- xinference/model/image/model_spec.json +16 -4
- xinference/model/image/model_spec_modelscope.json +16 -4
- xinference/model/image/sdapi.py +136 -0
- xinference/model/image/stable_diffusion/core.py +164 -19
- xinference/model/llm/__init__.py +29 -11
- xinference/model/llm/llama_cpp/core.py +16 -33
- xinference/model/llm/llm_family.json +1011 -1296
- xinference/model/llm/llm_family.py +34 -53
- xinference/model/llm/llm_family_csghub.json +18 -35
- xinference/model/llm/llm_family_modelscope.json +981 -1122
- xinference/model/llm/lmdeploy/core.py +56 -88
- xinference/model/llm/mlx/core.py +46 -69
- xinference/model/llm/sglang/core.py +36 -18
- xinference/model/llm/transformers/chatglm.py +168 -306
- xinference/model/llm/transformers/cogvlm2.py +36 -63
- xinference/model/llm/transformers/cogvlm2_video.py +33 -223
- xinference/model/llm/transformers/core.py +55 -50
- xinference/model/llm/transformers/deepseek_v2.py +340 -0
- xinference/model/llm/transformers/deepseek_vl.py +53 -96
- xinference/model/llm/transformers/glm4v.py +55 -111
- xinference/model/llm/transformers/intern_vl.py +39 -70
- xinference/model/llm/transformers/internlm2.py +32 -54
- xinference/model/llm/transformers/minicpmv25.py +22 -55
- xinference/model/llm/transformers/minicpmv26.py +158 -68
- xinference/model/llm/transformers/omnilmm.py +5 -28
- xinference/model/llm/transformers/qwen2_audio.py +168 -0
- xinference/model/llm/transformers/qwen2_vl.py +234 -0
- xinference/model/llm/transformers/qwen_vl.py +34 -86
- xinference/model/llm/transformers/utils.py +32 -38
- xinference/model/llm/transformers/yi_vl.py +32 -72
- xinference/model/llm/utils.py +280 -554
- xinference/model/llm/vllm/core.py +161 -100
- xinference/model/rerank/core.py +41 -8
- xinference/model/rerank/model_spec.json +7 -0
- xinference/model/rerank/model_spec_modelscope.json +7 -1
- xinference/model/utils.py +1 -31
- xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
- xinference/thirdparty/cosyvoice/cli/model.py +139 -26
- xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
- xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
- xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
- xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
- xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
- xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
- xinference/thirdparty/cosyvoice/utils/common.py +36 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
- xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
- xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
- xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +33 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +2 -2
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +0 -3
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +169 -198
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +4 -27
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +9 -47
- xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +2 -2
- xinference/thirdparty/fish_speech/fish_speech/train.py +2 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +12 -10
- xinference/thirdparty/fish_speech/tools/api.py +79 -134
- xinference/thirdparty/fish_speech/tools/commons.py +35 -0
- xinference/thirdparty/fish_speech/tools/download_models.py +3 -3
- xinference/thirdparty/fish_speech/tools/file.py +17 -0
- xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +1 -1
- xinference/thirdparty/fish_speech/tools/llama/generate.py +29 -24
- xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +1 -1
- xinference/thirdparty/fish_speech/tools/llama/quantize.py +2 -2
- xinference/thirdparty/fish_speech/tools/msgpack_api.py +34 -0
- xinference/thirdparty/fish_speech/tools/post_api.py +85 -44
- xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +1 -1
- xinference/thirdparty/fish_speech/tools/smart_pad.py +16 -3
- xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +2 -2
- xinference/thirdparty/fish_speech/tools/vqgan/inference.py +4 -2
- xinference/thirdparty/fish_speech/tools/webui.py +12 -146
- xinference/thirdparty/matcha/VERSION +1 -0
- xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
- xinference/thirdparty/matcha/hifigan/README.md +101 -0
- xinference/thirdparty/omnilmm/LICENSE +201 -0
- xinference/thirdparty/whisper/__init__.py +156 -0
- xinference/thirdparty/whisper/__main__.py +3 -0
- xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
- xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
- xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
- xinference/thirdparty/whisper/audio.py +157 -0
- xinference/thirdparty/whisper/decoding.py +826 -0
- xinference/thirdparty/whisper/model.py +314 -0
- xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
- xinference/thirdparty/whisper/normalizers/basic.py +76 -0
- xinference/thirdparty/whisper/normalizers/english.json +1741 -0
- xinference/thirdparty/whisper/normalizers/english.py +550 -0
- xinference/thirdparty/whisper/timing.py +386 -0
- xinference/thirdparty/whisper/tokenizer.py +395 -0
- xinference/thirdparty/whisper/transcribe.py +605 -0
- xinference/thirdparty/whisper/triton_ops.py +109 -0
- xinference/thirdparty/whisper/utils.py +316 -0
- xinference/thirdparty/whisper/version.py +1 -0
- xinference/types.py +14 -53
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.4bafd904.css → main.5061c4c3.css} +2 -2
- xinference/web/ui/build/static/css/main.5061c4c3.css.map +1 -0
- xinference/web/ui/build/static/js/main.754740c0.js +3 -0
- xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.754740c0.js.LICENSE.txt} +2 -0
- xinference/web/ui/build/static/js/main.754740c0.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +37 -0
- xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
- xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
- xinference/web/ui/node_modules/nunjucks/package.json +112 -0
- xinference/web/ui/package-lock.json +38 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/METADATA +16 -10
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/RECORD +179 -127
- xinference/model/llm/transformers/llama_2.py +0 -108
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +0 -442
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +0 -44
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +0 -115
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +0 -225
- xinference/thirdparty/fish_speech/tools/auto_rerank.py +0 -159
- xinference/thirdparty/fish_speech/tools/gen_ref.py +0 -36
- xinference/thirdparty/fish_speech/tools/merge_asr_files.py +0 -55
- xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
- xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
- xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/LICENSE +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/WHEEL +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
import time
|
|
16
15
|
import typing
|
|
17
16
|
import uuid
|
|
18
17
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -22,18 +21,10 @@ from typing import Dict, Iterator, List, Optional, Union
|
|
|
22
21
|
import torch
|
|
23
22
|
|
|
24
23
|
from ....core.scheduler import InferenceRequest
|
|
25
|
-
from ....types import
|
|
26
|
-
ChatCompletion,
|
|
27
|
-
ChatCompletionChunk,
|
|
28
|
-
ChatCompletionMessage,
|
|
29
|
-
Completion,
|
|
30
|
-
CompletionChoice,
|
|
31
|
-
CompletionChunk,
|
|
32
|
-
CompletionUsage,
|
|
33
|
-
)
|
|
24
|
+
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
|
|
34
25
|
from ...utils import select_device
|
|
35
26
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
36
|
-
from ..utils import _decode_image
|
|
27
|
+
from ..utils import _decode_image, generate_chat_completion, generate_completion_chunk
|
|
37
28
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
38
29
|
from .utils import get_max_src_len
|
|
39
30
|
|
|
@@ -102,66 +93,45 @@ class Glm4VModel(PytorchChatModel):
|
|
|
102
93
|
self._tokenizer = tokenizer
|
|
103
94
|
self._save_tensorizer()
|
|
104
95
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
elif c_type == "image_url":
|
|
114
|
-
image_urls.append(c["image_url"]["url"])
|
|
115
|
-
image_futures = []
|
|
116
|
-
with ThreadPoolExecutor() as executor:
|
|
117
|
-
for image_url in image_urls:
|
|
118
|
-
fut = executor.submit(_decode_image, image_url)
|
|
119
|
-
image_futures.append(fut)
|
|
120
|
-
images = [fut.result() for fut in image_futures]
|
|
121
|
-
text = " ".join(texts)
|
|
122
|
-
if len(images) == 0:
|
|
123
|
-
return text, []
|
|
124
|
-
elif len(images) == 1:
|
|
125
|
-
return text, images
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
|
|
98
|
+
res = []
|
|
99
|
+
for message in messages:
|
|
100
|
+
role = message["role"]
|
|
101
|
+
content = message["content"]
|
|
102
|
+
if isinstance(content, str):
|
|
103
|
+
res.append({"role": role, "content": content})
|
|
126
104
|
else:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if len(images_chat) > 0:
|
|
154
|
-
image = images_chat[0]
|
|
155
|
-
elif len(images_history) > 0:
|
|
156
|
-
image = images_history[0]
|
|
157
|
-
msgs.append({"role": "user", "content": content, "image": image})
|
|
158
|
-
return msgs
|
|
105
|
+
texts = []
|
|
106
|
+
image_urls = []
|
|
107
|
+
for c in content:
|
|
108
|
+
c_type = c.get("type")
|
|
109
|
+
if c_type == "text":
|
|
110
|
+
texts.append(c["text"])
|
|
111
|
+
else:
|
|
112
|
+
assert (
|
|
113
|
+
c_type == "image_url"
|
|
114
|
+
), "Please follow the image input of the OpenAI API."
|
|
115
|
+
image_urls.append(c["image_url"]["url"])
|
|
116
|
+
if len(image_urls) > 1:
|
|
117
|
+
raise RuntimeError("Only one image per message is supported")
|
|
118
|
+
image_futures = []
|
|
119
|
+
with ThreadPoolExecutor() as executor:
|
|
120
|
+
for image_url in image_urls:
|
|
121
|
+
fut = executor.submit(_decode_image, image_url)
|
|
122
|
+
image_futures.append(fut)
|
|
123
|
+
images = [fut.result() for fut in image_futures]
|
|
124
|
+
assert len(images) <= 1
|
|
125
|
+
text = " ".join(texts)
|
|
126
|
+
if images:
|
|
127
|
+
res.append({"role": role, "content": text, "image": images[0]})
|
|
128
|
+
else:
|
|
129
|
+
res.append({"role": role, "content": text})
|
|
130
|
+
return res
|
|
159
131
|
|
|
160
132
|
def chat(
|
|
161
133
|
self,
|
|
162
|
-
|
|
163
|
-
system_prompt: Optional[str] = None,
|
|
164
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
134
|
+
messages: List[Dict],
|
|
165
135
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
166
136
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
167
137
|
from transformers import TextIteratorStreamer
|
|
@@ -170,7 +140,7 @@ class Glm4VModel(PytorchChatModel):
|
|
|
170
140
|
generate_config = {}
|
|
171
141
|
|
|
172
142
|
stream = generate_config.get("stream", False)
|
|
173
|
-
msgs = self.
|
|
143
|
+
msgs = self._get_processed_msgs(messages)
|
|
174
144
|
|
|
175
145
|
inputs = self._tokenizer.apply_chat_template(
|
|
176
146
|
msgs,
|
|
@@ -213,64 +183,38 @@ class Glm4VModel(PytorchChatModel):
|
|
|
213
183
|
response = self._tokenizer.decode(outputs[0])
|
|
214
184
|
if response.endswith(stop_str):
|
|
215
185
|
response = response[: -len(stop_str)]
|
|
216
|
-
|
|
217
|
-
id=str(uuid.uuid1()),
|
|
218
|
-
object="text_completion",
|
|
219
|
-
created=int(time.time()),
|
|
220
|
-
model=self.model_uid,
|
|
221
|
-
choices=[
|
|
222
|
-
CompletionChoice(
|
|
223
|
-
index=0, text=response, finish_reason="stop", logprobs=None
|
|
224
|
-
)
|
|
225
|
-
],
|
|
226
|
-
usage=CompletionUsage(
|
|
227
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
228
|
-
),
|
|
229
|
-
)
|
|
230
|
-
return self._to_chat_completion(c)
|
|
186
|
+
return generate_chat_completion(self.model_uid, response)
|
|
231
187
|
|
|
232
188
|
def chat_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
|
|
233
189
|
completion_id = str(uuid.uuid1())
|
|
234
190
|
for new_text in streamer:
|
|
235
191
|
if not new_text.endswith(stop_str):
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
object="text_completion",
|
|
242
|
-
created=int(time.time()),
|
|
243
|
-
model=self.model_uid,
|
|
244
|
-
choices=[completion_choice],
|
|
245
|
-
)
|
|
246
|
-
completion_usage = CompletionUsage(
|
|
192
|
+
yield generate_completion_chunk(
|
|
193
|
+
chunk_text=new_text,
|
|
194
|
+
finish_reason=None,
|
|
195
|
+
chunk_id=completion_id,
|
|
196
|
+
model_uid=self.model_uid,
|
|
247
197
|
prompt_tokens=-1,
|
|
248
198
|
completion_tokens=-1,
|
|
249
199
|
total_tokens=-1,
|
|
200
|
+
has_choice=True,
|
|
201
|
+
has_content=True,
|
|
250
202
|
)
|
|
251
|
-
chunk["usage"] = completion_usage
|
|
252
|
-
yield chunk
|
|
253
203
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
object="text_completion",
|
|
260
|
-
created=int(time.time()),
|
|
261
|
-
model=self.model_uid,
|
|
262
|
-
choices=[completion_choice],
|
|
263
|
-
)
|
|
264
|
-
completion_usage = CompletionUsage(
|
|
204
|
+
yield generate_completion_chunk(
|
|
205
|
+
chunk_text=None,
|
|
206
|
+
finish_reason="stop",
|
|
207
|
+
chunk_id=completion_id,
|
|
208
|
+
model_uid=self.model_uid,
|
|
265
209
|
prompt_tokens=-1,
|
|
266
210
|
completion_tokens=-1,
|
|
267
211
|
total_tokens=-1,
|
|
212
|
+
has_choice=True,
|
|
213
|
+
has_content=False,
|
|
268
214
|
)
|
|
269
|
-
chunk["usage"] = completion_usage
|
|
270
|
-
yield chunk
|
|
271
215
|
|
|
272
|
-
def _get_full_prompt(self,
|
|
273
|
-
msgs = self.
|
|
216
|
+
def _get_full_prompt(self, messages, tools):
|
|
217
|
+
msgs = self._get_processed_msgs(messages)
|
|
274
218
|
inputs = self._tokenizer.apply_chat_template(
|
|
275
219
|
msgs,
|
|
276
220
|
add_generation_prompt=True,
|
|
@@ -12,24 +12,20 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
import time
|
|
16
15
|
import uuid
|
|
17
16
|
from concurrent.futures import ThreadPoolExecutor
|
|
18
17
|
from typing import Dict, Iterator, List, Optional, Union
|
|
19
18
|
|
|
20
19
|
import torch
|
|
21
20
|
|
|
22
|
-
from ....types import
|
|
23
|
-
ChatCompletion,
|
|
24
|
-
ChatCompletionChunk,
|
|
25
|
-
ChatCompletionMessage,
|
|
26
|
-
Completion,
|
|
27
|
-
CompletionChoice,
|
|
28
|
-
CompletionChunk,
|
|
29
|
-
CompletionUsage,
|
|
30
|
-
)
|
|
21
|
+
from ....types import ChatCompletion, ChatCompletionChunk
|
|
31
22
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
32
|
-
from ..utils import
|
|
23
|
+
from ..utils import (
|
|
24
|
+
_decode_image,
|
|
25
|
+
generate_chat_completion,
|
|
26
|
+
generate_completion_chunk,
|
|
27
|
+
parse_messages,
|
|
28
|
+
)
|
|
33
29
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
34
30
|
|
|
35
31
|
logger = logging.getLogger(__name__)
|
|
@@ -78,7 +74,7 @@ def _message_content_to_intern(content, image_cnt):
|
|
|
78
74
|
|
|
79
75
|
def _get_prompt_and_chat_history(
|
|
80
76
|
prompt: Union[str, List[Dict]],
|
|
81
|
-
chat_history: Optional[List[
|
|
77
|
+
chat_history: Optional[List[Dict]] = None,
|
|
82
78
|
):
|
|
83
79
|
# Convert openai history to intern vl history
|
|
84
80
|
images = []
|
|
@@ -332,9 +328,7 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
332
328
|
|
|
333
329
|
def chat(
|
|
334
330
|
self,
|
|
335
|
-
|
|
336
|
-
system_prompt: Optional[str] = None,
|
|
337
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
331
|
+
messages: List[Dict],
|
|
338
332
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
339
333
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
340
334
|
from ....thirdparty.internvl.conversation import get_conv_template
|
|
@@ -366,6 +360,7 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
366
360
|
else False
|
|
367
361
|
)
|
|
368
362
|
|
|
363
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
369
364
|
content, history, images, videos = _get_prompt_and_chat_history(
|
|
370
365
|
prompt, chat_history
|
|
371
366
|
)
|
|
@@ -434,10 +429,9 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
434
429
|
chunk = self._generate_stream(generate_kwargs, input_ids, include_usage)
|
|
435
430
|
return self._to_chat_completion_chunks(chunk)
|
|
436
431
|
else:
|
|
437
|
-
|
|
438
|
-
return self._to_chat_completion(chunk)
|
|
432
|
+
return self._generate(generate_kwargs, input_ids, template)
|
|
439
433
|
|
|
440
|
-
def _generate(self, generate_kwargs, input_ids, template):
|
|
434
|
+
def _generate(self, generate_kwargs, input_ids, template) -> ChatCompletion:
|
|
441
435
|
prompt_tokens = len(input_ids[0])
|
|
442
436
|
generation_output = self._model.generate(**generate_kwargs)
|
|
443
437
|
completion_tokens = len(generation_output[0])
|
|
@@ -445,23 +439,13 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
445
439
|
generation_output, skip_special_tokens=True
|
|
446
440
|
)[0]
|
|
447
441
|
response = response.split(template.sep)[0].strip()
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
CompletionChoice(
|
|
455
|
-
index=0, text=response, finish_reason="stop", logprobs=None
|
|
456
|
-
)
|
|
457
|
-
],
|
|
458
|
-
usage=CompletionUsage(
|
|
459
|
-
prompt_tokens=prompt_tokens,
|
|
460
|
-
completion_tokens=completion_tokens,
|
|
461
|
-
total_tokens=prompt_tokens + completion_tokens,
|
|
462
|
-
),
|
|
442
|
+
return generate_chat_completion(
|
|
443
|
+
self.model_uid,
|
|
444
|
+
response,
|
|
445
|
+
prompt_tokens=prompt_tokens,
|
|
446
|
+
completion_tokens=completion_tokens,
|
|
447
|
+
total_tokens=prompt_tokens + completion_tokens,
|
|
463
448
|
)
|
|
464
|
-
return chunk
|
|
465
449
|
|
|
466
450
|
def _generate_stream(self, generate_kwargs, input_ids, include_usage):
|
|
467
451
|
from threading import Thread
|
|
@@ -483,58 +467,43 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
483
467
|
|
|
484
468
|
completion_id = str(uuid.uuid1())
|
|
485
469
|
prompt_tokens = len(input_ids[0])
|
|
486
|
-
completion_tokens = 0
|
|
470
|
+
total_tokens, completion_tokens = 0, 0
|
|
487
471
|
# Loop through the streamer to get the new text as it is generated
|
|
488
472
|
for i, new_text in enumerate(streamer):
|
|
489
473
|
if new_text == self._model.conv_template.sep:
|
|
490
474
|
break
|
|
491
|
-
completion_choice = CompletionChoice(
|
|
492
|
-
text=new_text, index=0, logprobs=None, finish_reason=None
|
|
493
|
-
)
|
|
494
|
-
chunk = CompletionChunk(
|
|
495
|
-
id=completion_id,
|
|
496
|
-
object="text_completion",
|
|
497
|
-
created=int(time.time()),
|
|
498
|
-
model=self.model_uid,
|
|
499
|
-
choices=[completion_choice],
|
|
500
|
-
)
|
|
501
475
|
completion_tokens = max(completion_tokens, len(streamer.token_cache))
|
|
502
476
|
total_tokens = prompt_tokens + completion_tokens
|
|
503
|
-
|
|
477
|
+
yield generate_completion_chunk(
|
|
478
|
+
chunk_text=new_text,
|
|
479
|
+
finish_reason=None,
|
|
480
|
+
chunk_id=completion_id,
|
|
481
|
+
model_uid=self.model_uid,
|
|
504
482
|
prompt_tokens=prompt_tokens,
|
|
505
483
|
completion_tokens=completion_tokens,
|
|
506
484
|
total_tokens=total_tokens,
|
|
507
485
|
)
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
chunk = CompletionChunk(
|
|
514
|
-
id=completion_id,
|
|
515
|
-
object="text_completion",
|
|
516
|
-
created=int(time.time()),
|
|
517
|
-
model=self.model_uid,
|
|
518
|
-
choices=[completion_choice],
|
|
519
|
-
)
|
|
520
|
-
completion_usage = CompletionUsage(
|
|
486
|
+
yield generate_completion_chunk(
|
|
487
|
+
chunk_text=None,
|
|
488
|
+
finish_reason="stop",
|
|
489
|
+
chunk_id=completion_id,
|
|
490
|
+
model_uid=self.model_uid,
|
|
521
491
|
prompt_tokens=prompt_tokens,
|
|
522
492
|
completion_tokens=completion_tokens,
|
|
523
493
|
total_tokens=total_tokens,
|
|
494
|
+
has_choice=True,
|
|
495
|
+
has_content=False,
|
|
524
496
|
)
|
|
525
|
-
|
|
526
|
-
yield chunk
|
|
497
|
+
|
|
527
498
|
if include_usage:
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
choices=[],
|
|
534
|
-
)
|
|
535
|
-
chunk["usage"] = CompletionUsage(
|
|
499
|
+
yield generate_completion_chunk(
|
|
500
|
+
chunk_text=None,
|
|
501
|
+
finish_reason=None,
|
|
502
|
+
chunk_id=completion_id,
|
|
503
|
+
model_uid=self.model_uid,
|
|
536
504
|
prompt_tokens=prompt_tokens,
|
|
537
505
|
completion_tokens=completion_tokens,
|
|
538
506
|
total_tokens=total_tokens,
|
|
507
|
+
has_choice=False,
|
|
508
|
+
has_content=False,
|
|
539
509
|
)
|
|
540
|
-
yield chunk
|
|
@@ -11,23 +11,13 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
import time
|
|
15
14
|
import uuid
|
|
16
15
|
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
17
16
|
|
|
18
17
|
from ....core.scheduler import InferenceRequest
|
|
19
|
-
from ....types import
|
|
20
|
-
ChatCompletion,
|
|
21
|
-
ChatCompletionChoice,
|
|
22
|
-
ChatCompletionChunk,
|
|
23
|
-
ChatCompletionMessage,
|
|
24
|
-
CompletionChoice,
|
|
25
|
-
CompletionChunk,
|
|
26
|
-
CompletionUsage,
|
|
27
|
-
LoRA,
|
|
28
|
-
PytorchGenerateConfig,
|
|
29
|
-
)
|
|
18
|
+
from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
|
|
30
19
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
20
|
+
from ..utils import generate_chat_completion, generate_completion_chunk, parse_messages
|
|
31
21
|
from .core import PytorchChatModel, PytorchModelConfig
|
|
32
22
|
|
|
33
23
|
|
|
@@ -106,9 +96,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
106
96
|
|
|
107
97
|
def chat(
|
|
108
98
|
self,
|
|
109
|
-
|
|
110
|
-
system_prompt: Optional[str] = None,
|
|
111
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
99
|
+
messages: List[Dict],
|
|
112
100
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
113
101
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
114
102
|
kwargs: Dict[str, Any] = {}
|
|
@@ -130,6 +118,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
130
118
|
if isinstance(stream_options, dict)
|
|
131
119
|
else False
|
|
132
120
|
)
|
|
121
|
+
|
|
122
|
+
prompt, system_prompt, chat_history = parse_messages(messages)
|
|
133
123
|
if chat_history:
|
|
134
124
|
input_history = [
|
|
135
125
|
(chat_history[i]["content"], (chat_history[i + 1]["content"]))
|
|
@@ -155,54 +145,42 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
155
145
|
total_tokens = prompt_tokens + completion_tokens
|
|
156
146
|
chunk_text = chunk_text[last_chunk_text_length:]
|
|
157
147
|
last_chunk_text_length += len(chunk_text)
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
usage=CompletionUsage(
|
|
168
|
-
prompt_tokens=prompt_tokens,
|
|
169
|
-
completion_tokens=completion_tokens,
|
|
170
|
-
total_tokens=total_tokens,
|
|
171
|
-
),
|
|
148
|
+
|
|
149
|
+
yield generate_completion_chunk(
|
|
150
|
+
chunk_text,
|
|
151
|
+
finish_reason=None,
|
|
152
|
+
chunk_id=chunk_id,
|
|
153
|
+
model_uid=self.model_uid,
|
|
154
|
+
prompt_tokens=prompt_tokens,
|
|
155
|
+
completion_tokens=completion_tokens,
|
|
156
|
+
total_tokens=total_tokens,
|
|
172
157
|
)
|
|
158
|
+
yield generate_completion_chunk(
|
|
159
|
+
None,
|
|
160
|
+
finish_reason="stop",
|
|
161
|
+
chunk_id=chunk_id,
|
|
162
|
+
model_uid=self.model_uid,
|
|
163
|
+
prompt_tokens=prompt_tokens,
|
|
164
|
+
completion_tokens=completion_tokens,
|
|
165
|
+
total_tokens=total_tokens,
|
|
166
|
+
has_choice=True,
|
|
167
|
+
has_content=False,
|
|
168
|
+
)
|
|
173
169
|
if include_usage:
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
choices=[],
|
|
180
|
-
)
|
|
181
|
-
chunk["usage"] = CompletionUsage(
|
|
170
|
+
yield generate_completion_chunk(
|
|
171
|
+
None,
|
|
172
|
+
finish_reason=None,
|
|
173
|
+
chunk_id=chunk_id,
|
|
174
|
+
model_uid=self.model_uid,
|
|
182
175
|
prompt_tokens=prompt_tokens,
|
|
183
176
|
completion_tokens=completion_tokens,
|
|
184
177
|
total_tokens=total_tokens,
|
|
178
|
+
has_choice=False,
|
|
185
179
|
)
|
|
186
|
-
yield chunk
|
|
187
180
|
|
|
188
181
|
return self._to_chat_completion_chunks(_stream_generator())
|
|
189
182
|
else:
|
|
190
183
|
response, _ = self._model.chat(
|
|
191
184
|
self._tokenizer, prompt, input_history, **kwargs
|
|
192
185
|
)
|
|
193
|
-
return
|
|
194
|
-
id="chat" + str(uuid.uuid1()),
|
|
195
|
-
object="chat.completion",
|
|
196
|
-
created=int(time.time()),
|
|
197
|
-
model=self.model_uid,
|
|
198
|
-
choices=[
|
|
199
|
-
ChatCompletionChoice(
|
|
200
|
-
index=0,
|
|
201
|
-
message={"role": "assistant", "content": response},
|
|
202
|
-
finish_reason="stop",
|
|
203
|
-
)
|
|
204
|
-
],
|
|
205
|
-
usage=CompletionUsage(
|
|
206
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
207
|
-
),
|
|
208
|
-
)
|
|
186
|
+
return generate_chat_completion(self.model_uid, response)
|
|
@@ -13,25 +13,21 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
15
|
import logging
|
|
16
|
-
import time
|
|
17
16
|
import uuid
|
|
18
17
|
from concurrent.futures import ThreadPoolExecutor
|
|
19
18
|
from typing import Dict, Iterator, List, Optional, Union
|
|
20
19
|
|
|
21
20
|
import torch
|
|
22
21
|
|
|
23
|
-
from ....types import
|
|
24
|
-
ChatCompletion,
|
|
25
|
-
ChatCompletionChunk,
|
|
26
|
-
ChatCompletionMessage,
|
|
27
|
-
Completion,
|
|
28
|
-
CompletionChoice,
|
|
29
|
-
CompletionChunk,
|
|
30
|
-
CompletionUsage,
|
|
31
|
-
)
|
|
22
|
+
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
|
|
32
23
|
from ...utils import select_device
|
|
33
24
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
34
|
-
from ..utils import
|
|
25
|
+
from ..utils import (
|
|
26
|
+
_decode_image,
|
|
27
|
+
generate_chat_completion,
|
|
28
|
+
generate_completion_chunk,
|
|
29
|
+
parse_messages,
|
|
30
|
+
)
|
|
35
31
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
36
32
|
|
|
37
33
|
logger = logging.getLogger(__name__)
|
|
@@ -125,12 +121,11 @@ class MiniCPMV25Model(PytorchChatModel):
|
|
|
125
121
|
|
|
126
122
|
def chat(
|
|
127
123
|
self,
|
|
128
|
-
|
|
129
|
-
system_prompt: Optional[str] = None,
|
|
130
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
124
|
+
messages: List[Dict],
|
|
131
125
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
132
126
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
133
127
|
stream = generate_config.get("stream", False) if generate_config else False
|
|
128
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
134
129
|
content, images_chat = self._message_content_to_chat(prompt)
|
|
135
130
|
|
|
136
131
|
msgs = []
|
|
@@ -166,57 +161,29 @@ class MiniCPMV25Model(PytorchChatModel):
|
|
|
166
161
|
it = self.chat_stream(chat)
|
|
167
162
|
return self._to_chat_completion_chunks(it)
|
|
168
163
|
else:
|
|
169
|
-
|
|
170
|
-
id=str(uuid.uuid1()),
|
|
171
|
-
object="text_completion",
|
|
172
|
-
created=int(time.time()),
|
|
173
|
-
model=self.model_uid,
|
|
174
|
-
choices=[
|
|
175
|
-
CompletionChoice(
|
|
176
|
-
index=0, text=chat, finish_reason="stop", logprobs=None
|
|
177
|
-
)
|
|
178
|
-
],
|
|
179
|
-
usage=CompletionUsage(
|
|
180
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
181
|
-
),
|
|
182
|
-
)
|
|
183
|
-
return self._to_chat_completion(c)
|
|
164
|
+
return generate_chat_completion(self.model_uid, chat)
|
|
184
165
|
|
|
185
166
|
def chat_stream(self, chat) -> Iterator[CompletionChunk]:
|
|
186
167
|
completion_id = str(uuid.uuid1())
|
|
187
168
|
for new_text in chat:
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
object="text_completion",
|
|
194
|
-
created=int(time.time()),
|
|
195
|
-
model=self.model_uid,
|
|
196
|
-
choices=[completion_choice],
|
|
197
|
-
)
|
|
198
|
-
completion_usage = CompletionUsage(
|
|
169
|
+
yield generate_completion_chunk(
|
|
170
|
+
chunk_text=new_text,
|
|
171
|
+
finish_reason=None,
|
|
172
|
+
chunk_id=completion_id,
|
|
173
|
+
model_uid=self.model_uid,
|
|
199
174
|
prompt_tokens=-1,
|
|
200
175
|
completion_tokens=-1,
|
|
201
176
|
total_tokens=-1,
|
|
202
177
|
)
|
|
203
|
-
chunk["usage"] = completion_usage
|
|
204
|
-
yield chunk
|
|
205
178
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
object="text_completion",
|
|
212
|
-
created=int(time.time()),
|
|
213
|
-
model=self.model_uid,
|
|
214
|
-
choices=[completion_choice],
|
|
215
|
-
)
|
|
216
|
-
completion_usage = CompletionUsage(
|
|
179
|
+
yield generate_completion_chunk(
|
|
180
|
+
chunk_text=None,
|
|
181
|
+
finish_reason="stop",
|
|
182
|
+
chunk_id=completion_id,
|
|
183
|
+
model_uid=self.model_uid,
|
|
217
184
|
prompt_tokens=-1,
|
|
218
185
|
completion_tokens=-1,
|
|
219
186
|
total_tokens=-1,
|
|
187
|
+
has_choice=True,
|
|
188
|
+
has_content=False,
|
|
220
189
|
)
|
|
221
|
-
chunk["usage"] = completion_usage
|
|
222
|
-
yield chunk
|