xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +51 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +209 -40
- xinference/client/restful/restful_client.py +7 -26
- xinference/conftest.py +1 -1
- xinference/constants.py +5 -0
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +8 -14
- xinference/core/event.py +1 -1
- xinference/core/image_interface.py +28 -0
- xinference/core/model.py +110 -31
- xinference/core/scheduler.py +37 -37
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +17 -10
- xinference/core/utils.py +80 -22
- xinference/core/worker.py +17 -16
- xinference/deploy/cmdline.py +8 -16
- xinference/deploy/local.py +1 -1
- xinference/deploy/supervisor.py +1 -1
- xinference/deploy/utils.py +1 -1
- xinference/deploy/worker.py +1 -1
- xinference/model/audio/cosyvoice.py +86 -41
- xinference/model/audio/fish_speech.py +9 -9
- xinference/model/audio/model_spec.json +9 -9
- xinference/model/audio/whisper.py +4 -1
- xinference/model/embedding/core.py +52 -31
- xinference/model/image/core.py +2 -1
- xinference/model/image/model_spec.json +16 -4
- xinference/model/image/model_spec_modelscope.json +16 -4
- xinference/model/image/sdapi.py +136 -0
- xinference/model/image/stable_diffusion/core.py +164 -19
- xinference/model/llm/__init__.py +29 -11
- xinference/model/llm/llama_cpp/core.py +16 -33
- xinference/model/llm/llm_family.json +1011 -1296
- xinference/model/llm/llm_family.py +34 -53
- xinference/model/llm/llm_family_csghub.json +18 -35
- xinference/model/llm/llm_family_modelscope.json +981 -1122
- xinference/model/llm/lmdeploy/core.py +56 -88
- xinference/model/llm/mlx/core.py +46 -69
- xinference/model/llm/sglang/core.py +36 -18
- xinference/model/llm/transformers/chatglm.py +168 -306
- xinference/model/llm/transformers/cogvlm2.py +36 -63
- xinference/model/llm/transformers/cogvlm2_video.py +33 -223
- xinference/model/llm/transformers/core.py +55 -50
- xinference/model/llm/transformers/deepseek_v2.py +340 -0
- xinference/model/llm/transformers/deepseek_vl.py +53 -96
- xinference/model/llm/transformers/glm4v.py +55 -111
- xinference/model/llm/transformers/intern_vl.py +39 -70
- xinference/model/llm/transformers/internlm2.py +32 -54
- xinference/model/llm/transformers/minicpmv25.py +22 -55
- xinference/model/llm/transformers/minicpmv26.py +158 -68
- xinference/model/llm/transformers/omnilmm.py +5 -28
- xinference/model/llm/transformers/qwen2_audio.py +168 -0
- xinference/model/llm/transformers/qwen2_vl.py +234 -0
- xinference/model/llm/transformers/qwen_vl.py +34 -86
- xinference/model/llm/transformers/utils.py +32 -38
- xinference/model/llm/transformers/yi_vl.py +32 -72
- xinference/model/llm/utils.py +280 -554
- xinference/model/llm/vllm/core.py +161 -100
- xinference/model/rerank/core.py +41 -8
- xinference/model/rerank/model_spec.json +7 -0
- xinference/model/rerank/model_spec_modelscope.json +7 -1
- xinference/model/utils.py +1 -31
- xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
- xinference/thirdparty/cosyvoice/cli/model.py +139 -26
- xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
- xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
- xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
- xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
- xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
- xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
- xinference/thirdparty/cosyvoice/utils/common.py +36 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
- xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
- xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
- xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +33 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +2 -2
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +0 -3
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +169 -198
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +4 -27
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +9 -47
- xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +2 -2
- xinference/thirdparty/fish_speech/fish_speech/train.py +2 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +12 -10
- xinference/thirdparty/fish_speech/tools/api.py +79 -134
- xinference/thirdparty/fish_speech/tools/commons.py +35 -0
- xinference/thirdparty/fish_speech/tools/download_models.py +3 -3
- xinference/thirdparty/fish_speech/tools/file.py +17 -0
- xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +1 -1
- xinference/thirdparty/fish_speech/tools/llama/generate.py +29 -24
- xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +1 -1
- xinference/thirdparty/fish_speech/tools/llama/quantize.py +2 -2
- xinference/thirdparty/fish_speech/tools/msgpack_api.py +34 -0
- xinference/thirdparty/fish_speech/tools/post_api.py +85 -44
- xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +1 -1
- xinference/thirdparty/fish_speech/tools/smart_pad.py +16 -3
- xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +2 -2
- xinference/thirdparty/fish_speech/tools/vqgan/inference.py +4 -2
- xinference/thirdparty/fish_speech/tools/webui.py +12 -146
- xinference/thirdparty/matcha/VERSION +1 -0
- xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
- xinference/thirdparty/matcha/hifigan/README.md +101 -0
- xinference/thirdparty/omnilmm/LICENSE +201 -0
- xinference/thirdparty/whisper/__init__.py +156 -0
- xinference/thirdparty/whisper/__main__.py +3 -0
- xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
- xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
- xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
- xinference/thirdparty/whisper/audio.py +157 -0
- xinference/thirdparty/whisper/decoding.py +826 -0
- xinference/thirdparty/whisper/model.py +314 -0
- xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
- xinference/thirdparty/whisper/normalizers/basic.py +76 -0
- xinference/thirdparty/whisper/normalizers/english.json +1741 -0
- xinference/thirdparty/whisper/normalizers/english.py +550 -0
- xinference/thirdparty/whisper/timing.py +386 -0
- xinference/thirdparty/whisper/tokenizer.py +395 -0
- xinference/thirdparty/whisper/transcribe.py +605 -0
- xinference/thirdparty/whisper/triton_ops.py +109 -0
- xinference/thirdparty/whisper/utils.py +316 -0
- xinference/thirdparty/whisper/version.py +1 -0
- xinference/types.py +14 -53
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.4bafd904.css → main.5061c4c3.css} +2 -2
- xinference/web/ui/build/static/css/main.5061c4c3.css.map +1 -0
- xinference/web/ui/build/static/js/main.754740c0.js +3 -0
- xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.754740c0.js.LICENSE.txt} +2 -0
- xinference/web/ui/build/static/js/main.754740c0.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +37 -0
- xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
- xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
- xinference/web/ui/node_modules/nunjucks/package.json +112 -0
- xinference/web/ui/package-lock.json +38 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/METADATA +16 -10
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/RECORD +179 -127
- xinference/model/llm/transformers/llama_2.py +0 -108
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +0 -442
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +0 -44
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +0 -115
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +0 -225
- xinference/thirdparty/fish_speech/tools/auto_rerank.py +0 -159
- xinference/thirdparty/fish_speech/tools/gen_ref.py +0 -36
- xinference/thirdparty/fish_speech/tools/merge_asr_files.py +0 -55
- xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
- xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
- xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/LICENSE +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/WHEEL +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/top_level.txt +0 -0
|
@@ -12,26 +12,23 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
import time
|
|
16
15
|
import uuid
|
|
17
16
|
from concurrent.futures import ThreadPoolExecutor
|
|
18
|
-
from typing import Dict, Iterator, List, Optional, Union
|
|
17
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
19
18
|
|
|
20
19
|
import torch
|
|
21
20
|
from PIL import Image
|
|
22
21
|
|
|
23
|
-
from ....
|
|
24
|
-
|
|
25
|
-
ChatCompletionChunk,
|
|
26
|
-
ChatCompletionMessage,
|
|
27
|
-
Completion,
|
|
28
|
-
CompletionChoice,
|
|
29
|
-
CompletionChunk,
|
|
30
|
-
CompletionUsage,
|
|
31
|
-
)
|
|
22
|
+
from ....core.scheduler import InferenceRequest
|
|
23
|
+
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
|
|
32
24
|
from ...utils import select_device
|
|
33
25
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
34
|
-
from ..utils import
|
|
26
|
+
from ..utils import (
|
|
27
|
+
_decode_image,
|
|
28
|
+
generate_chat_completion,
|
|
29
|
+
generate_completion_chunk,
|
|
30
|
+
parse_messages,
|
|
31
|
+
)
|
|
35
32
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
36
33
|
|
|
37
34
|
logger = logging.getLogger(__name__)
|
|
@@ -43,6 +40,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
43
40
|
self._device = None
|
|
44
41
|
self._tokenizer = None
|
|
45
42
|
self._model = None
|
|
43
|
+
self._processor = None
|
|
46
44
|
|
|
47
45
|
@classmethod
|
|
48
46
|
def match(
|
|
@@ -59,7 +57,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
59
57
|
return AutoModel
|
|
60
58
|
|
|
61
59
|
def load(self, **kwargs):
|
|
62
|
-
from transformers import AutoModel, AutoTokenizer
|
|
60
|
+
from transformers import AutoModel, AutoProcessor, AutoTokenizer
|
|
63
61
|
from transformers.generation import GenerationConfig
|
|
64
62
|
|
|
65
63
|
device = self._pytorch_model_config.get("device", "auto")
|
|
@@ -100,6 +98,10 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
100
98
|
self.model_path,
|
|
101
99
|
trust_remote_code=True,
|
|
102
100
|
)
|
|
101
|
+
self._processor = AutoProcessor.from_pretrained(
|
|
102
|
+
self.model_path, trust_remote_code=True
|
|
103
|
+
)
|
|
104
|
+
self._device = self._model.device
|
|
103
105
|
self._save_tensorizer()
|
|
104
106
|
|
|
105
107
|
def _message_content_to_chat(self, content):
|
|
@@ -120,7 +122,9 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
120
122
|
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
|
|
121
123
|
frames = vr.get_batch(frame_idx).asnumpy()
|
|
122
124
|
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
|
|
123
|
-
|
|
125
|
+
logger.info(
|
|
126
|
+
f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
|
|
127
|
+
)
|
|
124
128
|
return frames
|
|
125
129
|
|
|
126
130
|
def _load_video(_url):
|
|
@@ -158,19 +162,13 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
158
162
|
return text, images, frames
|
|
159
163
|
return content, [], []
|
|
160
164
|
|
|
161
|
-
def
|
|
162
|
-
|
|
163
|
-
prompt
|
|
164
|
-
system_prompt: Optional[str] = None,
|
|
165
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
166
|
-
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
167
|
-
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
168
|
-
stream = generate_config.get("stream", False) if generate_config else False
|
|
169
|
-
videoExisted = False
|
|
165
|
+
def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
|
|
166
|
+
video_existed = False
|
|
167
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
170
168
|
|
|
171
169
|
content, images_chat, video_frames = self._message_content_to_chat(prompt)
|
|
172
170
|
if len(video_frames) > 0:
|
|
173
|
-
|
|
171
|
+
video_existed = True
|
|
174
172
|
images_chat = video_frames
|
|
175
173
|
|
|
176
174
|
msgs = []
|
|
@@ -184,7 +182,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
184
182
|
if images_tmp != []:
|
|
185
183
|
images_history = images_tmp
|
|
186
184
|
if len(video_frames_h) > 0:
|
|
187
|
-
|
|
185
|
+
video_existed = True
|
|
188
186
|
images_history = video_frames_h
|
|
189
187
|
if len(query_to_response) == 0 and role == "user":
|
|
190
188
|
query_to_response.append(
|
|
@@ -198,10 +196,19 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
198
196
|
msgs.extend(query_to_response)
|
|
199
197
|
query_to_response = []
|
|
200
198
|
msgs.append({"role": "user", "content": images_chat + [content]})
|
|
199
|
+
return msgs, video_existed
|
|
200
|
+
|
|
201
|
+
def chat(
|
|
202
|
+
self,
|
|
203
|
+
messages: List[Dict],
|
|
204
|
+
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
205
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
206
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
207
|
+
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
201
208
|
|
|
202
209
|
# Set decode params for video
|
|
203
210
|
params = {}
|
|
204
|
-
if
|
|
211
|
+
if video_existed:
|
|
205
212
|
params = {"use_image_id": False, "max_slice_nums": 1}
|
|
206
213
|
|
|
207
214
|
chat = self._model.chat(
|
|
@@ -216,57 +223,140 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
216
223
|
it = self.chat_stream(chat)
|
|
217
224
|
return self._to_chat_completion_chunks(it)
|
|
218
225
|
else:
|
|
219
|
-
|
|
220
|
-
id=str(uuid.uuid1()),
|
|
221
|
-
object="text_completion",
|
|
222
|
-
created=int(time.time()),
|
|
223
|
-
model=self.model_uid,
|
|
224
|
-
choices=[
|
|
225
|
-
CompletionChoice(
|
|
226
|
-
index=0, text=chat, finish_reason="stop", logprobs=None
|
|
227
|
-
)
|
|
228
|
-
],
|
|
229
|
-
usage=CompletionUsage(
|
|
230
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
231
|
-
),
|
|
232
|
-
)
|
|
233
|
-
return self._to_chat_completion(c)
|
|
226
|
+
return generate_chat_completion(self.model_uid, chat)
|
|
234
227
|
|
|
235
228
|
def chat_stream(self, chat) -> Iterator[CompletionChunk]:
|
|
236
229
|
completion_id = str(uuid.uuid1())
|
|
237
230
|
for new_text in chat:
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
object="text_completion",
|
|
244
|
-
created=int(time.time()),
|
|
245
|
-
model=self.model_uid,
|
|
246
|
-
choices=[completion_choice],
|
|
247
|
-
)
|
|
248
|
-
completion_usage = CompletionUsage(
|
|
231
|
+
yield generate_completion_chunk(
|
|
232
|
+
chunk_text=new_text,
|
|
233
|
+
finish_reason=None,
|
|
234
|
+
chunk_id=completion_id,
|
|
235
|
+
model_uid=self.model_uid,
|
|
249
236
|
prompt_tokens=-1,
|
|
250
237
|
completion_tokens=-1,
|
|
251
238
|
total_tokens=-1,
|
|
252
239
|
)
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
)
|
|
259
|
-
chunk = CompletionChunk(
|
|
260
|
-
id=completion_id,
|
|
261
|
-
object="text_completion",
|
|
262
|
-
created=int(time.time()),
|
|
263
|
-
model=self.model_uid,
|
|
264
|
-
choices=[completion_choice],
|
|
265
|
-
)
|
|
266
|
-
completion_usage = CompletionUsage(
|
|
240
|
+
yield generate_completion_chunk(
|
|
241
|
+
chunk_text=None,
|
|
242
|
+
finish_reason="stop",
|
|
243
|
+
chunk_id=completion_id,
|
|
244
|
+
model_uid=self.model_uid,
|
|
267
245
|
prompt_tokens=-1,
|
|
268
246
|
completion_tokens=-1,
|
|
269
247
|
total_tokens=-1,
|
|
248
|
+
has_choice=True,
|
|
249
|
+
has_content=False,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def prepare_sanitize_generate_config(self, req: InferenceRequest):
|
|
253
|
+
"""
|
|
254
|
+
Refer to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py
|
|
255
|
+
"""
|
|
256
|
+
raw_config = req.inference_kwargs.get("raw_params", {})
|
|
257
|
+
temperature = raw_config.get("temperature", None)
|
|
258
|
+
if temperature is None:
|
|
259
|
+
raw_config["temperature"] = 0.7
|
|
260
|
+
top_p = raw_config.get("top_p", None)
|
|
261
|
+
if top_p is None:
|
|
262
|
+
raw_config["top_p"] = 0.8
|
|
263
|
+
top_k = raw_config.get("top_k", None)
|
|
264
|
+
if top_k is None:
|
|
265
|
+
raw_config["top_k"] = 100
|
|
266
|
+
repetition_penalty = raw_config.get("repetition_penalty", None)
|
|
267
|
+
if repetition_penalty is None:
|
|
268
|
+
raw_config["repetition_penalty"] = 1.05
|
|
269
|
+
return raw_config
|
|
270
|
+
|
|
271
|
+
def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
|
|
272
|
+
"""
|
|
273
|
+
Copied from https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py#L315
|
|
274
|
+
"""
|
|
275
|
+
from copy import deepcopy
|
|
276
|
+
|
|
277
|
+
copy_msgs = deepcopy(msgs)
|
|
278
|
+
|
|
279
|
+
images = []
|
|
280
|
+
for i, msg in enumerate(copy_msgs):
|
|
281
|
+
role = msg["role"]
|
|
282
|
+
content = msg["content"]
|
|
283
|
+
assert role in ["user", "assistant"]
|
|
284
|
+
if i == 0:
|
|
285
|
+
assert role == "user", "The role of first msg should be user"
|
|
286
|
+
if isinstance(content, str):
|
|
287
|
+
content = [content]
|
|
288
|
+
cur_msgs = []
|
|
289
|
+
for c in content:
|
|
290
|
+
if isinstance(c, Image.Image):
|
|
291
|
+
images.append(c)
|
|
292
|
+
cur_msgs.append("(<image>./</image>)")
|
|
293
|
+
elif isinstance(c, str):
|
|
294
|
+
cur_msgs.append(c)
|
|
295
|
+
msg["content"] = "\n".join(cur_msgs)
|
|
296
|
+
|
|
297
|
+
return {
|
|
298
|
+
"prompt": self._processor.tokenizer.apply_chat_template(
|
|
299
|
+
copy_msgs, tokenize=False, add_generation_prompt=True
|
|
300
|
+
),
|
|
301
|
+
"input_image": images,
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
def _get_full_prompt(self, messages: List[Dict], tools):
|
|
305
|
+
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
306
|
+
if video_existed:
|
|
307
|
+
raise RuntimeError(
|
|
308
|
+
f"Continuous batching does not support video inputs for this model: {self.model_uid}"
|
|
309
|
+
)
|
|
310
|
+
return self._handle_input_ids_and_images(msgs)
|
|
311
|
+
|
|
312
|
+
def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
|
|
313
|
+
prompts_lists = [x["prompt"] for x in prompts]
|
|
314
|
+
input_images_lists = [x["input_image"] for x in prompts]
|
|
315
|
+
inputs = self._processor(
|
|
316
|
+
prompts_lists,
|
|
317
|
+
input_images_lists,
|
|
318
|
+
max_slice_nums=None,
|
|
319
|
+
use_image_id=None,
|
|
320
|
+
return_tensors="pt",
|
|
321
|
+
max_length=8192,
|
|
322
|
+
).to(self._model.device)
|
|
323
|
+
inputs.pop("image_sizes")
|
|
324
|
+
|
|
325
|
+
masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
|
|
326
|
+
for i in range(masked_input_ids.shape[0]):
|
|
327
|
+
non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
|
|
328
|
+
req_list[i].prompt_tokens = non_zero_values
|
|
329
|
+
req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
|
|
330
|
+
req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
|
|
331
|
+
|
|
332
|
+
model_inputs = {
|
|
333
|
+
"input_ids": inputs["input_ids"],
|
|
334
|
+
"image_bound": inputs["image_bound"],
|
|
335
|
+
"pixel_values": inputs["pixel_values"],
|
|
336
|
+
"tgt_sizes": inputs["tgt_sizes"],
|
|
337
|
+
}
|
|
338
|
+
model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
|
|
339
|
+
|
|
340
|
+
return {
|
|
341
|
+
"inputs_embeds": model_inputs["inputs_embeds"],
|
|
342
|
+
"attention_mask": inputs["attention_mask"],
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
def build_decode_position_ids(
|
|
346
|
+
self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
|
|
347
|
+
):
|
|
348
|
+
return None
|
|
349
|
+
|
|
350
|
+
def batch_inference(self, req_list: List[InferenceRequest]):
|
|
351
|
+
"""
|
|
352
|
+
This method is rewritten
|
|
353
|
+
because the specific inference process is performed by `self._model.llm`,
|
|
354
|
+
not `self._model` itself
|
|
355
|
+
"""
|
|
356
|
+
from .utils import batch_inference_one_step
|
|
357
|
+
|
|
358
|
+
self.prepare_batch_inference(req_list)
|
|
359
|
+
batch_inference_one_step(
|
|
360
|
+
self, req_list, self.model_uid, self._model.llm, self._tokenizer
|
|
270
361
|
)
|
|
271
|
-
|
|
272
|
-
yield chunk
|
|
362
|
+
self.handle_batch_inference_results(req_list)
|
|
@@ -16,20 +16,13 @@ import json
|
|
|
16
16
|
import logging
|
|
17
17
|
import operator
|
|
18
18
|
import tempfile
|
|
19
|
-
import time
|
|
20
|
-
import uuid
|
|
21
19
|
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
22
20
|
|
|
23
21
|
from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64
|
|
24
|
-
from ....types import
|
|
25
|
-
ChatCompletion,
|
|
26
|
-
ChatCompletionChoice,
|
|
27
|
-
ChatCompletionChunk,
|
|
28
|
-
ChatCompletionMessage,
|
|
29
|
-
CompletionUsage,
|
|
30
|
-
)
|
|
22
|
+
from ....types import ChatCompletion, ChatCompletionChunk
|
|
31
23
|
from ...utils import select_device
|
|
32
24
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
25
|
+
from ..utils import generate_chat_completion, parse_messages
|
|
33
26
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
34
27
|
|
|
35
28
|
logger = logging.getLogger(__name__)
|
|
@@ -96,15 +89,14 @@ class OmniLMMModel(PytorchChatModel):
|
|
|
96
89
|
|
|
97
90
|
def chat(
|
|
98
91
|
self,
|
|
99
|
-
|
|
100
|
-
system_prompt: Optional[str] = None,
|
|
101
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
92
|
+
messages: List[Dict],
|
|
102
93
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
103
94
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
104
95
|
if generate_config and generate_config.get("stream"):
|
|
105
96
|
raise Exception(
|
|
106
97
|
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
107
98
|
)
|
|
99
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
108
100
|
image_first, prompt = self._message_content_to_OmniLMM(prompt)
|
|
109
101
|
|
|
110
102
|
msgs = []
|
|
@@ -135,19 +127,4 @@ class OmniLMMModel(PytorchChatModel):
|
|
|
135
127
|
input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
|
|
136
128
|
answer = self._model.chat(input=input)
|
|
137
129
|
|
|
138
|
-
return
|
|
139
|
-
id="chat" + str(uuid.uuid1()),
|
|
140
|
-
object="chat.completion",
|
|
141
|
-
created=int(time.time()),
|
|
142
|
-
model=self.model_uid,
|
|
143
|
-
choices=[
|
|
144
|
-
ChatCompletionChoice(
|
|
145
|
-
index=0,
|
|
146
|
-
message={"role": "assistant", "content": answer},
|
|
147
|
-
finish_reason="stop",
|
|
148
|
-
)
|
|
149
|
-
],
|
|
150
|
-
usage=CompletionUsage(
|
|
151
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
152
|
-
),
|
|
153
|
-
)
|
|
130
|
+
return generate_chat_completion(self.model_uid, answer)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
import uuid
|
|
16
|
+
from io import BytesIO
|
|
17
|
+
from typing import Dict, Iterator, List, Optional, Union
|
|
18
|
+
from urllib.request import urlopen
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
from ....model.utils import select_device
|
|
23
|
+
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
|
|
24
|
+
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
25
|
+
from ..utils import generate_chat_completion, generate_completion_chunk
|
|
26
|
+
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Qwen2AudioChatModel(PytorchChatModel):
|
|
32
|
+
def __init__(self, *args, **kwargs):
|
|
33
|
+
super().__init__(*args, **kwargs)
|
|
34
|
+
self._processor = None
|
|
35
|
+
self._model = None
|
|
36
|
+
self._device = None
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def match(
|
|
40
|
+
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
41
|
+
) -> bool:
|
|
42
|
+
llm_family = model_family.model_family or model_family.model_name
|
|
43
|
+
if "qwen2-audio".lower() in llm_family.lower():
|
|
44
|
+
return True
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def load(self):
|
|
48
|
+
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
|
|
49
|
+
|
|
50
|
+
device = self._pytorch_model_config.get("device", "auto")
|
|
51
|
+
device = select_device(device)
|
|
52
|
+
self._device = device
|
|
53
|
+
# for multiple GPU, set back to auto to make multiple devices work
|
|
54
|
+
device = "auto" if device == "cuda" else device
|
|
55
|
+
|
|
56
|
+
self._processor = AutoProcessor.from_pretrained(
|
|
57
|
+
self.model_path,
|
|
58
|
+
device_map=device,
|
|
59
|
+
# trust_remote_code=True,
|
|
60
|
+
code_revision=self.model_spec.model_revision,
|
|
61
|
+
)
|
|
62
|
+
self._model = Qwen2AudioForConditionalGeneration.from_pretrained(
|
|
63
|
+
self.model_path,
|
|
64
|
+
device_map=device,
|
|
65
|
+
# trust_remote_code=True,
|
|
66
|
+
revision=self.model_spec.model_revision,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def _transform_messages(
|
|
70
|
+
self,
|
|
71
|
+
messages: List[Dict],
|
|
72
|
+
):
|
|
73
|
+
import librosa
|
|
74
|
+
|
|
75
|
+
text = self._processor.apply_chat_template(
|
|
76
|
+
messages, add_generation_prompt=True, tokenize=False
|
|
77
|
+
)
|
|
78
|
+
audios: List[np.ndarray] = []
|
|
79
|
+
for msg in messages:
|
|
80
|
+
content = msg["content"]
|
|
81
|
+
if isinstance(content, List):
|
|
82
|
+
for item in content: # type: ignore
|
|
83
|
+
if item.get("type") == "audio" and "audio_url" in item:
|
|
84
|
+
audio = librosa.load(
|
|
85
|
+
BytesIO(urlopen(item["audio_url"]).read()),
|
|
86
|
+
sr=self._processor.feature_extractor.sampling_rate,
|
|
87
|
+
)[0]
|
|
88
|
+
audios.append(audio)
|
|
89
|
+
|
|
90
|
+
return text, audios
|
|
91
|
+
|
|
92
|
+
def chat(
|
|
93
|
+
self,
|
|
94
|
+
messages: List[Dict],
|
|
95
|
+
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
96
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
97
|
+
text, audios = self._transform_messages(messages)
|
|
98
|
+
inputs = self._processor(
|
|
99
|
+
text=text, audios=audios, return_tensors="pt", padding=True
|
|
100
|
+
)
|
|
101
|
+
inputs.input_ids = inputs.input_ids.to(self._device)
|
|
102
|
+
generate_config = generate_config if generate_config else {}
|
|
103
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
104
|
+
|
|
105
|
+
if stream:
|
|
106
|
+
it = self._generate_stream(inputs, generate_config)
|
|
107
|
+
return self._to_chat_completion_chunks(it)
|
|
108
|
+
else:
|
|
109
|
+
c = self._generate(inputs, generate_config)
|
|
110
|
+
return c
|
|
111
|
+
|
|
112
|
+
def _generate(self, inputs, config: PytorchGenerateConfig = {}) -> ChatCompletion:
|
|
113
|
+
generate_ids = self._model.generate(
|
|
114
|
+
**inputs,
|
|
115
|
+
max_length=config.get("max_tokens", 512),
|
|
116
|
+
)
|
|
117
|
+
generate_ids = generate_ids[:, inputs.input_ids.size(1) :]
|
|
118
|
+
response = self._processor.batch_decode(
|
|
119
|
+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
120
|
+
)[0]
|
|
121
|
+
return generate_chat_completion(self.model_uid, response)
|
|
122
|
+
|
|
123
|
+
def _generate_stream(
|
|
124
|
+
self, inputs, config: PytorchGenerateConfig = {}
|
|
125
|
+
) -> Iterator[CompletionChunk]:
|
|
126
|
+
from threading import Thread
|
|
127
|
+
|
|
128
|
+
from transformers import TextIteratorStreamer
|
|
129
|
+
|
|
130
|
+
tokenizer = self._processor.tokenizer
|
|
131
|
+
streamer = TextIteratorStreamer(
|
|
132
|
+
tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
gen_kwargs = {
|
|
136
|
+
"max_new_tokens": config.get("max_tokens", 512),
|
|
137
|
+
"streamer": streamer,
|
|
138
|
+
**inputs,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
|
|
142
|
+
thread.start()
|
|
143
|
+
|
|
144
|
+
completion_id = str(uuid.uuid1())
|
|
145
|
+
for new_text in streamer:
|
|
146
|
+
yield generate_completion_chunk(
|
|
147
|
+
chunk_text=new_text,
|
|
148
|
+
finish_reason=None,
|
|
149
|
+
chunk_id=completion_id,
|
|
150
|
+
model_uid=self.model_uid,
|
|
151
|
+
prompt_tokens=-1,
|
|
152
|
+
completion_tokens=-1,
|
|
153
|
+
total_tokens=-1,
|
|
154
|
+
has_choice=True,
|
|
155
|
+
has_content=True,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
yield generate_completion_chunk(
|
|
159
|
+
chunk_text=None,
|
|
160
|
+
finish_reason="stop",
|
|
161
|
+
chunk_id=completion_id,
|
|
162
|
+
model_uid=self.model_uid,
|
|
163
|
+
prompt_tokens=-1,
|
|
164
|
+
completion_tokens=-1,
|
|
165
|
+
total_tokens=-1,
|
|
166
|
+
has_choice=True,
|
|
167
|
+
has_content=False,
|
|
168
|
+
)
|