xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +51 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +5 -39
- xinference/client/restful/restful_client.py +3 -24
- xinference/conftest.py +1 -1
- xinference/constants.py +5 -0
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +8 -14
- xinference/core/event.py +1 -1
- xinference/core/model.py +82 -31
- xinference/core/scheduler.py +37 -37
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +11 -10
- xinference/core/utils.py +80 -22
- xinference/core/worker.py +17 -16
- xinference/deploy/cmdline.py +8 -16
- xinference/deploy/local.py +1 -1
- xinference/deploy/supervisor.py +1 -1
- xinference/deploy/utils.py +1 -1
- xinference/deploy/worker.py +1 -1
- xinference/model/audio/cosyvoice.py +86 -41
- xinference/model/embedding/core.py +52 -31
- xinference/model/image/stable_diffusion/core.py +18 -1
- xinference/model/llm/__init__.py +21 -11
- xinference/model/llm/llama_cpp/core.py +16 -33
- xinference/model/llm/llm_family.json +619 -1297
- xinference/model/llm/llm_family.py +31 -52
- xinference/model/llm/llm_family_csghub.json +18 -35
- xinference/model/llm/llm_family_modelscope.json +573 -1119
- xinference/model/llm/lmdeploy/core.py +56 -88
- xinference/model/llm/mlx/core.py +46 -69
- xinference/model/llm/sglang/core.py +33 -18
- xinference/model/llm/transformers/chatglm.py +167 -305
- xinference/model/llm/transformers/cogvlm2.py +36 -63
- xinference/model/llm/transformers/cogvlm2_video.py +33 -223
- xinference/model/llm/transformers/core.py +49 -50
- xinference/model/llm/transformers/deepseek_vl.py +53 -96
- xinference/model/llm/transformers/glm4v.py +55 -111
- xinference/model/llm/transformers/intern_vl.py +39 -70
- xinference/model/llm/transformers/internlm2.py +32 -54
- xinference/model/llm/transformers/minicpmv25.py +22 -55
- xinference/model/llm/transformers/minicpmv26.py +158 -68
- xinference/model/llm/transformers/omnilmm.py +5 -28
- xinference/model/llm/transformers/qwen2_vl.py +208 -0
- xinference/model/llm/transformers/qwen_vl.py +34 -86
- xinference/model/llm/transformers/utils.py +32 -38
- xinference/model/llm/transformers/yi_vl.py +32 -72
- xinference/model/llm/utils.py +195 -489
- xinference/model/llm/vllm/core.py +153 -100
- xinference/model/rerank/core.py +41 -8
- xinference/model/rerank/model_spec.json +7 -0
- xinference/model/rerank/model_spec_modelscope.json +7 -1
- xinference/model/utils.py +1 -31
- xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
- xinference/thirdparty/cosyvoice/cli/model.py +139 -26
- xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
- xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
- xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
- xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
- xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
- xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
- xinference/thirdparty/cosyvoice/utils/common.py +36 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
- xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
- xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
- xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +34 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
- xinference/thirdparty/matcha/VERSION +1 -0
- xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
- xinference/thirdparty/matcha/hifigan/README.md +101 -0
- xinference/thirdparty/omnilmm/LICENSE +201 -0
- xinference/thirdparty/whisper/__init__.py +156 -0
- xinference/thirdparty/whisper/__main__.py +3 -0
- xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
- xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
- xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
- xinference/thirdparty/whisper/audio.py +157 -0
- xinference/thirdparty/whisper/decoding.py +826 -0
- xinference/thirdparty/whisper/model.py +314 -0
- xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
- xinference/thirdparty/whisper/normalizers/basic.py +76 -0
- xinference/thirdparty/whisper/normalizers/english.json +1741 -0
- xinference/thirdparty/whisper/normalizers/english.py +550 -0
- xinference/thirdparty/whisper/timing.py +386 -0
- xinference/thirdparty/whisper/tokenizer.py +395 -0
- xinference/thirdparty/whisper/transcribe.py +605 -0
- xinference/thirdparty/whisper/triton_ops.py +109 -0
- xinference/thirdparty/whisper/utils.py +316 -0
- xinference/thirdparty/whisper/version.py +1 -0
- xinference/types.py +7 -49
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.4bafd904.css → main.632e9148.css} +2 -2
- xinference/web/ui/build/static/css/main.632e9148.css.map +1 -0
- xinference/web/ui/build/static/js/main.9cfafbd6.js +3 -0
- xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.9cfafbd6.js.LICENSE.txt} +2 -0
- xinference/web/ui/build/static/js/main.9cfafbd6.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/01d6d198156bacbd436c51435edbd4b2cacd47a79db929105eba30f74b67d48d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59eb25f514afcc4fefd1b309d192b2455f1e0aec68a9de598ca4b2333fe2c774.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +37 -0
- xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
- xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
- xinference/web/ui/node_modules/nunjucks/package.json +112 -0
- xinference/web/ui/package-lock.json +38 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/METADATA +8 -8
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/RECORD +141 -87
- xinference/model/llm/transformers/llama_2.py +0 -108
- xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
- xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
- xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/LICENSE +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/WHEEL +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -11,23 +11,13 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
import time
|
|
15
14
|
import uuid
|
|
16
15
|
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
17
16
|
|
|
18
17
|
from ....core.scheduler import InferenceRequest
|
|
19
|
-
from ....types import
|
|
20
|
-
ChatCompletion,
|
|
21
|
-
ChatCompletionChoice,
|
|
22
|
-
ChatCompletionChunk,
|
|
23
|
-
ChatCompletionMessage,
|
|
24
|
-
CompletionChoice,
|
|
25
|
-
CompletionChunk,
|
|
26
|
-
CompletionUsage,
|
|
27
|
-
LoRA,
|
|
28
|
-
PytorchGenerateConfig,
|
|
29
|
-
)
|
|
18
|
+
from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
|
|
30
19
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
20
|
+
from ..utils import generate_chat_completion, generate_completion_chunk, parse_messages
|
|
31
21
|
from .core import PytorchChatModel, PytorchModelConfig
|
|
32
22
|
|
|
33
23
|
|
|
@@ -106,9 +96,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
106
96
|
|
|
107
97
|
def chat(
|
|
108
98
|
self,
|
|
109
|
-
|
|
110
|
-
system_prompt: Optional[str] = None,
|
|
111
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
99
|
+
messages: List[Dict],
|
|
112
100
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
113
101
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
114
102
|
kwargs: Dict[str, Any] = {}
|
|
@@ -130,6 +118,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
130
118
|
if isinstance(stream_options, dict)
|
|
131
119
|
else False
|
|
132
120
|
)
|
|
121
|
+
|
|
122
|
+
prompt, system_prompt, chat_history = parse_messages(messages)
|
|
133
123
|
if chat_history:
|
|
134
124
|
input_history = [
|
|
135
125
|
(chat_history[i]["content"], (chat_history[i + 1]["content"]))
|
|
@@ -155,54 +145,42 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
155
145
|
total_tokens = prompt_tokens + completion_tokens
|
|
156
146
|
chunk_text = chunk_text[last_chunk_text_length:]
|
|
157
147
|
last_chunk_text_length += len(chunk_text)
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
usage=CompletionUsage(
|
|
168
|
-
prompt_tokens=prompt_tokens,
|
|
169
|
-
completion_tokens=completion_tokens,
|
|
170
|
-
total_tokens=total_tokens,
|
|
171
|
-
),
|
|
148
|
+
|
|
149
|
+
yield generate_completion_chunk(
|
|
150
|
+
chunk_text,
|
|
151
|
+
finish_reason=None,
|
|
152
|
+
chunk_id=chunk_id,
|
|
153
|
+
model_uid=self.model_uid,
|
|
154
|
+
prompt_tokens=prompt_tokens,
|
|
155
|
+
completion_tokens=completion_tokens,
|
|
156
|
+
total_tokens=total_tokens,
|
|
172
157
|
)
|
|
158
|
+
yield generate_completion_chunk(
|
|
159
|
+
None,
|
|
160
|
+
finish_reason="stop",
|
|
161
|
+
chunk_id=chunk_id,
|
|
162
|
+
model_uid=self.model_uid,
|
|
163
|
+
prompt_tokens=prompt_tokens,
|
|
164
|
+
completion_tokens=completion_tokens,
|
|
165
|
+
total_tokens=total_tokens,
|
|
166
|
+
has_choice=True,
|
|
167
|
+
has_content=False,
|
|
168
|
+
)
|
|
173
169
|
if include_usage:
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
choices=[],
|
|
180
|
-
)
|
|
181
|
-
chunk["usage"] = CompletionUsage(
|
|
170
|
+
yield generate_completion_chunk(
|
|
171
|
+
None,
|
|
172
|
+
finish_reason=None,
|
|
173
|
+
chunk_id=chunk_id,
|
|
174
|
+
model_uid=self.model_uid,
|
|
182
175
|
prompt_tokens=prompt_tokens,
|
|
183
176
|
completion_tokens=completion_tokens,
|
|
184
177
|
total_tokens=total_tokens,
|
|
178
|
+
has_choice=False,
|
|
185
179
|
)
|
|
186
|
-
yield chunk
|
|
187
180
|
|
|
188
181
|
return self._to_chat_completion_chunks(_stream_generator())
|
|
189
182
|
else:
|
|
190
183
|
response, _ = self._model.chat(
|
|
191
184
|
self._tokenizer, prompt, input_history, **kwargs
|
|
192
185
|
)
|
|
193
|
-
return
|
|
194
|
-
id="chat" + str(uuid.uuid1()),
|
|
195
|
-
object="chat.completion",
|
|
196
|
-
created=int(time.time()),
|
|
197
|
-
model=self.model_uid,
|
|
198
|
-
choices=[
|
|
199
|
-
ChatCompletionChoice(
|
|
200
|
-
index=0,
|
|
201
|
-
message={"role": "assistant", "content": response},
|
|
202
|
-
finish_reason="stop",
|
|
203
|
-
)
|
|
204
|
-
],
|
|
205
|
-
usage=CompletionUsage(
|
|
206
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
207
|
-
),
|
|
208
|
-
)
|
|
186
|
+
return generate_chat_completion(self.model_uid, response)
|
|
@@ -13,25 +13,21 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
15
|
import logging
|
|
16
|
-
import time
|
|
17
16
|
import uuid
|
|
18
17
|
from concurrent.futures import ThreadPoolExecutor
|
|
19
18
|
from typing import Dict, Iterator, List, Optional, Union
|
|
20
19
|
|
|
21
20
|
import torch
|
|
22
21
|
|
|
23
|
-
from ....types import
|
|
24
|
-
ChatCompletion,
|
|
25
|
-
ChatCompletionChunk,
|
|
26
|
-
ChatCompletionMessage,
|
|
27
|
-
Completion,
|
|
28
|
-
CompletionChoice,
|
|
29
|
-
CompletionChunk,
|
|
30
|
-
CompletionUsage,
|
|
31
|
-
)
|
|
22
|
+
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
|
|
32
23
|
from ...utils import select_device
|
|
33
24
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
34
|
-
from ..utils import
|
|
25
|
+
from ..utils import (
|
|
26
|
+
_decode_image,
|
|
27
|
+
generate_chat_completion,
|
|
28
|
+
generate_completion_chunk,
|
|
29
|
+
parse_messages,
|
|
30
|
+
)
|
|
35
31
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
36
32
|
|
|
37
33
|
logger = logging.getLogger(__name__)
|
|
@@ -125,12 +121,11 @@ class MiniCPMV25Model(PytorchChatModel):
|
|
|
125
121
|
|
|
126
122
|
def chat(
|
|
127
123
|
self,
|
|
128
|
-
|
|
129
|
-
system_prompt: Optional[str] = None,
|
|
130
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
124
|
+
messages: List[Dict],
|
|
131
125
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
132
126
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
133
127
|
stream = generate_config.get("stream", False) if generate_config else False
|
|
128
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
134
129
|
content, images_chat = self._message_content_to_chat(prompt)
|
|
135
130
|
|
|
136
131
|
msgs = []
|
|
@@ -166,57 +161,29 @@ class MiniCPMV25Model(PytorchChatModel):
|
|
|
166
161
|
it = self.chat_stream(chat)
|
|
167
162
|
return self._to_chat_completion_chunks(it)
|
|
168
163
|
else:
|
|
169
|
-
|
|
170
|
-
id=str(uuid.uuid1()),
|
|
171
|
-
object="text_completion",
|
|
172
|
-
created=int(time.time()),
|
|
173
|
-
model=self.model_uid,
|
|
174
|
-
choices=[
|
|
175
|
-
CompletionChoice(
|
|
176
|
-
index=0, text=chat, finish_reason="stop", logprobs=None
|
|
177
|
-
)
|
|
178
|
-
],
|
|
179
|
-
usage=CompletionUsage(
|
|
180
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
181
|
-
),
|
|
182
|
-
)
|
|
183
|
-
return self._to_chat_completion(c)
|
|
164
|
+
return generate_chat_completion(self.model_uid, chat)
|
|
184
165
|
|
|
185
166
|
def chat_stream(self, chat) -> Iterator[CompletionChunk]:
|
|
186
167
|
completion_id = str(uuid.uuid1())
|
|
187
168
|
for new_text in chat:
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
object="text_completion",
|
|
194
|
-
created=int(time.time()),
|
|
195
|
-
model=self.model_uid,
|
|
196
|
-
choices=[completion_choice],
|
|
197
|
-
)
|
|
198
|
-
completion_usage = CompletionUsage(
|
|
169
|
+
yield generate_completion_chunk(
|
|
170
|
+
chunk_text=new_text,
|
|
171
|
+
finish_reason=None,
|
|
172
|
+
chunk_id=completion_id,
|
|
173
|
+
model_uid=self.model_uid,
|
|
199
174
|
prompt_tokens=-1,
|
|
200
175
|
completion_tokens=-1,
|
|
201
176
|
total_tokens=-1,
|
|
202
177
|
)
|
|
203
|
-
chunk["usage"] = completion_usage
|
|
204
|
-
yield chunk
|
|
205
178
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
object="text_completion",
|
|
212
|
-
created=int(time.time()),
|
|
213
|
-
model=self.model_uid,
|
|
214
|
-
choices=[completion_choice],
|
|
215
|
-
)
|
|
216
|
-
completion_usage = CompletionUsage(
|
|
179
|
+
yield generate_completion_chunk(
|
|
180
|
+
chunk_text=None,
|
|
181
|
+
finish_reason="stop",
|
|
182
|
+
chunk_id=completion_id,
|
|
183
|
+
model_uid=self.model_uid,
|
|
217
184
|
prompt_tokens=-1,
|
|
218
185
|
completion_tokens=-1,
|
|
219
186
|
total_tokens=-1,
|
|
187
|
+
has_choice=True,
|
|
188
|
+
has_content=False,
|
|
220
189
|
)
|
|
221
|
-
chunk["usage"] = completion_usage
|
|
222
|
-
yield chunk
|
|
@@ -12,26 +12,23 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
-
import time
|
|
16
15
|
import uuid
|
|
17
16
|
from concurrent.futures import ThreadPoolExecutor
|
|
18
|
-
from typing import Dict, Iterator, List, Optional, Union
|
|
17
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
19
18
|
|
|
20
19
|
import torch
|
|
21
20
|
from PIL import Image
|
|
22
21
|
|
|
23
|
-
from ....
|
|
24
|
-
|
|
25
|
-
ChatCompletionChunk,
|
|
26
|
-
ChatCompletionMessage,
|
|
27
|
-
Completion,
|
|
28
|
-
CompletionChoice,
|
|
29
|
-
CompletionChunk,
|
|
30
|
-
CompletionUsage,
|
|
31
|
-
)
|
|
22
|
+
from ....core.scheduler import InferenceRequest
|
|
23
|
+
from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
|
|
32
24
|
from ...utils import select_device
|
|
33
25
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
34
|
-
from ..utils import
|
|
26
|
+
from ..utils import (
|
|
27
|
+
_decode_image,
|
|
28
|
+
generate_chat_completion,
|
|
29
|
+
generate_completion_chunk,
|
|
30
|
+
parse_messages,
|
|
31
|
+
)
|
|
35
32
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
36
33
|
|
|
37
34
|
logger = logging.getLogger(__name__)
|
|
@@ -43,6 +40,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
43
40
|
self._device = None
|
|
44
41
|
self._tokenizer = None
|
|
45
42
|
self._model = None
|
|
43
|
+
self._processor = None
|
|
46
44
|
|
|
47
45
|
@classmethod
|
|
48
46
|
def match(
|
|
@@ -59,7 +57,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
59
57
|
return AutoModel
|
|
60
58
|
|
|
61
59
|
def load(self, **kwargs):
|
|
62
|
-
from transformers import AutoModel, AutoTokenizer
|
|
60
|
+
from transformers import AutoModel, AutoProcessor, AutoTokenizer
|
|
63
61
|
from transformers.generation import GenerationConfig
|
|
64
62
|
|
|
65
63
|
device = self._pytorch_model_config.get("device", "auto")
|
|
@@ -100,6 +98,10 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
100
98
|
self.model_path,
|
|
101
99
|
trust_remote_code=True,
|
|
102
100
|
)
|
|
101
|
+
self._processor = AutoProcessor.from_pretrained(
|
|
102
|
+
self.model_path, trust_remote_code=True
|
|
103
|
+
)
|
|
104
|
+
self._device = self._model.device
|
|
103
105
|
self._save_tensorizer()
|
|
104
106
|
|
|
105
107
|
def _message_content_to_chat(self, content):
|
|
@@ -120,7 +122,9 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
120
122
|
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
|
|
121
123
|
frames = vr.get_batch(frame_idx).asnumpy()
|
|
122
124
|
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
|
|
123
|
-
|
|
125
|
+
logger.info(
|
|
126
|
+
f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
|
|
127
|
+
)
|
|
124
128
|
return frames
|
|
125
129
|
|
|
126
130
|
def _load_video(_url):
|
|
@@ -158,19 +162,13 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
158
162
|
return text, images, frames
|
|
159
163
|
return content, [], []
|
|
160
164
|
|
|
161
|
-
def
|
|
162
|
-
|
|
163
|
-
prompt
|
|
164
|
-
system_prompt: Optional[str] = None,
|
|
165
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
166
|
-
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
167
|
-
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
168
|
-
stream = generate_config.get("stream", False) if generate_config else False
|
|
169
|
-
videoExisted = False
|
|
165
|
+
def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
|
|
166
|
+
video_existed = False
|
|
167
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
170
168
|
|
|
171
169
|
content, images_chat, video_frames = self._message_content_to_chat(prompt)
|
|
172
170
|
if len(video_frames) > 0:
|
|
173
|
-
|
|
171
|
+
video_existed = True
|
|
174
172
|
images_chat = video_frames
|
|
175
173
|
|
|
176
174
|
msgs = []
|
|
@@ -184,7 +182,7 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
184
182
|
if images_tmp != []:
|
|
185
183
|
images_history = images_tmp
|
|
186
184
|
if len(video_frames_h) > 0:
|
|
187
|
-
|
|
185
|
+
video_existed = True
|
|
188
186
|
images_history = video_frames_h
|
|
189
187
|
if len(query_to_response) == 0 and role == "user":
|
|
190
188
|
query_to_response.append(
|
|
@@ -198,10 +196,19 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
198
196
|
msgs.extend(query_to_response)
|
|
199
197
|
query_to_response = []
|
|
200
198
|
msgs.append({"role": "user", "content": images_chat + [content]})
|
|
199
|
+
return msgs, video_existed
|
|
200
|
+
|
|
201
|
+
def chat(
|
|
202
|
+
self,
|
|
203
|
+
messages: List[Dict],
|
|
204
|
+
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
205
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
206
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
207
|
+
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
201
208
|
|
|
202
209
|
# Set decode params for video
|
|
203
210
|
params = {}
|
|
204
|
-
if
|
|
211
|
+
if video_existed:
|
|
205
212
|
params = {"use_image_id": False, "max_slice_nums": 1}
|
|
206
213
|
|
|
207
214
|
chat = self._model.chat(
|
|
@@ -216,57 +223,140 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
216
223
|
it = self.chat_stream(chat)
|
|
217
224
|
return self._to_chat_completion_chunks(it)
|
|
218
225
|
else:
|
|
219
|
-
|
|
220
|
-
id=str(uuid.uuid1()),
|
|
221
|
-
object="text_completion",
|
|
222
|
-
created=int(time.time()),
|
|
223
|
-
model=self.model_uid,
|
|
224
|
-
choices=[
|
|
225
|
-
CompletionChoice(
|
|
226
|
-
index=0, text=chat, finish_reason="stop", logprobs=None
|
|
227
|
-
)
|
|
228
|
-
],
|
|
229
|
-
usage=CompletionUsage(
|
|
230
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
231
|
-
),
|
|
232
|
-
)
|
|
233
|
-
return self._to_chat_completion(c)
|
|
226
|
+
return generate_chat_completion(self.model_uid, chat)
|
|
234
227
|
|
|
235
228
|
def chat_stream(self, chat) -> Iterator[CompletionChunk]:
|
|
236
229
|
completion_id = str(uuid.uuid1())
|
|
237
230
|
for new_text in chat:
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
object="text_completion",
|
|
244
|
-
created=int(time.time()),
|
|
245
|
-
model=self.model_uid,
|
|
246
|
-
choices=[completion_choice],
|
|
247
|
-
)
|
|
248
|
-
completion_usage = CompletionUsage(
|
|
231
|
+
yield generate_completion_chunk(
|
|
232
|
+
chunk_text=new_text,
|
|
233
|
+
finish_reason=None,
|
|
234
|
+
chunk_id=completion_id,
|
|
235
|
+
model_uid=self.model_uid,
|
|
249
236
|
prompt_tokens=-1,
|
|
250
237
|
completion_tokens=-1,
|
|
251
238
|
total_tokens=-1,
|
|
252
239
|
)
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
)
|
|
259
|
-
chunk = CompletionChunk(
|
|
260
|
-
id=completion_id,
|
|
261
|
-
object="text_completion",
|
|
262
|
-
created=int(time.time()),
|
|
263
|
-
model=self.model_uid,
|
|
264
|
-
choices=[completion_choice],
|
|
265
|
-
)
|
|
266
|
-
completion_usage = CompletionUsage(
|
|
240
|
+
yield generate_completion_chunk(
|
|
241
|
+
chunk_text=None,
|
|
242
|
+
finish_reason="stop",
|
|
243
|
+
chunk_id=completion_id,
|
|
244
|
+
model_uid=self.model_uid,
|
|
267
245
|
prompt_tokens=-1,
|
|
268
246
|
completion_tokens=-1,
|
|
269
247
|
total_tokens=-1,
|
|
248
|
+
has_choice=True,
|
|
249
|
+
has_content=False,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def prepare_sanitize_generate_config(self, req: InferenceRequest):
|
|
253
|
+
"""
|
|
254
|
+
Refer to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py
|
|
255
|
+
"""
|
|
256
|
+
raw_config = req.inference_kwargs.get("raw_params", {})
|
|
257
|
+
temperature = raw_config.get("temperature", None)
|
|
258
|
+
if temperature is None:
|
|
259
|
+
raw_config["temperature"] = 0.7
|
|
260
|
+
top_p = raw_config.get("top_p", None)
|
|
261
|
+
if top_p is None:
|
|
262
|
+
raw_config["top_p"] = 0.8
|
|
263
|
+
top_k = raw_config.get("top_k", None)
|
|
264
|
+
if top_k is None:
|
|
265
|
+
raw_config["top_k"] = 100
|
|
266
|
+
repetition_penalty = raw_config.get("repetition_penalty", None)
|
|
267
|
+
if repetition_penalty is None:
|
|
268
|
+
raw_config["repetition_penalty"] = 1.05
|
|
269
|
+
return raw_config
|
|
270
|
+
|
|
271
|
+
def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
|
|
272
|
+
"""
|
|
273
|
+
Copied from https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py#L315
|
|
274
|
+
"""
|
|
275
|
+
from copy import deepcopy
|
|
276
|
+
|
|
277
|
+
copy_msgs = deepcopy(msgs)
|
|
278
|
+
|
|
279
|
+
images = []
|
|
280
|
+
for i, msg in enumerate(copy_msgs):
|
|
281
|
+
role = msg["role"]
|
|
282
|
+
content = msg["content"]
|
|
283
|
+
assert role in ["user", "assistant"]
|
|
284
|
+
if i == 0:
|
|
285
|
+
assert role == "user", "The role of first msg should be user"
|
|
286
|
+
if isinstance(content, str):
|
|
287
|
+
content = [content]
|
|
288
|
+
cur_msgs = []
|
|
289
|
+
for c in content:
|
|
290
|
+
if isinstance(c, Image.Image):
|
|
291
|
+
images.append(c)
|
|
292
|
+
cur_msgs.append("(<image>./</image>)")
|
|
293
|
+
elif isinstance(c, str):
|
|
294
|
+
cur_msgs.append(c)
|
|
295
|
+
msg["content"] = "\n".join(cur_msgs)
|
|
296
|
+
|
|
297
|
+
return {
|
|
298
|
+
"prompt": self._processor.tokenizer.apply_chat_template(
|
|
299
|
+
copy_msgs, tokenize=False, add_generation_prompt=True
|
|
300
|
+
),
|
|
301
|
+
"input_image": images,
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
def _get_full_prompt(self, messages: List[Dict], tools):
|
|
305
|
+
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
306
|
+
if video_existed:
|
|
307
|
+
raise RuntimeError(
|
|
308
|
+
f"Continuous batching does not support video inputs for this model: {self.model_uid}"
|
|
309
|
+
)
|
|
310
|
+
return self._handle_input_ids_and_images(msgs)
|
|
311
|
+
|
|
312
|
+
def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
|
|
313
|
+
prompts_lists = [x["prompt"] for x in prompts]
|
|
314
|
+
input_images_lists = [x["input_image"] for x in prompts]
|
|
315
|
+
inputs = self._processor(
|
|
316
|
+
prompts_lists,
|
|
317
|
+
input_images_lists,
|
|
318
|
+
max_slice_nums=None,
|
|
319
|
+
use_image_id=None,
|
|
320
|
+
return_tensors="pt",
|
|
321
|
+
max_length=8192,
|
|
322
|
+
).to(self._model.device)
|
|
323
|
+
inputs.pop("image_sizes")
|
|
324
|
+
|
|
325
|
+
masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
|
|
326
|
+
for i in range(masked_input_ids.shape[0]):
|
|
327
|
+
non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
|
|
328
|
+
req_list[i].prompt_tokens = non_zero_values
|
|
329
|
+
req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
|
|
330
|
+
req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
|
|
331
|
+
|
|
332
|
+
model_inputs = {
|
|
333
|
+
"input_ids": inputs["input_ids"],
|
|
334
|
+
"image_bound": inputs["image_bound"],
|
|
335
|
+
"pixel_values": inputs["pixel_values"],
|
|
336
|
+
"tgt_sizes": inputs["tgt_sizes"],
|
|
337
|
+
}
|
|
338
|
+
model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
|
|
339
|
+
|
|
340
|
+
return {
|
|
341
|
+
"inputs_embeds": model_inputs["inputs_embeds"],
|
|
342
|
+
"attention_mask": inputs["attention_mask"],
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
def build_decode_position_ids(
|
|
346
|
+
self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
|
|
347
|
+
):
|
|
348
|
+
return None
|
|
349
|
+
|
|
350
|
+
def batch_inference(self, req_list: List[InferenceRequest]):
|
|
351
|
+
"""
|
|
352
|
+
This method is rewritten
|
|
353
|
+
because the specific inference process is performed by `self._model.llm`,
|
|
354
|
+
not `self._model` itself
|
|
355
|
+
"""
|
|
356
|
+
from .utils import batch_inference_one_step
|
|
357
|
+
|
|
358
|
+
self.prepare_batch_inference(req_list)
|
|
359
|
+
batch_inference_one_step(
|
|
360
|
+
self, req_list, self.model_uid, self._model.llm, self._tokenizer
|
|
270
361
|
)
|
|
271
|
-
|
|
272
|
-
yield chunk
|
|
362
|
+
self.handle_batch_inference_results(req_list)
|
|
@@ -16,20 +16,13 @@ import json
|
|
|
16
16
|
import logging
|
|
17
17
|
import operator
|
|
18
18
|
import tempfile
|
|
19
|
-
import time
|
|
20
|
-
import uuid
|
|
21
19
|
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
22
20
|
|
|
23
21
|
from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64
|
|
24
|
-
from ....types import
|
|
25
|
-
ChatCompletion,
|
|
26
|
-
ChatCompletionChoice,
|
|
27
|
-
ChatCompletionChunk,
|
|
28
|
-
ChatCompletionMessage,
|
|
29
|
-
CompletionUsage,
|
|
30
|
-
)
|
|
22
|
+
from ....types import ChatCompletion, ChatCompletionChunk
|
|
31
23
|
from ...utils import select_device
|
|
32
24
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
25
|
+
from ..utils import generate_chat_completion, parse_messages
|
|
33
26
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
34
27
|
|
|
35
28
|
logger = logging.getLogger(__name__)
|
|
@@ -96,15 +89,14 @@ class OmniLMMModel(PytorchChatModel):
|
|
|
96
89
|
|
|
97
90
|
def chat(
|
|
98
91
|
self,
|
|
99
|
-
|
|
100
|
-
system_prompt: Optional[str] = None,
|
|
101
|
-
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
92
|
+
messages: List[Dict],
|
|
102
93
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
103
94
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
104
95
|
if generate_config and generate_config.get("stream"):
|
|
105
96
|
raise Exception(
|
|
106
97
|
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
107
98
|
)
|
|
99
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
108
100
|
image_first, prompt = self._message_content_to_OmniLMM(prompt)
|
|
109
101
|
|
|
110
102
|
msgs = []
|
|
@@ -135,19 +127,4 @@ class OmniLMMModel(PytorchChatModel):
|
|
|
135
127
|
input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
|
|
136
128
|
answer = self._model.chat(input=input)
|
|
137
129
|
|
|
138
|
-
return
|
|
139
|
-
id="chat" + str(uuid.uuid1()),
|
|
140
|
-
object="chat.completion",
|
|
141
|
-
created=int(time.time()),
|
|
142
|
-
model=self.model_uid,
|
|
143
|
-
choices=[
|
|
144
|
-
ChatCompletionChoice(
|
|
145
|
-
index=0,
|
|
146
|
-
message={"role": "assistant", "content": answer},
|
|
147
|
-
finish_reason="stop",
|
|
148
|
-
)
|
|
149
|
-
],
|
|
150
|
-
usage=CompletionUsage(
|
|
151
|
-
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
152
|
-
),
|
|
153
|
-
)
|
|
130
|
+
return generate_chat_completion(self.model_uid, answer)
|