xinference 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +47 -18
- xinference/api/oauth2/types.py +1 -0
- xinference/api/restful_api.py +34 -7
- xinference/client/oscar/actor_client.py +4 -3
- xinference/client/restful/restful_client.py +20 -4
- xinference/conftest.py +13 -2
- xinference/core/supervisor.py +48 -1
- xinference/core/worker.py +139 -20
- xinference/deploy/cmdline.py +119 -20
- xinference/model/embedding/core.py +1 -2
- xinference/model/llm/__init__.py +4 -6
- xinference/model/llm/ggml/llamacpp.py +2 -10
- xinference/model/llm/llm_family.json +877 -13
- xinference/model/llm/llm_family.py +15 -0
- xinference/model/llm/llm_family_modelscope.json +571 -0
- xinference/model/llm/pytorch/chatglm.py +2 -0
- xinference/model/llm/pytorch/core.py +22 -26
- xinference/model/llm/pytorch/deepseek_vl.py +232 -0
- xinference/model/llm/pytorch/internlm2.py +2 -0
- xinference/model/llm/pytorch/omnilmm.py +153 -0
- xinference/model/llm/pytorch/qwen_vl.py +2 -0
- xinference/model/llm/pytorch/yi_vl.py +4 -2
- xinference/model/llm/utils.py +53 -5
- xinference/model/llm/vllm/core.py +54 -6
- xinference/model/rerank/core.py +3 -0
- xinference/thirdparty/deepseek_vl/__init__.py +31 -0
- xinference/thirdparty/deepseek_vl/models/__init__.py +28 -0
- xinference/thirdparty/deepseek_vl/models/clip_encoder.py +242 -0
- xinference/thirdparty/deepseek_vl/models/image_processing_vlm.py +208 -0
- xinference/thirdparty/deepseek_vl/models/modeling_vlm.py +170 -0
- xinference/thirdparty/deepseek_vl/models/processing_vlm.py +390 -0
- xinference/thirdparty/deepseek_vl/models/projector.py +100 -0
- xinference/thirdparty/deepseek_vl/models/sam.py +593 -0
- xinference/thirdparty/deepseek_vl/models/siglip_vit.py +681 -0
- xinference/thirdparty/deepseek_vl/utils/__init__.py +18 -0
- xinference/thirdparty/deepseek_vl/utils/conversation.py +348 -0
- xinference/thirdparty/deepseek_vl/utils/io.py +78 -0
- xinference/thirdparty/omnilmm/__init__.py +0 -0
- xinference/thirdparty/omnilmm/chat.py +216 -0
- xinference/thirdparty/omnilmm/constants.py +4 -0
- xinference/thirdparty/omnilmm/conversation.py +332 -0
- xinference/thirdparty/omnilmm/model/__init__.py +1 -0
- xinference/thirdparty/omnilmm/model/omnilmm.py +594 -0
- xinference/thirdparty/omnilmm/model/resampler.py +166 -0
- xinference/thirdparty/omnilmm/model/utils.py +563 -0
- xinference/thirdparty/omnilmm/train/__init__.py +13 -0
- xinference/thirdparty/omnilmm/train/train_utils.py +150 -0
- xinference/thirdparty/omnilmm/utils.py +134 -0
- xinference/types.py +15 -19
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.76ef2b17.js +3 -0
- xinference/web/ui/build/static/js/main.76ef2b17.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +1 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/METADATA +25 -12
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/RECORD +79 -58
- xinference/model/llm/ggml/ctransformers.py +0 -281
- xinference/model/llm/ggml/ctransformers_util.py +0 -161
- xinference/web/ui/build/static/js/main.66b1c4fb.js +0 -3
- xinference/web/ui/build/static/js/main.66b1c4fb.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c2124cfe036b26befcbd386d1d17743b1a58d0b7a041a17bb67f9924400d63c3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fd4a8ae5d192331af1bedd1d2d70efcc569708ee6cc4cb479b225d059482aa81.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
- /xinference/web/ui/build/static/js/{main.66b1c4fb.js.LICENSE.txt → main.76ef2b17.js.LICENSE.txt} +0 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/LICENSE +0 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/WHEEL +0 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import base64
|
|
15
|
+
import logging
|
|
16
|
+
import os.path
|
|
17
|
+
import tempfile
|
|
18
|
+
import time
|
|
19
|
+
import uuid
|
|
20
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
21
|
+
from io import BytesIO
|
|
22
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
23
|
+
|
|
24
|
+
import requests
|
|
25
|
+
import torch
|
|
26
|
+
|
|
27
|
+
from ....model.utils import select_device
|
|
28
|
+
from ....types import (
|
|
29
|
+
ChatCompletion,
|
|
30
|
+
ChatCompletionChoice,
|
|
31
|
+
ChatCompletionChunk,
|
|
32
|
+
ChatCompletionMessage,
|
|
33
|
+
CompletionUsage,
|
|
34
|
+
)
|
|
35
|
+
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
36
|
+
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DeepSeekVLChatModel(PytorchChatModel):
|
|
42
|
+
def __init__(self, *args, **kwargs):
|
|
43
|
+
super().__init__(*args, **kwargs)
|
|
44
|
+
self._tokenizer = None
|
|
45
|
+
self._model = None
|
|
46
|
+
self._vl_chat_processor = None
|
|
47
|
+
self._type = None
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def match(
|
|
51
|
+
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
52
|
+
) -> bool:
|
|
53
|
+
if "deepseek" in model_family.model_name:
|
|
54
|
+
return True
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
def load(self):
|
|
58
|
+
from transformers import AutoModelForCausalLM
|
|
59
|
+
|
|
60
|
+
from ....thirdparty.deepseek_vl.models import (
|
|
61
|
+
MultiModalityCausalLM,
|
|
62
|
+
VLChatProcessor,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
self._device = self._pytorch_model_config.get("device", "auto")
|
|
66
|
+
self._device = select_device(self._device)
|
|
67
|
+
self._type = torch.float16 if self._device == "mps" else torch.bfloat16
|
|
68
|
+
|
|
69
|
+
# specify the path to the model
|
|
70
|
+
self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
|
|
71
|
+
self.model_path
|
|
72
|
+
)
|
|
73
|
+
self._tokenizer = self._vl_chat_processor.tokenizer
|
|
74
|
+
|
|
75
|
+
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
|
|
76
|
+
self.model_path, trust_remote_code=True, device_map=self._device
|
|
77
|
+
)
|
|
78
|
+
self._model = vl_gpt.to(self._type).eval()
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:
|
|
82
|
+
def _ensure_url(_url):
|
|
83
|
+
if _url.startswith("data:"):
|
|
84
|
+
logging.info("Parse url by base64 decoder.")
|
|
85
|
+
# https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
|
|
86
|
+
# e.g. f"data:image/jpeg;base64,{base64_image}"
|
|
87
|
+
_type, data = _url.split(";")
|
|
88
|
+
_, ext = _type.split("/")
|
|
89
|
+
data = data[len("base64,") :]
|
|
90
|
+
data = base64.b64decode(data.encode("utf-8"))
|
|
91
|
+
|
|
92
|
+
with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
|
|
93
|
+
f.write(data)
|
|
94
|
+
logging.info("Dump base64 data to %s", f.name)
|
|
95
|
+
return f.name
|
|
96
|
+
else:
|
|
97
|
+
if len(_url) > 2048:
|
|
98
|
+
raise Exception(f"Image url is too long, {len(_url)} > 2048.")
|
|
99
|
+
|
|
100
|
+
return _url
|
|
101
|
+
|
|
102
|
+
def _download(_images):
|
|
103
|
+
local_images = []
|
|
104
|
+
|
|
105
|
+
# To make requests.get works
|
|
106
|
+
headers = {
|
|
107
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
|
|
108
|
+
}
|
|
109
|
+
with ThreadPoolExecutor() as executor:
|
|
110
|
+
for url in images:
|
|
111
|
+
try:
|
|
112
|
+
if os.path.exists(url):
|
|
113
|
+
local_images.append(url)
|
|
114
|
+
continue
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.debug("Image is remote: %s, e: %s", url, e)
|
|
117
|
+
pass
|
|
118
|
+
# Append a placeholder
|
|
119
|
+
local_images.append(None)
|
|
120
|
+
|
|
121
|
+
def _fill_placeholder(_url, _index):
|
|
122
|
+
response = requests.get(url, headers=headers)
|
|
123
|
+
local_images[_index] = BytesIO(response.content)
|
|
124
|
+
|
|
125
|
+
executor.submit(_fill_placeholder, url, len(local_images) - 1)
|
|
126
|
+
return local_images
|
|
127
|
+
|
|
128
|
+
if not isinstance(content, str):
|
|
129
|
+
# TODO(codingl2k1): Optimize _ensure_url
|
|
130
|
+
|
|
131
|
+
images = []
|
|
132
|
+
new_content = []
|
|
133
|
+
for c in content:
|
|
134
|
+
c_type = c.get("type")
|
|
135
|
+
if c_type == "image_url":
|
|
136
|
+
images.append(_ensure_url(c["image_url"]["url"]))
|
|
137
|
+
elif c_type == "text":
|
|
138
|
+
new_content.append(c["text"])
|
|
139
|
+
if images:
|
|
140
|
+
new_content.insert(0, "<image_placeholder>")
|
|
141
|
+
images = _download(images)
|
|
142
|
+
return "".join(new_content), images
|
|
143
|
+
return content, []
|
|
144
|
+
|
|
145
|
+
def chat(
|
|
146
|
+
self,
|
|
147
|
+
prompt: Union[str, List[Dict]],
|
|
148
|
+
system_prompt: Optional[str] = None,
|
|
149
|
+
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
150
|
+
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
151
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
152
|
+
if generate_config and generate_config.get("stream"):
|
|
153
|
+
raise Exception(
|
|
154
|
+
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
155
|
+
)
|
|
156
|
+
prompt, images = self._message_content_to_deepseek(prompt)
|
|
157
|
+
prompt_messages: List[Dict[str, Any]] = [
|
|
158
|
+
{
|
|
159
|
+
"role": "User",
|
|
160
|
+
"content": prompt,
|
|
161
|
+
},
|
|
162
|
+
{"role": "Assistant", "content": ""},
|
|
163
|
+
]
|
|
164
|
+
if images:
|
|
165
|
+
prompt_messages[0]["images"] = images
|
|
166
|
+
|
|
167
|
+
# Convert openai history to qwen vl history
|
|
168
|
+
deepseek_history = []
|
|
169
|
+
for h in chat_history or []:
|
|
170
|
+
role = h["role"]
|
|
171
|
+
if role == "user":
|
|
172
|
+
content, images = self._message_content_to_deepseek(h["content"])
|
|
173
|
+
msg: Dict[str, Any] = {
|
|
174
|
+
"role": "User",
|
|
175
|
+
"content": content,
|
|
176
|
+
}
|
|
177
|
+
if images:
|
|
178
|
+
msg["images"] = images
|
|
179
|
+
deepseek_history.append(msg)
|
|
180
|
+
elif role == "assistant":
|
|
181
|
+
deepseek_history.append({"role": "Assistant", "content": h["content"]})
|
|
182
|
+
else:
|
|
183
|
+
logger.error("Unexpected msg in chat history: %s", h)
|
|
184
|
+
|
|
185
|
+
deepseek_history.extend(prompt_messages)
|
|
186
|
+
|
|
187
|
+
from ....thirdparty.deepseek_vl.utils.io import load_pil_images
|
|
188
|
+
|
|
189
|
+
# load images and prepare for inputs
|
|
190
|
+
pil_images = load_pil_images(deepseek_history)
|
|
191
|
+
prepare_inputs = self._vl_chat_processor(
|
|
192
|
+
conversations=deepseek_history, images=pil_images, force_batchify=True
|
|
193
|
+
).to(self._model.device, self._model.dtype)
|
|
194
|
+
|
|
195
|
+
# run image encoder to get the image embeddings
|
|
196
|
+
inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
|
|
197
|
+
|
|
198
|
+
# run the model to get the response
|
|
199
|
+
outputs = self._model.language_model.generate(
|
|
200
|
+
inputs_embeds=inputs_embeds,
|
|
201
|
+
attention_mask=prepare_inputs.attention_mask,
|
|
202
|
+
pad_token_id=self._tokenizer.eos_token_id,
|
|
203
|
+
bos_token_id=self._tokenizer.bos_token_id,
|
|
204
|
+
eos_token_id=self._tokenizer.eos_token_id,
|
|
205
|
+
max_new_tokens=512,
|
|
206
|
+
do_sample=True,
|
|
207
|
+
top_p=0.95,
|
|
208
|
+
temperature=0.2,
|
|
209
|
+
repetition_penalty=1.1,
|
|
210
|
+
use_cache=True,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
answer = self._tokenizer.decode(
|
|
214
|
+
outputs[0].cpu().tolist(), skip_special_tokens=True
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return ChatCompletion(
|
|
218
|
+
id="chat" + str(uuid.uuid1()),
|
|
219
|
+
object="chat.completion",
|
|
220
|
+
created=int(time.time()),
|
|
221
|
+
model=self.model_uid,
|
|
222
|
+
choices=[
|
|
223
|
+
ChatCompletionChoice(
|
|
224
|
+
index=0,
|
|
225
|
+
message={"role": "assistant", "content": answer},
|
|
226
|
+
finish_reason="stop",
|
|
227
|
+
)
|
|
228
|
+
],
|
|
229
|
+
usage=CompletionUsage(
|
|
230
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
231
|
+
),
|
|
232
|
+
)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import base64
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import operator
|
|
18
|
+
import tempfile
|
|
19
|
+
import time
|
|
20
|
+
import uuid
|
|
21
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
22
|
+
|
|
23
|
+
from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64
|
|
24
|
+
from ....types import (
|
|
25
|
+
ChatCompletion,
|
|
26
|
+
ChatCompletionChoice,
|
|
27
|
+
ChatCompletionChunk,
|
|
28
|
+
ChatCompletionMessage,
|
|
29
|
+
CompletionUsage,
|
|
30
|
+
)
|
|
31
|
+
from ...utils import select_device
|
|
32
|
+
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
33
|
+
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OmniLMMModel(PytorchChatModel):
|
|
39
|
+
def __init__(self, *args, **kwargs):
|
|
40
|
+
super().__init__(*args, **kwargs)
|
|
41
|
+
self._model = None
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def match(
|
|
45
|
+
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
46
|
+
) -> bool:
|
|
47
|
+
if "OmniLMM" in model_family.model_name:
|
|
48
|
+
return True
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
def load(self):
|
|
52
|
+
device = self._pytorch_model_config.get("device", "auto")
|
|
53
|
+
device = select_device(device)
|
|
54
|
+
|
|
55
|
+
self._model = OmniLMMChat(self.model_path, device_map=device)
|
|
56
|
+
|
|
57
|
+
def _message_content_to_OmniLMM(
|
|
58
|
+
self, content
|
|
59
|
+
) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
|
|
60
|
+
def _ensure_url(_url):
|
|
61
|
+
if _url.startswith("data:"):
|
|
62
|
+
logging.info("Parse url by base64 decoder.")
|
|
63
|
+
# https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
|
|
64
|
+
# e.g. f"data:image/jpeg;base64,{base64_image}"
|
|
65
|
+
_type, data = _url.split(";")
|
|
66
|
+
_, ext = _type.split("/")
|
|
67
|
+
data = data[len("base64,") :]
|
|
68
|
+
data = base64.b64decode(data.encode("utf-8"))
|
|
69
|
+
|
|
70
|
+
with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
|
|
71
|
+
f.write(data)
|
|
72
|
+
logging.info("Dump base64 data to %s", f.name)
|
|
73
|
+
return f.name
|
|
74
|
+
else:
|
|
75
|
+
if len(_url) > 2048:
|
|
76
|
+
raise Exception(f"Image url is too long, {len(_url)} > 2048.")
|
|
77
|
+
return _url
|
|
78
|
+
|
|
79
|
+
if not isinstance(content, str):
|
|
80
|
+
images = []
|
|
81
|
+
other_content = []
|
|
82
|
+
|
|
83
|
+
for c in content:
|
|
84
|
+
if c.get("type") == "image_url":
|
|
85
|
+
images.append(
|
|
86
|
+
{"image": _ensure_url(c["image_url"]["url"]), "type": "image"}
|
|
87
|
+
)
|
|
88
|
+
else:
|
|
89
|
+
other_content.append(c)
|
|
90
|
+
|
|
91
|
+
images = sorted(images, key=operator.itemgetter("type"))
|
|
92
|
+
other_content = sorted(other_content, key=operator.itemgetter("type"))
|
|
93
|
+
|
|
94
|
+
return images, other_content
|
|
95
|
+
return [], [{"type": "text", "text": content}]
|
|
96
|
+
|
|
97
|
+
def chat(
|
|
98
|
+
self,
|
|
99
|
+
prompt: Union[str, List[Dict]],
|
|
100
|
+
system_prompt: Optional[str] = None,
|
|
101
|
+
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
102
|
+
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
103
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
104
|
+
if generate_config and generate_config.get("stream"):
|
|
105
|
+
raise Exception(
|
|
106
|
+
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
107
|
+
)
|
|
108
|
+
image_first, prompt = self._message_content_to_OmniLMM(prompt)
|
|
109
|
+
|
|
110
|
+
msgs = []
|
|
111
|
+
query_to_response: List[Dict] = []
|
|
112
|
+
image_another = []
|
|
113
|
+
for h in chat_history or []:
|
|
114
|
+
role = h["role"]
|
|
115
|
+
image_tmp, content = self._message_content_to_OmniLMM(h["content"])
|
|
116
|
+
if image_tmp != []:
|
|
117
|
+
image_another = image_tmp
|
|
118
|
+
if len(query_to_response) == 0 and role == "user":
|
|
119
|
+
query_to_response.append(
|
|
120
|
+
{"role": "user", "content": content[0]["text"]}
|
|
121
|
+
)
|
|
122
|
+
if len(query_to_response) == 1 and role == "assistant":
|
|
123
|
+
query_to_response.append(
|
|
124
|
+
{"role": "assistant", "content": content[0]["text"]}
|
|
125
|
+
)
|
|
126
|
+
if len(query_to_response) == 2:
|
|
127
|
+
msgs.extend(query_to_response)
|
|
128
|
+
query_to_response = []
|
|
129
|
+
if image_first != []:
|
|
130
|
+
image = image_first
|
|
131
|
+
if image_another != []:
|
|
132
|
+
image = image_another
|
|
133
|
+
im_64 = img2base64(image[0]["image"])
|
|
134
|
+
msgs.append({"role": "user", "content": prompt[0]["text"]})
|
|
135
|
+
input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
|
|
136
|
+
answer = self._model.chat(input=input)
|
|
137
|
+
|
|
138
|
+
return ChatCompletion(
|
|
139
|
+
id="chat" + str(uuid.uuid1()),
|
|
140
|
+
object="chat.completion",
|
|
141
|
+
created=int(time.time()),
|
|
142
|
+
model=self.model_uid,
|
|
143
|
+
choices=[
|
|
144
|
+
ChatCompletionChoice(
|
|
145
|
+
index=0,
|
|
146
|
+
message={"role": "assistant", "content": answer},
|
|
147
|
+
finish_reason="stop",
|
|
148
|
+
)
|
|
149
|
+
],
|
|
150
|
+
usage=CompletionUsage(
|
|
151
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
152
|
+
),
|
|
153
|
+
)
|
|
@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
|
|
|
53
53
|
|
|
54
54
|
device = self._pytorch_model_config.get("device", "auto")
|
|
55
55
|
device = select_device(device)
|
|
56
|
+
# for multiple GPU, set back to auto to make multiple devices work
|
|
57
|
+
device = "auto" if device == "cuda" else device
|
|
56
58
|
|
|
57
59
|
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
58
60
|
self.model_path,
|
|
@@ -59,6 +59,8 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
59
59
|
|
|
60
60
|
self._device = self._pytorch_model_config.get("device", "auto")
|
|
61
61
|
self._device = select_device(self._device)
|
|
62
|
+
# for multiple GPU, set back to auto to make multiple devices work
|
|
63
|
+
self._device = "auto" if self._device == "cuda" else self._device
|
|
62
64
|
|
|
63
65
|
key_info["model_path"] = self.model_path
|
|
64
66
|
# Default device_map is auto, it can loads model to multiple cards.
|
|
@@ -190,7 +192,7 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
190
192
|
prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
|
|
191
193
|
)
|
|
192
194
|
.unsqueeze(0)
|
|
193
|
-
.to(self.
|
|
195
|
+
.to(self._model.device)
|
|
194
196
|
)
|
|
195
197
|
|
|
196
198
|
images = state.get_images(return_pil=True)
|
|
@@ -215,7 +217,7 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
215
217
|
"input_ids": input_ids,
|
|
216
218
|
"images": image_tensor.unsqueeze(0)
|
|
217
219
|
.to(dtype=torch.bfloat16)
|
|
218
|
-
.to(self.
|
|
220
|
+
.to(self._model.device),
|
|
219
221
|
"streamer": streamer,
|
|
220
222
|
"do_sample": True,
|
|
221
223
|
"top_p": float(top_p),
|
xinference/model/llm/utils.py
CHANGED
|
@@ -163,7 +163,7 @@ class ChatModelMixin:
|
|
|
163
163
|
|
|
164
164
|
for i, message in enumerate(chat_history):
|
|
165
165
|
role = get_role(message["role"])
|
|
166
|
-
content = message
|
|
166
|
+
content = message.get("content")
|
|
167
167
|
tool_calls = message.get("tool_calls")
|
|
168
168
|
if tool_calls:
|
|
169
169
|
content = tool_calls[0]["function"]
|
|
@@ -248,7 +248,7 @@ Begin!"""
|
|
|
248
248
|
ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
|
|
249
249
|
for message in chat_history:
|
|
250
250
|
role = get_role(message["role"])
|
|
251
|
-
content = message
|
|
251
|
+
content = message.get("content")
|
|
252
252
|
|
|
253
253
|
ret += prompt_style.intra_message_sep
|
|
254
254
|
if tools:
|
|
@@ -421,6 +421,16 @@ Begin!"""
|
|
|
421
421
|
else:
|
|
422
422
|
ret += f"{role}".rstrip()
|
|
423
423
|
return ret
|
|
424
|
+
elif prompt_style.style_name == "MINICPM-2B":
|
|
425
|
+
ret = ""
|
|
426
|
+
for message in chat_history:
|
|
427
|
+
content = message["content"] or ""
|
|
428
|
+
role = get_role(message["role"])
|
|
429
|
+
if role == "user":
|
|
430
|
+
ret += "<用户>" + content.strip()
|
|
431
|
+
else:
|
|
432
|
+
ret += "<AI>" + content.strip()
|
|
433
|
+
return ret
|
|
424
434
|
else:
|
|
425
435
|
raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
|
|
426
436
|
|
|
@@ -436,6 +446,11 @@ Begin!"""
|
|
|
436
446
|
"index": i,
|
|
437
447
|
"delta": {
|
|
438
448
|
"content": choice["text"],
|
|
449
|
+
**(
|
|
450
|
+
{"tool_calls": choice["tool_calls"]}
|
|
451
|
+
if "tool_calls" in choice
|
|
452
|
+
else {}
|
|
453
|
+
),
|
|
439
454
|
},
|
|
440
455
|
"finish_reason": choice["finish_reason"],
|
|
441
456
|
}
|
|
@@ -582,10 +597,9 @@ Begin!"""
|
|
|
582
597
|
return text, None, None
|
|
583
598
|
|
|
584
599
|
@classmethod
|
|
585
|
-
def
|
|
586
|
-
_id = str(uuid.uuid4())
|
|
600
|
+
def _eval_tool_arguments(cls, model_family, c, tools):
|
|
587
601
|
family = model_family.model_family or model_family.model_name
|
|
588
|
-
if "gorilla-openfunctions-v1"
|
|
602
|
+
if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
|
|
589
603
|
content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
|
|
590
604
|
elif "chatglm3" == family:
|
|
591
605
|
content, func, args = cls._eval_chatglm3_arguments(c, tools)
|
|
@@ -596,7 +610,41 @@ Begin!"""
|
|
|
596
610
|
f"Model {model_family.model_name} is not support tool calls."
|
|
597
611
|
)
|
|
598
612
|
logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
|
|
613
|
+
return content, func, args
|
|
599
614
|
|
|
615
|
+
@classmethod
|
|
616
|
+
def _tools_token_filter(cls, model_family):
|
|
617
|
+
"""
|
|
618
|
+
Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
A function that takes tokens (string output by the model so far) as input
|
|
622
|
+
returns True if current token is after "\nFinal Answer:", else False.
|
|
623
|
+
"""
|
|
624
|
+
family = model_family.model_family or model_family.model_name
|
|
625
|
+
if family in ["qwen-chat", "qwen1.5-chat"]:
|
|
626
|
+
# Encapsulating function to reset 'found' after each call
|
|
627
|
+
found = False
|
|
628
|
+
|
|
629
|
+
def process_token(tokens: str):
|
|
630
|
+
nonlocal found
|
|
631
|
+
# Once "Final Answer:" is found, future tokens are allowed.
|
|
632
|
+
if found:
|
|
633
|
+
return True
|
|
634
|
+
# Check if the token ends with "\nFinal Answer:" and update `found`.
|
|
635
|
+
if tokens.endswith("\nFinal Answer:"):
|
|
636
|
+
found = True
|
|
637
|
+
return False
|
|
638
|
+
|
|
639
|
+
return process_token
|
|
640
|
+
else:
|
|
641
|
+
# For other families, allow all tokens.
|
|
642
|
+
return lambda tokens: True
|
|
643
|
+
|
|
644
|
+
@classmethod
|
|
645
|
+
def _tool_calls_completion(cls, model_family, model_uid, c, tools):
|
|
646
|
+
_id = str(uuid.uuid4())
|
|
647
|
+
content, func, args = cls._eval_tool_arguments(model_family, c, tools)
|
|
600
648
|
if func:
|
|
601
649
|
m = {
|
|
602
650
|
"role": "assistant",
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import json
|
|
15
16
|
import logging
|
|
16
17
|
import multiprocessing
|
|
17
18
|
import time
|
|
@@ -36,6 +37,8 @@ from ....types import (
|
|
|
36
37
|
CompletionChoice,
|
|
37
38
|
CompletionChunk,
|
|
38
39
|
CompletionUsage,
|
|
40
|
+
ToolCallFunction,
|
|
41
|
+
ToolCalls,
|
|
39
42
|
)
|
|
40
43
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
41
44
|
from ..llm_family import CustomLLMFamilyV1
|
|
@@ -80,25 +83,36 @@ try:
|
|
|
80
83
|
except ImportError:
|
|
81
84
|
VLLM_INSTALLED = False
|
|
82
85
|
|
|
83
|
-
VLLM_SUPPORTED_MODELS = [
|
|
86
|
+
VLLM_SUPPORTED_MODELS = [
|
|
87
|
+
"llama-2",
|
|
88
|
+
"baichuan",
|
|
89
|
+
"internlm-16k",
|
|
90
|
+
"mistral-v0.1",
|
|
91
|
+
"Yi",
|
|
92
|
+
"code-llama",
|
|
93
|
+
"code-llama-python",
|
|
94
|
+
]
|
|
84
95
|
VLLM_SUPPORTED_CHAT_MODELS = [
|
|
85
96
|
"llama-2-chat",
|
|
86
97
|
"vicuna-v1.3",
|
|
87
98
|
"vicuna-v1.5",
|
|
88
99
|
"baichuan-chat",
|
|
100
|
+
"baichuan-2-chat",
|
|
89
101
|
"internlm-chat-7b",
|
|
90
102
|
"internlm-chat-8k",
|
|
91
103
|
"internlm-chat-20b",
|
|
104
|
+
"internlm2-chat",
|
|
92
105
|
"qwen-chat",
|
|
93
|
-
"Yi",
|
|
94
106
|
"Yi-chat",
|
|
95
|
-
"code-llama",
|
|
96
|
-
"code-llama-python",
|
|
97
107
|
"code-llama-instruct",
|
|
98
108
|
"mistral-instruct-v0.1",
|
|
99
109
|
"mistral-instruct-v0.2",
|
|
100
110
|
"mixtral-instruct-v0.1",
|
|
101
111
|
"chatglm3",
|
|
112
|
+
"chatglm3-32k",
|
|
113
|
+
"chatglm3-128k",
|
|
114
|
+
"deepseek-chat",
|
|
115
|
+
"deepseek-coder-instruct",
|
|
102
116
|
]
|
|
103
117
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
104
118
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
|
|
@@ -110,6 +124,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
|
|
|
110
124
|
VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
|
|
111
125
|
VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
|
|
112
126
|
|
|
127
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
|
|
128
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
|
|
129
|
+
|
|
113
130
|
|
|
114
131
|
class VLLMModel(LLM):
|
|
115
132
|
def __init__(
|
|
@@ -290,6 +307,7 @@ class VLLMModel(LLM):
|
|
|
290
307
|
self,
|
|
291
308
|
prompt: str,
|
|
292
309
|
generate_config: Optional[Dict] = None,
|
|
310
|
+
tools: object = False,
|
|
293
311
|
) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
|
|
294
312
|
try:
|
|
295
313
|
from vllm.sampling_params import SamplingParams
|
|
@@ -316,16 +334,46 @@ class VLLMModel(LLM):
|
|
|
316
334
|
|
|
317
335
|
async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
|
|
318
336
|
previous_texts = [""] * sanitized_generate_config["n"]
|
|
337
|
+
tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
|
|
319
338
|
async for _request_output in results_generator:
|
|
320
339
|
chunk = self._convert_request_output_to_completion_chunk(
|
|
321
340
|
request_id=request_id,
|
|
322
341
|
model=self.model_uid,
|
|
323
342
|
request_output=_request_output,
|
|
324
343
|
)
|
|
344
|
+
|
|
325
345
|
for i, choice in enumerate(chunk["choices"]):
|
|
326
346
|
delta = choice["text"][len(previous_texts[i]) :]
|
|
327
347
|
previous_texts[i] = choice["text"]
|
|
328
348
|
choice["text"] = delta
|
|
349
|
+
|
|
350
|
+
if tools:
|
|
351
|
+
# only handle the first choice
|
|
352
|
+
choice = chunk["choices"][0]
|
|
353
|
+
if choice["finish_reason"] is not None:
|
|
354
|
+
# use previous text for evaluation temporarily
|
|
355
|
+
choice_delta = choice["text"]
|
|
356
|
+
choice["text"] = previous_texts[0]
|
|
357
|
+
_content, func, args = ChatModelMixin._eval_tool_arguments(
|
|
358
|
+
self.model_family, chunk, tools
|
|
359
|
+
)
|
|
360
|
+
choice["text"] = choice_delta
|
|
361
|
+
if func is not None:
|
|
362
|
+
choice["text"] = None
|
|
363
|
+
choice["finish_reason"] = "tool_calls"
|
|
364
|
+
choice["tool_calls"] = [
|
|
365
|
+
ToolCalls(
|
|
366
|
+
id=str(uuid.uuid4()),
|
|
367
|
+
type="function",
|
|
368
|
+
function=ToolCallFunction(
|
|
369
|
+
name=func,
|
|
370
|
+
arguments=json.dumps(args, ensure_ascii=False),
|
|
371
|
+
),
|
|
372
|
+
)
|
|
373
|
+
]
|
|
374
|
+
# use a filter function to skip Qwen's react thought process
|
|
375
|
+
elif not tools_token_filter(previous_texts[0]):
|
|
376
|
+
continue
|
|
329
377
|
prompt_tokens = len(_request_output.prompt_token_ids)
|
|
330
378
|
completion_tokens = sum(
|
|
331
379
|
len(output.token_ids) for output in _request_output.outputs
|
|
@@ -413,7 +461,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
413
461
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
414
462
|
# TODO(codingl2k1): qwen hacky to set stop for function call.
|
|
415
463
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
416
|
-
if tools and "qwen-chat"
|
|
464
|
+
if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
|
|
417
465
|
stop = generate_config.get("stop")
|
|
418
466
|
if isinstance(stop, str):
|
|
419
467
|
generate_config["stop"] = [stop, "Observation:"]
|
|
@@ -426,7 +474,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
426
474
|
stream = generate_config.get("stream", None)
|
|
427
475
|
|
|
428
476
|
if stream:
|
|
429
|
-
agen = await self.async_generate(full_prompt, generate_config)
|
|
477
|
+
agen = await self.async_generate(full_prompt, generate_config, tools)
|
|
430
478
|
assert isinstance(agen, AsyncGenerator)
|
|
431
479
|
return self._async_to_chat_completion_chunks(agen)
|
|
432
480
|
else:
|
xinference/model/rerank/core.py
CHANGED
|
@@ -134,8 +134,11 @@ class RerankModel:
|
|
|
134
134
|
top_n: Optional[int],
|
|
135
135
|
max_chunks_per_doc: Optional[int],
|
|
136
136
|
return_documents: Optional[bool],
|
|
137
|
+
**kwargs,
|
|
137
138
|
) -> Rerank:
|
|
138
139
|
assert self._model is not None
|
|
140
|
+
if kwargs:
|
|
141
|
+
raise ValueError("rerank hasn't support extra parameter.")
|
|
139
142
|
if max_chunks_per_doc is not None:
|
|
140
143
|
raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
|
|
141
144
|
sentence_combinations = [[query, doc] for doc in documents]
|