xinference 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +34 -15
- xinference/client/oscar/actor_client.py +4 -3
- xinference/client/restful/restful_client.py +40 -18
- xinference/core/supervisor.py +48 -9
- xinference/core/worker.py +13 -8
- xinference/deploy/cmdline.py +22 -9
- xinference/model/audio/__init__.py +40 -1
- xinference/model/audio/core.py +25 -45
- xinference/model/audio/custom.py +148 -0
- xinference/model/core.py +6 -9
- xinference/model/embedding/core.py +1 -2
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/image/core.py +12 -4
- xinference/model/image/stable_diffusion/core.py +8 -7
- xinference/model/llm/__init__.py +0 -6
- xinference/model/llm/core.py +9 -14
- xinference/model/llm/ggml/llamacpp.py +2 -10
- xinference/model/llm/llm_family.json +507 -7
- xinference/model/llm/llm_family.py +41 -4
- xinference/model/llm/llm_family_modelscope.json +260 -0
- xinference/model/llm/pytorch/baichuan.py +4 -3
- xinference/model/llm/pytorch/chatglm.py +5 -2
- xinference/model/llm/pytorch/core.py +37 -41
- xinference/model/llm/pytorch/falcon.py +6 -5
- xinference/model/llm/pytorch/internlm2.py +5 -2
- xinference/model/llm/pytorch/llama_2.py +6 -5
- xinference/model/llm/pytorch/qwen_vl.py +2 -0
- xinference/model/llm/pytorch/vicuna.py +4 -3
- xinference/model/llm/pytorch/yi_vl.py +4 -2
- xinference/model/llm/utils.py +42 -4
- xinference/model/llm/vllm/core.py +54 -6
- xinference/model/rerank/core.py +26 -12
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +25 -1
- xinference/model/utils.py +12 -1
- xinference/thirdparty/omnilmm/chat.py +1 -1
- xinference/types.py +70 -19
- xinference/utils.py +1 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/METADATA +13 -10
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/RECORD +71 -74
- xinference/model/llm/ggml/ctransformers.py +0 -281
- xinference/model/llm/ggml/ctransformers_util.py +0 -161
- xinference/web/ui/build/static/js/main.98516614.js +0 -3
- xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
- /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/LICENSE +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/WHEEL +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/top_level.txt +0 -0
|
@@ -59,6 +59,8 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
59
59
|
|
|
60
60
|
self._device = self._pytorch_model_config.get("device", "auto")
|
|
61
61
|
self._device = select_device(self._device)
|
|
62
|
+
# for multiple GPU, set back to auto to make multiple devices work
|
|
63
|
+
self._device = "auto" if self._device == "cuda" else self._device
|
|
62
64
|
|
|
63
65
|
key_info["model_path"] = self.model_path
|
|
64
66
|
# Default device_map is auto, it can loads model to multiple cards.
|
|
@@ -190,7 +192,7 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
190
192
|
prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
|
|
191
193
|
)
|
|
192
194
|
.unsqueeze(0)
|
|
193
|
-
.to(self.
|
|
195
|
+
.to(self._model.device)
|
|
194
196
|
)
|
|
195
197
|
|
|
196
198
|
images = state.get_images(return_pil=True)
|
|
@@ -215,7 +217,7 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
215
217
|
"input_ids": input_ids,
|
|
216
218
|
"images": image_tensor.unsqueeze(0)
|
|
217
219
|
.to(dtype=torch.bfloat16)
|
|
218
|
-
.to(self.
|
|
220
|
+
.to(self._model.device),
|
|
219
221
|
"streamer": streamer,
|
|
220
222
|
"do_sample": True,
|
|
221
223
|
"top_p": float(top_p),
|
xinference/model/llm/utils.py
CHANGED
|
@@ -163,7 +163,7 @@ class ChatModelMixin:
|
|
|
163
163
|
|
|
164
164
|
for i, message in enumerate(chat_history):
|
|
165
165
|
role = get_role(message["role"])
|
|
166
|
-
content = message
|
|
166
|
+
content = message.get("content")
|
|
167
167
|
tool_calls = message.get("tool_calls")
|
|
168
168
|
if tool_calls:
|
|
169
169
|
content = tool_calls[0]["function"]
|
|
@@ -248,7 +248,7 @@ Begin!"""
|
|
|
248
248
|
ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
|
|
249
249
|
for message in chat_history:
|
|
250
250
|
role = get_role(message["role"])
|
|
251
|
-
content = message
|
|
251
|
+
content = message.get("content")
|
|
252
252
|
|
|
253
253
|
ret += prompt_style.intra_message_sep
|
|
254
254
|
if tools:
|
|
@@ -446,6 +446,11 @@ Begin!"""
|
|
|
446
446
|
"index": i,
|
|
447
447
|
"delta": {
|
|
448
448
|
"content": choice["text"],
|
|
449
|
+
**(
|
|
450
|
+
{"tool_calls": choice["tool_calls"]}
|
|
451
|
+
if "tool_calls" in choice
|
|
452
|
+
else {}
|
|
453
|
+
),
|
|
449
454
|
},
|
|
450
455
|
"finish_reason": choice["finish_reason"],
|
|
451
456
|
}
|
|
@@ -592,8 +597,7 @@ Begin!"""
|
|
|
592
597
|
return text, None, None
|
|
593
598
|
|
|
594
599
|
@classmethod
|
|
595
|
-
def
|
|
596
|
-
_id = str(uuid.uuid4())
|
|
600
|
+
def _eval_tool_arguments(cls, model_family, c, tools):
|
|
597
601
|
family = model_family.model_family or model_family.model_name
|
|
598
602
|
if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
|
|
599
603
|
content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
|
|
@@ -606,7 +610,41 @@ Begin!"""
|
|
|
606
610
|
f"Model {model_family.model_name} is not support tool calls."
|
|
607
611
|
)
|
|
608
612
|
logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
|
|
613
|
+
return content, func, args
|
|
614
|
+
|
|
615
|
+
@classmethod
|
|
616
|
+
def _tools_token_filter(cls, model_family):
|
|
617
|
+
"""
|
|
618
|
+
Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
|
|
609
619
|
|
|
620
|
+
Returns:
|
|
621
|
+
A function that takes tokens (string output by the model so far) as input
|
|
622
|
+
returns True if current token is after "\nFinal Answer:", else False.
|
|
623
|
+
"""
|
|
624
|
+
family = model_family.model_family or model_family.model_name
|
|
625
|
+
if family in ["qwen-chat", "qwen1.5-chat"]:
|
|
626
|
+
# Encapsulating function to reset 'found' after each call
|
|
627
|
+
found = False
|
|
628
|
+
|
|
629
|
+
def process_token(tokens: str):
|
|
630
|
+
nonlocal found
|
|
631
|
+
# Once "Final Answer:" is found, future tokens are allowed.
|
|
632
|
+
if found:
|
|
633
|
+
return True
|
|
634
|
+
# Check if the token ends with "\nFinal Answer:" and update `found`.
|
|
635
|
+
if tokens.endswith("\nFinal Answer:"):
|
|
636
|
+
found = True
|
|
637
|
+
return False
|
|
638
|
+
|
|
639
|
+
return process_token
|
|
640
|
+
else:
|
|
641
|
+
# For other families, allow all tokens.
|
|
642
|
+
return lambda tokens: True
|
|
643
|
+
|
|
644
|
+
@classmethod
|
|
645
|
+
def _tool_calls_completion(cls, model_family, model_uid, c, tools):
|
|
646
|
+
_id = str(uuid.uuid4())
|
|
647
|
+
content, func, args = cls._eval_tool_arguments(model_family, c, tools)
|
|
610
648
|
if func:
|
|
611
649
|
m = {
|
|
612
650
|
"role": "assistant",
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import json
|
|
15
16
|
import logging
|
|
16
17
|
import multiprocessing
|
|
17
18
|
import time
|
|
@@ -36,6 +37,8 @@ from ....types import (
|
|
|
36
37
|
CompletionChoice,
|
|
37
38
|
CompletionChunk,
|
|
38
39
|
CompletionUsage,
|
|
40
|
+
ToolCallFunction,
|
|
41
|
+
ToolCalls,
|
|
39
42
|
)
|
|
40
43
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
41
44
|
from ..llm_family import CustomLLMFamilyV1
|
|
@@ -80,7 +83,15 @@ try:
|
|
|
80
83
|
except ImportError:
|
|
81
84
|
VLLM_INSTALLED = False
|
|
82
85
|
|
|
83
|
-
VLLM_SUPPORTED_MODELS = [
|
|
86
|
+
VLLM_SUPPORTED_MODELS = [
|
|
87
|
+
"llama-2",
|
|
88
|
+
"baichuan",
|
|
89
|
+
"internlm-16k",
|
|
90
|
+
"mistral-v0.1",
|
|
91
|
+
"Yi",
|
|
92
|
+
"code-llama",
|
|
93
|
+
"code-llama-python",
|
|
94
|
+
]
|
|
84
95
|
VLLM_SUPPORTED_CHAT_MODELS = [
|
|
85
96
|
"llama-2-chat",
|
|
86
97
|
"vicuna-v1.3",
|
|
@@ -90,21 +101,22 @@ VLLM_SUPPORTED_CHAT_MODELS = [
|
|
|
90
101
|
"internlm-chat-7b",
|
|
91
102
|
"internlm-chat-8k",
|
|
92
103
|
"internlm-chat-20b",
|
|
104
|
+
"internlm2-chat",
|
|
93
105
|
"qwen-chat",
|
|
94
|
-
"Yi",
|
|
95
106
|
"Yi-chat",
|
|
96
|
-
"code-llama",
|
|
97
|
-
"code-llama-python",
|
|
98
107
|
"code-llama-instruct",
|
|
99
108
|
"mistral-instruct-v0.1",
|
|
100
109
|
"mistral-instruct-v0.2",
|
|
101
110
|
"mixtral-instruct-v0.1",
|
|
102
111
|
"chatglm3",
|
|
112
|
+
"chatglm3-32k",
|
|
113
|
+
"chatglm3-128k",
|
|
103
114
|
"deepseek-chat",
|
|
104
115
|
"deepseek-coder-instruct",
|
|
105
116
|
]
|
|
106
117
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
107
118
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
|
|
119
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
|
|
108
120
|
|
|
109
121
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
|
|
110
122
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
|
|
@@ -113,6 +125,11 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
|
|
|
113
125
|
VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
|
|
114
126
|
VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
|
|
115
127
|
|
|
128
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
|
|
129
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
|
|
130
|
+
VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01")
|
|
131
|
+
VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01-4bit")
|
|
132
|
+
|
|
116
133
|
|
|
117
134
|
class VLLMModel(LLM):
|
|
118
135
|
def __init__(
|
|
@@ -293,6 +310,7 @@ class VLLMModel(LLM):
|
|
|
293
310
|
self,
|
|
294
311
|
prompt: str,
|
|
295
312
|
generate_config: Optional[Dict] = None,
|
|
313
|
+
tools: object = False,
|
|
296
314
|
) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
|
|
297
315
|
try:
|
|
298
316
|
from vllm.sampling_params import SamplingParams
|
|
@@ -319,16 +337,46 @@ class VLLMModel(LLM):
|
|
|
319
337
|
|
|
320
338
|
async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
|
|
321
339
|
previous_texts = [""] * sanitized_generate_config["n"]
|
|
340
|
+
tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
|
|
322
341
|
async for _request_output in results_generator:
|
|
323
342
|
chunk = self._convert_request_output_to_completion_chunk(
|
|
324
343
|
request_id=request_id,
|
|
325
344
|
model=self.model_uid,
|
|
326
345
|
request_output=_request_output,
|
|
327
346
|
)
|
|
347
|
+
|
|
328
348
|
for i, choice in enumerate(chunk["choices"]):
|
|
329
349
|
delta = choice["text"][len(previous_texts[i]) :]
|
|
330
350
|
previous_texts[i] = choice["text"]
|
|
331
351
|
choice["text"] = delta
|
|
352
|
+
|
|
353
|
+
if tools:
|
|
354
|
+
# only handle the first choice
|
|
355
|
+
choice = chunk["choices"][0]
|
|
356
|
+
if choice["finish_reason"] is not None:
|
|
357
|
+
# use previous text for evaluation temporarily
|
|
358
|
+
choice_delta = choice["text"]
|
|
359
|
+
choice["text"] = previous_texts[0]
|
|
360
|
+
_content, func, args = ChatModelMixin._eval_tool_arguments(
|
|
361
|
+
self.model_family, chunk, tools
|
|
362
|
+
)
|
|
363
|
+
choice["text"] = choice_delta
|
|
364
|
+
if func is not None:
|
|
365
|
+
choice["text"] = None
|
|
366
|
+
choice["finish_reason"] = "tool_calls"
|
|
367
|
+
choice["tool_calls"] = [
|
|
368
|
+
ToolCalls(
|
|
369
|
+
id=str(uuid.uuid4()),
|
|
370
|
+
type="function",
|
|
371
|
+
function=ToolCallFunction(
|
|
372
|
+
name=func,
|
|
373
|
+
arguments=json.dumps(args, ensure_ascii=False),
|
|
374
|
+
),
|
|
375
|
+
)
|
|
376
|
+
]
|
|
377
|
+
# use a filter function to skip Qwen's react thought process
|
|
378
|
+
elif not tools_token_filter(previous_texts[0]):
|
|
379
|
+
continue
|
|
332
380
|
prompt_tokens = len(_request_output.prompt_token_ids)
|
|
333
381
|
completion_tokens = sum(
|
|
334
382
|
len(output.token_ids) for output in _request_output.outputs
|
|
@@ -416,7 +464,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
416
464
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
417
465
|
# TODO(codingl2k1): qwen hacky to set stop for function call.
|
|
418
466
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
419
|
-
if tools and "qwen-chat"
|
|
467
|
+
if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
|
|
420
468
|
stop = generate_config.get("stop")
|
|
421
469
|
if isinstance(stop, str):
|
|
422
470
|
generate_config["stop"] = [stop, "Observation:"]
|
|
@@ -429,7 +477,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
429
477
|
stream = generate_config.get("stream", None)
|
|
430
478
|
|
|
431
479
|
if stream:
|
|
432
|
-
agen = await self.async_generate(full_prompt, generate_config)
|
|
480
|
+
agen = await self.async_generate(full_prompt, generate_config, tools)
|
|
433
481
|
assert isinstance(agen, AsyncGenerator)
|
|
434
482
|
return self._async_to_chat_completion_chunks(agen)
|
|
435
483
|
else:
|
xinference/model/rerank/core.py
CHANGED
|
@@ -42,8 +42,9 @@ def get_rerank_model_descriptions():
|
|
|
42
42
|
class RerankModelSpec(CacheableModelSpec):
|
|
43
43
|
model_name: str
|
|
44
44
|
language: List[str]
|
|
45
|
+
type: Optional[str] = "normal"
|
|
45
46
|
model_id: str
|
|
46
|
-
model_revision: str
|
|
47
|
+
model_revision: Optional[str]
|
|
47
48
|
model_hub: str = "huggingface"
|
|
48
49
|
|
|
49
50
|
|
|
@@ -63,6 +64,7 @@ class RerankModelDescription(ModelDescription):
|
|
|
63
64
|
"model_type": "rerank",
|
|
64
65
|
"address": self.address,
|
|
65
66
|
"accelerators": self.devices,
|
|
67
|
+
"type": self._model_spec.type,
|
|
66
68
|
"model_name": self._model_spec.model_name,
|
|
67
69
|
"language": self._model_spec.language,
|
|
68
70
|
"model_revision": self._model_spec.model_revision,
|
|
@@ -97,12 +99,14 @@ def generate_rerank_description(model_spec: RerankModelSpec) -> Dict[str, List[D
|
|
|
97
99
|
class RerankModel:
|
|
98
100
|
def __init__(
|
|
99
101
|
self,
|
|
102
|
+
model_spec: RerankModelSpec,
|
|
100
103
|
model_uid: str,
|
|
101
104
|
model_path: str,
|
|
102
105
|
device: Optional[str] = None,
|
|
103
106
|
use_fp16: bool = False,
|
|
104
107
|
model_config: Optional[Dict] = None,
|
|
105
108
|
):
|
|
109
|
+
self._model_spec = model_spec
|
|
106
110
|
self._model_uid = model_uid
|
|
107
111
|
self._model_path = model_path
|
|
108
112
|
self._device = device
|
|
@@ -112,20 +116,25 @@ class RerankModel:
|
|
|
112
116
|
|
|
113
117
|
def load(self):
|
|
114
118
|
try:
|
|
115
|
-
|
|
119
|
+
if self._model_spec.type == "normal":
|
|
120
|
+
from FlagEmbedding import FlagReranker
|
|
121
|
+
elif self._model_spec.type == "LLM-based":
|
|
122
|
+
from FlagEmbedding import FlagLLMReranker as FlagReranker
|
|
123
|
+
elif self._model_spec.type == "LLM-based layerwise":
|
|
124
|
+
from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
|
|
125
|
+
else:
|
|
126
|
+
raise RuntimeError(
|
|
127
|
+
f"Unsupported Rank model type: {self._model_spec.type}"
|
|
128
|
+
)
|
|
116
129
|
except ImportError:
|
|
117
|
-
error_message = "Failed to import module '
|
|
130
|
+
error_message = "Failed to import module 'FlagEmbedding'"
|
|
118
131
|
installation_guide = [
|
|
119
|
-
"Please make sure '
|
|
120
|
-
"You can install it by `pip install
|
|
132
|
+
"Please make sure 'FlagEmbedding' is installed. ",
|
|
133
|
+
"You can install it by `pip install FlagEmbedding`\n",
|
|
121
134
|
]
|
|
122
135
|
|
|
123
136
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
124
|
-
self._model =
|
|
125
|
-
self._model_path, device=self._device, **self._model_config
|
|
126
|
-
)
|
|
127
|
-
if self._use_fp16:
|
|
128
|
-
self._model.model.half()
|
|
137
|
+
self._model = FlagReranker(self._model_path, use_fp16=True)
|
|
129
138
|
|
|
130
139
|
def rerank(
|
|
131
140
|
self,
|
|
@@ -134,12 +143,15 @@ class RerankModel:
|
|
|
134
143
|
top_n: Optional[int],
|
|
135
144
|
max_chunks_per_doc: Optional[int],
|
|
136
145
|
return_documents: Optional[bool],
|
|
146
|
+
**kwargs,
|
|
137
147
|
) -> Rerank:
|
|
138
148
|
assert self._model is not None
|
|
149
|
+
if kwargs:
|
|
150
|
+
raise ValueError("rerank hasn't support extra parameter.")
|
|
139
151
|
if max_chunks_per_doc is not None:
|
|
140
152
|
raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
|
|
141
153
|
sentence_combinations = [[query, doc] for doc in documents]
|
|
142
|
-
similarity_scores = self._model.
|
|
154
|
+
similarity_scores = self._model.compute_score(sentence_combinations)
|
|
143
155
|
sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
|
|
144
156
|
if top_n is not None:
|
|
145
157
|
sim_scores_argsort = sim_scores_argsort[:top_n]
|
|
@@ -221,7 +233,9 @@ def create_rerank_model_instance(
|
|
|
221
233
|
|
|
222
234
|
model_path = cache(model_spec)
|
|
223
235
|
use_fp16 = kwargs.pop("use_fp16", False)
|
|
224
|
-
model = RerankModel(
|
|
236
|
+
model = RerankModel(
|
|
237
|
+
model_spec, model_uid, model_path, use_fp16=use_fp16, model_config=kwargs
|
|
238
|
+
)
|
|
225
239
|
model_description = RerankModelDescription(
|
|
226
240
|
subpool_addr, devices, model_spec, model_path=model_path
|
|
227
241
|
)
|
|
@@ -1,20 +1,44 @@
|
|
|
1
1
|
[
|
|
2
2
|
{
|
|
3
3
|
"model_name": "bge-reranker-large",
|
|
4
|
+
"type": "normal",
|
|
4
5
|
"language": ["en", "zh"],
|
|
5
6
|
"model_id": "BAAI/bge-reranker-large",
|
|
6
7
|
"model_revision": "27c9168d479987529781de8474dff94d69beca11"
|
|
7
8
|
},
|
|
8
9
|
{
|
|
9
10
|
"model_name": "bge-reranker-base",
|
|
11
|
+
"type": "normal",
|
|
10
12
|
"language": ["en", "zh"],
|
|
11
13
|
"model_id": "BAAI/bge-reranker-base",
|
|
12
14
|
"model_revision": "465b4b7ddf2be0a020c8ad6e525b9bb1dbb708ae"
|
|
13
15
|
},
|
|
14
16
|
{
|
|
15
17
|
"model_name": "bce-reranker-base_v1",
|
|
18
|
+
"type": "normal",
|
|
16
19
|
"language": ["en", "zh"],
|
|
17
20
|
"model_id": "maidalun1020/bce-reranker-base_v1",
|
|
18
21
|
"model_revision": "eaa31a577a0574e87a08959bd229ca14ce1b5496"
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"model_name": "bge-reranker-v2-m3",
|
|
25
|
+
"type": "normal",
|
|
26
|
+
"language": ["en", "zh", "multilingual"],
|
|
27
|
+
"model_id": "BAAI/bge-reranker-v2-m3",
|
|
28
|
+
"model_revision": "12e974610ba9083ed95f3edf08d7e899581f4de4"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"model_name": "bge-reranker-v2-gemma",
|
|
32
|
+
"type": "LLM-based",
|
|
33
|
+
"language": ["en", "zh", "multilingual"],
|
|
34
|
+
"model_id": "BAAI/bge-reranker-v2-gemma",
|
|
35
|
+
"model_revision": "1787044f8b6fb740a9de4557c3a12377f84d9e17"
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"model_name": "bge-reranker-v2-minicpm-layerwise",
|
|
39
|
+
"type": "LLM-based layerwise",
|
|
40
|
+
"language": ["en", "zh", "multilingual"],
|
|
41
|
+
"model_id": "BAAI/bge-reranker-v2-minicpm-layerwise",
|
|
42
|
+
"model_revision": "47b5332b296c4d8cb6ee2c60502cc62a0d708881"
|
|
19
43
|
}
|
|
20
44
|
]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[
|
|
2
2
|
{
|
|
3
3
|
"model_name": "bge-reranker-base",
|
|
4
|
+
"type": "normal",
|
|
4
5
|
"language": ["en", "zh"],
|
|
5
6
|
"model_id": "Xorbits/bge-reranker-base",
|
|
6
7
|
"model_revision": "v0.0.1",
|
|
@@ -8,16 +9,39 @@
|
|
|
8
9
|
},
|
|
9
10
|
{
|
|
10
11
|
"model_name": "bge-reranker-large",
|
|
12
|
+
"type": "normal",
|
|
11
13
|
"language": ["en", "zh"],
|
|
12
14
|
"model_id": "Xorbits/bge-reranker-large",
|
|
13
15
|
"model_revision": "v0.0.1",
|
|
14
16
|
"model_hub": "modelscope"
|
|
15
17
|
},
|
|
16
|
-
|
|
18
|
+
{
|
|
17
19
|
"model_name": "bce-reranker-base_v1",
|
|
20
|
+
"type": "normal",
|
|
18
21
|
"language": ["en", "zh"],
|
|
19
22
|
"model_id": "maidalun/bce-reranker-base_v1",
|
|
20
23
|
"model_revision": "v0.0.1",
|
|
21
24
|
"model_hub": "modelscope"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"model_name": "bge-reranker-v2-m3",
|
|
28
|
+
"type": "normal",
|
|
29
|
+
"language": ["en", "zh", "multilingual"],
|
|
30
|
+
"model_id": "AI-ModelScope/bge-reranker-v2-m3",
|
|
31
|
+
"model_hub": "modelscope"
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"model_name": "bge-reranker-v2-gemma",
|
|
35
|
+
"type": "LLM-based",
|
|
36
|
+
"language": ["en", "zh", "multilingual"],
|
|
37
|
+
"model_id": "AI-ModelScope/bge-reranker-v2-gemma",
|
|
38
|
+
"model_hub": "modelscope"
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"model_name": "bge-reranker-v2-minicpm-layerwise",
|
|
42
|
+
"type": "LLM-based layerwise",
|
|
43
|
+
"language": ["en", "zh", "multilingual"],
|
|
44
|
+
"model_id": "zfffff/bge-reranker-v2-minicpm-layerwise",
|
|
45
|
+
"model_hub": "modelscope"
|
|
22
46
|
}
|
|
23
47
|
]
|
xinference/model/utils.py
CHANGED
|
@@ -17,7 +17,7 @@ import os
|
|
|
17
17
|
import shutil
|
|
18
18
|
from json import JSONDecodeError
|
|
19
19
|
from pathlib import Path
|
|
20
|
-
from typing import Any, Callable, Dict, Optional, Tuple
|
|
20
|
+
from typing import Any, Callable, Dict, Optional, Tuple, Union
|
|
21
21
|
|
|
22
22
|
from fsspec import AbstractFileSystem
|
|
23
23
|
|
|
@@ -415,3 +415,14 @@ def select_device(device):
|
|
|
415
415
|
raise ValueError(f"{device} is unavailable in your environment")
|
|
416
416
|
|
|
417
417
|
return device
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def convert_float_to_int_or_str(model_size: float) -> Union[int, str]:
|
|
421
|
+
"""convert float to int or string
|
|
422
|
+
|
|
423
|
+
if float can be presented as int, convert it to int, otherwise convert it to string
|
|
424
|
+
"""
|
|
425
|
+
if int(model_size) == model_size:
|
|
426
|
+
return int(model_size)
|
|
427
|
+
else:
|
|
428
|
+
return str(model_size)
|
|
@@ -207,7 +207,7 @@ class OmniLMM3B:
|
|
|
207
207
|
|
|
208
208
|
class OmniLMMChat:
|
|
209
209
|
def __init__(self, model_path, device_map) -> None:
|
|
210
|
-
if "
|
|
210
|
+
if "12b" in model_path:
|
|
211
211
|
self.model = OmniLMM12B(model_path, device_map)
|
|
212
212
|
else:
|
|
213
213
|
self.model = OmniLMM3B(model_path, device_map)
|
xinference/types.py
CHANGED
|
@@ -91,11 +91,23 @@ class CompletionLogprobs(TypedDict):
|
|
|
91
91
|
top_logprobs: List[Optional[Dict[str, float]]]
|
|
92
92
|
|
|
93
93
|
|
|
94
|
+
class ToolCallFunction(TypedDict):
|
|
95
|
+
name: str
|
|
96
|
+
arguments: str
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class ToolCalls(TypedDict):
|
|
100
|
+
id: str
|
|
101
|
+
type: Literal["function"]
|
|
102
|
+
function: ToolCallFunction
|
|
103
|
+
|
|
104
|
+
|
|
94
105
|
class CompletionChoice(TypedDict):
|
|
95
106
|
text: str
|
|
96
107
|
index: int
|
|
97
108
|
logprobs: Optional[CompletionLogprobs]
|
|
98
109
|
finish_reason: Optional[str]
|
|
110
|
+
tool_calls: NotRequired[List[ToolCalls]]
|
|
99
111
|
|
|
100
112
|
|
|
101
113
|
class CompletionUsage(TypedDict):
|
|
@@ -147,6 +159,7 @@ class ChatCompletion(TypedDict):
|
|
|
147
159
|
class ChatCompletionChunkDelta(TypedDict):
|
|
148
160
|
role: NotRequired[str]
|
|
149
161
|
content: NotRequired[str]
|
|
162
|
+
tool_calls: NotRequired[List[ToolCalls]]
|
|
150
163
|
|
|
151
164
|
|
|
152
165
|
class ChatCompletionChunkChoice(TypedDict):
|
|
@@ -232,6 +245,8 @@ class LlamaCppModelConfig(TypedDict, total=False):
|
|
|
232
245
|
n_ctx: int
|
|
233
246
|
n_parts: int
|
|
234
247
|
n_gpu_layers: int
|
|
248
|
+
split_mode: int
|
|
249
|
+
main_gpu: int
|
|
235
250
|
seed: int
|
|
236
251
|
f16_kv: bool
|
|
237
252
|
logits_all: bool
|
|
@@ -355,21 +370,6 @@ try:
|
|
|
355
370
|
except ImportError:
|
|
356
371
|
CreateCompletionLlamaCpp = create_model("CreateCompletionLlamaCpp")
|
|
357
372
|
|
|
358
|
-
CreateCompletionCTransformers: BaseModel
|
|
359
|
-
try:
|
|
360
|
-
from ctransformers.llm import LLM
|
|
361
|
-
|
|
362
|
-
CreateCompletionCTransformers = get_pydantic_model_from_method(
|
|
363
|
-
LLM.generate,
|
|
364
|
-
exclude_fields=["tokens"],
|
|
365
|
-
include_fields={
|
|
366
|
-
"max_tokens": (Optional[int], max_tokens_field),
|
|
367
|
-
"stream": (Optional[bool], stream_field),
|
|
368
|
-
},
|
|
369
|
-
)
|
|
370
|
-
except ImportError:
|
|
371
|
-
CreateCompletionCTransformers = create_model("CreateCompletionCTransformers")
|
|
372
|
-
|
|
373
373
|
|
|
374
374
|
# This type is for openai API compatibility
|
|
375
375
|
CreateCompletionOpenAI: BaseModel
|
|
@@ -415,7 +415,6 @@ class CreateCompletion(
|
|
|
415
415
|
ModelAndPrompt,
|
|
416
416
|
CreateCompletionTorch,
|
|
417
417
|
CreateCompletionLlamaCpp,
|
|
418
|
-
CreateCompletionCTransformers,
|
|
419
418
|
CreateCompletionOpenAI,
|
|
420
419
|
):
|
|
421
420
|
pass
|
|
@@ -428,8 +427,6 @@ class CreateChatModel(BaseModel):
|
|
|
428
427
|
# Currently, chat calls generates, so the params share the same one.
|
|
429
428
|
CreateChatCompletionTorch = CreateCompletionTorch
|
|
430
429
|
CreateChatCompletionLlamaCpp: BaseModel = CreateCompletionLlamaCpp
|
|
431
|
-
CreateChatCompletionCTransformers: BaseModel = CreateCompletionCTransformers
|
|
432
|
-
|
|
433
430
|
|
|
434
431
|
# This type is for openai API compatibility
|
|
435
432
|
CreateChatCompletionOpenAI: BaseModel
|
|
@@ -450,7 +447,61 @@ class CreateChatCompletion(
|
|
|
450
447
|
CreateChatModel,
|
|
451
448
|
CreateChatCompletionTorch,
|
|
452
449
|
CreateChatCompletionLlamaCpp,
|
|
453
|
-
CreateChatCompletionCTransformers,
|
|
454
450
|
CreateChatCompletionOpenAI,
|
|
455
451
|
):
|
|
456
452
|
pass
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
class LoRA:
|
|
456
|
+
def __init__(self, lora_name: str, local_path: str):
|
|
457
|
+
self.lora_name = lora_name
|
|
458
|
+
self.local_path = local_path
|
|
459
|
+
|
|
460
|
+
def to_dict(self):
|
|
461
|
+
return {
|
|
462
|
+
"lora_name": self.lora_name,
|
|
463
|
+
"local_path": self.local_path,
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
@classmethod
|
|
467
|
+
def from_dict(cls, data: Dict):
|
|
468
|
+
return cls(
|
|
469
|
+
lora_name=data["lora_name"],
|
|
470
|
+
local_path=data["local_path"],
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
class PeftModelConfig:
|
|
475
|
+
def __init__(
|
|
476
|
+
self,
|
|
477
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
478
|
+
image_lora_load_kwargs: Optional[Dict] = None,
|
|
479
|
+
image_lora_fuse_kwargs: Optional[Dict] = None,
|
|
480
|
+
):
|
|
481
|
+
self.peft_model = peft_model
|
|
482
|
+
self.image_lora_load_kwargs = image_lora_load_kwargs
|
|
483
|
+
self.image_lora_fuse_kwargs = image_lora_fuse_kwargs
|
|
484
|
+
|
|
485
|
+
def to_dict(self):
|
|
486
|
+
return {
|
|
487
|
+
"lora_list": [lora.to_dict() for lora in self.peft_model]
|
|
488
|
+
if self.peft_model
|
|
489
|
+
else None,
|
|
490
|
+
"image_lora_load_kwargs": self.image_lora_load_kwargs,
|
|
491
|
+
"image_lora_fuse_kwargs": self.image_lora_fuse_kwargs,
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
@classmethod
|
|
495
|
+
def from_dict(cls, data: Dict):
|
|
496
|
+
peft_model_list = data.get("lora_list", None)
|
|
497
|
+
peft_model = (
|
|
498
|
+
[LoRA.from_dict(lora_dict) for lora_dict in peft_model_list]
|
|
499
|
+
if peft_model_list is not None
|
|
500
|
+
else None
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
return cls(
|
|
504
|
+
peft_model=peft_model,
|
|
505
|
+
image_lora_load_kwargs=data.get("image_lora_load_kwargs"),
|
|
506
|
+
image_lora_fuse_kwargs=data.get("image_lora_fuse_kwargs"),
|
|
507
|
+
)
|
xinference/utils.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
|
-
"main.js": "./static/js/main.
|
|
3
|
+
"main.js": "./static/js/main.26fdbfbe.js",
|
|
4
4
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
5
5
|
"index.html": "./index.html",
|
|
6
|
-
"main.
|
|
6
|
+
"main.26fdbfbe.js.map": "./static/js/main.26fdbfbe.js.map"
|
|
7
7
|
},
|
|
8
8
|
"entrypoints": [
|
|
9
|
-
"static/js/main.
|
|
9
|
+
"static/js/main.26fdbfbe.js"
|
|
10
10
|
]
|
|
11
11
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.26fdbfbe.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|