xinference 0.7.4__py3-none-any.whl → 0.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +22 -8
- xinference/client/oscar/actor_client.py +78 -8
- xinference/client/restful/restful_client.py +86 -0
- xinference/core/model.py +14 -7
- xinference/core/supervisor.py +12 -0
- xinference/deploy/cmdline.py +16 -0
- xinference/deploy/test/test_cmdline.py +1 -0
- xinference/model/embedding/model_spec.json +40 -0
- xinference/model/llm/__init__.py +14 -1
- xinference/model/llm/llm_family.json +10 -1
- xinference/model/llm/llm_family.py +38 -2
- xinference/model/llm/llm_family_modelscope.json +10 -1
- xinference/model/llm/pytorch/chatglm.py +1 -0
- xinference/model/llm/pytorch/core.py +1 -1
- xinference/model/llm/pytorch/utils.py +50 -18
- xinference/model/llm/utils.py +2 -2
- xinference/model/llm/vllm/core.py +13 -4
- xinference/model/multimodal/core.py +1 -1
- xinference/model/multimodal/qwen_vl.py +34 -2
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.31d347d8.js → main.236e72e7.js} +3 -3
- xinference/web/ui/build/static/js/main.236e72e7.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +1 -0
- {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/METADATA +9 -2
- {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/RECORD +32 -32
- xinference/web/ui/build/static/js/main.31d347d8.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ca8515ecefb4a06c5305417bfd9c04e13cf6b9103f52a47c925921b26c0a9f9d.json +0 -1
- /xinference/web/ui/build/static/js/{main.31d347d8.js.LICENSE.txt → main.236e72e7.js.LICENSE.txt} +0 -0
- {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/LICENSE +0 -0
- {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/WHEEL +0 -0
- {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/entry_points.txt +0 -0
- {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/top_level.txt +0 -0
|
@@ -17,7 +17,7 @@ import os
|
|
|
17
17
|
import platform
|
|
18
18
|
import shutil
|
|
19
19
|
from threading import Lock
|
|
20
|
-
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
20
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
|
|
21
21
|
|
|
22
22
|
from pydantic import BaseModel, Field, Protocol, ValidationError, validator
|
|
23
23
|
from pydantic.error_wrappers import ErrorWrapper
|
|
@@ -41,6 +41,8 @@ logger = logging.getLogger(__name__)
|
|
|
41
41
|
|
|
42
42
|
DEFAULT_CONTEXT_LENGTH = 2048
|
|
43
43
|
BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
|
|
44
|
+
BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set()
|
|
45
|
+
BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
class GgmlLLMSpecV1(BaseModel):
|
|
@@ -105,6 +107,8 @@ class LLMFamilyV1(BaseModel):
|
|
|
105
107
|
model_lang: List[str]
|
|
106
108
|
model_ability: List[Literal["embed", "generate", "chat"]]
|
|
107
109
|
model_description: Optional[str]
|
|
110
|
+
# reason for not required str here: legacy registration
|
|
111
|
+
model_family: Optional[str]
|
|
108
112
|
model_specs: List["LLMSpecV1"]
|
|
109
113
|
prompt_style: Optional["PromptStyleV1"]
|
|
110
114
|
|
|
@@ -134,7 +138,39 @@ class CustomLLMFamilyV1(LLMFamilyV1):
|
|
|
134
138
|
)
|
|
135
139
|
except (ValueError, TypeError, UnicodeDecodeError) as e:
|
|
136
140
|
raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls)
|
|
137
|
-
llm_spec = cls.parse_obj(obj)
|
|
141
|
+
llm_spec: CustomLLMFamilyV1 = cls.parse_obj(obj)
|
|
142
|
+
|
|
143
|
+
# check model_family
|
|
144
|
+
if llm_spec.model_family is None:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"You must specify `model_family` when registering custom LLM models."
|
|
147
|
+
)
|
|
148
|
+
assert isinstance(llm_spec.model_family, str)
|
|
149
|
+
if (
|
|
150
|
+
llm_spec.model_family != "other"
|
|
151
|
+
and "chat" in llm_spec.model_ability
|
|
152
|
+
and llm_spec.model_family not in BUILTIN_LLM_MODEL_CHAT_FAMILIES
|
|
153
|
+
):
|
|
154
|
+
raise ValueError(
|
|
155
|
+
f"`model_family` for chat model must be `other` or one of the following values: \n"
|
|
156
|
+
f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}"
|
|
157
|
+
)
|
|
158
|
+
if (
|
|
159
|
+
llm_spec.model_family != "other"
|
|
160
|
+
and "chat" not in llm_spec.model_ability
|
|
161
|
+
and llm_spec.model_family not in BUILTIN_LLM_MODEL_GENERATE_FAMILIES
|
|
162
|
+
):
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"`model_family` for generate model must be `other` or one of the following values: \n"
|
|
165
|
+
f"{', '.join(list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES))}"
|
|
166
|
+
)
|
|
167
|
+
# set prompt style when it is the builtin model family
|
|
168
|
+
if (
|
|
169
|
+
llm_spec.prompt_style is None
|
|
170
|
+
and llm_spec.model_family != "other"
|
|
171
|
+
and "chat" in llm_spec.model_ability
|
|
172
|
+
):
|
|
173
|
+
llm_spec.prompt_style = llm_spec.model_family
|
|
138
174
|
|
|
139
175
|
# handle prompt style when user choose existing style
|
|
140
176
|
if llm_spec.prompt_style is not None and isinstance(llm_spec.prompt_style, str):
|
|
@@ -331,6 +331,15 @@
|
|
|
331
331
|
"roles": [
|
|
332
332
|
"user",
|
|
333
333
|
"assistant"
|
|
334
|
+
],
|
|
335
|
+
"stop_token_ids": [
|
|
336
|
+
64795,
|
|
337
|
+
64797,
|
|
338
|
+
2
|
|
339
|
+
],
|
|
340
|
+
"stop":[
|
|
341
|
+
"<|user|>",
|
|
342
|
+
"<|observation|>"
|
|
334
343
|
]
|
|
335
344
|
}
|
|
336
345
|
},
|
|
@@ -357,7 +366,7 @@
|
|
|
357
366
|
],
|
|
358
367
|
"model_hub": "modelscope",
|
|
359
368
|
"model_id": "ZhipuAI/chatglm3-6b-32k",
|
|
360
|
-
"model_revision": "
|
|
369
|
+
"model_revision": "master"
|
|
361
370
|
}
|
|
362
371
|
],
|
|
363
372
|
"prompt_style": {
|
|
@@ -58,6 +58,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
58
58
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
59
59
|
self.model_path,
|
|
60
60
|
trust_remote_code=kwargs["trust_remote_code"],
|
|
61
|
+
encode_special_tokens=True,
|
|
61
62
|
revision=kwargs["revision"],
|
|
62
63
|
)
|
|
63
64
|
model = AutoModel.from_pretrained(
|
|
@@ -409,7 +409,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
409
409
|
) -> PytorchGenerateConfig:
|
|
410
410
|
generate_config = super()._sanitize_generate_config(generate_config)
|
|
411
411
|
if (
|
|
412
|
-
generate_config.get("stop"
|
|
412
|
+
(not generate_config.get("stop"))
|
|
413
413
|
and self.model_family.prompt_style
|
|
414
414
|
and self.model_family.prompt_style.stop
|
|
415
415
|
):
|
|
@@ -527,10 +527,12 @@ def generate_stream_chatglm(
|
|
|
527
527
|
top_p = float(generate_config.get("top_p", 1.0))
|
|
528
528
|
max_new_tokens = int(generate_config.get("max_tokens", 256))
|
|
529
529
|
echo = generate_config.get("echo", False)
|
|
530
|
+
stop_str = generate_config.get("stop", None)
|
|
531
|
+
eos_token_id = generate_config.get("stop_token_ids", [])
|
|
532
|
+
eos_token_id.append(tokenizer.eos_token_id)
|
|
530
533
|
|
|
531
534
|
inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
|
|
532
535
|
input_echo_len = len(inputs["input_ids"][0])
|
|
533
|
-
|
|
534
536
|
gen_kwargs = {
|
|
535
537
|
"max_length": max_new_tokens + input_echo_len,
|
|
536
538
|
"do_sample": True if temperature > 1e-5 else False,
|
|
@@ -543,7 +545,9 @@ def generate_stream_chatglm(
|
|
|
543
545
|
|
|
544
546
|
total_len = 0
|
|
545
547
|
last_response_length = 0
|
|
546
|
-
for total_ids in model.stream_generate(
|
|
548
|
+
for total_ids in model.stream_generate(
|
|
549
|
+
**inputs, eos_token_id=eos_token_id, **gen_kwargs
|
|
550
|
+
):
|
|
547
551
|
total_ids = total_ids.tolist()[0]
|
|
548
552
|
total_len = len(total_ids)
|
|
549
553
|
if echo:
|
|
@@ -553,29 +557,57 @@ def generate_stream_chatglm(
|
|
|
553
557
|
response = tokenizer.decode(output_ids)
|
|
554
558
|
response = process_response(response)
|
|
555
559
|
|
|
560
|
+
partially_stopped = False
|
|
561
|
+
stopped = False
|
|
562
|
+
if stop_str:
|
|
563
|
+
if isinstance(stop_str, str):
|
|
564
|
+
pos = response.rfind(stop_str, 0)
|
|
565
|
+
if pos != -1:
|
|
566
|
+
response = response[:pos]
|
|
567
|
+
stopped = True
|
|
568
|
+
else:
|
|
569
|
+
partially_stopped = is_partial_stop(response, stop_str)
|
|
570
|
+
elif isinstance(stop_str, Iterable):
|
|
571
|
+
for each_stop in stop_str:
|
|
572
|
+
pos = response.rfind(each_stop, 0)
|
|
573
|
+
if pos != -1:
|
|
574
|
+
response = response[:pos]
|
|
575
|
+
stopped = True
|
|
576
|
+
break
|
|
577
|
+
else:
|
|
578
|
+
partially_stopped = is_partial_stop(response, each_stop)
|
|
579
|
+
if partially_stopped:
|
|
580
|
+
break
|
|
581
|
+
else:
|
|
582
|
+
raise ValueError("Invalid stop field type.")
|
|
583
|
+
|
|
556
584
|
if stream:
|
|
557
585
|
response = response.strip("�")
|
|
558
586
|
tmp_response_length = len(response)
|
|
559
587
|
response = response[last_response_length:]
|
|
560
588
|
last_response_length = tmp_response_length
|
|
561
589
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
590
|
+
if not partially_stopped:
|
|
591
|
+
completion_choice = CompletionChoice(
|
|
592
|
+
text=response, index=0, logprobs=None, finish_reason=None
|
|
593
|
+
)
|
|
594
|
+
completion_chunk = CompletionChunk(
|
|
595
|
+
id=str(uuid.uuid1()),
|
|
596
|
+
object="text_completion",
|
|
597
|
+
created=int(time.time()),
|
|
598
|
+
model=model_uid,
|
|
599
|
+
choices=[completion_choice],
|
|
600
|
+
)
|
|
601
|
+
completion_usage = CompletionUsage(
|
|
602
|
+
prompt_tokens=input_echo_len,
|
|
603
|
+
completion_tokens=(total_len - input_echo_len),
|
|
604
|
+
total_tokens=total_len,
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
yield completion_chunk, completion_usage
|
|
577
608
|
|
|
578
|
-
|
|
609
|
+
if stopped:
|
|
610
|
+
break
|
|
579
611
|
|
|
580
612
|
if total_len - input_echo_len == max_new_tokens - 1:
|
|
581
613
|
finish_reason = "length"
|
xinference/model/llm/utils.py
CHANGED
|
@@ -141,7 +141,7 @@ class ChatModelMixin:
|
|
|
141
141
|
return ret
|
|
142
142
|
elif prompt_style.style_name == "CHATGLM3":
|
|
143
143
|
prompts = (
|
|
144
|
-
[f"<|system|>\n{prompt_style.system_prompt}"]
|
|
144
|
+
[f"<|system|>\n {prompt_style.system_prompt}"]
|
|
145
145
|
if prompt_style.system_prompt
|
|
146
146
|
else []
|
|
147
147
|
)
|
|
@@ -155,7 +155,7 @@ class ChatModelMixin:
|
|
|
155
155
|
if content:
|
|
156
156
|
if role == "tool":
|
|
157
157
|
role = "observation"
|
|
158
|
-
prompts.append(f"<|{role}|>\n{content}")
|
|
158
|
+
prompts.append(f"<|{role}|>\n {content}")
|
|
159
159
|
else:
|
|
160
160
|
prompts.append(f"<|{role}|>")
|
|
161
161
|
return "\n".join(prompts)
|
|
@@ -37,6 +37,7 @@ from ....types import (
|
|
|
37
37
|
CompletionUsage,
|
|
38
38
|
)
|
|
39
39
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
40
|
+
from ..llm_family import CustomLLMFamilyV1
|
|
40
41
|
from ..utils import ChatModelMixin
|
|
41
42
|
|
|
42
43
|
logger = logging.getLogger(__name__)
|
|
@@ -197,8 +198,12 @@ class VLLMModel(LLM):
|
|
|
197
198
|
# Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
|
|
198
199
|
if "4" not in quantization:
|
|
199
200
|
return False
|
|
200
|
-
if llm_family
|
|
201
|
-
|
|
201
|
+
if isinstance(llm_family, CustomLLMFamilyV1):
|
|
202
|
+
if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
|
|
203
|
+
return False
|
|
204
|
+
else:
|
|
205
|
+
if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
|
|
206
|
+
return False
|
|
202
207
|
if "generate" not in llm_family.model_ability:
|
|
203
208
|
return False
|
|
204
209
|
return VLLM_INSTALLED
|
|
@@ -329,8 +334,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
329
334
|
# Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
|
|
330
335
|
if "4" not in quantization:
|
|
331
336
|
return False
|
|
332
|
-
if llm_family
|
|
333
|
-
|
|
337
|
+
if isinstance(llm_family, CustomLLMFamilyV1):
|
|
338
|
+
if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
|
|
339
|
+
return False
|
|
340
|
+
else:
|
|
341
|
+
if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
|
|
342
|
+
return False
|
|
334
343
|
if "chat" not in llm_family.model_ability:
|
|
335
344
|
return False
|
|
336
345
|
return VLLM_INSTALLED
|
|
@@ -11,11 +11,14 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import base64
|
|
15
|
+
import logging
|
|
15
16
|
import operator
|
|
17
|
+
import tempfile
|
|
16
18
|
import time
|
|
17
19
|
import uuid
|
|
18
20
|
from typing import Dict, Iterator, List, Optional, Union
|
|
21
|
+
from urllib.parse import urlparse
|
|
19
22
|
|
|
20
23
|
from ...types import (
|
|
21
24
|
ChatCompletion,
|
|
@@ -26,6 +29,8 @@ from ...types import (
|
|
|
26
29
|
from ..utils import select_device
|
|
27
30
|
from .core import LVLM, LVLMFamilyV1, LVLMSpecV1
|
|
28
31
|
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
29
34
|
|
|
30
35
|
class QwenVLChat(LVLM):
|
|
31
36
|
def __init__(self, *args, **kwargs):
|
|
@@ -67,9 +72,32 @@ class QwenVLChat(LVLM):
|
|
|
67
72
|
)
|
|
68
73
|
|
|
69
74
|
def _message_content_to_qwen(self, content) -> str:
|
|
75
|
+
def _ensure_url(_url):
|
|
76
|
+
try:
|
|
77
|
+
if _url.startswith("data:"):
|
|
78
|
+
raise "Not a valid url."
|
|
79
|
+
parsed = urlparse(_url)
|
|
80
|
+
if not parsed.scheme:
|
|
81
|
+
raise "Not a valid url."
|
|
82
|
+
return _url
|
|
83
|
+
except Exception:
|
|
84
|
+
logging.info("Parse url by base64 decoder.")
|
|
85
|
+
# https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
|
|
86
|
+
# e.g. f"data:image/jpeg;base64,{base64_image}"
|
|
87
|
+
_type, data = _url.split(";")
|
|
88
|
+
_, ext = _type.split("/")
|
|
89
|
+
data = data[len("base64,") :]
|
|
90
|
+
data = base64.b64decode(data.encode("utf-8"))
|
|
91
|
+
|
|
92
|
+
with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
|
|
93
|
+
f.write(data)
|
|
94
|
+
logging.info("Dump base64 data to %s", f.name)
|
|
95
|
+
return f.name
|
|
96
|
+
|
|
70
97
|
if not isinstance(content, str):
|
|
98
|
+
# TODO(codingl2k1): Optimize _ensure_url
|
|
71
99
|
content = [
|
|
72
|
-
{"image": c["image_url"]["url"], "type": "image"}
|
|
100
|
+
{"image": _ensure_url(c["image_url"]["url"]), "type": "image"}
|
|
73
101
|
if c.get("type") == "image_url"
|
|
74
102
|
else c
|
|
75
103
|
for c in content
|
|
@@ -85,6 +113,10 @@ class QwenVLChat(LVLM):
|
|
|
85
113
|
chat_history: Optional[List[Dict]] = None,
|
|
86
114
|
generate_config: Optional[Dict] = None,
|
|
87
115
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
116
|
+
if generate_config and generate_config.get("stream"):
|
|
117
|
+
raise Exception(
|
|
118
|
+
f"Chat with model {self.model_family.model_name} does not support stream."
|
|
119
|
+
)
|
|
88
120
|
prompt = self._message_content_to_qwen(prompt)
|
|
89
121
|
# Convert openai history to qwen vl history
|
|
90
122
|
qwen_history = []
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
|
-
"main.js": "./static/js/main.
|
|
3
|
+
"main.js": "./static/js/main.236e72e7.js",
|
|
4
4
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
5
5
|
"index.html": "./index.html",
|
|
6
|
-
"main.
|
|
6
|
+
"main.236e72e7.js.map": "./static/js/main.236e72e7.js.map"
|
|
7
7
|
},
|
|
8
8
|
"entrypoints": [
|
|
9
|
-
"static/js/main.
|
|
9
|
+
"static/js/main.236e72e7.js"
|
|
10
10
|
]
|
|
11
11
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.236e72e7.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|