xinference 1.7.0__py3-none-any.whl → 1.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +3 -4
- xinference/client/__init__.py +2 -0
- xinference/client/common.py +49 -2
- xinference/client/handlers.py +18 -0
- xinference/client/restful/async_restful_client.py +1760 -0
- xinference/client/restful/restful_client.py +74 -78
- xinference/core/media_interface.py +3 -1
- xinference/core/model.py +5 -4
- xinference/core/supervisor.py +10 -5
- xinference/core/worker.py +15 -14
- xinference/deploy/local.py +51 -9
- xinference/deploy/worker.py +5 -3
- xinference/device_utils.py +22 -3
- xinference/model/audio/fish_speech.py +23 -34
- xinference/model/audio/model_spec.json +4 -2
- xinference/model/audio/model_spec_modelscope.json +4 -2
- xinference/model/audio/utils.py +2 -2
- xinference/model/core.py +1 -0
- xinference/model/embedding/__init__.py +8 -8
- xinference/model/embedding/custom.py +6 -1
- xinference/model/embedding/embed_family.py +0 -41
- xinference/model/embedding/model_spec.json +10 -1
- xinference/model/embedding/model_spec_modelscope.json +10 -1
- xinference/model/embedding/sentence_transformers/core.py +30 -15
- xinference/model/flexible/core.py +1 -1
- xinference/model/flexible/launchers/__init__.py +2 -0
- xinference/model/flexible/launchers/image_process_launcher.py +1 -1
- xinference/model/flexible/launchers/modelscope_launcher.py +47 -0
- xinference/model/flexible/launchers/transformers_launcher.py +5 -5
- xinference/model/flexible/launchers/yolo_launcher.py +62 -0
- xinference/model/llm/__init__.py +7 -0
- xinference/model/llm/core.py +18 -1
- xinference/model/llm/llama_cpp/core.py +1 -1
- xinference/model/llm/llm_family.json +43 -3
- xinference/model/llm/llm_family.py +6 -0
- xinference/model/llm/llm_family_modelscope.json +45 -3
- xinference/model/llm/mlx/core.py +271 -18
- xinference/model/llm/mlx/distributed_models/__init__.py +13 -0
- xinference/model/llm/mlx/distributed_models/core.py +164 -0
- xinference/model/llm/mlx/distributed_models/deepseek_v3.py +75 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +82 -0
- xinference/model/llm/mlx/distributed_models/qwen3.py +82 -0
- xinference/model/llm/mlx/distributed_models/qwen3_moe.py +76 -0
- xinference/model/llm/reasoning_parser.py +12 -6
- xinference/model/llm/sglang/core.py +8 -4
- xinference/model/llm/transformers/chatglm.py +4 -1
- xinference/model/llm/transformers/core.py +4 -2
- xinference/model/llm/transformers/multimodal/cogagent.py +10 -4
- xinference/model/llm/transformers/multimodal/intern_vl.py +1 -1
- xinference/model/llm/utils.py +36 -17
- xinference/model/llm/vllm/core.py +142 -34
- xinference/model/llm/vllm/distributed_executor.py +96 -21
- xinference/model/llm/vllm/xavier/transfer.py +2 -2
- xinference/model/rerank/core.py +26 -9
- xinference/model/rerank/model_spec.json +3 -3
- xinference/model/rerank/model_spec_modelscope.json +3 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.9b12b7f9.js +3 -0
- xinference/web/ui/build/static/js/main.9b12b7f9.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0fd4820d93f99509e80d8702dc3f6f8272424acab5608fa7c0e82cb1d3250a87.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f75545479c17fdfe2a00235fa4a0e9da1ae95e6b3caafba87ded92de6b0240e4.json +1 -0
- xinference/web/ui/src/locales/en.json +3 -0
- xinference/web/ui/src/locales/ja.json +3 -0
- xinference/web/ui/src/locales/ko.json +3 -0
- xinference/web/ui/src/locales/zh.json +3 -0
- {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/METADATA +4 -3
- {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/RECORD +77 -67
- xinference/web/ui/build/static/js/main.8a9e3ba0.js +0 -3
- xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +0 -1
- /xinference/web/ui/build/static/js/{main.8a9e3ba0.js.LICENSE.txt → main.9b12b7f9.js.LICENSE.txt} +0 -0
- {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/WHEEL +0 -0
- {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/top_level.txt +0 -0
|
@@ -12,18 +12,16 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
|
-
import typing
|
|
16
15
|
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
|
|
17
16
|
|
|
18
17
|
import requests
|
|
19
18
|
|
|
20
|
-
from ..common import streaming_response_iterator
|
|
19
|
+
from ..common import convert_float_to_int_or_str, streaming_response_iterator
|
|
21
20
|
|
|
22
21
|
if TYPE_CHECKING:
|
|
23
22
|
from ...types import (
|
|
24
23
|
ChatCompletion,
|
|
25
24
|
ChatCompletionChunk,
|
|
26
|
-
ChatCompletionMessage,
|
|
27
25
|
Completion,
|
|
28
26
|
CompletionChunk,
|
|
29
27
|
Embedding,
|
|
@@ -33,17 +31,6 @@ if TYPE_CHECKING:
|
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
|
|
36
|
-
def convert_float_to_int_or_str(model_size: float) -> Union[int, str]:
|
|
37
|
-
"""convert float to int or string
|
|
38
|
-
|
|
39
|
-
if float can be presented as int, convert it to int, otherwise convert it to string
|
|
40
|
-
"""
|
|
41
|
-
if int(model_size) == model_size:
|
|
42
|
-
return int(model_size)
|
|
43
|
-
else:
|
|
44
|
-
return str(model_size)
|
|
45
|
-
|
|
46
|
-
|
|
47
34
|
def _get_error_string(response: requests.Response) -> str:
|
|
48
35
|
try:
|
|
49
36
|
if response.content:
|
|
@@ -57,25 +44,6 @@ def _get_error_string(response: requests.Response) -> str:
|
|
|
57
44
|
return "Unknown error"
|
|
58
45
|
|
|
59
46
|
|
|
60
|
-
@typing.no_type_check
|
|
61
|
-
def handle_system_prompts(
|
|
62
|
-
chat_history: List["ChatCompletionMessage"], system_prompt: Optional[str]
|
|
63
|
-
) -> List["ChatCompletionMessage"]:
|
|
64
|
-
history_system_prompts = [
|
|
65
|
-
ch["content"] for ch in chat_history if ch["role"] == "system"
|
|
66
|
-
]
|
|
67
|
-
if system_prompt is not None:
|
|
68
|
-
history_system_prompts.append(system_prompt)
|
|
69
|
-
|
|
70
|
-
# remove all the system prompt in the chat_history
|
|
71
|
-
chat_history = list(filter(lambda x: x["role"] != "system", chat_history))
|
|
72
|
-
# insert all system prompts at the beginning
|
|
73
|
-
chat_history.insert(
|
|
74
|
-
0, {"role": "system", "content": ". ".join(history_system_prompts)}
|
|
75
|
-
)
|
|
76
|
-
return chat_history
|
|
77
|
-
|
|
78
|
-
|
|
79
47
|
class RESTfulModelHandle:
|
|
80
48
|
"""
|
|
81
49
|
A sync model interface (for RESTful client) which provides type hints that makes it much easier to use xinference
|
|
@@ -86,6 +54,19 @@ class RESTfulModelHandle:
|
|
|
86
54
|
self._model_uid = model_uid
|
|
87
55
|
self._base_url = base_url
|
|
88
56
|
self.auth_headers = auth_headers
|
|
57
|
+
self.session = requests.Session()
|
|
58
|
+
|
|
59
|
+
def close(self):
|
|
60
|
+
"""
|
|
61
|
+
Close the session.
|
|
62
|
+
"""
|
|
63
|
+
if self.session:
|
|
64
|
+
self.session.close()
|
|
65
|
+
self.session = None
|
|
66
|
+
|
|
67
|
+
def __del__(self):
|
|
68
|
+
if self.session:
|
|
69
|
+
self.close()
|
|
89
70
|
|
|
90
71
|
|
|
91
72
|
class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
|
|
@@ -116,7 +97,7 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
|
|
|
116
97
|
"input": input,
|
|
117
98
|
}
|
|
118
99
|
request_body.update(kwargs)
|
|
119
|
-
response =
|
|
100
|
+
response = self.session.post(url, json=request_body, headers=self.auth_headers)
|
|
120
101
|
if response.status_code != 200:
|
|
121
102
|
raise RuntimeError(
|
|
122
103
|
f"Failed to create the embeddings, detail: {_get_error_string(response)}"
|
|
@@ -154,7 +135,7 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
|
|
|
154
135
|
"input": input,
|
|
155
136
|
}
|
|
156
137
|
request_body.update(kwargs)
|
|
157
|
-
response =
|
|
138
|
+
response = self.session.post(url, json=request_body, headers=self.auth_headers)
|
|
158
139
|
if response.status_code != 200:
|
|
159
140
|
raise RuntimeError(
|
|
160
141
|
f"Failed to decode token ids, detail: {_get_error_string(response)}"
|
|
@@ -213,7 +194,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
|
|
|
213
194
|
"kwargs": json.dumps(kwargs),
|
|
214
195
|
}
|
|
215
196
|
request_body.update(kwargs)
|
|
216
|
-
response =
|
|
197
|
+
response = self.session.post(url, json=request_body, headers=self.auth_headers)
|
|
217
198
|
if response.status_code != 200:
|
|
218
199
|
raise RuntimeError(
|
|
219
200
|
f"Failed to rerank documents, detail: {response.json()['detail']}"
|
|
@@ -258,7 +239,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
|
|
|
258
239
|
"response_format": response_format,
|
|
259
240
|
"kwargs": json.dumps(kwargs),
|
|
260
241
|
}
|
|
261
|
-
response =
|
|
242
|
+
response = self.session.post(url, json=request_body, headers=self.auth_headers)
|
|
262
243
|
if response.status_code != 200:
|
|
263
244
|
raise RuntimeError(
|
|
264
245
|
f"Failed to create the images, detail: {_get_error_string(response)}"
|
|
@@ -322,7 +303,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
|
|
|
322
303
|
for key, value in params.items():
|
|
323
304
|
files.append((key, (None, value)))
|
|
324
305
|
files.append(("image", ("image", image, "application/octet-stream")))
|
|
325
|
-
response =
|
|
306
|
+
response = self.session.post(url, files=files, headers=self.auth_headers)
|
|
326
307
|
if response.status_code != 200:
|
|
327
308
|
raise RuntimeError(
|
|
328
309
|
f"Failed to variants the images, detail: {_get_error_string(response)}"
|
|
@@ -397,7 +378,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
|
|
|
397
378
|
files.append(
|
|
398
379
|
("mask_image", ("mask_image", mask_image, "application/octet-stream"))
|
|
399
380
|
)
|
|
400
|
-
response =
|
|
381
|
+
response = self.session.post(url, files=files, headers=self.auth_headers)
|
|
401
382
|
if response.status_code != 200:
|
|
402
383
|
raise RuntimeError(
|
|
403
384
|
f"Failed to inpaint the images, detail: {_get_error_string(response)}"
|
|
@@ -416,7 +397,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
|
|
|
416
397
|
for key, value in params.items():
|
|
417
398
|
files.append((key, (None, value)))
|
|
418
399
|
files.append(("image", ("image", image, "application/octet-stream")))
|
|
419
|
-
response =
|
|
400
|
+
response = self.session.post(url, files=files, headers=self.auth_headers)
|
|
420
401
|
if response.status_code != 200:
|
|
421
402
|
raise RuntimeError(
|
|
422
403
|
f"Failed to ocr the images, detail: {_get_error_string(response)}"
|
|
@@ -454,7 +435,7 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
|
|
|
454
435
|
"n": n,
|
|
455
436
|
"kwargs": json.dumps(kwargs),
|
|
456
437
|
}
|
|
457
|
-
response =
|
|
438
|
+
response = self.session.post(url, json=request_body, headers=self.auth_headers)
|
|
458
439
|
if response.status_code != 200:
|
|
459
440
|
raise RuntimeError(
|
|
460
441
|
f"Failed to create the video, detail: {_get_error_string(response)}"
|
|
@@ -501,7 +482,7 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
|
|
|
501
482
|
for key, value in params.items():
|
|
502
483
|
files.append((key, (None, value)))
|
|
503
484
|
files.append(("image", ("image", image, "application/octet-stream")))
|
|
504
|
-
response =
|
|
485
|
+
response = self.session.post(url, files=files, headers=self.auth_headers)
|
|
505
486
|
if response.status_code != 200:
|
|
506
487
|
raise RuntimeError(
|
|
507
488
|
f"Failed to create the video from image, detail: {_get_error_string(response)}"
|
|
@@ -554,7 +535,7 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
|
|
|
554
535
|
("first_frame", ("image", first_frame, "application/octet-stream"))
|
|
555
536
|
)
|
|
556
537
|
files.append(("last_frame", ("image", last_frame, "application/octet-stream")))
|
|
557
|
-
response =
|
|
538
|
+
response = self.session.post(url, files=files, headers=self.auth_headers)
|
|
558
539
|
if response.status_code != 200:
|
|
559
540
|
raise RuntimeError(
|
|
560
541
|
f"Failed to create the video from image, detail: {_get_error_string(response)}"
|
|
@@ -604,7 +585,7 @@ class RESTfulGenerateModelHandle(RESTfulModelHandle):
|
|
|
604
585
|
|
|
605
586
|
stream = bool(generate_config and generate_config.get("stream"))
|
|
606
587
|
|
|
607
|
-
response =
|
|
588
|
+
response = self.session.post(
|
|
608
589
|
url, json=request_body, stream=stream, headers=self.auth_headers
|
|
609
590
|
)
|
|
610
591
|
if response.status_code != 200:
|
|
@@ -665,7 +646,7 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
|
|
|
665
646
|
request_body[key] = value
|
|
666
647
|
|
|
667
648
|
stream = bool(generate_config and generate_config.get("stream"))
|
|
668
|
-
response =
|
|
649
|
+
response = self.session.post(
|
|
669
650
|
url, json=request_body, stream=stream, headers=self.auth_headers
|
|
670
651
|
)
|
|
671
652
|
|
|
@@ -736,7 +717,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
736
717
|
}
|
|
737
718
|
files: List[Any] = []
|
|
738
719
|
files.append(("file", ("file", audio, "application/octet-stream")))
|
|
739
|
-
response =
|
|
720
|
+
response = self.session.post(
|
|
740
721
|
url, data=params, files=files, headers=self.auth_headers
|
|
741
722
|
)
|
|
742
723
|
if response.status_code != 200:
|
|
@@ -799,7 +780,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
799
780
|
}
|
|
800
781
|
files: List[Any] = []
|
|
801
782
|
files.append(("file", ("file", audio, "application/octet-stream")))
|
|
802
|
-
response =
|
|
783
|
+
response = self.session.post(
|
|
803
784
|
url, data=params, files=files, headers=self.auth_headers
|
|
804
785
|
)
|
|
805
786
|
if response.status_code != 200:
|
|
@@ -873,11 +854,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
873
854
|
)
|
|
874
855
|
)
|
|
875
856
|
if files:
|
|
876
|
-
response =
|
|
857
|
+
response = self.session.post(
|
|
877
858
|
url, data=params, files=files, headers=self.auth_headers, stream=stream
|
|
878
859
|
)
|
|
879
860
|
else:
|
|
880
|
-
response =
|
|
861
|
+
response = self.session.post(
|
|
881
862
|
url, json=params, headers=self.auth_headers, stream=stream
|
|
882
863
|
)
|
|
883
864
|
if response.status_code != 200:
|
|
@@ -894,6 +875,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
|
|
|
894
875
|
class RESTfulFlexibleModelHandle(RESTfulModelHandle):
|
|
895
876
|
def infer(
|
|
896
877
|
self,
|
|
878
|
+
*args,
|
|
897
879
|
**kwargs,
|
|
898
880
|
):
|
|
899
881
|
"""
|
|
@@ -914,16 +896,17 @@ class RESTfulFlexibleModelHandle(RESTfulModelHandle):
|
|
|
914
896
|
url = f"{self._base_url}/v1/flexible/infers"
|
|
915
897
|
params = {
|
|
916
898
|
"model": self._model_uid,
|
|
899
|
+
"args": args,
|
|
917
900
|
}
|
|
918
901
|
params.update(kwargs)
|
|
919
902
|
|
|
920
|
-
response =
|
|
903
|
+
response = self.session.post(url, json=params, headers=self.auth_headers)
|
|
921
904
|
if response.status_code != 200:
|
|
922
905
|
raise RuntimeError(
|
|
923
906
|
f"Failed to predict, detail: {_get_error_string(response)}"
|
|
924
907
|
)
|
|
925
908
|
|
|
926
|
-
return response.
|
|
909
|
+
return response.json()
|
|
927
910
|
|
|
928
911
|
|
|
929
912
|
class Client:
|
|
@@ -931,10 +914,23 @@ class Client:
|
|
|
931
914
|
self.base_url = base_url
|
|
932
915
|
self._headers: Dict[str, str] = {}
|
|
933
916
|
self._cluster_authed = False
|
|
917
|
+
self.session = requests.Session()
|
|
934
918
|
self._check_cluster_authenticated()
|
|
935
919
|
if api_key is not None and self._cluster_authed:
|
|
936
920
|
self._headers["Authorization"] = f"Bearer {api_key}"
|
|
937
921
|
|
|
922
|
+
def close(self):
|
|
923
|
+
"""
|
|
924
|
+
Close the session.
|
|
925
|
+
"""
|
|
926
|
+
if self.session:
|
|
927
|
+
self.session.close()
|
|
928
|
+
self.session = None
|
|
929
|
+
|
|
930
|
+
def __del__(self):
|
|
931
|
+
if self.session:
|
|
932
|
+
self.close()
|
|
933
|
+
|
|
938
934
|
def _set_token(self, token: Optional[str]):
|
|
939
935
|
if not self._cluster_authed or token is None:
|
|
940
936
|
return
|
|
@@ -949,7 +945,7 @@ class Client:
|
|
|
949
945
|
|
|
950
946
|
def _check_cluster_authenticated(self):
|
|
951
947
|
url = f"{self.base_url}/v1/cluster/auth"
|
|
952
|
-
response =
|
|
948
|
+
response = self.session.get(url)
|
|
953
949
|
# compatible with old version of xinference
|
|
954
950
|
if response.status_code == 404:
|
|
955
951
|
self._cluster_authed = False
|
|
@@ -963,7 +959,7 @@ class Client:
|
|
|
963
959
|
|
|
964
960
|
def vllm_models(self) -> Dict[str, Any]:
|
|
965
961
|
url = f"{self.base_url}/v1/models/vllm-supported"
|
|
966
|
-
response =
|
|
962
|
+
response = self.session.get(url, headers=self._headers)
|
|
967
963
|
if response.status_code != 200:
|
|
968
964
|
raise RuntimeError(
|
|
969
965
|
f"Failed to fetch VLLM models. detail: {response.json()['detail']}"
|
|
@@ -981,7 +977,7 @@ class Client:
|
|
|
981
977
|
|
|
982
978
|
payload = {"username": username, "password": password}
|
|
983
979
|
|
|
984
|
-
response =
|
|
980
|
+
response = self.session.post(url, json=payload)
|
|
985
981
|
if response.status_code != 200:
|
|
986
982
|
raise RuntimeError(f"Failed to login, detail: {response.json()['detail']}")
|
|
987
983
|
|
|
@@ -1003,7 +999,7 @@ class Client:
|
|
|
1003
999
|
|
|
1004
1000
|
url = f"{self.base_url}/v1/models"
|
|
1005
1001
|
|
|
1006
|
-
response =
|
|
1002
|
+
response = self.session.get(url, headers=self._headers)
|
|
1007
1003
|
if response.status_code != 200:
|
|
1008
1004
|
raise RuntimeError(
|
|
1009
1005
|
f"Failed to list model, detail: {_get_error_string(response)}"
|
|
@@ -1111,9 +1107,9 @@ class Client:
|
|
|
1111
1107
|
payload[str(key)] = value
|
|
1112
1108
|
|
|
1113
1109
|
if wait_ready:
|
|
1114
|
-
response =
|
|
1110
|
+
response = self.session.post(url, json=payload, headers=self._headers)
|
|
1115
1111
|
else:
|
|
1116
|
-
response =
|
|
1112
|
+
response = self.session.post(
|
|
1117
1113
|
url, json=payload, headers=self._headers, params={"wait_ready": False}
|
|
1118
1114
|
)
|
|
1119
1115
|
if response.status_code != 200:
|
|
@@ -1142,7 +1138,7 @@ class Client:
|
|
|
1142
1138
|
|
|
1143
1139
|
url = f"{self.base_url}/v1/models/{model_uid}"
|
|
1144
1140
|
|
|
1145
|
-
response =
|
|
1141
|
+
response = self.session.delete(url, headers=self._headers)
|
|
1146
1142
|
if response.status_code != 200:
|
|
1147
1143
|
raise RuntimeError(
|
|
1148
1144
|
f"Failed to terminate model, detail: {_get_error_string(response)}"
|
|
@@ -1169,7 +1165,7 @@ class Client:
|
|
|
1169
1165
|
"""
|
|
1170
1166
|
url = f"{self.base_url}/v1/models/{model_uid}/progress"
|
|
1171
1167
|
|
|
1172
|
-
response =
|
|
1168
|
+
response = self.session.get(url, headers=self._headers)
|
|
1173
1169
|
if response.status_code != 200:
|
|
1174
1170
|
raise RuntimeError(
|
|
1175
1171
|
f"Fail to get model launching progress, detail: {_get_error_string(response)}"
|
|
@@ -1192,7 +1188,7 @@ class Client:
|
|
|
1192
1188
|
"""
|
|
1193
1189
|
url = f"{self.base_url}/v1/models/{model_uid}/cancel"
|
|
1194
1190
|
|
|
1195
|
-
response =
|
|
1191
|
+
response = self.session.post(url, headers=self._headers)
|
|
1196
1192
|
if response.status_code != 200:
|
|
1197
1193
|
raise RuntimeError(
|
|
1198
1194
|
f"Fail to cancel launching model, detail: {_get_error_string(response)}"
|
|
@@ -1200,7 +1196,7 @@ class Client:
|
|
|
1200
1196
|
|
|
1201
1197
|
def get_instance_info(self, model_name: str, model_uid: str):
|
|
1202
1198
|
url = f"{self.base_url}/v1/models/instances"
|
|
1203
|
-
response =
|
|
1199
|
+
response = self.session.get(
|
|
1204
1200
|
url,
|
|
1205
1201
|
headers=self._headers,
|
|
1206
1202
|
params={"model_name": model_name, "model_uid": model_uid},
|
|
@@ -1212,9 +1208,9 @@ class Client:
|
|
|
1212
1208
|
|
|
1213
1209
|
def _get_supervisor_internal_address(self):
|
|
1214
1210
|
url = f"{self.base_url}/v1/address"
|
|
1215
|
-
response =
|
|
1211
|
+
response = self.session.get(url, headers=self._headers)
|
|
1216
1212
|
if response.status_code != 200:
|
|
1217
|
-
raise RuntimeError(
|
|
1213
|
+
raise RuntimeError("Failed to get supervisor internal address")
|
|
1218
1214
|
response_data = response.json()
|
|
1219
1215
|
return response_data
|
|
1220
1216
|
|
|
@@ -1243,7 +1239,7 @@ class Client:
|
|
|
1243
1239
|
"""
|
|
1244
1240
|
|
|
1245
1241
|
url = f"{self.base_url}/v1/models/{model_uid}"
|
|
1246
|
-
response =
|
|
1242
|
+
response = self.session.get(url, headers=self._headers)
|
|
1247
1243
|
if response.status_code != 200:
|
|
1248
1244
|
raise RuntimeError(
|
|
1249
1245
|
f"Failed to get the model description, detail: {_get_error_string(response)}"
|
|
@@ -1331,7 +1327,7 @@ class Client:
|
|
|
1331
1327
|
"""
|
|
1332
1328
|
|
|
1333
1329
|
url = f"{self.base_url}/v1/models/{model_uid}"
|
|
1334
|
-
response =
|
|
1330
|
+
response = self.session.get(url, headers=self._headers)
|
|
1335
1331
|
if response.status_code != 200:
|
|
1336
1332
|
raise RuntimeError(
|
|
1337
1333
|
f"Failed to get the model description, detail: {_get_error_string(response)}"
|
|
@@ -1366,7 +1362,7 @@ class Client:
|
|
|
1366
1362
|
"""
|
|
1367
1363
|
url = f"{self.base_url}/v1/model_registrations/{model_type}"
|
|
1368
1364
|
request_body = {"model": model, "worker_ip": worker_ip, "persist": persist}
|
|
1369
|
-
response =
|
|
1365
|
+
response = self.session.post(url, json=request_body, headers=self._headers)
|
|
1370
1366
|
if response.status_code != 200:
|
|
1371
1367
|
raise RuntimeError(
|
|
1372
1368
|
f"Failed to register model, detail: {_get_error_string(response)}"
|
|
@@ -1392,7 +1388,7 @@ class Client:
|
|
|
1392
1388
|
Report failure to unregister the custom model. Provide details of failure through error message.
|
|
1393
1389
|
"""
|
|
1394
1390
|
url = f"{self.base_url}/v1/model_registrations/{model_type}/{model_name}"
|
|
1395
|
-
response =
|
|
1391
|
+
response = self.session.delete(url, headers=self._headers)
|
|
1396
1392
|
if response.status_code != 200:
|
|
1397
1393
|
raise RuntimeError(
|
|
1398
1394
|
f"Failed to register model, detail: {_get_error_string(response)}"
|
|
@@ -1422,7 +1418,7 @@ class Client:
|
|
|
1422
1418
|
|
|
1423
1419
|
"""
|
|
1424
1420
|
url = f"{self.base_url}/v1/model_registrations/{model_type}"
|
|
1425
|
-
response =
|
|
1421
|
+
response = self.session.get(url, headers=self._headers)
|
|
1426
1422
|
if response.status_code != 200:
|
|
1427
1423
|
raise RuntimeError(
|
|
1428
1424
|
f"Failed to list model registration, detail: {_get_error_string(response)}"
|
|
@@ -1459,7 +1455,7 @@ class Client:
|
|
|
1459
1455
|
"model_name": model_name,
|
|
1460
1456
|
"worker_ip": worker_ip,
|
|
1461
1457
|
}
|
|
1462
|
-
response =
|
|
1458
|
+
response = self.session.get(url, headers=self._headers, params=params)
|
|
1463
1459
|
if response.status_code != 200:
|
|
1464
1460
|
raise RuntimeError(
|
|
1465
1461
|
f"Failed to list cached model, detail: {_get_error_string(response)}"
|
|
@@ -1490,7 +1486,7 @@ class Client:
|
|
|
1490
1486
|
"model_version": model_version,
|
|
1491
1487
|
"worker_ip": worker_ip,
|
|
1492
1488
|
}
|
|
1493
|
-
response =
|
|
1489
|
+
response = self.session.get(url, headers=self._headers, params=params)
|
|
1494
1490
|
if response.status_code != 200:
|
|
1495
1491
|
raise RuntimeError(
|
|
1496
1492
|
f"Failed to get paths by model name, detail: {_get_error_string(response)}"
|
|
@@ -1520,7 +1516,7 @@ class Client:
|
|
|
1520
1516
|
"model_version": model_version,
|
|
1521
1517
|
"worker_ip": worker_ip,
|
|
1522
1518
|
}
|
|
1523
|
-
response =
|
|
1519
|
+
response = self.session.delete(url, headers=self._headers, params=params)
|
|
1524
1520
|
if response.status_code != 200:
|
|
1525
1521
|
raise RuntimeError(
|
|
1526
1522
|
f"Failed to remove cached models, detail: {_get_error_string(response)}"
|
|
@@ -1548,7 +1544,7 @@ class Client:
|
|
|
1548
1544
|
The collection of registered models on the server.
|
|
1549
1545
|
"""
|
|
1550
1546
|
url = f"{self.base_url}/v1/model_registrations/{model_type}/{model_name}"
|
|
1551
|
-
response =
|
|
1547
|
+
response = self.session.get(url, headers=self._headers)
|
|
1552
1548
|
if response.status_code != 200:
|
|
1553
1549
|
raise RuntimeError(
|
|
1554
1550
|
f"Failed to list model registration, detail: {_get_error_string(response)}"
|
|
@@ -1578,7 +1574,7 @@ class Client:
|
|
|
1578
1574
|
url = f"{self.base_url}/v1/engines/{model_name}"
|
|
1579
1575
|
else:
|
|
1580
1576
|
url = f"{self.base_url}/v1/engines/{model_type}/{model_name}"
|
|
1581
|
-
response =
|
|
1577
|
+
response = self.session.get(url, headers=self._headers)
|
|
1582
1578
|
if response.status_code != 200:
|
|
1583
1579
|
raise RuntimeError(
|
|
1584
1580
|
f"Failed to query engine parameters by model name, detail: {_get_error_string(response)}"
|
|
@@ -1608,7 +1604,7 @@ class Client:
|
|
|
1608
1604
|
Return empty dict.
|
|
1609
1605
|
"""
|
|
1610
1606
|
url = f"{self.base_url}/v1/models/{model_uid}/requests/{request_id}/abort"
|
|
1611
|
-
response =
|
|
1607
|
+
response = self.session.post(
|
|
1612
1608
|
url, headers=self._headers, json={"block_duration": block_duration}
|
|
1613
1609
|
)
|
|
1614
1610
|
if response.status_code != 200:
|
|
@@ -1621,7 +1617,7 @@ class Client:
|
|
|
1621
1617
|
|
|
1622
1618
|
def get_workers_info(self):
|
|
1623
1619
|
url = f"{self.base_url}/v1/workers"
|
|
1624
|
-
response =
|
|
1620
|
+
response = self.session.get(url, headers=self._headers)
|
|
1625
1621
|
if response.status_code != 200:
|
|
1626
1622
|
raise RuntimeError(
|
|
1627
1623
|
f"Failed to get workers info, detail: {_get_error_string(response)}"
|
|
@@ -1631,7 +1627,7 @@ class Client:
|
|
|
1631
1627
|
|
|
1632
1628
|
def get_supervisor_info(self):
|
|
1633
1629
|
url = f"{self.base_url}/v1/supervisor"
|
|
1634
|
-
response =
|
|
1630
|
+
response = self.session.get(url, headers=self._headers)
|
|
1635
1631
|
if response.status_code != 200:
|
|
1636
1632
|
raise RuntimeError(
|
|
1637
1633
|
f"Failed to get supervisor info, detail: {_get_error_string(response)}"
|
|
@@ -1641,7 +1637,7 @@ class Client:
|
|
|
1641
1637
|
|
|
1642
1638
|
def get_progress(self, request_id: str):
|
|
1643
1639
|
url = f"{self.base_url}/v1/requests/{request_id}/progress"
|
|
1644
|
-
response =
|
|
1640
|
+
response = self.session.get(url, headers=self._headers)
|
|
1645
1641
|
if response.status_code != 200:
|
|
1646
1642
|
raise RuntimeError(
|
|
1647
1643
|
f"Failed to get progress, detail: {_get_error_string(response)}"
|
|
@@ -1651,7 +1647,7 @@ class Client:
|
|
|
1651
1647
|
|
|
1652
1648
|
def abort_cluster(self):
|
|
1653
1649
|
url = f"{self.base_url}/v1/clusters"
|
|
1654
|
-
response =
|
|
1650
|
+
response = self.session.delete(url, headers=self._headers)
|
|
1655
1651
|
if response.status_code != 200:
|
|
1656
1652
|
raise RuntimeError(
|
|
1657
1653
|
f"Failed to abort cluster, detail: {_get_error_string(response)}"
|
|
@@ -16,6 +16,7 @@ import base64
|
|
|
16
16
|
import io
|
|
17
17
|
import logging
|
|
18
18
|
import os
|
|
19
|
+
import tempfile
|
|
19
20
|
import threading
|
|
20
21
|
import time
|
|
21
22
|
import uuid
|
|
@@ -784,7 +785,8 @@ class MediaInterface:
|
|
|
784
785
|
)
|
|
785
786
|
|
|
786
787
|
# Write to a temp .mp3 file and return its path
|
|
787
|
-
|
|
788
|
+
temp_dir = tempfile.gettempdir()
|
|
789
|
+
audio_path = os.path.join(temp_dir, f"{uuid.uuid4()}.mp3")
|
|
788
790
|
with open(audio_path, "wb") as f:
|
|
789
791
|
f.write(response)
|
|
790
792
|
|
xinference/core/model.py
CHANGED
|
@@ -160,10 +160,6 @@ def oom_check(fn):
|
|
|
160
160
|
class ModelActor(xo.StatelessActor, CancelMixin):
|
|
161
161
|
_replica_model_uid: Optional[str]
|
|
162
162
|
|
|
163
|
-
@classmethod
|
|
164
|
-
def gen_uid(cls, model: "LLM"):
|
|
165
|
-
return f"{model.__class__}-model-actor"
|
|
166
|
-
|
|
167
163
|
async def __pre_destroy__(self):
|
|
168
164
|
from ..model.embedding.core import EmbeddingModel
|
|
169
165
|
from ..model.llm.sglang.core import SGLANGModel
|
|
@@ -318,6 +314,9 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
318
314
|
def __repr__(self) -> str:
|
|
319
315
|
return f"ModelActor({self._replica_model_uid})"
|
|
320
316
|
|
|
317
|
+
def __getattr__(self, attr: str):
|
|
318
|
+
return getattr(self._model, attr)
|
|
319
|
+
|
|
321
320
|
def decrease_serve_count(self):
|
|
322
321
|
self._serve_count -= 1
|
|
323
322
|
|
|
@@ -1223,12 +1222,14 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
1223
1222
|
@log_async(logger=logger, ignore_kwargs=["image"])
|
|
1224
1223
|
async def infer(
|
|
1225
1224
|
self,
|
|
1225
|
+
*args,
|
|
1226
1226
|
**kwargs,
|
|
1227
1227
|
):
|
|
1228
1228
|
kwargs.pop("request_id", None)
|
|
1229
1229
|
if hasattr(self._model, "infer"):
|
|
1230
1230
|
return await self._call_wrapper_json(
|
|
1231
1231
|
self._model.infer,
|
|
1232
|
+
*args,
|
|
1232
1233
|
**kwargs,
|
|
1233
1234
|
)
|
|
1234
1235
|
raise AttributeError(
|
xinference/core/supervisor.py
CHANGED
|
@@ -348,15 +348,20 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
348
348
|
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
|
|
349
349
|
)
|
|
350
350
|
|
|
351
|
+
to_filter_abilities = ["vision", "reasoning", "audio", "omni", "hybrid"]
|
|
352
|
+
ability_to_names: Dict[str, List[str]] = {
|
|
353
|
+
ability: [] for ability in to_filter_abilities
|
|
354
|
+
}
|
|
355
|
+
for family in BUILTIN_LLM_FAMILIES:
|
|
356
|
+
for ability in to_filter_abilities:
|
|
357
|
+
if ability in family.model_ability:
|
|
358
|
+
ability_to_names[ability].append(family.model_name)
|
|
359
|
+
|
|
351
360
|
return {
|
|
352
361
|
"chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
|
|
353
362
|
"generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
|
|
354
363
|
"tools": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
|
|
355
|
-
|
|
356
|
-
family.model_name
|
|
357
|
-
for family in BUILTIN_LLM_FAMILIES
|
|
358
|
-
if "vision" in family.model_ability
|
|
359
|
-
],
|
|
364
|
+
**ability_to_names,
|
|
360
365
|
}
|
|
361
366
|
|
|
362
367
|
async def get_devices_count(self) -> int:
|
xinference/core/worker.py
CHANGED
|
@@ -15,10 +15,12 @@
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import logging
|
|
17
17
|
import os
|
|
18
|
+
import pathlib
|
|
18
19
|
import platform
|
|
19
20
|
import queue
|
|
20
21
|
import shutil
|
|
21
22
|
import signal
|
|
23
|
+
import sys
|
|
22
24
|
import threading
|
|
23
25
|
import time
|
|
24
26
|
from collections import defaultdict
|
|
@@ -809,7 +811,13 @@ class WorkerActor(xo.StatelessActor):
|
|
|
809
811
|
virtual_env_name or "uv", env_path
|
|
810
812
|
)
|
|
811
813
|
# create env
|
|
812
|
-
|
|
814
|
+
python_path = None
|
|
815
|
+
if not hasattr(sys, "_MEIPASS"):
|
|
816
|
+
# not in pyinstaller
|
|
817
|
+
# we specify python_path explicitly
|
|
818
|
+
# sometimes uv would find other versions.
|
|
819
|
+
python_path = pathlib.Path(sys.executable)
|
|
820
|
+
virtual_env_manager.create_env(python_path=python_path)
|
|
813
821
|
return virtual_env_manager
|
|
814
822
|
|
|
815
823
|
@classmethod
|
|
@@ -829,25 +837,18 @@ class WorkerActor(xo.StatelessActor):
|
|
|
829
837
|
if hasattr(settings, k) and not getattr(settings, k):
|
|
830
838
|
setattr(settings, k, v)
|
|
831
839
|
|
|
840
|
+
conf = dict(settings)
|
|
832
841
|
packages = settings.packages
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
find_links = settings.find_links
|
|
836
|
-
trusted_host = settings.trusted_host
|
|
842
|
+
conf.pop("packages", None)
|
|
843
|
+
conf.pop("inherit_pip_config", None)
|
|
837
844
|
|
|
838
845
|
logger.info(
|
|
839
|
-
"Installing packages %s in virtual env %s, with settings(
|
|
846
|
+
"Installing packages %s in virtual env %s, with settings(%s)",
|
|
840
847
|
packages,
|
|
841
848
|
virtual_env_manager.env_path,
|
|
842
|
-
|
|
843
|
-
)
|
|
844
|
-
virtual_env_manager.install_packages(
|
|
845
|
-
packages,
|
|
846
|
-
index_url=index_url,
|
|
847
|
-
extra_index_url=extra_index_url,
|
|
848
|
-
find_links=find_links,
|
|
849
|
-
trusted_host=trusted_host,
|
|
849
|
+
", ".join([f"{k}={v}" for k, v in conf.items() if v]),
|
|
850
850
|
)
|
|
851
|
+
virtual_env_manager.install_packages(packages, **conf)
|
|
851
852
|
|
|
852
853
|
async def _get_progressor(self, request_id: str):
|
|
853
854
|
from .progress_tracker import Progressor, ProgressTrackerActor
|