xinference 1.7.0.post1__py3-none-any.whl → 1.7.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (83) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +3 -4
  3. xinference/client/__init__.py +2 -0
  4. xinference/client/common.py +49 -2
  5. xinference/client/handlers.py +18 -0
  6. xinference/client/restful/async_restful_client.py +1760 -0
  7. xinference/client/restful/restful_client.py +74 -78
  8. xinference/core/media_interface.py +3 -1
  9. xinference/core/model.py +5 -4
  10. xinference/core/supervisor.py +10 -5
  11. xinference/core/worker.py +15 -14
  12. xinference/deploy/local.py +51 -9
  13. xinference/deploy/worker.py +5 -3
  14. xinference/device_utils.py +22 -3
  15. xinference/model/audio/fish_speech.py +23 -34
  16. xinference/model/audio/model_spec.json +4 -2
  17. xinference/model/audio/model_spec_modelscope.json +4 -2
  18. xinference/model/audio/utils.py +2 -2
  19. xinference/model/core.py +1 -0
  20. xinference/model/embedding/__init__.py +8 -8
  21. xinference/model/embedding/custom.py +6 -1
  22. xinference/model/embedding/embed_family.py +0 -41
  23. xinference/model/embedding/model_spec.json +10 -1
  24. xinference/model/embedding/model_spec_modelscope.json +10 -1
  25. xinference/model/embedding/sentence_transformers/core.py +30 -15
  26. xinference/model/flexible/core.py +1 -1
  27. xinference/model/flexible/launchers/__init__.py +2 -0
  28. xinference/model/flexible/launchers/image_process_launcher.py +1 -1
  29. xinference/model/flexible/launchers/modelscope_launcher.py +47 -0
  30. xinference/model/flexible/launchers/transformers_launcher.py +5 -5
  31. xinference/model/flexible/launchers/yolo_launcher.py +62 -0
  32. xinference/model/llm/__init__.py +7 -0
  33. xinference/model/llm/core.py +18 -1
  34. xinference/model/llm/llama_cpp/core.py +1 -1
  35. xinference/model/llm/llm_family.json +41 -1
  36. xinference/model/llm/llm_family.py +6 -0
  37. xinference/model/llm/llm_family_modelscope.json +43 -1
  38. xinference/model/llm/mlx/core.py +271 -18
  39. xinference/model/llm/mlx/distributed_models/__init__.py +13 -0
  40. xinference/model/llm/mlx/distributed_models/core.py +164 -0
  41. xinference/model/llm/mlx/distributed_models/deepseek_v3.py +75 -0
  42. xinference/model/llm/mlx/distributed_models/qwen2.py +82 -0
  43. xinference/model/llm/mlx/distributed_models/qwen3.py +82 -0
  44. xinference/model/llm/mlx/distributed_models/qwen3_moe.py +76 -0
  45. xinference/model/llm/reasoning_parser.py +12 -6
  46. xinference/model/llm/sglang/core.py +8 -4
  47. xinference/model/llm/transformers/chatglm.py +4 -1
  48. xinference/model/llm/transformers/core.py +4 -2
  49. xinference/model/llm/transformers/multimodal/cogagent.py +10 -4
  50. xinference/model/llm/transformers/multimodal/intern_vl.py +1 -1
  51. xinference/model/llm/utils.py +36 -17
  52. xinference/model/llm/vllm/core.py +142 -34
  53. xinference/model/llm/vllm/distributed_executor.py +96 -21
  54. xinference/model/llm/vllm/xavier/transfer.py +2 -2
  55. xinference/model/rerank/core.py +16 -9
  56. xinference/model/rerank/model_spec.json +3 -3
  57. xinference/model/rerank/model_spec_modelscope.json +3 -3
  58. xinference/web/ui/build/asset-manifest.json +3 -3
  59. xinference/web/ui/build/index.html +1 -1
  60. xinference/web/ui/build/static/js/main.9b12b7f9.js +3 -0
  61. xinference/web/ui/build/static/js/main.9b12b7f9.js.map +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/0fd4820d93f99509e80d8702dc3f6f8272424acab5608fa7c0e82cb1d3250a87.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/f75545479c17fdfe2a00235fa4a0e9da1ae95e6b3caafba87ded92de6b0240e4.json +1 -0
  67. xinference/web/ui/src/locales/en.json +3 -0
  68. xinference/web/ui/src/locales/ja.json +3 -0
  69. xinference/web/ui/src/locales/ko.json +3 -0
  70. xinference/web/ui/src/locales/zh.json +3 -0
  71. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/METADATA +4 -3
  72. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/RECORD +77 -67
  73. xinference/web/ui/build/static/js/main.8a9e3ba0.js +0 -3
  74. xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +0 -1
  76. xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +0 -1
  79. /xinference/web/ui/build/static/js/{main.8a9e3ba0.js.LICENSE.txt → main.9b12b7f9.js.LICENSE.txt} +0 -0
  80. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/WHEEL +0 -0
  81. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/entry_points.txt +0 -0
  82. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/licenses/LICENSE +0 -0
  83. {xinference-1.7.0.post1.dist-info → xinference-1.7.1.post1.dist-info}/top_level.txt +0 -0
@@ -12,18 +12,16 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import json
15
- import typing
16
15
  from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
17
16
 
18
17
  import requests
19
18
 
20
- from ..common import streaming_response_iterator
19
+ from ..common import convert_float_to_int_or_str, streaming_response_iterator
21
20
 
22
21
  if TYPE_CHECKING:
23
22
  from ...types import (
24
23
  ChatCompletion,
25
24
  ChatCompletionChunk,
26
- ChatCompletionMessage,
27
25
  Completion,
28
26
  CompletionChunk,
29
27
  Embedding,
@@ -33,17 +31,6 @@ if TYPE_CHECKING:
33
31
  )
34
32
 
35
33
 
36
- def convert_float_to_int_or_str(model_size: float) -> Union[int, str]:
37
- """convert float to int or string
38
-
39
- if float can be presented as int, convert it to int, otherwise convert it to string
40
- """
41
- if int(model_size) == model_size:
42
- return int(model_size)
43
- else:
44
- return str(model_size)
45
-
46
-
47
34
  def _get_error_string(response: requests.Response) -> str:
48
35
  try:
49
36
  if response.content:
@@ -57,25 +44,6 @@ def _get_error_string(response: requests.Response) -> str:
57
44
  return "Unknown error"
58
45
 
59
46
 
60
- @typing.no_type_check
61
- def handle_system_prompts(
62
- chat_history: List["ChatCompletionMessage"], system_prompt: Optional[str]
63
- ) -> List["ChatCompletionMessage"]:
64
- history_system_prompts = [
65
- ch["content"] for ch in chat_history if ch["role"] == "system"
66
- ]
67
- if system_prompt is not None:
68
- history_system_prompts.append(system_prompt)
69
-
70
- # remove all the system prompt in the chat_history
71
- chat_history = list(filter(lambda x: x["role"] != "system", chat_history))
72
- # insert all system prompts at the beginning
73
- chat_history.insert(
74
- 0, {"role": "system", "content": ". ".join(history_system_prompts)}
75
- )
76
- return chat_history
77
-
78
-
79
47
  class RESTfulModelHandle:
80
48
  """
81
49
  A sync model interface (for RESTful client) which provides type hints that makes it much easier to use xinference
@@ -86,6 +54,19 @@ class RESTfulModelHandle:
86
54
  self._model_uid = model_uid
87
55
  self._base_url = base_url
88
56
  self.auth_headers = auth_headers
57
+ self.session = requests.Session()
58
+
59
+ def close(self):
60
+ """
61
+ Close the session.
62
+ """
63
+ if self.session:
64
+ self.session.close()
65
+ self.session = None
66
+
67
+ def __del__(self):
68
+ if self.session:
69
+ self.close()
89
70
 
90
71
 
91
72
  class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
@@ -116,7 +97,7 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
116
97
  "input": input,
117
98
  }
118
99
  request_body.update(kwargs)
119
- response = requests.post(url, json=request_body, headers=self.auth_headers)
100
+ response = self.session.post(url, json=request_body, headers=self.auth_headers)
120
101
  if response.status_code != 200:
121
102
  raise RuntimeError(
122
103
  f"Failed to create the embeddings, detail: {_get_error_string(response)}"
@@ -154,7 +135,7 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
154
135
  "input": input,
155
136
  }
156
137
  request_body.update(kwargs)
157
- response = requests.post(url, json=request_body, headers=self.auth_headers)
138
+ response = self.session.post(url, json=request_body, headers=self.auth_headers)
158
139
  if response.status_code != 200:
159
140
  raise RuntimeError(
160
141
  f"Failed to decode token ids, detail: {_get_error_string(response)}"
@@ -213,7 +194,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
213
194
  "kwargs": json.dumps(kwargs),
214
195
  }
215
196
  request_body.update(kwargs)
216
- response = requests.post(url, json=request_body, headers=self.auth_headers)
197
+ response = self.session.post(url, json=request_body, headers=self.auth_headers)
217
198
  if response.status_code != 200:
218
199
  raise RuntimeError(
219
200
  f"Failed to rerank documents, detail: {response.json()['detail']}"
@@ -258,7 +239,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
258
239
  "response_format": response_format,
259
240
  "kwargs": json.dumps(kwargs),
260
241
  }
261
- response = requests.post(url, json=request_body, headers=self.auth_headers)
242
+ response = self.session.post(url, json=request_body, headers=self.auth_headers)
262
243
  if response.status_code != 200:
263
244
  raise RuntimeError(
264
245
  f"Failed to create the images, detail: {_get_error_string(response)}"
@@ -322,7 +303,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
322
303
  for key, value in params.items():
323
304
  files.append((key, (None, value)))
324
305
  files.append(("image", ("image", image, "application/octet-stream")))
325
- response = requests.post(url, files=files, headers=self.auth_headers)
306
+ response = self.session.post(url, files=files, headers=self.auth_headers)
326
307
  if response.status_code != 200:
327
308
  raise RuntimeError(
328
309
  f"Failed to variants the images, detail: {_get_error_string(response)}"
@@ -397,7 +378,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
397
378
  files.append(
398
379
  ("mask_image", ("mask_image", mask_image, "application/octet-stream"))
399
380
  )
400
- response = requests.post(url, files=files, headers=self.auth_headers)
381
+ response = self.session.post(url, files=files, headers=self.auth_headers)
401
382
  if response.status_code != 200:
402
383
  raise RuntimeError(
403
384
  f"Failed to inpaint the images, detail: {_get_error_string(response)}"
@@ -416,7 +397,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
416
397
  for key, value in params.items():
417
398
  files.append((key, (None, value)))
418
399
  files.append(("image", ("image", image, "application/octet-stream")))
419
- response = requests.post(url, files=files, headers=self.auth_headers)
400
+ response = self.session.post(url, files=files, headers=self.auth_headers)
420
401
  if response.status_code != 200:
421
402
  raise RuntimeError(
422
403
  f"Failed to ocr the images, detail: {_get_error_string(response)}"
@@ -454,7 +435,7 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
454
435
  "n": n,
455
436
  "kwargs": json.dumps(kwargs),
456
437
  }
457
- response = requests.post(url, json=request_body, headers=self.auth_headers)
438
+ response = self.session.post(url, json=request_body, headers=self.auth_headers)
458
439
  if response.status_code != 200:
459
440
  raise RuntimeError(
460
441
  f"Failed to create the video, detail: {_get_error_string(response)}"
@@ -501,7 +482,7 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
501
482
  for key, value in params.items():
502
483
  files.append((key, (None, value)))
503
484
  files.append(("image", ("image", image, "application/octet-stream")))
504
- response = requests.post(url, files=files, headers=self.auth_headers)
485
+ response = self.session.post(url, files=files, headers=self.auth_headers)
505
486
  if response.status_code != 200:
506
487
  raise RuntimeError(
507
488
  f"Failed to create the video from image, detail: {_get_error_string(response)}"
@@ -554,7 +535,7 @@ class RESTfulVideoModelHandle(RESTfulModelHandle):
554
535
  ("first_frame", ("image", first_frame, "application/octet-stream"))
555
536
  )
556
537
  files.append(("last_frame", ("image", last_frame, "application/octet-stream")))
557
- response = requests.post(url, files=files, headers=self.auth_headers)
538
+ response = self.session.post(url, files=files, headers=self.auth_headers)
558
539
  if response.status_code != 200:
559
540
  raise RuntimeError(
560
541
  f"Failed to create the video from image, detail: {_get_error_string(response)}"
@@ -604,7 +585,7 @@ class RESTfulGenerateModelHandle(RESTfulModelHandle):
604
585
 
605
586
  stream = bool(generate_config and generate_config.get("stream"))
606
587
 
607
- response = requests.post(
588
+ response = self.session.post(
608
589
  url, json=request_body, stream=stream, headers=self.auth_headers
609
590
  )
610
591
  if response.status_code != 200:
@@ -665,7 +646,7 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
665
646
  request_body[key] = value
666
647
 
667
648
  stream = bool(generate_config and generate_config.get("stream"))
668
- response = requests.post(
649
+ response = self.session.post(
669
650
  url, json=request_body, stream=stream, headers=self.auth_headers
670
651
  )
671
652
 
@@ -736,7 +717,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
736
717
  }
737
718
  files: List[Any] = []
738
719
  files.append(("file", ("file", audio, "application/octet-stream")))
739
- response = requests.post(
720
+ response = self.session.post(
740
721
  url, data=params, files=files, headers=self.auth_headers
741
722
  )
742
723
  if response.status_code != 200:
@@ -799,7 +780,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
799
780
  }
800
781
  files: List[Any] = []
801
782
  files.append(("file", ("file", audio, "application/octet-stream")))
802
- response = requests.post(
783
+ response = self.session.post(
803
784
  url, data=params, files=files, headers=self.auth_headers
804
785
  )
805
786
  if response.status_code != 200:
@@ -873,11 +854,11 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
873
854
  )
874
855
  )
875
856
  if files:
876
- response = requests.post(
857
+ response = self.session.post(
877
858
  url, data=params, files=files, headers=self.auth_headers, stream=stream
878
859
  )
879
860
  else:
880
- response = requests.post(
861
+ response = self.session.post(
881
862
  url, json=params, headers=self.auth_headers, stream=stream
882
863
  )
883
864
  if response.status_code != 200:
@@ -894,6 +875,7 @@ class RESTfulAudioModelHandle(RESTfulModelHandle):
894
875
  class RESTfulFlexibleModelHandle(RESTfulModelHandle):
895
876
  def infer(
896
877
  self,
878
+ *args,
897
879
  **kwargs,
898
880
  ):
899
881
  """
@@ -914,16 +896,17 @@ class RESTfulFlexibleModelHandle(RESTfulModelHandle):
914
896
  url = f"{self._base_url}/v1/flexible/infers"
915
897
  params = {
916
898
  "model": self._model_uid,
899
+ "args": args,
917
900
  }
918
901
  params.update(kwargs)
919
902
 
920
- response = requests.post(url, json=params, headers=self.auth_headers)
903
+ response = self.session.post(url, json=params, headers=self.auth_headers)
921
904
  if response.status_code != 200:
922
905
  raise RuntimeError(
923
906
  f"Failed to predict, detail: {_get_error_string(response)}"
924
907
  )
925
908
 
926
- return response.content
909
+ return response.json()
927
910
 
928
911
 
929
912
  class Client:
@@ -931,10 +914,23 @@ class Client:
931
914
  self.base_url = base_url
932
915
  self._headers: Dict[str, str] = {}
933
916
  self._cluster_authed = False
917
+ self.session = requests.Session()
934
918
  self._check_cluster_authenticated()
935
919
  if api_key is not None and self._cluster_authed:
936
920
  self._headers["Authorization"] = f"Bearer {api_key}"
937
921
 
922
+ def close(self):
923
+ """
924
+ Close the session.
925
+ """
926
+ if self.session:
927
+ self.session.close()
928
+ self.session = None
929
+
930
+ def __del__(self):
931
+ if self.session:
932
+ self.close()
933
+
938
934
  def _set_token(self, token: Optional[str]):
939
935
  if not self._cluster_authed or token is None:
940
936
  return
@@ -949,7 +945,7 @@ class Client:
949
945
 
950
946
  def _check_cluster_authenticated(self):
951
947
  url = f"{self.base_url}/v1/cluster/auth"
952
- response = requests.get(url)
948
+ response = self.session.get(url)
953
949
  # compatible with old version of xinference
954
950
  if response.status_code == 404:
955
951
  self._cluster_authed = False
@@ -963,7 +959,7 @@ class Client:
963
959
 
964
960
  def vllm_models(self) -> Dict[str, Any]:
965
961
  url = f"{self.base_url}/v1/models/vllm-supported"
966
- response = requests.get(url, headers=self._headers)
962
+ response = self.session.get(url, headers=self._headers)
967
963
  if response.status_code != 200:
968
964
  raise RuntimeError(
969
965
  f"Failed to fetch VLLM models. detail: {response.json()['detail']}"
@@ -981,7 +977,7 @@ class Client:
981
977
 
982
978
  payload = {"username": username, "password": password}
983
979
 
984
- response = requests.post(url, json=payload)
980
+ response = self.session.post(url, json=payload)
985
981
  if response.status_code != 200:
986
982
  raise RuntimeError(f"Failed to login, detail: {response.json()['detail']}")
987
983
 
@@ -1003,7 +999,7 @@ class Client:
1003
999
 
1004
1000
  url = f"{self.base_url}/v1/models"
1005
1001
 
1006
- response = requests.get(url, headers=self._headers)
1002
+ response = self.session.get(url, headers=self._headers)
1007
1003
  if response.status_code != 200:
1008
1004
  raise RuntimeError(
1009
1005
  f"Failed to list model, detail: {_get_error_string(response)}"
@@ -1111,9 +1107,9 @@ class Client:
1111
1107
  payload[str(key)] = value
1112
1108
 
1113
1109
  if wait_ready:
1114
- response = requests.post(url, json=payload, headers=self._headers)
1110
+ response = self.session.post(url, json=payload, headers=self._headers)
1115
1111
  else:
1116
- response = requests.post(
1112
+ response = self.session.post(
1117
1113
  url, json=payload, headers=self._headers, params={"wait_ready": False}
1118
1114
  )
1119
1115
  if response.status_code != 200:
@@ -1142,7 +1138,7 @@ class Client:
1142
1138
 
1143
1139
  url = f"{self.base_url}/v1/models/{model_uid}"
1144
1140
 
1145
- response = requests.delete(url, headers=self._headers)
1141
+ response = self.session.delete(url, headers=self._headers)
1146
1142
  if response.status_code != 200:
1147
1143
  raise RuntimeError(
1148
1144
  f"Failed to terminate model, detail: {_get_error_string(response)}"
@@ -1169,7 +1165,7 @@ class Client:
1169
1165
  """
1170
1166
  url = f"{self.base_url}/v1/models/{model_uid}/progress"
1171
1167
 
1172
- response = requests.get(url, headers=self._headers)
1168
+ response = self.session.get(url, headers=self._headers)
1173
1169
  if response.status_code != 200:
1174
1170
  raise RuntimeError(
1175
1171
  f"Fail to get model launching progress, detail: {_get_error_string(response)}"
@@ -1192,7 +1188,7 @@ class Client:
1192
1188
  """
1193
1189
  url = f"{self.base_url}/v1/models/{model_uid}/cancel"
1194
1190
 
1195
- response = requests.post(url, headers=self._headers)
1191
+ response = self.session.post(url, headers=self._headers)
1196
1192
  if response.status_code != 200:
1197
1193
  raise RuntimeError(
1198
1194
  f"Fail to cancel launching model, detail: {_get_error_string(response)}"
@@ -1200,7 +1196,7 @@ class Client:
1200
1196
 
1201
1197
  def get_instance_info(self, model_name: str, model_uid: str):
1202
1198
  url = f"{self.base_url}/v1/models/instances"
1203
- response = requests.get(
1199
+ response = self.session.get(
1204
1200
  url,
1205
1201
  headers=self._headers,
1206
1202
  params={"model_name": model_name, "model_uid": model_uid},
@@ -1212,9 +1208,9 @@ class Client:
1212
1208
 
1213
1209
  def _get_supervisor_internal_address(self):
1214
1210
  url = f"{self.base_url}/v1/address"
1215
- response = requests.get(url, headers=self._headers)
1211
+ response = self.session.get(url, headers=self._headers)
1216
1212
  if response.status_code != 200:
1217
- raise RuntimeError(f"Failed to get supervisor internal address")
1213
+ raise RuntimeError("Failed to get supervisor internal address")
1218
1214
  response_data = response.json()
1219
1215
  return response_data
1220
1216
 
@@ -1243,7 +1239,7 @@ class Client:
1243
1239
  """
1244
1240
 
1245
1241
  url = f"{self.base_url}/v1/models/{model_uid}"
1246
- response = requests.get(url, headers=self._headers)
1242
+ response = self.session.get(url, headers=self._headers)
1247
1243
  if response.status_code != 200:
1248
1244
  raise RuntimeError(
1249
1245
  f"Failed to get the model description, detail: {_get_error_string(response)}"
@@ -1331,7 +1327,7 @@ class Client:
1331
1327
  """
1332
1328
 
1333
1329
  url = f"{self.base_url}/v1/models/{model_uid}"
1334
- response = requests.get(url, headers=self._headers)
1330
+ response = self.session.get(url, headers=self._headers)
1335
1331
  if response.status_code != 200:
1336
1332
  raise RuntimeError(
1337
1333
  f"Failed to get the model description, detail: {_get_error_string(response)}"
@@ -1366,7 +1362,7 @@ class Client:
1366
1362
  """
1367
1363
  url = f"{self.base_url}/v1/model_registrations/{model_type}"
1368
1364
  request_body = {"model": model, "worker_ip": worker_ip, "persist": persist}
1369
- response = requests.post(url, json=request_body, headers=self._headers)
1365
+ response = self.session.post(url, json=request_body, headers=self._headers)
1370
1366
  if response.status_code != 200:
1371
1367
  raise RuntimeError(
1372
1368
  f"Failed to register model, detail: {_get_error_string(response)}"
@@ -1392,7 +1388,7 @@ class Client:
1392
1388
  Report failure to unregister the custom model. Provide details of failure through error message.
1393
1389
  """
1394
1390
  url = f"{self.base_url}/v1/model_registrations/{model_type}/{model_name}"
1395
- response = requests.delete(url, headers=self._headers)
1391
+ response = self.session.delete(url, headers=self._headers)
1396
1392
  if response.status_code != 200:
1397
1393
  raise RuntimeError(
1398
1394
  f"Failed to register model, detail: {_get_error_string(response)}"
@@ -1422,7 +1418,7 @@ class Client:
1422
1418
 
1423
1419
  """
1424
1420
  url = f"{self.base_url}/v1/model_registrations/{model_type}"
1425
- response = requests.get(url, headers=self._headers)
1421
+ response = self.session.get(url, headers=self._headers)
1426
1422
  if response.status_code != 200:
1427
1423
  raise RuntimeError(
1428
1424
  f"Failed to list model registration, detail: {_get_error_string(response)}"
@@ -1459,7 +1455,7 @@ class Client:
1459
1455
  "model_name": model_name,
1460
1456
  "worker_ip": worker_ip,
1461
1457
  }
1462
- response = requests.get(url, headers=self._headers, params=params)
1458
+ response = self.session.get(url, headers=self._headers, params=params)
1463
1459
  if response.status_code != 200:
1464
1460
  raise RuntimeError(
1465
1461
  f"Failed to list cached model, detail: {_get_error_string(response)}"
@@ -1490,7 +1486,7 @@ class Client:
1490
1486
  "model_version": model_version,
1491
1487
  "worker_ip": worker_ip,
1492
1488
  }
1493
- response = requests.get(url, headers=self._headers, params=params)
1489
+ response = self.session.get(url, headers=self._headers, params=params)
1494
1490
  if response.status_code != 200:
1495
1491
  raise RuntimeError(
1496
1492
  f"Failed to get paths by model name, detail: {_get_error_string(response)}"
@@ -1520,7 +1516,7 @@ class Client:
1520
1516
  "model_version": model_version,
1521
1517
  "worker_ip": worker_ip,
1522
1518
  }
1523
- response = requests.delete(url, headers=self._headers, params=params)
1519
+ response = self.session.delete(url, headers=self._headers, params=params)
1524
1520
  if response.status_code != 200:
1525
1521
  raise RuntimeError(
1526
1522
  f"Failed to remove cached models, detail: {_get_error_string(response)}"
@@ -1548,7 +1544,7 @@ class Client:
1548
1544
  The collection of registered models on the server.
1549
1545
  """
1550
1546
  url = f"{self.base_url}/v1/model_registrations/{model_type}/{model_name}"
1551
- response = requests.get(url, headers=self._headers)
1547
+ response = self.session.get(url, headers=self._headers)
1552
1548
  if response.status_code != 200:
1553
1549
  raise RuntimeError(
1554
1550
  f"Failed to list model registration, detail: {_get_error_string(response)}"
@@ -1578,7 +1574,7 @@ class Client:
1578
1574
  url = f"{self.base_url}/v1/engines/{model_name}"
1579
1575
  else:
1580
1576
  url = f"{self.base_url}/v1/engines/{model_type}/{model_name}"
1581
- response = requests.get(url, headers=self._headers)
1577
+ response = self.session.get(url, headers=self._headers)
1582
1578
  if response.status_code != 200:
1583
1579
  raise RuntimeError(
1584
1580
  f"Failed to query engine parameters by model name, detail: {_get_error_string(response)}"
@@ -1608,7 +1604,7 @@ class Client:
1608
1604
  Return empty dict.
1609
1605
  """
1610
1606
  url = f"{self.base_url}/v1/models/{model_uid}/requests/{request_id}/abort"
1611
- response = requests.post(
1607
+ response = self.session.post(
1612
1608
  url, headers=self._headers, json={"block_duration": block_duration}
1613
1609
  )
1614
1610
  if response.status_code != 200:
@@ -1621,7 +1617,7 @@ class Client:
1621
1617
 
1622
1618
  def get_workers_info(self):
1623
1619
  url = f"{self.base_url}/v1/workers"
1624
- response = requests.get(url, headers=self._headers)
1620
+ response = self.session.get(url, headers=self._headers)
1625
1621
  if response.status_code != 200:
1626
1622
  raise RuntimeError(
1627
1623
  f"Failed to get workers info, detail: {_get_error_string(response)}"
@@ -1631,7 +1627,7 @@ class Client:
1631
1627
 
1632
1628
  def get_supervisor_info(self):
1633
1629
  url = f"{self.base_url}/v1/supervisor"
1634
- response = requests.get(url, headers=self._headers)
1630
+ response = self.session.get(url, headers=self._headers)
1635
1631
  if response.status_code != 200:
1636
1632
  raise RuntimeError(
1637
1633
  f"Failed to get supervisor info, detail: {_get_error_string(response)}"
@@ -1641,7 +1637,7 @@ class Client:
1641
1637
 
1642
1638
  def get_progress(self, request_id: str):
1643
1639
  url = f"{self.base_url}/v1/requests/{request_id}/progress"
1644
- response = requests.get(url, headers=self._headers)
1640
+ response = self.session.get(url, headers=self._headers)
1645
1641
  if response.status_code != 200:
1646
1642
  raise RuntimeError(
1647
1643
  f"Failed to get progress, detail: {_get_error_string(response)}"
@@ -1651,7 +1647,7 @@ class Client:
1651
1647
 
1652
1648
  def abort_cluster(self):
1653
1649
  url = f"{self.base_url}/v1/clusters"
1654
- response = requests.delete(url, headers=self._headers)
1650
+ response = self.session.delete(url, headers=self._headers)
1655
1651
  if response.status_code != 200:
1656
1652
  raise RuntimeError(
1657
1653
  f"Failed to abort cluster, detail: {_get_error_string(response)}"
@@ -16,6 +16,7 @@ import base64
16
16
  import io
17
17
  import logging
18
18
  import os
19
+ import tempfile
19
20
  import threading
20
21
  import time
21
22
  import uuid
@@ -784,7 +785,8 @@ class MediaInterface:
784
785
  )
785
786
 
786
787
  # Write to a temp .mp3 file and return its path
787
- audio_path = f"/tmp/{uuid.uuid4()}.mp3"
788
+ temp_dir = tempfile.gettempdir()
789
+ audio_path = os.path.join(temp_dir, f"{uuid.uuid4()}.mp3")
788
790
  with open(audio_path, "wb") as f:
789
791
  f.write(response)
790
792
 
xinference/core/model.py CHANGED
@@ -160,10 +160,6 @@ def oom_check(fn):
160
160
  class ModelActor(xo.StatelessActor, CancelMixin):
161
161
  _replica_model_uid: Optional[str]
162
162
 
163
- @classmethod
164
- def gen_uid(cls, model: "LLM"):
165
- return f"{model.__class__}-model-actor"
166
-
167
163
  async def __pre_destroy__(self):
168
164
  from ..model.embedding.core import EmbeddingModel
169
165
  from ..model.llm.sglang.core import SGLANGModel
@@ -318,6 +314,9 @@ class ModelActor(xo.StatelessActor, CancelMixin):
318
314
  def __repr__(self) -> str:
319
315
  return f"ModelActor({self._replica_model_uid})"
320
316
 
317
+ def __getattr__(self, attr: str):
318
+ return getattr(self._model, attr)
319
+
321
320
  def decrease_serve_count(self):
322
321
  self._serve_count -= 1
323
322
 
@@ -1223,12 +1222,14 @@ class ModelActor(xo.StatelessActor, CancelMixin):
1223
1222
  @log_async(logger=logger, ignore_kwargs=["image"])
1224
1223
  async def infer(
1225
1224
  self,
1225
+ *args,
1226
1226
  **kwargs,
1227
1227
  ):
1228
1228
  kwargs.pop("request_id", None)
1229
1229
  if hasattr(self._model, "infer"):
1230
1230
  return await self._call_wrapper_json(
1231
1231
  self._model.infer,
1232
+ *args,
1232
1233
  **kwargs,
1233
1234
  )
1234
1235
  raise AttributeError(
@@ -348,15 +348,20 @@ class SupervisorActor(xo.StatelessActor):
348
348
  BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
349
349
  )
350
350
 
351
+ to_filter_abilities = ["vision", "reasoning", "audio", "omni", "hybrid"]
352
+ ability_to_names: Dict[str, List[str]] = {
353
+ ability: [] for ability in to_filter_abilities
354
+ }
355
+ for family in BUILTIN_LLM_FAMILIES:
356
+ for ability in to_filter_abilities:
357
+ if ability in family.model_ability:
358
+ ability_to_names[ability].append(family.model_name)
359
+
351
360
  return {
352
361
  "chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
353
362
  "generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
354
363
  "tools": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
355
- "vision": [
356
- family.model_name
357
- for family in BUILTIN_LLM_FAMILIES
358
- if "vision" in family.model_ability
359
- ],
364
+ **ability_to_names,
360
365
  }
361
366
 
362
367
  async def get_devices_count(self) -> int:
xinference/core/worker.py CHANGED
@@ -15,10 +15,12 @@
15
15
  import asyncio
16
16
  import logging
17
17
  import os
18
+ import pathlib
18
19
  import platform
19
20
  import queue
20
21
  import shutil
21
22
  import signal
23
+ import sys
22
24
  import threading
23
25
  import time
24
26
  from collections import defaultdict
@@ -809,7 +811,13 @@ class WorkerActor(xo.StatelessActor):
809
811
  virtual_env_name or "uv", env_path
810
812
  )
811
813
  # create env
812
- virtual_env_manager.create_env()
814
+ python_path = None
815
+ if not hasattr(sys, "_MEIPASS"):
816
+ # not in pyinstaller
817
+ # we specify python_path explicitly
818
+ # sometimes uv would find other versions.
819
+ python_path = pathlib.Path(sys.executable)
820
+ virtual_env_manager.create_env(python_path=python_path)
813
821
  return virtual_env_manager
814
822
 
815
823
  @classmethod
@@ -829,25 +837,18 @@ class WorkerActor(xo.StatelessActor):
829
837
  if hasattr(settings, k) and not getattr(settings, k):
830
838
  setattr(settings, k, v)
831
839
 
840
+ conf = dict(settings)
832
841
  packages = settings.packages
833
- index_url = settings.index_url
834
- extra_index_url = settings.extra_index_url
835
- find_links = settings.find_links
836
- trusted_host = settings.trusted_host
842
+ conf.pop("packages", None)
843
+ conf.pop("inherit_pip_config", None)
837
844
 
838
845
  logger.info(
839
- "Installing packages %s in virtual env %s, with settings(index_url=%s)",
846
+ "Installing packages %s in virtual env %s, with settings(%s)",
840
847
  packages,
841
848
  virtual_env_manager.env_path,
842
- index_url,
843
- )
844
- virtual_env_manager.install_packages(
845
- packages,
846
- index_url=index_url,
847
- extra_index_url=extra_index_url,
848
- find_links=find_links,
849
- trusted_host=trusted_host,
849
+ ", ".join([f"{k}={v}" for k, v in conf.items() if v]),
850
850
  )
851
+ virtual_env_manager.install_packages(packages, **conf)
851
852
 
852
853
  async def _get_progressor(self, request_id: str):
853
854
  from .progress_tracker import Progressor, ProgressTrackerActor