xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (149) hide show
  1. xinference/_compat.py +51 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +5 -39
  4. xinference/client/restful/restful_client.py +3 -24
  5. xinference/conftest.py +1 -1
  6. xinference/constants.py +5 -0
  7. xinference/core/cache_tracker.py +1 -1
  8. xinference/core/chat_interface.py +8 -14
  9. xinference/core/event.py +1 -1
  10. xinference/core/model.py +82 -31
  11. xinference/core/scheduler.py +37 -37
  12. xinference/core/status_guard.py +1 -1
  13. xinference/core/supervisor.py +11 -10
  14. xinference/core/utils.py +80 -22
  15. xinference/core/worker.py +17 -16
  16. xinference/deploy/cmdline.py +8 -16
  17. xinference/deploy/local.py +1 -1
  18. xinference/deploy/supervisor.py +1 -1
  19. xinference/deploy/utils.py +1 -1
  20. xinference/deploy/worker.py +1 -1
  21. xinference/model/audio/cosyvoice.py +86 -41
  22. xinference/model/embedding/core.py +52 -31
  23. xinference/model/image/stable_diffusion/core.py +18 -1
  24. xinference/model/llm/__init__.py +21 -11
  25. xinference/model/llm/llama_cpp/core.py +16 -33
  26. xinference/model/llm/llm_family.json +619 -1297
  27. xinference/model/llm/llm_family.py +31 -52
  28. xinference/model/llm/llm_family_csghub.json +18 -35
  29. xinference/model/llm/llm_family_modelscope.json +573 -1119
  30. xinference/model/llm/lmdeploy/core.py +56 -88
  31. xinference/model/llm/mlx/core.py +46 -69
  32. xinference/model/llm/sglang/core.py +33 -18
  33. xinference/model/llm/transformers/chatglm.py +167 -305
  34. xinference/model/llm/transformers/cogvlm2.py +36 -63
  35. xinference/model/llm/transformers/cogvlm2_video.py +33 -223
  36. xinference/model/llm/transformers/core.py +49 -50
  37. xinference/model/llm/transformers/deepseek_vl.py +53 -96
  38. xinference/model/llm/transformers/glm4v.py +55 -111
  39. xinference/model/llm/transformers/intern_vl.py +39 -70
  40. xinference/model/llm/transformers/internlm2.py +32 -54
  41. xinference/model/llm/transformers/minicpmv25.py +22 -55
  42. xinference/model/llm/transformers/minicpmv26.py +158 -68
  43. xinference/model/llm/transformers/omnilmm.py +5 -28
  44. xinference/model/llm/transformers/qwen2_vl.py +208 -0
  45. xinference/model/llm/transformers/qwen_vl.py +34 -86
  46. xinference/model/llm/transformers/utils.py +32 -38
  47. xinference/model/llm/transformers/yi_vl.py +32 -72
  48. xinference/model/llm/utils.py +195 -489
  49. xinference/model/llm/vllm/core.py +153 -100
  50. xinference/model/rerank/core.py +41 -8
  51. xinference/model/rerank/model_spec.json +7 -0
  52. xinference/model/rerank/model_spec_modelscope.json +7 -1
  53. xinference/model/utils.py +1 -31
  54. xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
  55. xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
  56. xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
  57. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
  58. xinference/thirdparty/cosyvoice/cli/model.py +139 -26
  59. xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
  60. xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
  61. xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
  62. xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
  63. xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
  64. xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
  65. xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
  66. xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
  67. xinference/thirdparty/cosyvoice/utils/common.py +36 -0
  68. xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
  69. xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
  70. xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
  71. xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
  72. xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
  73. xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
  74. xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
  75. xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
  76. xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
  77. xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
  78. xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
  79. xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
  80. xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
  81. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +34 -0
  82. xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  83. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
  84. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
  85. xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
  88. xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
  89. xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
  90. xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
  91. xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
  92. xinference/thirdparty/matcha/VERSION +1 -0
  93. xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
  94. xinference/thirdparty/matcha/hifigan/README.md +101 -0
  95. xinference/thirdparty/omnilmm/LICENSE +201 -0
  96. xinference/thirdparty/whisper/__init__.py +156 -0
  97. xinference/thirdparty/whisper/__main__.py +3 -0
  98. xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
  99. xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
  100. xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
  101. xinference/thirdparty/whisper/audio.py +157 -0
  102. xinference/thirdparty/whisper/decoding.py +826 -0
  103. xinference/thirdparty/whisper/model.py +314 -0
  104. xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
  105. xinference/thirdparty/whisper/normalizers/basic.py +76 -0
  106. xinference/thirdparty/whisper/normalizers/english.json +1741 -0
  107. xinference/thirdparty/whisper/normalizers/english.py +550 -0
  108. xinference/thirdparty/whisper/timing.py +386 -0
  109. xinference/thirdparty/whisper/tokenizer.py +395 -0
  110. xinference/thirdparty/whisper/transcribe.py +605 -0
  111. xinference/thirdparty/whisper/triton_ops.py +109 -0
  112. xinference/thirdparty/whisper/utils.py +316 -0
  113. xinference/thirdparty/whisper/version.py +1 -0
  114. xinference/types.py +7 -49
  115. xinference/web/ui/build/asset-manifest.json +6 -6
  116. xinference/web/ui/build/index.html +1 -1
  117. xinference/web/ui/build/static/css/{main.4bafd904.css → main.632e9148.css} +2 -2
  118. xinference/web/ui/build/static/css/main.632e9148.css.map +1 -0
  119. xinference/web/ui/build/static/js/main.9cfafbd6.js +3 -0
  120. xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.9cfafbd6.js.LICENSE.txt} +2 -0
  121. xinference/web/ui/build/static/js/main.9cfafbd6.js.map +1 -0
  122. xinference/web/ui/node_modules/.cache/babel-loader/01d6d198156bacbd436c51435edbd4b2cacd47a79db929105eba30f74b67d48d.json +1 -0
  123. xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
  124. xinference/web/ui/node_modules/.cache/babel-loader/59eb25f514afcc4fefd1b309d192b2455f1e0aec68a9de598ca4b2333fe2c774.json +1 -0
  125. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
  126. xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
  127. xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
  128. xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
  129. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
  130. xinference/web/ui/node_modules/.package-lock.json +37 -0
  131. xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
  132. xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
  133. xinference/web/ui/node_modules/nunjucks/package.json +112 -0
  134. xinference/web/ui/package-lock.json +38 -0
  135. xinference/web/ui/package.json +1 -0
  136. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/METADATA +8 -8
  137. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/RECORD +141 -87
  138. xinference/model/llm/transformers/llama_2.py +0 -108
  139. xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
  140. xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
  141. xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
  142. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
  143. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
  144. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
  145. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
  146. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/LICENSE +0 -0
  147. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/WHEEL +0 -0
  148. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/entry_points.txt +0 -0
  149. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@ import logging
18
18
  import uuid
19
19
  from collections import deque
20
20
  from enum import Enum
21
- from typing import List, Optional, Set, Tuple
21
+ from typing import Dict, List, Optional, Set, Tuple, Union
22
22
 
23
23
  import xoscar as xo
24
24
 
@@ -37,13 +37,24 @@ class AbortRequestMessage(Enum):
37
37
 
38
38
 
39
39
  class InferenceRequest:
40
- def __init__(self, prompt, future_or_queue, is_prefill, *args, **kwargs):
41
- # original prompt
42
- self._prompt = prompt
40
+ def __init__(
41
+ self,
42
+ prompt_or_messages,
43
+ future_or_queue,
44
+ is_prefill,
45
+ call_ability,
46
+ *args,
47
+ **kwargs,
48
+ ):
49
+ # original prompt, prompt(str) for generate model and messages(List[Dict]) for chat model
50
+ self._prompt = prompt_or_messages
43
51
  # full prompt that contains chat history and applies chat template
44
52
  self._full_prompt = None
45
53
  # whether the current request is in the prefill phase
46
54
  self._is_prefill = is_prefill
55
+ # the ability that the user calls this model for, that is `generate` / `chat` for now,
56
+ # which is for results formatting
57
+ self._call_ability = call_ability
47
58
  # full prompt tokens
48
59
  self._prompt_tokens = None
49
60
  # all new generated tokens during decode phase
@@ -88,38 +99,22 @@ class InferenceRequest:
88
99
  self._check_args()
89
100
 
90
101
  def _check_args(self):
91
- # chat
92
- if len(self._inference_args) == 3:
93
- # system prompt
94
- assert self._inference_args[0] is None or isinstance(
95
- self._inference_args[0], str
96
- )
97
- # chat history
98
- assert self._inference_args[1] is None or isinstance(
99
- self._inference_args[1], list
100
- )
101
- # generate config
102
- assert self._inference_args[2] is None or isinstance(
103
- self._inference_args[2], dict
104
- )
105
- else: # generate
106
- assert len(self._inference_args) == 1
107
- # generate config
108
- assert self._inference_args[0] is None or isinstance(
109
- self._inference_args[0], dict
110
- )
102
+ assert len(self._inference_args) == 1
103
+ # generate config
104
+ assert self._inference_args[0] is None or isinstance(
105
+ self._inference_args[0], dict
106
+ )
111
107
 
112
108
  @property
113
109
  def prompt(self):
110
+ """
111
+ prompt for generate model and messages for chat model
112
+ """
114
113
  return self._prompt
115
114
 
116
115
  @property
117
- def system_prompt(self):
118
- return self._inference_args[0]
119
-
120
- @property
121
- def chat_history(self):
122
- return self._inference_args[1]
116
+ def call_ability(self):
117
+ return self._call_ability
123
118
 
124
119
  @property
125
120
  def full_prompt(self):
@@ -162,11 +157,7 @@ class InferenceRequest:
162
157
 
163
158
  @property
164
159
  def generate_config(self):
165
- return (
166
- self._inference_args[2]
167
- if len(self._inference_args) == 3
168
- else self._inference_args[0]
169
- )
160
+ return self._inference_args[0]
170
161
 
171
162
  @property
172
163
  def sanitized_generate_config(self):
@@ -423,8 +414,17 @@ class SchedulerActor(xo.StatelessActor):
423
414
 
424
415
  self._empty_cache()
425
416
 
426
- async def add_request(self, prompt: str, future_or_queue, *args, **kwargs):
427
- req = InferenceRequest(prompt, future_or_queue, True, *args, **kwargs)
417
+ async def add_request(
418
+ self,
419
+ prompt_or_messages: Union[str, List[Dict]],
420
+ future_or_queue,
421
+ call_ability,
422
+ *args,
423
+ **kwargs,
424
+ ):
425
+ req = InferenceRequest(
426
+ prompt_or_messages, future_or_queue, True, call_ability, *args, **kwargs
427
+ )
428
428
  rid = req.request_id
429
429
  if rid is not None:
430
430
  if rid in self._id_to_req:
@@ -51,7 +51,7 @@ class StatusGuardActor(xo.StatelessActor):
51
51
  self._model_uid_to_info: Dict[str, InstanceInfo] = {} # type: ignore
52
52
 
53
53
  @classmethod
54
- def uid(cls) -> str:
54
+ def default_uid(cls) -> str:
55
55
  return "status_guard"
56
56
 
57
57
  @staticmethod
@@ -105,7 +105,7 @@ class SupervisorActor(xo.StatelessActor):
105
105
  self._lock = asyncio.Lock()
106
106
 
107
107
  @classmethod
108
- def uid(cls) -> str:
108
+ def default_uid(cls) -> str:
109
109
  return "supervisor"
110
110
 
111
111
  def _get_worker_ref_by_ip(
@@ -135,12 +135,12 @@ class SupervisorActor(xo.StatelessActor):
135
135
  self._status_guard_ref: xo.ActorRefType[ # type: ignore
136
136
  "StatusGuardActor"
137
137
  ] = await xo.create_actor(
138
- StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
138
+ StatusGuardActor, address=self.address, uid=StatusGuardActor.default_uid()
139
139
  )
140
140
  self._cache_tracker_ref: xo.ActorRefType[ # type: ignore
141
141
  "CacheTrackerActor"
142
142
  ] = await xo.create_actor(
143
- CacheTrackerActor, address=self.address, uid=CacheTrackerActor.uid()
143
+ CacheTrackerActor, address=self.address, uid=CacheTrackerActor.default_uid()
144
144
  )
145
145
 
146
146
  from .event import EventCollectorActor
@@ -148,7 +148,9 @@ class SupervisorActor(xo.StatelessActor):
148
148
  self._event_collector_ref: xo.ActorRefType[ # type: ignore
149
149
  EventCollectorActor
150
150
  ] = await xo.create_actor(
151
- EventCollectorActor, address=self.address, uid=EventCollectorActor.uid()
151
+ EventCollectorActor,
152
+ address=self.address,
153
+ uid=EventCollectorActor.default_uid(),
152
154
  )
153
155
 
154
156
  from ..model.audio import (
@@ -308,10 +310,7 @@ class SupervisorActor(xo.StatelessActor):
308
310
  async def get_builtin_prompts() -> Dict[str, Any]:
309
311
  from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE
310
312
 
311
- data = {}
312
- for k, v in BUILTIN_LLM_PROMPT_STYLE.items():
313
- data[k] = v.dict()
314
- return data
313
+ return {k: v for k, v in BUILTIN_LLM_PROMPT_STYLE.items()}
315
314
 
316
315
  @staticmethod
317
316
  async def get_builtin_families() -> Dict[str, List[str]]:
@@ -1028,7 +1027,7 @@ class SupervisorActor(xo.StatelessActor):
1028
1027
  else:
1029
1028
  task = asyncio.create_task(_launch_model())
1030
1029
  ASYNC_LAUNCH_TASKS[model_uid] = task
1031
- task.add_done_callback(lambda _: callback_for_async_launch(model_uid))
1030
+ task.add_done_callback(lambda _: callback_for_async_launch(model_uid)) # type: ignore
1032
1031
  return model_uid
1033
1032
 
1034
1033
  async def get_instance_info(
@@ -1233,7 +1232,9 @@ class SupervisorActor(xo.StatelessActor):
1233
1232
  worker_address not in self._worker_address_to_worker
1234
1233
  ), f"Worker {worker_address} exists"
1235
1234
 
1236
- worker_ref = await xo.actor_ref(address=worker_address, uid=WorkerActor.uid())
1235
+ worker_ref = await xo.actor_ref(
1236
+ address=worker_address, uid=WorkerActor.default_uid()
1237
+ )
1237
1238
  self._worker_address_to_worker[worker_address] = worker_ref
1238
1239
  logger.debug("Worker %s has been added successfully", worker_address)
1239
1240
 
xinference/core/utils.py CHANGED
@@ -11,62 +11,120 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import copy
15
14
  import logging
16
15
  import os
17
16
  import random
18
17
  import string
19
- from typing import Dict, Generator, List, Tuple, Union
18
+ import uuid
19
+ from typing import Dict, Generator, List, Optional, Tuple, Union
20
20
 
21
21
  import orjson
22
22
  from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
23
23
 
24
24
  from .._compat import BaseModel
25
+ from ..constants import XINFERENCE_LOG_ARG_MAX_LENGTH
25
26
 
26
27
  logger = logging.getLogger(__name__)
27
28
 
28
29
 
29
- def log_async(logger, args_formatter=None):
30
+ def truncate_log_arg(arg) -> str:
31
+ s = str(arg)
32
+ if len(s) > XINFERENCE_LOG_ARG_MAX_LENGTH:
33
+ s = s[0:XINFERENCE_LOG_ARG_MAX_LENGTH] + "..."
34
+ return s
35
+
36
+
37
+ def log_async(
38
+ logger,
39
+ level=logging.DEBUG,
40
+ ignore_kwargs: Optional[List[str]] = None,
41
+ log_exception=True,
42
+ ):
30
43
  import time
31
44
  from functools import wraps
32
45
 
33
46
  def decorator(func):
47
+ func_name = func.__name__
48
+
34
49
  @wraps(func)
35
50
  async def wrapped(*args, **kwargs):
36
- if args_formatter is not None:
37
- formatted_args, formatted_kwargs = copy.copy(args), copy.copy(kwargs)
38
- args_formatter(formatted_args, formatted_kwargs)
39
- else:
40
- formatted_args, formatted_kwargs = args, kwargs
41
- logger.debug(
42
- f"Enter {func.__name__}, args: {formatted_args}, kwargs: {formatted_kwargs}"
51
+ request_id_str = kwargs.get("request_id", "")
52
+ if not request_id_str:
53
+ request_id_str = uuid.uuid1()
54
+ request_id_str = f"[request {request_id_str}]"
55
+ formatted_args = ",".join(map(truncate_log_arg, args))
56
+ formatted_kwargs = ",".join(
57
+ [
58
+ "%s=%s" % (k, truncate_log_arg(v))
59
+ for k, v in kwargs.items()
60
+ if ignore_kwargs is None or k not in ignore_kwargs
61
+ ]
43
62
  )
44
- start = time.time()
45
- ret = await func(*args, **kwargs)
46
- logger.debug(
47
- f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s"
63
+ logger.log(
64
+ level,
65
+ f"{request_id_str} Enter {func_name}, args: {formatted_args}, kwargs: {formatted_kwargs}",
48
66
  )
49
- return ret
67
+ start = time.time()
68
+ try:
69
+ ret = await func(*args, **kwargs)
70
+ logger.log(
71
+ level,
72
+ f"{request_id_str} Leave {func_name}, elapsed time: {int(time.time() - start)} s",
73
+ )
74
+ return ret
75
+ except Exception as e:
76
+ if log_exception:
77
+ logger.error(
78
+ f"{request_id_str} Leave {func_name}, error: {e}, elapsed time: {int(time.time() - start)} s",
79
+ exc_info=True,
80
+ )
81
+ else:
82
+ logger.log(
83
+ level,
84
+ f"{request_id_str} Leave {func_name}, error: {e}, elapsed time: {int(time.time() - start)} s",
85
+ )
86
+ raise
50
87
 
51
88
  return wrapped
52
89
 
53
90
  return decorator
54
91
 
55
92
 
56
- def log_sync(logger):
93
+ def log_sync(logger, level=logging.DEBUG, log_exception=True):
57
94
  import time
58
95
  from functools import wraps
59
96
 
60
97
  def decorator(func):
61
98
  @wraps(func)
62
99
  def wrapped(*args, **kwargs):
63
- logger.debug(f"Enter {func.__name__}, args: {args}, kwargs: {kwargs}")
64
- start = time.time()
65
- ret = func(*args, **kwargs)
66
- logger.debug(
67
- f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s"
100
+ formatted_args = ",".join(map(truncate_log_arg, args))
101
+ formatted_kwargs = ",".join(
102
+ map(lambda x: "%s=%s" % (x[0], truncate_log_arg(x[1])), kwargs.items())
68
103
  )
69
- return ret
104
+ logger.log(
105
+ level,
106
+ f"Enter {func.__name__}, args: {formatted_args}, kwargs: {formatted_kwargs}",
107
+ )
108
+ start = time.time()
109
+ try:
110
+ ret = func(*args, **kwargs)
111
+ logger.log(
112
+ level,
113
+ f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s",
114
+ )
115
+ return ret
116
+ except Exception as e:
117
+ if log_exception:
118
+ logger.error(
119
+ f"Leave {func.__name__}, error: {e}, elapsed time: {int(time.time() - start)} s",
120
+ exc_info=True,
121
+ )
122
+ else:
123
+ logger.log(
124
+ level,
125
+ f"Leave {func.__name__}, error: {e}, elapsed time: {int(time.time() - start)} s",
126
+ )
127
+ raise
70
128
 
71
129
  return wrapped
72
130
 
xinference/core/worker.py CHANGED
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import asyncio
16
+ import logging
16
17
  import os
17
18
  import platform
18
19
  import queue
@@ -73,15 +74,15 @@ class WorkerActor(xo.StatelessActor):
73
74
  self._supervisor_ref: Optional[xo.ActorRefType] = None
74
75
  self._main_pool = main_pool
75
76
  self._main_pool.recover_sub_pool = self.recover_sub_pool
76
- self._status_guard_ref: xo.ActorRefType["StatusGuardActor"] = ( # type: ignore
77
- None
78
- )
77
+ self._status_guard_ref: xo.ActorRefType[
78
+ "StatusGuardActor"
79
+ ] = None # type: ignore
79
80
  self._event_collector_ref: xo.ActorRefType[ # type: ignore
80
81
  EventCollectorActor
81
82
  ] = None
82
- self._cache_tracker_ref: xo.ActorRefType[CacheTrackerActor] = ( # type: ignore
83
- None
84
- )
83
+ self._cache_tracker_ref: xo.ActorRefType[
84
+ CacheTrackerActor
85
+ ] = None # type: ignore
85
86
 
86
87
  # internal states.
87
88
  # temporary placeholder during model launch process:
@@ -185,7 +186,7 @@ class WorkerActor(xo.StatelessActor):
185
186
  break
186
187
 
187
188
  @classmethod
188
- def uid(cls) -> str:
189
+ def default_uid(cls) -> str:
189
190
  return "worker"
190
191
 
191
192
  async def __post_create__(self):
@@ -270,9 +271,9 @@ class WorkerActor(xo.StatelessActor):
270
271
 
271
272
  try:
272
273
  await self.get_supervisor_ref(add_worker=True)
273
- except Exception as e:
274
+ except Exception:
274
275
  # Do not crash the worker if supervisor is down, auto re-connect later
275
- logger.error(f"cannot connect to supervisor {e}")
276
+ logger.error(f"cannot connect to supervisor", exc_info=True)
276
277
 
277
278
  if not XINFERENCE_DISABLE_HEALTH_CHECK:
278
279
  from ..isolation import Isolation
@@ -324,7 +325,7 @@ class WorkerActor(xo.StatelessActor):
324
325
  if self._supervisor_ref is not None:
325
326
  return self._supervisor_ref
326
327
  supervisor_ref = await xo.actor_ref( # type: ignore
327
- address=self._supervisor_address, uid=SupervisorActor.uid()
328
+ address=self._supervisor_address, uid=SupervisorActor.default_uid()
328
329
  )
329
330
  # Prevent concurrent operations leads to double initialization, check again.
330
331
  if self._supervisor_ref is not None:
@@ -336,13 +337,13 @@ class WorkerActor(xo.StatelessActor):
336
337
  logger.info("Connected to supervisor as a fresh worker")
337
338
 
338
339
  self._status_guard_ref = await xo.actor_ref(
339
- address=self._supervisor_address, uid=StatusGuardActor.uid()
340
+ address=self._supervisor_address, uid=StatusGuardActor.default_uid()
340
341
  )
341
342
  self._event_collector_ref = await xo.actor_ref(
342
- address=self._supervisor_address, uid=EventCollectorActor.uid()
343
+ address=self._supervisor_address, uid=EventCollectorActor.default_uid()
343
344
  )
344
345
  self._cache_tracker_ref = await xo.actor_ref(
345
- address=self._supervisor_address, uid=CacheTrackerActor.uid()
346
+ address=self._supervisor_address, uid=CacheTrackerActor.default_uid()
346
347
  )
347
348
  # cache_tracker is on supervisor
348
349
  from ..model.audio import get_audio_model_descriptions
@@ -770,7 +771,7 @@ class WorkerActor(xo.StatelessActor):
770
771
  version_info["model_file_location"],
771
772
  )
772
773
 
773
- @log_async(logger=logger)
774
+ @log_async(logger=logger, level=logging.INFO)
774
775
  async def launch_builtin_model(
775
776
  self,
776
777
  model_uid: str,
@@ -814,7 +815,7 @@ class WorkerActor(xo.StatelessActor):
814
815
  )
815
816
  except Exception as e:
816
817
  # Report callback error can be log and ignore, should not interrupt the Process
817
- logger.error("report_event error: %s" % (e))
818
+ logger.error("report_event error: %s" % (e), exc_info=True)
818
819
 
819
820
  if gpu_idx is not None:
820
821
  logger.info(
@@ -917,7 +918,7 @@ class WorkerActor(xo.StatelessActor):
917
918
  {"model_ability": abilities, "status": LaunchStatus.READY.name},
918
919
  )
919
920
 
920
- @log_async(logger=logger)
921
+ @log_async(logger=logger, level=logging.INFO)
921
922
  async def terminate_model(self, model_uid: str, is_model_die=False):
922
923
  # Terminate model while its launching is not allow
923
924
  if model_uid in self._model_uid_launching_guard:
@@ -17,7 +17,7 @@ import logging
17
17
  import os
18
18
  import sys
19
19
  import warnings
20
- from typing import List, Optional, Sequence, Tuple, Union
20
+ from typing import Dict, List, Optional, Sequence, Tuple, Union
21
21
 
22
22
  import click
23
23
  from xoscar.utils import get_next_port
@@ -38,7 +38,6 @@ from ..constants import (
38
38
  XINFERENCE_LOG_MAX_BYTES,
39
39
  )
40
40
  from ..isolation import Isolation
41
- from ..types import ChatCompletionMessage
42
41
  from .utils import (
43
42
  get_config_dict,
44
43
  get_log_file,
@@ -1210,13 +1209,12 @@ def model_chat(
1210
1209
  stream: bool,
1211
1210
  api_key: Optional[str],
1212
1211
  ):
1213
- # TODO: chat model roles may not be user and assistant.
1214
1212
  endpoint = get_endpoint(endpoint)
1215
1213
  client = RESTfulClient(base_url=endpoint, api_key=api_key)
1216
1214
  if api_key is None:
1217
1215
  client._set_token(get_stored_token(endpoint, client))
1218
1216
 
1219
- chat_history: "List[ChatCompletionMessage]" = []
1217
+ messages: List[Dict] = []
1220
1218
  if stream:
1221
1219
  # TODO: when stream=True, RestfulClient cannot generate words one by one.
1222
1220
  # So use Client in temporary. The implementation needs to be changed to
@@ -1229,10 +1227,10 @@ def model_chat(
1229
1227
  if prompt == "":
1230
1228
  break
1231
1229
  print("Assistant: ", end="", file=sys.stdout)
1230
+ messages.append(dict(role="user", content=prompt))
1232
1231
  response_content = ""
1233
1232
  for chunk in model.chat(
1234
- prompt=prompt,
1235
- chat_history=chat_history,
1233
+ messages,
1236
1234
  generate_config={"stream": stream, "max_tokens": max_tokens},
1237
1235
  ):
1238
1236
  delta = chunk["choices"][0]["delta"]
@@ -1242,10 +1240,7 @@ def model_chat(
1242
1240
  response_content += delta["content"]
1243
1241
  print(delta["content"], end="", flush=True, file=sys.stdout)
1244
1242
  print("", file=sys.stdout)
1245
- chat_history.append(ChatCompletionMessage(role="user", content=prompt))
1246
- chat_history.append(
1247
- ChatCompletionMessage(role="assistant", content=response_content)
1248
- )
1243
+ messages.append(dict(role="assistant", content=response_content))
1249
1244
 
1250
1245
  model = client.get_model(model_uid=model_uid)
1251
1246
 
@@ -1274,20 +1269,17 @@ def model_chat(
1274
1269
  prompt = input("User: ")
1275
1270
  if prompt == "":
1276
1271
  break
1277
- chat_history.append(ChatCompletionMessage(role="user", content=prompt))
1272
+ messages.append({"role": "user", "content": prompt})
1278
1273
  print("Assistant: ", end="", file=sys.stdout)
1279
1274
  response = restful_model.chat(
1280
- prompt=prompt,
1281
- chat_history=chat_history,
1275
+ messages,
1282
1276
  generate_config={"stream": stream, "max_tokens": max_tokens},
1283
1277
  )
1284
1278
  if not isinstance(response, dict):
1285
1279
  raise ValueError("chat result is not valid")
1286
1280
  response_content = response["choices"][0]["message"]["content"]
1287
1281
  print(f"{response_content}\n", file=sys.stdout)
1288
- chat_history.append(
1289
- ChatCompletionMessage(role="assistant", content=response_content)
1290
- )
1282
+ messages.append(dict(role="assistant", content=response_content))
1291
1283
 
1292
1284
 
1293
1285
  @cli.command("vllm-models", help="Query and display models compatible with vLLM.")
@@ -49,7 +49,7 @@ async def _start_local_cluster(
49
49
  address=address, logging_conf=logging_conf
50
50
  )
51
51
  await xo.create_actor(
52
- SupervisorActor, address=address, uid=SupervisorActor.uid()
52
+ SupervisorActor, address=address, uid=SupervisorActor.default_uid()
53
53
  )
54
54
  await start_worker_components(
55
55
  address=address,
@@ -41,7 +41,7 @@ async def _start_supervisor(address: str, logging_conf: Optional[Dict] = None):
41
41
  address=address, n_process=0, logging_conf={"dict": logging_conf}
42
42
  )
43
43
  await xo.create_actor(
44
- SupervisorActor, address=address, uid=SupervisorActor.uid()
44
+ SupervisorActor, address=address, uid=SupervisorActor.default_uid()
45
45
  )
46
46
  await pool.join()
47
47
  except asyncio.exceptions.CancelledError:
@@ -167,7 +167,7 @@ def health_check(address: str, max_attempts: int, sleep_interval: int = 3) -> bo
167
167
  from ..core.supervisor import SupervisorActor
168
168
 
169
169
  supervisor_ref: xo.ActorRefType[SupervisorActor] = await xo.actor_ref( # type: ignore
170
- address=address, uid=SupervisorActor.uid()
170
+ address=address, uid=SupervisorActor.default_uid()
171
171
  )
172
172
 
173
173
  await supervisor_ref.get_status()
@@ -43,7 +43,7 @@ async def start_worker_components(
43
43
  await xo.create_actor(
44
44
  WorkerActor,
45
45
  address=address,
46
- uid=WorkerActor.uid(),
46
+ uid=WorkerActor.default_uid(),
47
47
  supervisor_address=supervisor_address,
48
48
  main_pool=main_pool,
49
49
  gpu_devices=gpu_device_indices,