xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +51 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +5 -39
- xinference/client/restful/restful_client.py +3 -24
- xinference/conftest.py +1 -1
- xinference/constants.py +5 -0
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +8 -14
- xinference/core/event.py +1 -1
- xinference/core/model.py +82 -31
- xinference/core/scheduler.py +37 -37
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +11 -10
- xinference/core/utils.py +80 -22
- xinference/core/worker.py +17 -16
- xinference/deploy/cmdline.py +8 -16
- xinference/deploy/local.py +1 -1
- xinference/deploy/supervisor.py +1 -1
- xinference/deploy/utils.py +1 -1
- xinference/deploy/worker.py +1 -1
- xinference/model/audio/cosyvoice.py +86 -41
- xinference/model/embedding/core.py +52 -31
- xinference/model/image/stable_diffusion/core.py +18 -1
- xinference/model/llm/__init__.py +21 -11
- xinference/model/llm/llama_cpp/core.py +16 -33
- xinference/model/llm/llm_family.json +619 -1297
- xinference/model/llm/llm_family.py +31 -52
- xinference/model/llm/llm_family_csghub.json +18 -35
- xinference/model/llm/llm_family_modelscope.json +573 -1119
- xinference/model/llm/lmdeploy/core.py +56 -88
- xinference/model/llm/mlx/core.py +46 -69
- xinference/model/llm/sglang/core.py +33 -18
- xinference/model/llm/transformers/chatglm.py +167 -305
- xinference/model/llm/transformers/cogvlm2.py +36 -63
- xinference/model/llm/transformers/cogvlm2_video.py +33 -223
- xinference/model/llm/transformers/core.py +49 -50
- xinference/model/llm/transformers/deepseek_vl.py +53 -96
- xinference/model/llm/transformers/glm4v.py +55 -111
- xinference/model/llm/transformers/intern_vl.py +39 -70
- xinference/model/llm/transformers/internlm2.py +32 -54
- xinference/model/llm/transformers/minicpmv25.py +22 -55
- xinference/model/llm/transformers/minicpmv26.py +158 -68
- xinference/model/llm/transformers/omnilmm.py +5 -28
- xinference/model/llm/transformers/qwen2_vl.py +208 -0
- xinference/model/llm/transformers/qwen_vl.py +34 -86
- xinference/model/llm/transformers/utils.py +32 -38
- xinference/model/llm/transformers/yi_vl.py +32 -72
- xinference/model/llm/utils.py +195 -489
- xinference/model/llm/vllm/core.py +153 -100
- xinference/model/rerank/core.py +41 -8
- xinference/model/rerank/model_spec.json +7 -0
- xinference/model/rerank/model_spec_modelscope.json +7 -1
- xinference/model/utils.py +1 -31
- xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
- xinference/thirdparty/cosyvoice/cli/model.py +139 -26
- xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
- xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
- xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
- xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
- xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
- xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
- xinference/thirdparty/cosyvoice/utils/common.py +36 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
- xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
- xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
- xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +34 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
- xinference/thirdparty/matcha/VERSION +1 -0
- xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
- xinference/thirdparty/matcha/hifigan/README.md +101 -0
- xinference/thirdparty/omnilmm/LICENSE +201 -0
- xinference/thirdparty/whisper/__init__.py +156 -0
- xinference/thirdparty/whisper/__main__.py +3 -0
- xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
- xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
- xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
- xinference/thirdparty/whisper/audio.py +157 -0
- xinference/thirdparty/whisper/decoding.py +826 -0
- xinference/thirdparty/whisper/model.py +314 -0
- xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
- xinference/thirdparty/whisper/normalizers/basic.py +76 -0
- xinference/thirdparty/whisper/normalizers/english.json +1741 -0
- xinference/thirdparty/whisper/normalizers/english.py +550 -0
- xinference/thirdparty/whisper/timing.py +386 -0
- xinference/thirdparty/whisper/tokenizer.py +395 -0
- xinference/thirdparty/whisper/transcribe.py +605 -0
- xinference/thirdparty/whisper/triton_ops.py +109 -0
- xinference/thirdparty/whisper/utils.py +316 -0
- xinference/thirdparty/whisper/version.py +1 -0
- xinference/types.py +7 -49
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.4bafd904.css → main.632e9148.css} +2 -2
- xinference/web/ui/build/static/css/main.632e9148.css.map +1 -0
- xinference/web/ui/build/static/js/main.9cfafbd6.js +3 -0
- xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.9cfafbd6.js.LICENSE.txt} +2 -0
- xinference/web/ui/build/static/js/main.9cfafbd6.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/01d6d198156bacbd436c51435edbd4b2cacd47a79db929105eba30f74b67d48d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59eb25f514afcc4fefd1b309d192b2455f1e0aec68a9de598ca4b2333fe2c774.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +37 -0
- xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
- xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
- xinference/web/ui/node_modules/nunjucks/package.json +112 -0
- xinference/web/ui/package-lock.json +38 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/METADATA +8 -8
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/RECORD +141 -87
- xinference/model/llm/transformers/llama_2.py +0 -108
- xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
- xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
- xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/LICENSE +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/WHEEL +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/top_level.txt +0 -0
xinference/core/scheduler.py
CHANGED
|
@@ -18,7 +18,7 @@ import logging
|
|
|
18
18
|
import uuid
|
|
19
19
|
from collections import deque
|
|
20
20
|
from enum import Enum
|
|
21
|
-
from typing import List, Optional, Set, Tuple
|
|
21
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
22
22
|
|
|
23
23
|
import xoscar as xo
|
|
24
24
|
|
|
@@ -37,13 +37,24 @@ class AbortRequestMessage(Enum):
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class InferenceRequest:
|
|
40
|
-
def __init__(
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
prompt_or_messages,
|
|
43
|
+
future_or_queue,
|
|
44
|
+
is_prefill,
|
|
45
|
+
call_ability,
|
|
46
|
+
*args,
|
|
47
|
+
**kwargs,
|
|
48
|
+
):
|
|
49
|
+
# original prompt, prompt(str) for generate model and messages(List[Dict]) for chat model
|
|
50
|
+
self._prompt = prompt_or_messages
|
|
43
51
|
# full prompt that contains chat history and applies chat template
|
|
44
52
|
self._full_prompt = None
|
|
45
53
|
# whether the current request is in the prefill phase
|
|
46
54
|
self._is_prefill = is_prefill
|
|
55
|
+
# the ability that the user calls this model for, that is `generate` / `chat` for now,
|
|
56
|
+
# which is for results formatting
|
|
57
|
+
self._call_ability = call_ability
|
|
47
58
|
# full prompt tokens
|
|
48
59
|
self._prompt_tokens = None
|
|
49
60
|
# all new generated tokens during decode phase
|
|
@@ -88,38 +99,22 @@ class InferenceRequest:
|
|
|
88
99
|
self._check_args()
|
|
89
100
|
|
|
90
101
|
def _check_args(self):
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
)
|
|
97
|
-
# chat history
|
|
98
|
-
assert self._inference_args[1] is None or isinstance(
|
|
99
|
-
self._inference_args[1], list
|
|
100
|
-
)
|
|
101
|
-
# generate config
|
|
102
|
-
assert self._inference_args[2] is None or isinstance(
|
|
103
|
-
self._inference_args[2], dict
|
|
104
|
-
)
|
|
105
|
-
else: # generate
|
|
106
|
-
assert len(self._inference_args) == 1
|
|
107
|
-
# generate config
|
|
108
|
-
assert self._inference_args[0] is None or isinstance(
|
|
109
|
-
self._inference_args[0], dict
|
|
110
|
-
)
|
|
102
|
+
assert len(self._inference_args) == 1
|
|
103
|
+
# generate config
|
|
104
|
+
assert self._inference_args[0] is None or isinstance(
|
|
105
|
+
self._inference_args[0], dict
|
|
106
|
+
)
|
|
111
107
|
|
|
112
108
|
@property
|
|
113
109
|
def prompt(self):
|
|
110
|
+
"""
|
|
111
|
+
prompt for generate model and messages for chat model
|
|
112
|
+
"""
|
|
114
113
|
return self._prompt
|
|
115
114
|
|
|
116
115
|
@property
|
|
117
|
-
def
|
|
118
|
-
return self.
|
|
119
|
-
|
|
120
|
-
@property
|
|
121
|
-
def chat_history(self):
|
|
122
|
-
return self._inference_args[1]
|
|
116
|
+
def call_ability(self):
|
|
117
|
+
return self._call_ability
|
|
123
118
|
|
|
124
119
|
@property
|
|
125
120
|
def full_prompt(self):
|
|
@@ -162,11 +157,7 @@ class InferenceRequest:
|
|
|
162
157
|
|
|
163
158
|
@property
|
|
164
159
|
def generate_config(self):
|
|
165
|
-
return
|
|
166
|
-
self._inference_args[2]
|
|
167
|
-
if len(self._inference_args) == 3
|
|
168
|
-
else self._inference_args[0]
|
|
169
|
-
)
|
|
160
|
+
return self._inference_args[0]
|
|
170
161
|
|
|
171
162
|
@property
|
|
172
163
|
def sanitized_generate_config(self):
|
|
@@ -423,8 +414,17 @@ class SchedulerActor(xo.StatelessActor):
|
|
|
423
414
|
|
|
424
415
|
self._empty_cache()
|
|
425
416
|
|
|
426
|
-
async def add_request(
|
|
427
|
-
|
|
417
|
+
async def add_request(
|
|
418
|
+
self,
|
|
419
|
+
prompt_or_messages: Union[str, List[Dict]],
|
|
420
|
+
future_or_queue,
|
|
421
|
+
call_ability,
|
|
422
|
+
*args,
|
|
423
|
+
**kwargs,
|
|
424
|
+
):
|
|
425
|
+
req = InferenceRequest(
|
|
426
|
+
prompt_or_messages, future_or_queue, True, call_ability, *args, **kwargs
|
|
427
|
+
)
|
|
428
428
|
rid = req.request_id
|
|
429
429
|
if rid is not None:
|
|
430
430
|
if rid in self._id_to_req:
|
xinference/core/status_guard.py
CHANGED
xinference/core/supervisor.py
CHANGED
|
@@ -105,7 +105,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
105
105
|
self._lock = asyncio.Lock()
|
|
106
106
|
|
|
107
107
|
@classmethod
|
|
108
|
-
def
|
|
108
|
+
def default_uid(cls) -> str:
|
|
109
109
|
return "supervisor"
|
|
110
110
|
|
|
111
111
|
def _get_worker_ref_by_ip(
|
|
@@ -135,12 +135,12 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
135
135
|
self._status_guard_ref: xo.ActorRefType[ # type: ignore
|
|
136
136
|
"StatusGuardActor"
|
|
137
137
|
] = await xo.create_actor(
|
|
138
|
-
StatusGuardActor, address=self.address, uid=StatusGuardActor.
|
|
138
|
+
StatusGuardActor, address=self.address, uid=StatusGuardActor.default_uid()
|
|
139
139
|
)
|
|
140
140
|
self._cache_tracker_ref: xo.ActorRefType[ # type: ignore
|
|
141
141
|
"CacheTrackerActor"
|
|
142
142
|
] = await xo.create_actor(
|
|
143
|
-
CacheTrackerActor, address=self.address, uid=CacheTrackerActor.
|
|
143
|
+
CacheTrackerActor, address=self.address, uid=CacheTrackerActor.default_uid()
|
|
144
144
|
)
|
|
145
145
|
|
|
146
146
|
from .event import EventCollectorActor
|
|
@@ -148,7 +148,9 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
148
148
|
self._event_collector_ref: xo.ActorRefType[ # type: ignore
|
|
149
149
|
EventCollectorActor
|
|
150
150
|
] = await xo.create_actor(
|
|
151
|
-
EventCollectorActor,
|
|
151
|
+
EventCollectorActor,
|
|
152
|
+
address=self.address,
|
|
153
|
+
uid=EventCollectorActor.default_uid(),
|
|
152
154
|
)
|
|
153
155
|
|
|
154
156
|
from ..model.audio import (
|
|
@@ -308,10 +310,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
308
310
|
async def get_builtin_prompts() -> Dict[str, Any]:
|
|
309
311
|
from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE
|
|
310
312
|
|
|
311
|
-
|
|
312
|
-
for k, v in BUILTIN_LLM_PROMPT_STYLE.items():
|
|
313
|
-
data[k] = v.dict()
|
|
314
|
-
return data
|
|
313
|
+
return {k: v for k, v in BUILTIN_LLM_PROMPT_STYLE.items()}
|
|
315
314
|
|
|
316
315
|
@staticmethod
|
|
317
316
|
async def get_builtin_families() -> Dict[str, List[str]]:
|
|
@@ -1028,7 +1027,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1028
1027
|
else:
|
|
1029
1028
|
task = asyncio.create_task(_launch_model())
|
|
1030
1029
|
ASYNC_LAUNCH_TASKS[model_uid] = task
|
|
1031
|
-
task.add_done_callback(lambda _: callback_for_async_launch(model_uid))
|
|
1030
|
+
task.add_done_callback(lambda _: callback_for_async_launch(model_uid)) # type: ignore
|
|
1032
1031
|
return model_uid
|
|
1033
1032
|
|
|
1034
1033
|
async def get_instance_info(
|
|
@@ -1233,7 +1232,9 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1233
1232
|
worker_address not in self._worker_address_to_worker
|
|
1234
1233
|
), f"Worker {worker_address} exists"
|
|
1235
1234
|
|
|
1236
|
-
worker_ref = await xo.actor_ref(
|
|
1235
|
+
worker_ref = await xo.actor_ref(
|
|
1236
|
+
address=worker_address, uid=WorkerActor.default_uid()
|
|
1237
|
+
)
|
|
1237
1238
|
self._worker_address_to_worker[worker_address] = worker_ref
|
|
1238
1239
|
logger.debug("Worker %s has been added successfully", worker_address)
|
|
1239
1240
|
|
xinference/core/utils.py
CHANGED
|
@@ -11,62 +11,120 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
import copy
|
|
15
14
|
import logging
|
|
16
15
|
import os
|
|
17
16
|
import random
|
|
18
17
|
import string
|
|
19
|
-
|
|
18
|
+
import uuid
|
|
19
|
+
from typing import Dict, Generator, List, Optional, Tuple, Union
|
|
20
20
|
|
|
21
21
|
import orjson
|
|
22
22
|
from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
|
|
23
23
|
|
|
24
24
|
from .._compat import BaseModel
|
|
25
|
+
from ..constants import XINFERENCE_LOG_ARG_MAX_LENGTH
|
|
25
26
|
|
|
26
27
|
logger = logging.getLogger(__name__)
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def
|
|
30
|
+
def truncate_log_arg(arg) -> str:
|
|
31
|
+
s = str(arg)
|
|
32
|
+
if len(s) > XINFERENCE_LOG_ARG_MAX_LENGTH:
|
|
33
|
+
s = s[0:XINFERENCE_LOG_ARG_MAX_LENGTH] + "..."
|
|
34
|
+
return s
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def log_async(
|
|
38
|
+
logger,
|
|
39
|
+
level=logging.DEBUG,
|
|
40
|
+
ignore_kwargs: Optional[List[str]] = None,
|
|
41
|
+
log_exception=True,
|
|
42
|
+
):
|
|
30
43
|
import time
|
|
31
44
|
from functools import wraps
|
|
32
45
|
|
|
33
46
|
def decorator(func):
|
|
47
|
+
func_name = func.__name__
|
|
48
|
+
|
|
34
49
|
@wraps(func)
|
|
35
50
|
async def wrapped(*args, **kwargs):
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
51
|
+
request_id_str = kwargs.get("request_id", "")
|
|
52
|
+
if not request_id_str:
|
|
53
|
+
request_id_str = uuid.uuid1()
|
|
54
|
+
request_id_str = f"[request {request_id_str}]"
|
|
55
|
+
formatted_args = ",".join(map(truncate_log_arg, args))
|
|
56
|
+
formatted_kwargs = ",".join(
|
|
57
|
+
[
|
|
58
|
+
"%s=%s" % (k, truncate_log_arg(v))
|
|
59
|
+
for k, v in kwargs.items()
|
|
60
|
+
if ignore_kwargs is None or k not in ignore_kwargs
|
|
61
|
+
]
|
|
43
62
|
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s"
|
|
63
|
+
logger.log(
|
|
64
|
+
level,
|
|
65
|
+
f"{request_id_str} Enter {func_name}, args: {formatted_args}, kwargs: {formatted_kwargs}",
|
|
48
66
|
)
|
|
49
|
-
|
|
67
|
+
start = time.time()
|
|
68
|
+
try:
|
|
69
|
+
ret = await func(*args, **kwargs)
|
|
70
|
+
logger.log(
|
|
71
|
+
level,
|
|
72
|
+
f"{request_id_str} Leave {func_name}, elapsed time: {int(time.time() - start)} s",
|
|
73
|
+
)
|
|
74
|
+
return ret
|
|
75
|
+
except Exception as e:
|
|
76
|
+
if log_exception:
|
|
77
|
+
logger.error(
|
|
78
|
+
f"{request_id_str} Leave {func_name}, error: {e}, elapsed time: {int(time.time() - start)} s",
|
|
79
|
+
exc_info=True,
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
logger.log(
|
|
83
|
+
level,
|
|
84
|
+
f"{request_id_str} Leave {func_name}, error: {e}, elapsed time: {int(time.time() - start)} s",
|
|
85
|
+
)
|
|
86
|
+
raise
|
|
50
87
|
|
|
51
88
|
return wrapped
|
|
52
89
|
|
|
53
90
|
return decorator
|
|
54
91
|
|
|
55
92
|
|
|
56
|
-
def log_sync(logger):
|
|
93
|
+
def log_sync(logger, level=logging.DEBUG, log_exception=True):
|
|
57
94
|
import time
|
|
58
95
|
from functools import wraps
|
|
59
96
|
|
|
60
97
|
def decorator(func):
|
|
61
98
|
@wraps(func)
|
|
62
99
|
def wrapped(*args, **kwargs):
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
logger.debug(
|
|
67
|
-
f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s"
|
|
100
|
+
formatted_args = ",".join(map(truncate_log_arg, args))
|
|
101
|
+
formatted_kwargs = ",".join(
|
|
102
|
+
map(lambda x: "%s=%s" % (x[0], truncate_log_arg(x[1])), kwargs.items())
|
|
68
103
|
)
|
|
69
|
-
|
|
104
|
+
logger.log(
|
|
105
|
+
level,
|
|
106
|
+
f"Enter {func.__name__}, args: {formatted_args}, kwargs: {formatted_kwargs}",
|
|
107
|
+
)
|
|
108
|
+
start = time.time()
|
|
109
|
+
try:
|
|
110
|
+
ret = func(*args, **kwargs)
|
|
111
|
+
logger.log(
|
|
112
|
+
level,
|
|
113
|
+
f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s",
|
|
114
|
+
)
|
|
115
|
+
return ret
|
|
116
|
+
except Exception as e:
|
|
117
|
+
if log_exception:
|
|
118
|
+
logger.error(
|
|
119
|
+
f"Leave {func.__name__}, error: {e}, elapsed time: {int(time.time() - start)} s",
|
|
120
|
+
exc_info=True,
|
|
121
|
+
)
|
|
122
|
+
else:
|
|
123
|
+
logger.log(
|
|
124
|
+
level,
|
|
125
|
+
f"Leave {func.__name__}, error: {e}, elapsed time: {int(time.time() - start)} s",
|
|
126
|
+
)
|
|
127
|
+
raise
|
|
70
128
|
|
|
71
129
|
return wrapped
|
|
72
130
|
|
xinference/core/worker.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
|
+
import logging
|
|
16
17
|
import os
|
|
17
18
|
import platform
|
|
18
19
|
import queue
|
|
@@ -73,15 +74,15 @@ class WorkerActor(xo.StatelessActor):
|
|
|
73
74
|
self._supervisor_ref: Optional[xo.ActorRefType] = None
|
|
74
75
|
self._main_pool = main_pool
|
|
75
76
|
self._main_pool.recover_sub_pool = self.recover_sub_pool
|
|
76
|
-
self._status_guard_ref: xo.ActorRefType[
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
self._status_guard_ref: xo.ActorRefType[
|
|
78
|
+
"StatusGuardActor"
|
|
79
|
+
] = None # type: ignore
|
|
79
80
|
self._event_collector_ref: xo.ActorRefType[ # type: ignore
|
|
80
81
|
EventCollectorActor
|
|
81
82
|
] = None
|
|
82
|
-
self._cache_tracker_ref: xo.ActorRefType[
|
|
83
|
-
|
|
84
|
-
|
|
83
|
+
self._cache_tracker_ref: xo.ActorRefType[
|
|
84
|
+
CacheTrackerActor
|
|
85
|
+
] = None # type: ignore
|
|
85
86
|
|
|
86
87
|
# internal states.
|
|
87
88
|
# temporary placeholder during model launch process:
|
|
@@ -185,7 +186,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
185
186
|
break
|
|
186
187
|
|
|
187
188
|
@classmethod
|
|
188
|
-
def
|
|
189
|
+
def default_uid(cls) -> str:
|
|
189
190
|
return "worker"
|
|
190
191
|
|
|
191
192
|
async def __post_create__(self):
|
|
@@ -270,9 +271,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
270
271
|
|
|
271
272
|
try:
|
|
272
273
|
await self.get_supervisor_ref(add_worker=True)
|
|
273
|
-
except Exception
|
|
274
|
+
except Exception:
|
|
274
275
|
# Do not crash the worker if supervisor is down, auto re-connect later
|
|
275
|
-
logger.error(f"cannot connect to supervisor
|
|
276
|
+
logger.error(f"cannot connect to supervisor", exc_info=True)
|
|
276
277
|
|
|
277
278
|
if not XINFERENCE_DISABLE_HEALTH_CHECK:
|
|
278
279
|
from ..isolation import Isolation
|
|
@@ -324,7 +325,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
324
325
|
if self._supervisor_ref is not None:
|
|
325
326
|
return self._supervisor_ref
|
|
326
327
|
supervisor_ref = await xo.actor_ref( # type: ignore
|
|
327
|
-
address=self._supervisor_address, uid=SupervisorActor.
|
|
328
|
+
address=self._supervisor_address, uid=SupervisorActor.default_uid()
|
|
328
329
|
)
|
|
329
330
|
# Prevent concurrent operations leads to double initialization, check again.
|
|
330
331
|
if self._supervisor_ref is not None:
|
|
@@ -336,13 +337,13 @@ class WorkerActor(xo.StatelessActor):
|
|
|
336
337
|
logger.info("Connected to supervisor as a fresh worker")
|
|
337
338
|
|
|
338
339
|
self._status_guard_ref = await xo.actor_ref(
|
|
339
|
-
address=self._supervisor_address, uid=StatusGuardActor.
|
|
340
|
+
address=self._supervisor_address, uid=StatusGuardActor.default_uid()
|
|
340
341
|
)
|
|
341
342
|
self._event_collector_ref = await xo.actor_ref(
|
|
342
|
-
address=self._supervisor_address, uid=EventCollectorActor.
|
|
343
|
+
address=self._supervisor_address, uid=EventCollectorActor.default_uid()
|
|
343
344
|
)
|
|
344
345
|
self._cache_tracker_ref = await xo.actor_ref(
|
|
345
|
-
address=self._supervisor_address, uid=CacheTrackerActor.
|
|
346
|
+
address=self._supervisor_address, uid=CacheTrackerActor.default_uid()
|
|
346
347
|
)
|
|
347
348
|
# cache_tracker is on supervisor
|
|
348
349
|
from ..model.audio import get_audio_model_descriptions
|
|
@@ -770,7 +771,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
770
771
|
version_info["model_file_location"],
|
|
771
772
|
)
|
|
772
773
|
|
|
773
|
-
@log_async(logger=logger)
|
|
774
|
+
@log_async(logger=logger, level=logging.INFO)
|
|
774
775
|
async def launch_builtin_model(
|
|
775
776
|
self,
|
|
776
777
|
model_uid: str,
|
|
@@ -814,7 +815,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
814
815
|
)
|
|
815
816
|
except Exception as e:
|
|
816
817
|
# Report callback error can be log and ignore, should not interrupt the Process
|
|
817
|
-
logger.error("report_event error: %s" % (e))
|
|
818
|
+
logger.error("report_event error: %s" % (e), exc_info=True)
|
|
818
819
|
|
|
819
820
|
if gpu_idx is not None:
|
|
820
821
|
logger.info(
|
|
@@ -917,7 +918,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
917
918
|
{"model_ability": abilities, "status": LaunchStatus.READY.name},
|
|
918
919
|
)
|
|
919
920
|
|
|
920
|
-
@log_async(logger=logger)
|
|
921
|
+
@log_async(logger=logger, level=logging.INFO)
|
|
921
922
|
async def terminate_model(self, model_uid: str, is_model_die=False):
|
|
922
923
|
# Terminate model while its launching is not allow
|
|
923
924
|
if model_uid in self._model_uid_launching_guard:
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -17,7 +17,7 @@ import logging
|
|
|
17
17
|
import os
|
|
18
18
|
import sys
|
|
19
19
|
import warnings
|
|
20
|
-
from typing import List, Optional, Sequence, Tuple, Union
|
|
20
|
+
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
|
21
21
|
|
|
22
22
|
import click
|
|
23
23
|
from xoscar.utils import get_next_port
|
|
@@ -38,7 +38,6 @@ from ..constants import (
|
|
|
38
38
|
XINFERENCE_LOG_MAX_BYTES,
|
|
39
39
|
)
|
|
40
40
|
from ..isolation import Isolation
|
|
41
|
-
from ..types import ChatCompletionMessage
|
|
42
41
|
from .utils import (
|
|
43
42
|
get_config_dict,
|
|
44
43
|
get_log_file,
|
|
@@ -1210,13 +1209,12 @@ def model_chat(
|
|
|
1210
1209
|
stream: bool,
|
|
1211
1210
|
api_key: Optional[str],
|
|
1212
1211
|
):
|
|
1213
|
-
# TODO: chat model roles may not be user and assistant.
|
|
1214
1212
|
endpoint = get_endpoint(endpoint)
|
|
1215
1213
|
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
1216
1214
|
if api_key is None:
|
|
1217
1215
|
client._set_token(get_stored_token(endpoint, client))
|
|
1218
1216
|
|
|
1219
|
-
|
|
1217
|
+
messages: List[Dict] = []
|
|
1220
1218
|
if stream:
|
|
1221
1219
|
# TODO: when stream=True, RestfulClient cannot generate words one by one.
|
|
1222
1220
|
# So use Client in temporary. The implementation needs to be changed to
|
|
@@ -1229,10 +1227,10 @@ def model_chat(
|
|
|
1229
1227
|
if prompt == "":
|
|
1230
1228
|
break
|
|
1231
1229
|
print("Assistant: ", end="", file=sys.stdout)
|
|
1230
|
+
messages.append(dict(role="user", content=prompt))
|
|
1232
1231
|
response_content = ""
|
|
1233
1232
|
for chunk in model.chat(
|
|
1234
|
-
|
|
1235
|
-
chat_history=chat_history,
|
|
1233
|
+
messages,
|
|
1236
1234
|
generate_config={"stream": stream, "max_tokens": max_tokens},
|
|
1237
1235
|
):
|
|
1238
1236
|
delta = chunk["choices"][0]["delta"]
|
|
@@ -1242,10 +1240,7 @@ def model_chat(
|
|
|
1242
1240
|
response_content += delta["content"]
|
|
1243
1241
|
print(delta["content"], end="", flush=True, file=sys.stdout)
|
|
1244
1242
|
print("", file=sys.stdout)
|
|
1245
|
-
|
|
1246
|
-
chat_history.append(
|
|
1247
|
-
ChatCompletionMessage(role="assistant", content=response_content)
|
|
1248
|
-
)
|
|
1243
|
+
messages.append(dict(role="assistant", content=response_content))
|
|
1249
1244
|
|
|
1250
1245
|
model = client.get_model(model_uid=model_uid)
|
|
1251
1246
|
|
|
@@ -1274,20 +1269,17 @@ def model_chat(
|
|
|
1274
1269
|
prompt = input("User: ")
|
|
1275
1270
|
if prompt == "":
|
|
1276
1271
|
break
|
|
1277
|
-
|
|
1272
|
+
messages.append({"role": "user", "content": prompt})
|
|
1278
1273
|
print("Assistant: ", end="", file=sys.stdout)
|
|
1279
1274
|
response = restful_model.chat(
|
|
1280
|
-
|
|
1281
|
-
chat_history=chat_history,
|
|
1275
|
+
messages,
|
|
1282
1276
|
generate_config={"stream": stream, "max_tokens": max_tokens},
|
|
1283
1277
|
)
|
|
1284
1278
|
if not isinstance(response, dict):
|
|
1285
1279
|
raise ValueError("chat result is not valid")
|
|
1286
1280
|
response_content = response["choices"][0]["message"]["content"]
|
|
1287
1281
|
print(f"{response_content}\n", file=sys.stdout)
|
|
1288
|
-
|
|
1289
|
-
ChatCompletionMessage(role="assistant", content=response_content)
|
|
1290
|
-
)
|
|
1282
|
+
messages.append(dict(role="assistant", content=response_content))
|
|
1291
1283
|
|
|
1292
1284
|
|
|
1293
1285
|
@cli.command("vllm-models", help="Query and display models compatible with vLLM.")
|
xinference/deploy/local.py
CHANGED
|
@@ -49,7 +49,7 @@ async def _start_local_cluster(
|
|
|
49
49
|
address=address, logging_conf=logging_conf
|
|
50
50
|
)
|
|
51
51
|
await xo.create_actor(
|
|
52
|
-
SupervisorActor, address=address, uid=SupervisorActor.
|
|
52
|
+
SupervisorActor, address=address, uid=SupervisorActor.default_uid()
|
|
53
53
|
)
|
|
54
54
|
await start_worker_components(
|
|
55
55
|
address=address,
|
xinference/deploy/supervisor.py
CHANGED
|
@@ -41,7 +41,7 @@ async def _start_supervisor(address: str, logging_conf: Optional[Dict] = None):
|
|
|
41
41
|
address=address, n_process=0, logging_conf={"dict": logging_conf}
|
|
42
42
|
)
|
|
43
43
|
await xo.create_actor(
|
|
44
|
-
SupervisorActor, address=address, uid=SupervisorActor.
|
|
44
|
+
SupervisorActor, address=address, uid=SupervisorActor.default_uid()
|
|
45
45
|
)
|
|
46
46
|
await pool.join()
|
|
47
47
|
except asyncio.exceptions.CancelledError:
|
xinference/deploy/utils.py
CHANGED
|
@@ -167,7 +167,7 @@ def health_check(address: str, max_attempts: int, sleep_interval: int = 3) -> bo
|
|
|
167
167
|
from ..core.supervisor import SupervisorActor
|
|
168
168
|
|
|
169
169
|
supervisor_ref: xo.ActorRefType[SupervisorActor] = await xo.actor_ref( # type: ignore
|
|
170
|
-
address=address, uid=SupervisorActor.
|
|
170
|
+
address=address, uid=SupervisorActor.default_uid()
|
|
171
171
|
)
|
|
172
172
|
|
|
173
173
|
await supervisor_ref.get_status()
|
xinference/deploy/worker.py
CHANGED