xinference 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/restful_client.py +9 -1
- xinference/core/model.py +19 -0
- xinference/core/resource.py +7 -1
- xinference/core/status_guard.py +1 -0
- xinference/core/supervisor.py +228 -19
- xinference/core/utils.py +1 -29
- xinference/core/worker.py +28 -2
- xinference/deploy/cmdline.py +33 -3
- xinference/deploy/test/test_cmdline.py +32 -0
- xinference/device_utils.py +43 -1
- xinference/model/audio/kokoro.py +19 -36
- xinference/model/audio/model_spec.json +1 -1
- xinference/model/image/stable_diffusion/core.py +15 -6
- xinference/model/llm/llm_family.json +521 -6
- xinference/model/llm/llm_family.py +3 -1
- xinference/model/llm/llm_family_modelscope.json +559 -6
- xinference/model/llm/reasoning_parsers/__init__.py +13 -0
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +98 -0
- xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +140 -0
- xinference/model/llm/sglang/core.py +99 -11
- xinference/model/llm/transformers/intern_vl.py +23 -14
- xinference/model/llm/utils.py +53 -19
- xinference/model/llm/vllm/core.py +23 -2
- xinference/model/llm/vllm/xavier/executor.py +2 -2
- xinference/model/llm/vllm/xavier/scheduler.py +3 -3
- xinference/thirdparty/internvl/conversation.py +26 -17
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.f8177338.css +2 -0
- xinference/web/ui/build/static/css/main.f8177338.css.map +1 -0
- xinference/web/ui/build/static/js/main.ad42919c.js +3 -0
- xinference/web/ui/build/static/js/main.ad42919c.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0acb065326560592b10888234242f94f67efe28458b90f273d4d4fba9daa0cd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6cb9f6c62ab4042f0b11c5d75e51187188e9d6f5f08b1d63e796e051bafdb457.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +1 -0
- xinference/web/ui/src/locales/en.json +14 -1
- xinference/web/ui/src/locales/zh.json +14 -1
- {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/METADATA +11 -11
- {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/RECORD +55 -49
- xinference/web/ui/build/static/css/main.51a587ff.css +0 -2
- xinference/web/ui/build/static/css/main.51a587ff.css.map +0 -1
- xinference/web/ui/build/static/js/main.b0936c54.js +0 -3
- xinference/web/ui/build/static/js/main.b0936c54.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0c2fb5375667931c4a331c99e0d87dc145e8f327cea3f44d6e56f54c7c1d4020.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a7f1a71f6580dfe810c685a9c1d68e318f71e1fa258fbe50b87a6ac37cc0a598.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +0 -1
- /xinference/web/ui/build/static/js/{main.b0936c54.js.LICENSE.txt → main.ad42919c.js.LICENSE.txt} +0 -0
- {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/LICENSE +0 -0
- {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/WHEEL +0 -0
- {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.2.2.dist-info → xinference-1.3.0.dist-info}/top_level.txt +0 -0
xinference/deploy/cmdline.py
CHANGED
|
@@ -770,11 +770,17 @@ def remove_cache(
|
|
|
770
770
|
type=int,
|
|
771
771
|
help="The replica count of the model, default is 1.",
|
|
772
772
|
)
|
|
773
|
+
@click.option(
|
|
774
|
+
"--n-worker",
|
|
775
|
+
default=1,
|
|
776
|
+
type=int,
|
|
777
|
+
help="The number of workers used by the model, default is 1.",
|
|
778
|
+
)
|
|
773
779
|
@click.option(
|
|
774
780
|
"--n-gpu",
|
|
775
781
|
default="auto",
|
|
776
782
|
type=str,
|
|
777
|
-
help='The number of GPUs used by the model, default is "auto".',
|
|
783
|
+
help='The number of GPUs used by the model, if n_worker>1, means number of GPUs per worker, default is "auto".',
|
|
778
784
|
)
|
|
779
785
|
@click.option(
|
|
780
786
|
"--lora-modules",
|
|
@@ -815,6 +821,12 @@ def remove_cache(
|
|
|
815
821
|
type=bool,
|
|
816
822
|
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
|
|
817
823
|
)
|
|
824
|
+
@click.option(
|
|
825
|
+
"--reasoning-content",
|
|
826
|
+
default=False,
|
|
827
|
+
type=bool,
|
|
828
|
+
help="Whether or not to enable reasoning content in model responses.",
|
|
829
|
+
)
|
|
818
830
|
@click.option(
|
|
819
831
|
"--api-key",
|
|
820
832
|
"-ak",
|
|
@@ -822,6 +834,7 @@ def remove_cache(
|
|
|
822
834
|
type=str,
|
|
823
835
|
help="Api-Key for access xinference api with authorization.",
|
|
824
836
|
)
|
|
837
|
+
@click.option("--model-path", "-mp", default=None, type=str, help="Model path to run.")
|
|
825
838
|
@click.pass_context
|
|
826
839
|
def model_launch(
|
|
827
840
|
ctx,
|
|
@@ -834,6 +847,7 @@ def model_launch(
|
|
|
834
847
|
model_format: str,
|
|
835
848
|
quantization: str,
|
|
836
849
|
replica: int,
|
|
850
|
+
n_worker: int,
|
|
837
851
|
n_gpu: str,
|
|
838
852
|
lora_modules: Optional[Tuple],
|
|
839
853
|
image_lora_load_kwargs: Optional[Tuple],
|
|
@@ -841,15 +855,28 @@ def model_launch(
|
|
|
841
855
|
worker_ip: Optional[str],
|
|
842
856
|
gpu_idx: Optional[str],
|
|
843
857
|
trust_remote_code: bool,
|
|
858
|
+
reasoning_content: bool,
|
|
844
859
|
api_key: Optional[str],
|
|
860
|
+
model_path: Optional[str],
|
|
845
861
|
):
|
|
846
862
|
kwargs = {}
|
|
847
863
|
for i in range(0, len(ctx.args), 2):
|
|
848
864
|
if not ctx.args[i].startswith("--"):
|
|
849
865
|
raise ValueError(
|
|
850
|
-
f"You must specify extra kwargs with `--` prefix.
|
|
866
|
+
f"You must specify extra kwargs with `--` prefix. "
|
|
867
|
+
f"There is an error in parameter passing that is {ctx.args[i]}."
|
|
851
868
|
)
|
|
852
|
-
|
|
869
|
+
param_name = ctx.args[i][2:]
|
|
870
|
+
param_value = handle_click_args_type(ctx.args[i + 1])
|
|
871
|
+
if param_name == "model_path":
|
|
872
|
+
# fix for --model_path which is the old fashion to set model_path,
|
|
873
|
+
# now model_path is a builtin option, try to make it compatible
|
|
874
|
+
if model_path is None:
|
|
875
|
+
model_path = param_value
|
|
876
|
+
continue
|
|
877
|
+
else:
|
|
878
|
+
raise ValueError("Cannot set both for --model-path and --model_path")
|
|
879
|
+
kwargs[param_name] = param_value
|
|
853
880
|
print(f"Launch model name: {model_name} with kwargs: {kwargs}", file=sys.stderr)
|
|
854
881
|
|
|
855
882
|
if model_type == "LLM" and model_engine is None:
|
|
@@ -914,11 +941,14 @@ def model_launch(
|
|
|
914
941
|
model_format=model_format,
|
|
915
942
|
quantization=quantization,
|
|
916
943
|
replica=replica,
|
|
944
|
+
n_worker=n_worker,
|
|
917
945
|
n_gpu=_n_gpu,
|
|
918
946
|
peft_model_config=peft_model_config,
|
|
919
947
|
worker_ip=worker_ip,
|
|
920
948
|
gpu_idx=_gpu_idx,
|
|
921
949
|
trust_remote_code=trust_remote_code,
|
|
950
|
+
model_path=model_path,
|
|
951
|
+
reasoning_content=reasoning_content,
|
|
922
952
|
**kwargs,
|
|
923
953
|
)
|
|
924
954
|
|
|
@@ -147,6 +147,38 @@ def test_cmdline(setup, stream, model_uid):
|
|
|
147
147
|
assert model_uid not in result.stdout
|
|
148
148
|
|
|
149
149
|
|
|
150
|
+
def test_cmdline_model_path_error(setup):
|
|
151
|
+
endpoint, _ = setup
|
|
152
|
+
runner = CliRunner(mix_stderr=False)
|
|
153
|
+
|
|
154
|
+
# launch model
|
|
155
|
+
result = runner.invoke(
|
|
156
|
+
model_launch,
|
|
157
|
+
[
|
|
158
|
+
"--endpoint",
|
|
159
|
+
endpoint,
|
|
160
|
+
"--model-name",
|
|
161
|
+
"tiny-llama",
|
|
162
|
+
"--size-in-billions",
|
|
163
|
+
1,
|
|
164
|
+
"--model-format",
|
|
165
|
+
"ggufv2",
|
|
166
|
+
"--quantization",
|
|
167
|
+
"Q2_K",
|
|
168
|
+
"--model-path",
|
|
169
|
+
"/path/to/model",
|
|
170
|
+
"--model_path",
|
|
171
|
+
"/path/to/model",
|
|
172
|
+
],
|
|
173
|
+
)
|
|
174
|
+
assert result.exit_code > 0
|
|
175
|
+
with pytest.raises(
|
|
176
|
+
ValueError, match="Cannot set both for --model-path and --model_path"
|
|
177
|
+
):
|
|
178
|
+
t, e, tb = result.exc_info
|
|
179
|
+
raise e.with_traceback(tb)
|
|
180
|
+
|
|
181
|
+
|
|
150
182
|
def test_cmdline_of_custom_model(setup):
|
|
151
183
|
endpoint, _ = setup
|
|
152
184
|
runner = CliRunner()
|
xinference/device_utils.py
CHANGED
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
|
+
from typing import Dict, Literal, Union
|
|
16
17
|
|
|
17
18
|
import torch
|
|
18
|
-
from typing_extensions import Literal, Union
|
|
19
19
|
|
|
20
20
|
DeviceType = Literal["cuda", "mps", "xpu", "npu", "cpu"]
|
|
21
21
|
DEVICE_TO_ENV_NAME = {
|
|
@@ -122,3 +122,45 @@ def gpu_count():
|
|
|
122
122
|
return torch.npu.device_count()
|
|
123
123
|
else:
|
|
124
124
|
return 0
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _get_nvidia_gpu_mem_info(gpu_id: int) -> Dict[str, float]:
|
|
128
|
+
from pynvml import (
|
|
129
|
+
nvmlDeviceGetHandleByIndex,
|
|
130
|
+
nvmlDeviceGetMemoryInfo,
|
|
131
|
+
nvmlDeviceGetName,
|
|
132
|
+
nvmlDeviceGetUtilizationRates,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
handler = nvmlDeviceGetHandleByIndex(gpu_id)
|
|
136
|
+
gpu_name = nvmlDeviceGetName(handler)
|
|
137
|
+
mem_info = nvmlDeviceGetMemoryInfo(handler)
|
|
138
|
+
utilization = nvmlDeviceGetUtilizationRates(handler)
|
|
139
|
+
return {
|
|
140
|
+
"name": gpu_name,
|
|
141
|
+
"total": mem_info.total,
|
|
142
|
+
"used": mem_info.used,
|
|
143
|
+
"free": mem_info.free,
|
|
144
|
+
"util": utilization.gpu,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def get_nvidia_gpu_info() -> Dict:
|
|
149
|
+
from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
nvmlInit()
|
|
153
|
+
device_count = nvmlDeviceGetCount()
|
|
154
|
+
res = {}
|
|
155
|
+
for i in range(device_count):
|
|
156
|
+
res[f"gpu-{i}"] = _get_nvidia_gpu_mem_info(i)
|
|
157
|
+
return res
|
|
158
|
+
except:
|
|
159
|
+
# TODO: add log here
|
|
160
|
+
# logger.debug(f"Cannot init nvml. Maybe due to lack of NVIDIA GPUs or incorrect installation of CUDA.")
|
|
161
|
+
return {}
|
|
162
|
+
finally:
|
|
163
|
+
try:
|
|
164
|
+
nvmlShutdown()
|
|
165
|
+
except:
|
|
166
|
+
pass
|
xinference/model/audio/kokoro.py
CHANGED
|
@@ -26,36 +26,6 @@ logger = logging.getLogger(__name__)
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class KokoroModel:
|
|
29
|
-
# The available voices, should keep sync with https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
|
|
30
|
-
VOICES = [
|
|
31
|
-
"af_alloy",
|
|
32
|
-
"af_aoede",
|
|
33
|
-
"af_bella",
|
|
34
|
-
"af_jessica",
|
|
35
|
-
"af_kore",
|
|
36
|
-
"af_nicole",
|
|
37
|
-
"af_nova",
|
|
38
|
-
"af_river",
|
|
39
|
-
"af_sarah",
|
|
40
|
-
"af_sky",
|
|
41
|
-
"am_adam",
|
|
42
|
-
"am_echo",
|
|
43
|
-
"am_eric",
|
|
44
|
-
"am_fenrir",
|
|
45
|
-
"am_liam",
|
|
46
|
-
"am_michael",
|
|
47
|
-
"am_onyx",
|
|
48
|
-
"am_puck",
|
|
49
|
-
"bf_alice",
|
|
50
|
-
"bf_emma",
|
|
51
|
-
"bf_isabella",
|
|
52
|
-
"bf_lily",
|
|
53
|
-
"bm_daniel",
|
|
54
|
-
"bm_fable",
|
|
55
|
-
"bm_george",
|
|
56
|
-
"bm_lewis",
|
|
57
|
-
]
|
|
58
|
-
|
|
59
29
|
def __init__(
|
|
60
30
|
self,
|
|
61
31
|
model_uid: str,
|
|
@@ -89,10 +59,25 @@ class KokoroModel:
|
|
|
89
59
|
config_path = os.path.join(self._model_path, "config.json")
|
|
90
60
|
model_path = os.path.join(self._model_path, "kokoro-v1_0.pth")
|
|
91
61
|
# LANG_CODES = dict(
|
|
62
|
+
# # pip install misaki[en]
|
|
92
63
|
# a='American English',
|
|
93
64
|
# b='British English',
|
|
65
|
+
#
|
|
66
|
+
# # espeak-ng
|
|
67
|
+
# e='es',
|
|
68
|
+
# f='fr-fr',
|
|
69
|
+
# h='hi',
|
|
70
|
+
# i='it',
|
|
71
|
+
# p='pt-br',
|
|
72
|
+
#
|
|
73
|
+
# # pip install misaki[ja]
|
|
74
|
+
# j='Japanese',
|
|
75
|
+
#
|
|
76
|
+
# # pip install misaki[zh]
|
|
77
|
+
# z='Mandarin Chinese',
|
|
94
78
|
# )
|
|
95
79
|
lang_code = self._kwargs.get("lang_code", "a")
|
|
80
|
+
logger.info("Launching Kokoro model with language code: %s", lang_code)
|
|
96
81
|
self._model = KPipeline(
|
|
97
82
|
lang_code=lang_code,
|
|
98
83
|
model=KModel(config=config_path, model=model_path),
|
|
@@ -114,14 +99,12 @@ class KokoroModel:
|
|
|
114
99
|
raise Exception("Kokoro does not support stream mode.")
|
|
115
100
|
assert self._model is not None
|
|
116
101
|
if not voice:
|
|
117
|
-
voice =
|
|
102
|
+
voice = "af_alloy"
|
|
118
103
|
logger.info("Auto select speaker: %s", voice)
|
|
119
|
-
elif
|
|
120
|
-
raise ValueError(
|
|
121
|
-
f"Invalid voice: {voice}, available speakers: {self.VOICES}"
|
|
122
|
-
)
|
|
123
|
-
else:
|
|
104
|
+
elif voice.endswith(".pt"):
|
|
124
105
|
logger.info("Using custom voice pt: %s", voice)
|
|
106
|
+
else:
|
|
107
|
+
logger.info("Using voice: %s", voice)
|
|
125
108
|
logger.info("Speech kwargs: %s", kwargs)
|
|
126
109
|
generator = self._model(text=input, voice=voice, speed=speed, **kwargs)
|
|
127
110
|
results = list(generator)
|
|
@@ -343,7 +343,7 @@
|
|
|
343
343
|
"model_name": "Kokoro-82M",
|
|
344
344
|
"model_family": "Kokoro",
|
|
345
345
|
"model_id": "hexgrad/Kokoro-82M",
|
|
346
|
-
"model_revision": "
|
|
346
|
+
"model_revision": "7884269d6fd3f9beabc271b6f1308e5699281fa9",
|
|
347
347
|
"model_ability": "text-to-audio",
|
|
348
348
|
"multilingual": true
|
|
349
349
|
}
|
|
@@ -22,7 +22,6 @@ import logging
|
|
|
22
22
|
import os
|
|
23
23
|
import re
|
|
24
24
|
import sys
|
|
25
|
-
import warnings
|
|
26
25
|
from glob import glob
|
|
27
26
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
|
28
27
|
|
|
@@ -412,12 +411,22 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
412
411
|
else:
|
|
413
412
|
raise ValueError(f"Unknown sampler: {sampler_name}")
|
|
414
413
|
|
|
415
|
-
|
|
414
|
+
def _need_set_scheduler(self, scheduler: Any) -> bool:
|
|
415
|
+
"""Determine whether it is necessary to set up a scheduler"""
|
|
416
|
+
if self._model_spec is None:
|
|
417
|
+
return False
|
|
418
|
+
if scheduler is None:
|
|
419
|
+
return False
|
|
420
|
+
if "FLUX" in self._model_spec.model_name:
|
|
421
|
+
logger.warning("FLUX model, skipping scheduler setup")
|
|
422
|
+
return False
|
|
423
|
+
return True
|
|
424
|
+
|
|
416
425
|
@contextlib.contextmanager
|
|
417
|
-
def _reset_when_done(model: Any, sampler_name: str):
|
|
418
|
-
assert model is not None
|
|
426
|
+
def _reset_when_done(self, model: Any, sampler_name: str):
|
|
419
427
|
scheduler = DiffusionModel._get_scheduler(model, sampler_name)
|
|
420
|
-
if scheduler:
|
|
428
|
+
if self._need_set_scheduler(scheduler):
|
|
429
|
+
logger.debug("Use scheduler %s", scheduler)
|
|
421
430
|
default_scheduler = model.scheduler
|
|
422
431
|
model.scheduler = scheduler
|
|
423
432
|
try:
|
|
@@ -517,7 +526,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
517
526
|
for key in list(kwargs):
|
|
518
527
|
allow_key = model_accept_param(key, model)
|
|
519
528
|
if not allow_key:
|
|
520
|
-
|
|
529
|
+
logger.warning(f"{type(model)} cannot accept `{key}`, will ignore it")
|
|
521
530
|
kwargs.pop(key)
|
|
522
531
|
|
|
523
532
|
def text_to_image(
|