xinference 0.15.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/__init__.py +0 -4
- xinference/_version.py +3 -3
- xinference/constants.py +4 -4
- xinference/core/model.py +89 -18
- xinference/core/scheduler.py +10 -7
- xinference/core/utils.py +9 -0
- xinference/deploy/supervisor.py +4 -0
- xinference/model/__init__.py +4 -0
- xinference/model/image/scheduler/__init__.py +13 -0
- xinference/model/image/scheduler/flux.py +533 -0
- xinference/model/image/stable_diffusion/core.py +6 -31
- xinference/model/image/utils.py +39 -3
- xinference/model/llm/__init__.py +2 -0
- xinference/model/llm/llm_family.json +169 -1
- xinference/model/llm/llm_family_modelscope.json +108 -0
- xinference/model/llm/transformers/chatglm.py +104 -0
- xinference/model/llm/transformers/core.py +37 -111
- xinference/model/llm/transformers/deepseek_v2.py +0 -226
- xinference/model/llm/transformers/internlm2.py +3 -95
- xinference/model/llm/transformers/opt.py +68 -0
- xinference/model/llm/transformers/utils.py +4 -284
- xinference/model/llm/utils.py +2 -2
- xinference/model/llm/vllm/core.py +16 -1
- xinference/utils.py +2 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.e51a356d.js → main.f7da0140.js} +3 -3
- xinference/web/ui/build/static/js/main.f7da0140.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
- {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/METADATA +36 -4
- {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/RECORD +36 -33
- xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
- /xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.f7da0140.js.LICENSE.txt} +0 -0
- {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/LICENSE +0 -0
- {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/WHEEL +0 -0
- {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/top_level.txt +0 -0
xinference/__init__.py
CHANGED
|
@@ -26,13 +26,9 @@ except:
|
|
|
26
26
|
def _install():
|
|
27
27
|
from xoscar.backends.router import Router
|
|
28
28
|
|
|
29
|
-
from .model import _install as install_model
|
|
30
|
-
|
|
31
29
|
default_router = Router.get_instance_or_empty()
|
|
32
30
|
Router.set_instance(default_router)
|
|
33
31
|
|
|
34
|
-
install_model()
|
|
35
|
-
|
|
36
32
|
|
|
37
33
|
_install()
|
|
38
34
|
del _install
|
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-10-
|
|
11
|
+
"date": "2024-10-18T12:49:02+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.
|
|
14
|
+
"full-revisionid": "5f7dea44832a1c41f887b9a01377191894550057",
|
|
15
|
+
"version": "0.16.0"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/constants.py
CHANGED
|
@@ -27,8 +27,8 @@ XINFERENCE_ENV_HEALTH_CHECK_INTERVAL = "XINFERENCE_HEALTH_CHECK_INTERVAL"
|
|
|
27
27
|
XINFERENCE_ENV_HEALTH_CHECK_TIMEOUT = "XINFERENCE_HEALTH_CHECK_TIMEOUT"
|
|
28
28
|
XINFERENCE_ENV_DISABLE_HEALTH_CHECK = "XINFERENCE_DISABLE_HEALTH_CHECK"
|
|
29
29
|
XINFERENCE_ENV_DISABLE_METRICS = "XINFERENCE_DISABLE_METRICS"
|
|
30
|
-
XINFERENCE_ENV_TRANSFORMERS_ENABLE_BATCHING = "XINFERENCE_TRANSFORMERS_ENABLE_BATCHING"
|
|
31
30
|
XINFERENCE_ENV_DOWNLOAD_MAX_ATTEMPTS = "XINFERENCE_DOWNLOAD_MAX_ATTEMPTS"
|
|
31
|
+
XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE = "XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE"
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def get_xinference_home() -> str:
|
|
@@ -80,9 +80,9 @@ XINFERENCE_DISABLE_HEALTH_CHECK = bool(
|
|
|
80
80
|
XINFERENCE_DISABLE_METRICS = bool(
|
|
81
81
|
int(os.environ.get(XINFERENCE_ENV_DISABLE_METRICS, 0))
|
|
82
82
|
)
|
|
83
|
-
XINFERENCE_TRANSFORMERS_ENABLE_BATCHING = bool(
|
|
84
|
-
int(os.environ.get(XINFERENCE_ENV_TRANSFORMERS_ENABLE_BATCHING, 0))
|
|
85
|
-
)
|
|
86
83
|
XINFERENCE_DOWNLOAD_MAX_ATTEMPTS = int(
|
|
87
84
|
os.environ.get(XINFERENCE_ENV_DOWNLOAD_MAX_ATTEMPTS, 3)
|
|
88
85
|
)
|
|
86
|
+
XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE = os.environ.get(
|
|
87
|
+
XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE, None
|
|
88
|
+
)
|
xinference/core/model.py
CHANGED
|
@@ -41,7 +41,7 @@ from typing import (
|
|
|
41
41
|
import sse_starlette.sse
|
|
42
42
|
import xoscar as xo
|
|
43
43
|
|
|
44
|
-
from ..constants import
|
|
44
|
+
from ..constants import XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE
|
|
45
45
|
|
|
46
46
|
if TYPE_CHECKING:
|
|
47
47
|
from .progress_tracker import ProgressTrackerActor
|
|
@@ -74,6 +74,8 @@ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
|
|
|
74
74
|
"MiniCPM-V-2.6",
|
|
75
75
|
]
|
|
76
76
|
|
|
77
|
+
XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
|
|
78
|
+
|
|
77
79
|
|
|
78
80
|
def request_limit(fn):
|
|
79
81
|
"""
|
|
@@ -153,6 +155,16 @@ class ModelActor(xo.StatelessActor):
|
|
|
153
155
|
f"Destroy scheduler actor failed, address: {self.address}, error: {e}"
|
|
154
156
|
)
|
|
155
157
|
|
|
158
|
+
if self.allow_batching_for_text_to_image():
|
|
159
|
+
try:
|
|
160
|
+
assert self._text_to_image_scheduler_ref is not None
|
|
161
|
+
await xo.destroy_actor(self._text_to_image_scheduler_ref)
|
|
162
|
+
del self._text_to_image_scheduler_ref
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.debug(
|
|
165
|
+
f"Destroy text_to_image scheduler actor failed, address: {self.address}, error: {e}"
|
|
166
|
+
)
|
|
167
|
+
|
|
156
168
|
if hasattr(self._model, "stop") and callable(self._model.stop):
|
|
157
169
|
self._model.stop()
|
|
158
170
|
|
|
@@ -220,6 +232,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
220
232
|
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
221
233
|
|
|
222
234
|
self._scheduler_ref = None
|
|
235
|
+
self._text_to_image_scheduler_ref = None
|
|
223
236
|
|
|
224
237
|
async def __post_create__(self):
|
|
225
238
|
self._loop = asyncio.get_running_loop()
|
|
@@ -233,6 +246,15 @@ class ModelActor(xo.StatelessActor):
|
|
|
233
246
|
uid=SchedulerActor.gen_uid(self.model_uid(), self._model.rep_id),
|
|
234
247
|
)
|
|
235
248
|
|
|
249
|
+
if self.allow_batching_for_text_to_image():
|
|
250
|
+
from ..model.image.scheduler.flux import FluxBatchSchedulerActor
|
|
251
|
+
|
|
252
|
+
self._text_to_image_scheduler_ref = await xo.create_actor(
|
|
253
|
+
FluxBatchSchedulerActor,
|
|
254
|
+
address=self.address,
|
|
255
|
+
uid=FluxBatchSchedulerActor.gen_uid(self.model_uid()),
|
|
256
|
+
)
|
|
257
|
+
|
|
236
258
|
async def _record_completion_metrics(
|
|
237
259
|
self, duration, completion_tokens, prompt_tokens
|
|
238
260
|
):
|
|
@@ -311,10 +333,8 @@ class ModelActor(xo.StatelessActor):
|
|
|
311
333
|
|
|
312
334
|
model_ability = self._model_description.get("model_ability", [])
|
|
313
335
|
|
|
314
|
-
condition =
|
|
315
|
-
|
|
316
|
-
)
|
|
317
|
-
if condition and "vision" in model_ability:
|
|
336
|
+
condition = isinstance(self._model, PytorchModel)
|
|
337
|
+
if condition and ("vision" in model_ability or "audio" in model_ability):
|
|
318
338
|
if (
|
|
319
339
|
self._model.model_family.model_name
|
|
320
340
|
in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
|
|
@@ -331,6 +351,26 @@ class ModelActor(xo.StatelessActor):
|
|
|
331
351
|
return False
|
|
332
352
|
return condition
|
|
333
353
|
|
|
354
|
+
def allow_batching_for_text_to_image(self) -> bool:
|
|
355
|
+
from ..model.image.stable_diffusion.core import DiffusionModel
|
|
356
|
+
|
|
357
|
+
condition = XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE is not None and isinstance(
|
|
358
|
+
self._model, DiffusionModel
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
if condition:
|
|
362
|
+
model_name = self._model._model_spec.model_name # type: ignore
|
|
363
|
+
if model_name in XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS:
|
|
364
|
+
return True
|
|
365
|
+
else:
|
|
366
|
+
logger.warning(
|
|
367
|
+
f"Currently for image models with text_to_image ability, "
|
|
368
|
+
f"xinference only supports {', '.join(XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS)} for batching. "
|
|
369
|
+
f"Your model {model_name} is disqualified."
|
|
370
|
+
)
|
|
371
|
+
return False
|
|
372
|
+
return condition
|
|
373
|
+
|
|
334
374
|
async def load(self):
|
|
335
375
|
self._model.load()
|
|
336
376
|
if self.allow_batching():
|
|
@@ -338,6 +378,11 @@ class ModelActor(xo.StatelessActor):
|
|
|
338
378
|
logger.debug(
|
|
339
379
|
f"Batching enabled for model: {self.model_uid()}, max_num_seqs: {self._model.get_max_num_seqs()}"
|
|
340
380
|
)
|
|
381
|
+
if self.allow_batching_for_text_to_image():
|
|
382
|
+
await self._text_to_image_scheduler_ref.set_model(self._model)
|
|
383
|
+
logger.debug(
|
|
384
|
+
f"Batching enabled for model: {self.model_uid()}, max_num_images: {self._model.get_max_num_images_for_batching()}"
|
|
385
|
+
)
|
|
341
386
|
|
|
342
387
|
def model_uid(self):
|
|
343
388
|
return (
|
|
@@ -617,12 +662,16 @@ class ModelActor(xo.StatelessActor):
|
|
|
617
662
|
)
|
|
618
663
|
|
|
619
664
|
async def abort_request(self, request_id: str) -> str:
|
|
620
|
-
from .
|
|
665
|
+
from .utils import AbortRequestMessage
|
|
621
666
|
|
|
622
667
|
if self.allow_batching():
|
|
623
668
|
if self._scheduler_ref is None:
|
|
624
669
|
return AbortRequestMessage.NOT_FOUND.name
|
|
625
670
|
return await self._scheduler_ref.abort_request(request_id)
|
|
671
|
+
elif self.allow_batching_for_text_to_image():
|
|
672
|
+
if self._text_to_image_scheduler_ref is None:
|
|
673
|
+
return AbortRequestMessage.NOT_FOUND.name
|
|
674
|
+
return await self._text_to_image_scheduler_ref.abort_request(request_id)
|
|
626
675
|
return AbortRequestMessage.NO_OP.name
|
|
627
676
|
|
|
628
677
|
@request_limit
|
|
@@ -747,6 +796,22 @@ class ModelActor(xo.StatelessActor):
|
|
|
747
796
|
f"Model {self._model.model_spec} is not for creating speech."
|
|
748
797
|
)
|
|
749
798
|
|
|
799
|
+
async def handle_image_batching_request(self, unique_id, *args, **kwargs):
|
|
800
|
+
size = args[2]
|
|
801
|
+
if XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE != size:
|
|
802
|
+
raise RuntimeError(
|
|
803
|
+
f"The image size: {size} of text_to_image for batching "
|
|
804
|
+
f"must be the same as the environment variable: {XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE} you set."
|
|
805
|
+
)
|
|
806
|
+
assert self._loop is not None
|
|
807
|
+
future = ConcurrentFuture()
|
|
808
|
+
await self._text_to_image_scheduler_ref.add_request(
|
|
809
|
+
unique_id, future, *args, **kwargs
|
|
810
|
+
)
|
|
811
|
+
fut = asyncio.wrap_future(future, loop=self._loop)
|
|
812
|
+
result = await fut
|
|
813
|
+
return await asyncio.to_thread(json_dumps, result)
|
|
814
|
+
|
|
750
815
|
@request_limit
|
|
751
816
|
@log_async(logger=logger)
|
|
752
817
|
async def text_to_image(
|
|
@@ -759,19 +824,25 @@ class ModelActor(xo.StatelessActor):
|
|
|
759
824
|
**kwargs,
|
|
760
825
|
):
|
|
761
826
|
if hasattr(self._model, "text_to_image"):
|
|
762
|
-
|
|
763
|
-
kwargs.pop("request_id", None)
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
return await self._call_wrapper_json(
|
|
767
|
-
self._model.text_to_image,
|
|
768
|
-
prompt,
|
|
769
|
-
n,
|
|
770
|
-
size,
|
|
771
|
-
response_format,
|
|
772
|
-
*args,
|
|
773
|
-
**kwargs,
|
|
827
|
+
if self.allow_batching_for_text_to_image():
|
|
828
|
+
unique_id = kwargs.pop("request_id", None)
|
|
829
|
+
return await self.handle_image_batching_request(
|
|
830
|
+
unique_id, prompt, n, size, response_format, *args, **kwargs
|
|
774
831
|
)
|
|
832
|
+
else:
|
|
833
|
+
progressor = kwargs["progressor"] = await self._get_progressor(
|
|
834
|
+
kwargs.pop("request_id", None)
|
|
835
|
+
)
|
|
836
|
+
with progressor:
|
|
837
|
+
return await self._call_wrapper_json(
|
|
838
|
+
self._model.text_to_image,
|
|
839
|
+
prompt,
|
|
840
|
+
n,
|
|
841
|
+
size,
|
|
842
|
+
response_format,
|
|
843
|
+
*args,
|
|
844
|
+
**kwargs,
|
|
845
|
+
)
|
|
775
846
|
raise AttributeError(
|
|
776
847
|
f"Model {self._model.model_spec} is not for creating image."
|
|
777
848
|
)
|
xinference/core/scheduler.py
CHANGED
|
@@ -17,11 +17,12 @@ import functools
|
|
|
17
17
|
import logging
|
|
18
18
|
import uuid
|
|
19
19
|
from collections import deque
|
|
20
|
-
from enum import Enum
|
|
21
20
|
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
22
21
|
|
|
23
22
|
import xoscar as xo
|
|
24
23
|
|
|
24
|
+
from .utils import AbortRequestMessage
|
|
25
|
+
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
27
28
|
XINFERENCE_STREAMING_DONE_FLAG = "<XINFERENCE_STREAMING_DONE>"
|
|
@@ -30,12 +31,6 @@ XINFERENCE_STREAMING_ABORT_FLAG = "<XINFERENCE_STREAMING_ABORT>"
|
|
|
30
31
|
XINFERENCE_NON_STREAMING_ABORT_FLAG = "<XINFERENCE_NON_STREAMING_ABORT>"
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
class AbortRequestMessage(Enum):
|
|
34
|
-
NOT_FOUND = 1
|
|
35
|
-
DONE = 2
|
|
36
|
-
NO_OP = 3
|
|
37
|
-
|
|
38
|
-
|
|
39
34
|
class InferenceRequest:
|
|
40
35
|
def __init__(
|
|
41
36
|
self,
|
|
@@ -81,6 +76,10 @@ class InferenceRequest:
|
|
|
81
76
|
self.padding_len = 0
|
|
82
77
|
# Use in stream mode
|
|
83
78
|
self.last_output_length = 0
|
|
79
|
+
# For tool call
|
|
80
|
+
self.tools = None
|
|
81
|
+
# Currently, for storing tool call streaming results.
|
|
82
|
+
self.outputs: List[str] = []
|
|
84
83
|
# inference results,
|
|
85
84
|
# it is a list type because when stream=True,
|
|
86
85
|
# self.completion contains all the results in a decode round.
|
|
@@ -112,6 +111,10 @@ class InferenceRequest:
|
|
|
112
111
|
"""
|
|
113
112
|
return self._prompt
|
|
114
113
|
|
|
114
|
+
@prompt.setter
|
|
115
|
+
def prompt(self, value: str):
|
|
116
|
+
self._prompt = value
|
|
117
|
+
|
|
115
118
|
@property
|
|
116
119
|
def call_ability(self):
|
|
117
120
|
return self._call_ability
|
xinference/core/utils.py
CHANGED
|
@@ -16,6 +16,7 @@ import os
|
|
|
16
16
|
import random
|
|
17
17
|
import string
|
|
18
18
|
import uuid
|
|
19
|
+
from enum import Enum
|
|
19
20
|
from typing import Dict, Generator, List, Optional, Tuple, Union
|
|
20
21
|
|
|
21
22
|
import orjson
|
|
@@ -27,6 +28,12 @@ from ..constants import XINFERENCE_LOG_ARG_MAX_LENGTH
|
|
|
27
28
|
logger = logging.getLogger(__name__)
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
class AbortRequestMessage(Enum):
|
|
32
|
+
NOT_FOUND = 1
|
|
33
|
+
DONE = 2
|
|
34
|
+
NO_OP = 3
|
|
35
|
+
|
|
36
|
+
|
|
30
37
|
def truncate_log_arg(arg) -> str:
|
|
31
38
|
s = str(arg)
|
|
32
39
|
if len(s) > XINFERENCE_LOG_ARG_MAX_LENGTH:
|
|
@@ -51,6 +58,8 @@ def log_async(
|
|
|
51
58
|
request_id_str = kwargs.get("request_id", "")
|
|
52
59
|
if not request_id_str:
|
|
53
60
|
request_id_str = uuid.uuid1()
|
|
61
|
+
if func_name == "text_to_image":
|
|
62
|
+
kwargs["request_id"] = request_id_str
|
|
54
63
|
request_id_str = f"[request {request_id_str}]"
|
|
55
64
|
formatted_args = ",".join(map(truncate_log_arg, args))
|
|
56
65
|
formatted_kwargs = ",".join(
|
xinference/deploy/supervisor.py
CHANGED
|
@@ -31,6 +31,10 @@ from .utils import health_check
|
|
|
31
31
|
|
|
32
32
|
logger = logging.getLogger(__name__)
|
|
33
33
|
|
|
34
|
+
from ..model import _install as install_model
|
|
35
|
+
|
|
36
|
+
install_model()
|
|
37
|
+
|
|
34
38
|
|
|
35
39
|
async def _start_supervisor(address: str, logging_conf: Optional[Dict] = None):
|
|
36
40
|
logging.config.dictConfig(logging_conf) # type: ignore
|
xinference/model/__init__.py
CHANGED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-2024 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|