xinference 0.15.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (38) hide show
  1. xinference/__init__.py +0 -4
  2. xinference/_version.py +3 -3
  3. xinference/constants.py +4 -4
  4. xinference/core/model.py +89 -18
  5. xinference/core/scheduler.py +10 -7
  6. xinference/core/utils.py +9 -0
  7. xinference/deploy/supervisor.py +4 -0
  8. xinference/model/__init__.py +4 -0
  9. xinference/model/image/scheduler/__init__.py +13 -0
  10. xinference/model/image/scheduler/flux.py +533 -0
  11. xinference/model/image/stable_diffusion/core.py +6 -31
  12. xinference/model/image/utils.py +39 -3
  13. xinference/model/llm/__init__.py +2 -0
  14. xinference/model/llm/llm_family.json +169 -1
  15. xinference/model/llm/llm_family_modelscope.json +108 -0
  16. xinference/model/llm/transformers/chatglm.py +104 -0
  17. xinference/model/llm/transformers/core.py +37 -111
  18. xinference/model/llm/transformers/deepseek_v2.py +0 -226
  19. xinference/model/llm/transformers/internlm2.py +3 -95
  20. xinference/model/llm/transformers/opt.py +68 -0
  21. xinference/model/llm/transformers/utils.py +4 -284
  22. xinference/model/llm/utils.py +2 -2
  23. xinference/model/llm/vllm/core.py +16 -1
  24. xinference/utils.py +2 -3
  25. xinference/web/ui/build/asset-manifest.json +3 -3
  26. xinference/web/ui/build/index.html +1 -1
  27. xinference/web/ui/build/static/js/{main.e51a356d.js → main.f7da0140.js} +3 -3
  28. xinference/web/ui/build/static/js/main.f7da0140.js.map +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
  30. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/METADATA +36 -4
  31. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/RECORD +36 -33
  32. xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
  33. xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
  34. /xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.f7da0140.js.LICENSE.txt} +0 -0
  35. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/LICENSE +0 -0
  36. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/WHEEL +0 -0
  37. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/entry_points.txt +0 -0
  38. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/top_level.txt +0 -0
xinference/__init__.py CHANGED
@@ -26,13 +26,9 @@ except:
26
26
  def _install():
27
27
  from xoscar.backends.router import Router
28
28
 
29
- from .model import _install as install_model
30
-
31
29
  default_router = Router.get_instance_or_empty()
32
30
  Router.set_instance(default_router)
33
31
 
34
- install_model()
35
-
36
32
 
37
33
  _install()
38
34
  del _install
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-10-12T18:28:41+0800",
11
+ "date": "2024-10-18T12:49:02+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "c0be11504c70f6c392cbdb67c86cf12153353f70",
15
- "version": "0.15.4"
14
+ "full-revisionid": "5f7dea44832a1c41f887b9a01377191894550057",
15
+ "version": "0.16.0"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
xinference/constants.py CHANGED
@@ -27,8 +27,8 @@ XINFERENCE_ENV_HEALTH_CHECK_INTERVAL = "XINFERENCE_HEALTH_CHECK_INTERVAL"
27
27
  XINFERENCE_ENV_HEALTH_CHECK_TIMEOUT = "XINFERENCE_HEALTH_CHECK_TIMEOUT"
28
28
  XINFERENCE_ENV_DISABLE_HEALTH_CHECK = "XINFERENCE_DISABLE_HEALTH_CHECK"
29
29
  XINFERENCE_ENV_DISABLE_METRICS = "XINFERENCE_DISABLE_METRICS"
30
- XINFERENCE_ENV_TRANSFORMERS_ENABLE_BATCHING = "XINFERENCE_TRANSFORMERS_ENABLE_BATCHING"
31
30
  XINFERENCE_ENV_DOWNLOAD_MAX_ATTEMPTS = "XINFERENCE_DOWNLOAD_MAX_ATTEMPTS"
31
+ XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE = "XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE"
32
32
 
33
33
 
34
34
  def get_xinference_home() -> str:
@@ -80,9 +80,9 @@ XINFERENCE_DISABLE_HEALTH_CHECK = bool(
80
80
  XINFERENCE_DISABLE_METRICS = bool(
81
81
  int(os.environ.get(XINFERENCE_ENV_DISABLE_METRICS, 0))
82
82
  )
83
- XINFERENCE_TRANSFORMERS_ENABLE_BATCHING = bool(
84
- int(os.environ.get(XINFERENCE_ENV_TRANSFORMERS_ENABLE_BATCHING, 0))
85
- )
86
83
  XINFERENCE_DOWNLOAD_MAX_ATTEMPTS = int(
87
84
  os.environ.get(XINFERENCE_ENV_DOWNLOAD_MAX_ATTEMPTS, 3)
88
85
  )
86
+ XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE = os.environ.get(
87
+ XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE, None
88
+ )
xinference/core/model.py CHANGED
@@ -41,7 +41,7 @@ from typing import (
41
41
  import sse_starlette.sse
42
42
  import xoscar as xo
43
43
 
44
- from ..constants import XINFERENCE_TRANSFORMERS_ENABLE_BATCHING
44
+ from ..constants import XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE
45
45
 
46
46
  if TYPE_CHECKING:
47
47
  from .progress_tracker import ProgressTrackerActor
@@ -74,6 +74,8 @@ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
74
74
  "MiniCPM-V-2.6",
75
75
  ]
76
76
 
77
+ XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
78
+
77
79
 
78
80
  def request_limit(fn):
79
81
  """
@@ -153,6 +155,16 @@ class ModelActor(xo.StatelessActor):
153
155
  f"Destroy scheduler actor failed, address: {self.address}, error: {e}"
154
156
  )
155
157
 
158
+ if self.allow_batching_for_text_to_image():
159
+ try:
160
+ assert self._text_to_image_scheduler_ref is not None
161
+ await xo.destroy_actor(self._text_to_image_scheduler_ref)
162
+ del self._text_to_image_scheduler_ref
163
+ except Exception as e:
164
+ logger.debug(
165
+ f"Destroy text_to_image scheduler actor failed, address: {self.address}, error: {e}"
166
+ )
167
+
156
168
  if hasattr(self._model, "stop") and callable(self._model.stop):
157
169
  self._model.stop()
158
170
 
@@ -220,6 +232,7 @@ class ModelActor(xo.StatelessActor):
220
232
  self._loop: Optional[asyncio.AbstractEventLoop] = None
221
233
 
222
234
  self._scheduler_ref = None
235
+ self._text_to_image_scheduler_ref = None
223
236
 
224
237
  async def __post_create__(self):
225
238
  self._loop = asyncio.get_running_loop()
@@ -233,6 +246,15 @@ class ModelActor(xo.StatelessActor):
233
246
  uid=SchedulerActor.gen_uid(self.model_uid(), self._model.rep_id),
234
247
  )
235
248
 
249
+ if self.allow_batching_for_text_to_image():
250
+ from ..model.image.scheduler.flux import FluxBatchSchedulerActor
251
+
252
+ self._text_to_image_scheduler_ref = await xo.create_actor(
253
+ FluxBatchSchedulerActor,
254
+ address=self.address,
255
+ uid=FluxBatchSchedulerActor.gen_uid(self.model_uid()),
256
+ )
257
+
236
258
  async def _record_completion_metrics(
237
259
  self, duration, completion_tokens, prompt_tokens
238
260
  ):
@@ -311,10 +333,8 @@ class ModelActor(xo.StatelessActor):
311
333
 
312
334
  model_ability = self._model_description.get("model_ability", [])
313
335
 
314
- condition = XINFERENCE_TRANSFORMERS_ENABLE_BATCHING and isinstance(
315
- self._model, PytorchModel
316
- )
317
- if condition and "vision" in model_ability:
336
+ condition = isinstance(self._model, PytorchModel)
337
+ if condition and ("vision" in model_ability or "audio" in model_ability):
318
338
  if (
319
339
  self._model.model_family.model_name
320
340
  in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
@@ -331,6 +351,26 @@ class ModelActor(xo.StatelessActor):
331
351
  return False
332
352
  return condition
333
353
 
354
+ def allow_batching_for_text_to_image(self) -> bool:
355
+ from ..model.image.stable_diffusion.core import DiffusionModel
356
+
357
+ condition = XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE is not None and isinstance(
358
+ self._model, DiffusionModel
359
+ )
360
+
361
+ if condition:
362
+ model_name = self._model._model_spec.model_name # type: ignore
363
+ if model_name in XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS:
364
+ return True
365
+ else:
366
+ logger.warning(
367
+ f"Currently for image models with text_to_image ability, "
368
+ f"xinference only supports {', '.join(XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS)} for batching. "
369
+ f"Your model {model_name} is disqualified."
370
+ )
371
+ return False
372
+ return condition
373
+
334
374
  async def load(self):
335
375
  self._model.load()
336
376
  if self.allow_batching():
@@ -338,6 +378,11 @@ class ModelActor(xo.StatelessActor):
338
378
  logger.debug(
339
379
  f"Batching enabled for model: {self.model_uid()}, max_num_seqs: {self._model.get_max_num_seqs()}"
340
380
  )
381
+ if self.allow_batching_for_text_to_image():
382
+ await self._text_to_image_scheduler_ref.set_model(self._model)
383
+ logger.debug(
384
+ f"Batching enabled for model: {self.model_uid()}, max_num_images: {self._model.get_max_num_images_for_batching()}"
385
+ )
341
386
 
342
387
  def model_uid(self):
343
388
  return (
@@ -617,12 +662,16 @@ class ModelActor(xo.StatelessActor):
617
662
  )
618
663
 
619
664
  async def abort_request(self, request_id: str) -> str:
620
- from .scheduler import AbortRequestMessage
665
+ from .utils import AbortRequestMessage
621
666
 
622
667
  if self.allow_batching():
623
668
  if self._scheduler_ref is None:
624
669
  return AbortRequestMessage.NOT_FOUND.name
625
670
  return await self._scheduler_ref.abort_request(request_id)
671
+ elif self.allow_batching_for_text_to_image():
672
+ if self._text_to_image_scheduler_ref is None:
673
+ return AbortRequestMessage.NOT_FOUND.name
674
+ return await self._text_to_image_scheduler_ref.abort_request(request_id)
626
675
  return AbortRequestMessage.NO_OP.name
627
676
 
628
677
  @request_limit
@@ -747,6 +796,22 @@ class ModelActor(xo.StatelessActor):
747
796
  f"Model {self._model.model_spec} is not for creating speech."
748
797
  )
749
798
 
799
+ async def handle_image_batching_request(self, unique_id, *args, **kwargs):
800
+ size = args[2]
801
+ if XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE != size:
802
+ raise RuntimeError(
803
+ f"The image size: {size} of text_to_image for batching "
804
+ f"must be the same as the environment variable: {XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE} you set."
805
+ )
806
+ assert self._loop is not None
807
+ future = ConcurrentFuture()
808
+ await self._text_to_image_scheduler_ref.add_request(
809
+ unique_id, future, *args, **kwargs
810
+ )
811
+ fut = asyncio.wrap_future(future, loop=self._loop)
812
+ result = await fut
813
+ return await asyncio.to_thread(json_dumps, result)
814
+
750
815
  @request_limit
751
816
  @log_async(logger=logger)
752
817
  async def text_to_image(
@@ -759,19 +824,25 @@ class ModelActor(xo.StatelessActor):
759
824
  **kwargs,
760
825
  ):
761
826
  if hasattr(self._model, "text_to_image"):
762
- progressor = kwargs["progressor"] = await self._get_progressor(
763
- kwargs.pop("request_id", None)
764
- )
765
- with progressor:
766
- return await self._call_wrapper_json(
767
- self._model.text_to_image,
768
- prompt,
769
- n,
770
- size,
771
- response_format,
772
- *args,
773
- **kwargs,
827
+ if self.allow_batching_for_text_to_image():
828
+ unique_id = kwargs.pop("request_id", None)
829
+ return await self.handle_image_batching_request(
830
+ unique_id, prompt, n, size, response_format, *args, **kwargs
774
831
  )
832
+ else:
833
+ progressor = kwargs["progressor"] = await self._get_progressor(
834
+ kwargs.pop("request_id", None)
835
+ )
836
+ with progressor:
837
+ return await self._call_wrapper_json(
838
+ self._model.text_to_image,
839
+ prompt,
840
+ n,
841
+ size,
842
+ response_format,
843
+ *args,
844
+ **kwargs,
845
+ )
775
846
  raise AttributeError(
776
847
  f"Model {self._model.model_spec} is not for creating image."
777
848
  )
@@ -17,11 +17,12 @@ import functools
17
17
  import logging
18
18
  import uuid
19
19
  from collections import deque
20
- from enum import Enum
21
20
  from typing import Dict, List, Optional, Set, Tuple, Union
22
21
 
23
22
  import xoscar as xo
24
23
 
24
+ from .utils import AbortRequestMessage
25
+
25
26
  logger = logging.getLogger(__name__)
26
27
 
27
28
  XINFERENCE_STREAMING_DONE_FLAG = "<XINFERENCE_STREAMING_DONE>"
@@ -30,12 +31,6 @@ XINFERENCE_STREAMING_ABORT_FLAG = "<XINFERENCE_STREAMING_ABORT>"
30
31
  XINFERENCE_NON_STREAMING_ABORT_FLAG = "<XINFERENCE_NON_STREAMING_ABORT>"
31
32
 
32
33
 
33
- class AbortRequestMessage(Enum):
34
- NOT_FOUND = 1
35
- DONE = 2
36
- NO_OP = 3
37
-
38
-
39
34
  class InferenceRequest:
40
35
  def __init__(
41
36
  self,
@@ -81,6 +76,10 @@ class InferenceRequest:
81
76
  self.padding_len = 0
82
77
  # Use in stream mode
83
78
  self.last_output_length = 0
79
+ # For tool call
80
+ self.tools = None
81
+ # Currently, for storing tool call streaming results.
82
+ self.outputs: List[str] = []
84
83
  # inference results,
85
84
  # it is a list type because when stream=True,
86
85
  # self.completion contains all the results in a decode round.
@@ -112,6 +111,10 @@ class InferenceRequest:
112
111
  """
113
112
  return self._prompt
114
113
 
114
+ @prompt.setter
115
+ def prompt(self, value: str):
116
+ self._prompt = value
117
+
115
118
  @property
116
119
  def call_ability(self):
117
120
  return self._call_ability
xinference/core/utils.py CHANGED
@@ -16,6 +16,7 @@ import os
16
16
  import random
17
17
  import string
18
18
  import uuid
19
+ from enum import Enum
19
20
  from typing import Dict, Generator, List, Optional, Tuple, Union
20
21
 
21
22
  import orjson
@@ -27,6 +28,12 @@ from ..constants import XINFERENCE_LOG_ARG_MAX_LENGTH
27
28
  logger = logging.getLogger(__name__)
28
29
 
29
30
 
31
+ class AbortRequestMessage(Enum):
32
+ NOT_FOUND = 1
33
+ DONE = 2
34
+ NO_OP = 3
35
+
36
+
30
37
  def truncate_log_arg(arg) -> str:
31
38
  s = str(arg)
32
39
  if len(s) > XINFERENCE_LOG_ARG_MAX_LENGTH:
@@ -51,6 +58,8 @@ def log_async(
51
58
  request_id_str = kwargs.get("request_id", "")
52
59
  if not request_id_str:
53
60
  request_id_str = uuid.uuid1()
61
+ if func_name == "text_to_image":
62
+ kwargs["request_id"] = request_id_str
54
63
  request_id_str = f"[request {request_id_str}]"
55
64
  formatted_args = ",".join(map(truncate_log_arg, args))
56
65
  formatted_kwargs = ",".join(
@@ -31,6 +31,10 @@ from .utils import health_check
31
31
 
32
32
  logger = logging.getLogger(__name__)
33
33
 
34
+ from ..model import _install as install_model
35
+
36
+ install_model()
37
+
34
38
 
35
39
  async def _start_supervisor(address: str, logging_conf: Optional[Dict] = None):
36
40
  logging.config.dictConfig(logging_conf) # type: ignore
@@ -29,3 +29,7 @@ def _install():
29
29
  image_install()
30
30
  rerank_install()
31
31
  video_install()
32
+
33
+
34
+ _install()
35
+ del _install
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-2024 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.