PyPI - xinference - Versions diffs - 0.15.4__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

xinference 0.15.4py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (38) hide show

xinference/__init__.py +0 -4
xinference/_version.py +3 -3
xinference/constants.py +4 -4
xinference/core/model.py +89 -18
xinference/core/scheduler.py +10 -7
xinference/core/utils.py +9 -0
xinference/deploy/supervisor.py +4 -0
xinference/model/__init__.py +4 -0
xinference/model/image/scheduler/__init__.py +13 -0
xinference/model/image/scheduler/flux.py +533 -0
xinference/model/image/stable_diffusion/core.py +6 -31
xinference/model/image/utils.py +39 -3
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +169 -1
xinference/model/llm/llm_family_modelscope.json +108 -0
xinference/model/llm/transformers/chatglm.py +104 -0
xinference/model/llm/transformers/core.py +37 -111
xinference/model/llm/transformers/deepseek_v2.py +0 -226
xinference/model/llm/transformers/internlm2.py +3 -95
xinference/model/llm/transformers/opt.py +68 -0
xinference/model/llm/transformers/utils.py +4 -284
xinference/model/llm/utils.py +2 -2
xinference/model/llm/vllm/core.py +16 -1
xinference/utils.py +2 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.e51a356d.js → main.f7da0140.js} +3 -3
xinference/web/ui/build/static/js/main.f7da0140.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/METADATA +36 -4
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/RECORD +36 -33
xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
/xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.f7da0140.js.LICENSE.txt} +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/LICENSE +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/WHEEL +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/entry_points.txt +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/top_level.txt +0 -0

xinference/__init__.py CHANGED Viewed

@@ -26,13 +26,9 @@ except:
 def _install():
     from xoscar.backends.router import Router
-    from .model import _install as install_model
     default_router = Router.get_instance_or_empty()
     Router.set_instance(default_router)
-    install_model()
 _install()
 del _install

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-10-12T18:28:41+0800",
+ "date": "2024-10-18T12:49:02+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "c0be11504c70f6c392cbdb67c86cf12153353f70",
- "version": "0.15.4"
+ "full-revisionid": "5f7dea44832a1c41f887b9a01377191894550057",
+ "version": "0.16.0"
 }
 '''  # END VERSION_JSON

xinference/constants.py CHANGED Viewed

@@ -27,8 +27,8 @@ XINFERENCE_ENV_HEALTH_CHECK_INTERVAL = "XINFERENCE_HEALTH_CHECK_INTERVAL"
 XINFERENCE_ENV_HEALTH_CHECK_TIMEOUT = "XINFERENCE_HEALTH_CHECK_TIMEOUT"
 XINFERENCE_ENV_DISABLE_HEALTH_CHECK = "XINFERENCE_DISABLE_HEALTH_CHECK"
 XINFERENCE_ENV_DISABLE_METRICS = "XINFERENCE_DISABLE_METRICS"
-XINFERENCE_ENV_TRANSFORMERS_ENABLE_BATCHING = "XINFERENCE_TRANSFORMERS_ENABLE_BATCHING"
 XINFERENCE_ENV_DOWNLOAD_MAX_ATTEMPTS = "XINFERENCE_DOWNLOAD_MAX_ATTEMPTS"
+XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE = "XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE"
 def get_xinference_home() -> str:
@@ -80,9 +80,9 @@ XINFERENCE_DISABLE_HEALTH_CHECK = bool(
 XINFERENCE_DISABLE_METRICS = bool(
     int(os.environ.get(XINFERENCE_ENV_DISABLE_METRICS, 0))
 )
-XINFERENCE_TRANSFORMERS_ENABLE_BATCHING = bool(
-    int(os.environ.get(XINFERENCE_ENV_TRANSFORMERS_ENABLE_BATCHING, 0))
-)
 XINFERENCE_DOWNLOAD_MAX_ATTEMPTS = int(
     os.environ.get(XINFERENCE_ENV_DOWNLOAD_MAX_ATTEMPTS, 3)
 )
+XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE = os.environ.get(
+    XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE, None
+)

xinference/core/model.py CHANGED Viewed

@@ -41,7 +41,7 @@ from typing import (
 import sse_starlette.sse
 import xoscar as xo
-from ..constants import XINFERENCE_TRANSFORMERS_ENABLE_BATCHING
+from ..constants import XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE
 if TYPE_CHECKING:
     from .progress_tracker import ProgressTrackerActor
@@ -74,6 +74,8 @@ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
     "MiniCPM-V-2.6",
 ]
+XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
 def request_limit(fn):
     """
@@ -153,6 +155,16 @@ class ModelActor(xo.StatelessActor):
                     f"Destroy scheduler actor failed, address: {self.address}, error: {e}"
                 )
+        if self.allow_batching_for_text_to_image():
+            try:
+                assert self._text_to_image_scheduler_ref is not None
+                await xo.destroy_actor(self._text_to_image_scheduler_ref)
+                del self._text_to_image_scheduler_ref
+            except Exception as e:
+                logger.debug(
+                    f"Destroy text_to_image scheduler actor failed, address: {self.address}, error: {e}"
+                )
         if hasattr(self._model, "stop") and callable(self._model.stop):
             self._model.stop()
@@ -220,6 +232,7 @@ class ModelActor(xo.StatelessActor):
         self._loop: Optional[asyncio.AbstractEventLoop] = None
         self._scheduler_ref = None
+        self._text_to_image_scheduler_ref = None
     async def __post_create__(self):
         self._loop = asyncio.get_running_loop()
@@ -233,6 +246,15 @@ class ModelActor(xo.StatelessActor):
                 uid=SchedulerActor.gen_uid(self.model_uid(), self._model.rep_id),
             )
+        if self.allow_batching_for_text_to_image():
+            from ..model.image.scheduler.flux import FluxBatchSchedulerActor
+            self._text_to_image_scheduler_ref = await xo.create_actor(
+                FluxBatchSchedulerActor,
+                address=self.address,
+                uid=FluxBatchSchedulerActor.gen_uid(self.model_uid()),
+            )
     async def _record_completion_metrics(
         self, duration, completion_tokens, prompt_tokens
     ):
@@ -311,10 +333,8 @@ class ModelActor(xo.StatelessActor):
         model_ability = self._model_description.get("model_ability", [])
-        condition = XINFERENCE_TRANSFORMERS_ENABLE_BATCHING and isinstance(
-            self._model, PytorchModel
-        )
-        if condition and "vision" in model_ability:
+        condition = isinstance(self._model, PytorchModel)
+        if condition and ("vision" in model_ability or "audio" in model_ability):
             if (
                 self._model.model_family.model_name
                 in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
@@ -331,6 +351,26 @@ class ModelActor(xo.StatelessActor):
                 return False
         return condition
+    def allow_batching_for_text_to_image(self) -> bool:
+        from ..model.image.stable_diffusion.core import DiffusionModel
+        condition = XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE is not None and isinstance(
+            self._model, DiffusionModel
+        )
+        if condition:
+            model_name = self._model._model_spec.model_name  # type: ignore
+            if model_name in XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS:
+                return True
+            else:
+                logger.warning(
+                    f"Currently for image models with text_to_image ability, "
+                    f"xinference only supports {', '.join(XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS)} for batching. "
+                    f"Your model {model_name} is disqualified."
+                )
+                return False
+        return condition
     async def load(self):
         self._model.load()
         if self.allow_batching():
@@ -338,6 +378,11 @@ class ModelActor(xo.StatelessActor):
             logger.debug(
                 f"Batching enabled for model: {self.model_uid()}, max_num_seqs: {self._model.get_max_num_seqs()}"
             )
+        if self.allow_batching_for_text_to_image():
+            await self._text_to_image_scheduler_ref.set_model(self._model)
+            logger.debug(
+                f"Batching enabled for model: {self.model_uid()}, max_num_images: {self._model.get_max_num_images_for_batching()}"
+            )
     def model_uid(self):
         return (
@@ -617,12 +662,16 @@ class ModelActor(xo.StatelessActor):
                 )
     async def abort_request(self, request_id: str) -> str:
-        from .scheduler import AbortRequestMessage
+        from .utils import AbortRequestMessage
         if self.allow_batching():
             if self._scheduler_ref is None:
                 return AbortRequestMessage.NOT_FOUND.name
             return await self._scheduler_ref.abort_request(request_id)
+        elif self.allow_batching_for_text_to_image():
+            if self._text_to_image_scheduler_ref is None:
+                return AbortRequestMessage.NOT_FOUND.name
+            return await self._text_to_image_scheduler_ref.abort_request(request_id)
         return AbortRequestMessage.NO_OP.name
     @request_limit
@@ -747,6 +796,22 @@ class ModelActor(xo.StatelessActor):
             f"Model {self._model.model_spec} is not for creating speech."
         )
+    async def handle_image_batching_request(self, unique_id, *args, **kwargs):
+        size = args[2]
+        if XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE != size:
+            raise RuntimeError(
+                f"The image size: {size} of text_to_image for batching "
+                f"must be the same as the environment variable: {XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE} you set."
+            )
+        assert self._loop is not None
+        future = ConcurrentFuture()
+        await self._text_to_image_scheduler_ref.add_request(
+            unique_id, future, *args, **kwargs
+        )
+        fut = asyncio.wrap_future(future, loop=self._loop)
+        result = await fut
+        return await asyncio.to_thread(json_dumps, result)
     @request_limit
     @log_async(logger=logger)
     async def text_to_image(
@@ -759,19 +824,25 @@ class ModelActor(xo.StatelessActor):
         **kwargs,
     ):
         if hasattr(self._model, "text_to_image"):
-            progressor = kwargs["progressor"] = await self._get_progressor(
-                kwargs.pop("request_id", None)
-            )
-            with progressor:
-                return await self._call_wrapper_json(
-                    self._model.text_to_image,
-                    prompt,
-                    n,
-                    size,
-                    response_format,
-                    *args,
-                    **kwargs,
+            if self.allow_batching_for_text_to_image():
+                unique_id = kwargs.pop("request_id", None)
+                return await self.handle_image_batching_request(
+                    unique_id, prompt, n, size, response_format, *args, **kwargs
                 )
+            else:
+                progressor = kwargs["progressor"] = await self._get_progressor(
+                    kwargs.pop("request_id", None)
+                )
+                with progressor:
+                    return await self._call_wrapper_json(
+                        self._model.text_to_image,
+                        prompt,
+                        n,
+                        size,
+                        response_format,
+                        *args,
+                        **kwargs,
+                    )
         raise AttributeError(
             f"Model {self._model.model_spec} is not for creating image."
         )

xinference/core/scheduler.py CHANGED Viewed

@@ -17,11 +17,12 @@ import functools
 import logging
 import uuid
 from collections import deque
-from enum import Enum
 from typing import Dict, List, Optional, Set, Tuple, Union
 import xoscar as xo
+from .utils import AbortRequestMessage
 logger = logging.getLogger(__name__)
 XINFERENCE_STREAMING_DONE_FLAG = "<XINFERENCE_STREAMING_DONE>"
@@ -30,12 +31,6 @@ XINFERENCE_STREAMING_ABORT_FLAG = "<XINFERENCE_STREAMING_ABORT>"
 XINFERENCE_NON_STREAMING_ABORT_FLAG = "<XINFERENCE_NON_STREAMING_ABORT>"
-class AbortRequestMessage(Enum):
-    NOT_FOUND = 1
-    DONE = 2
-    NO_OP = 3
 class InferenceRequest:
     def __init__(
         self,
@@ -81,6 +76,10 @@ class InferenceRequest:
         self.padding_len = 0
         # Use in stream mode
         self.last_output_length = 0
+        # For tool call
+        self.tools = None
+        # Currently, for storing tool call streaming results.
+        self.outputs: List[str] = []
         # inference results,
         # it is a list type because when stream=True,
         # self.completion contains all the results in a decode round.
@@ -112,6 +111,10 @@ class InferenceRequest:
         """
         return self._prompt
+    @prompt.setter
+    def prompt(self, value: str):
+        self._prompt = value
     @property
     def call_ability(self):
         return self._call_ability

xinference/core/utils.py CHANGED Viewed

@@ -16,6 +16,7 @@ import os
 import random
 import string
 import uuid
+from enum import Enum
 from typing import Dict, Generator, List, Optional, Tuple, Union
 import orjson
@@ -27,6 +28,12 @@ from ..constants import XINFERENCE_LOG_ARG_MAX_LENGTH
 logger = logging.getLogger(__name__)
+class AbortRequestMessage(Enum):
+    NOT_FOUND = 1
+    DONE = 2
+    NO_OP = 3
 def truncate_log_arg(arg) -> str:
     s = str(arg)
     if len(s) > XINFERENCE_LOG_ARG_MAX_LENGTH:
@@ -51,6 +58,8 @@ def log_async(
             request_id_str = kwargs.get("request_id", "")
             if not request_id_str:
                 request_id_str = uuid.uuid1()
+                if func_name == "text_to_image":
+                    kwargs["request_id"] = request_id_str
             request_id_str = f"[request {request_id_str}]"
             formatted_args = ",".join(map(truncate_log_arg, args))
             formatted_kwargs = ",".join(

xinference/deploy/supervisor.py CHANGED Viewed

@@ -31,6 +31,10 @@ from .utils import health_check
 logger = logging.getLogger(__name__)
+from ..model import _install as install_model
+install_model()
 async def _start_supervisor(address: str, logging_conf: Optional[Dict] = None):
     logging.config.dictConfig(logging_conf)  # type: ignore

xinference/model/__init__.py CHANGED Viewed

@@ -29,3 +29,7 @@ def _install():
     image_install()
     rerank_install()
     video_install()
+_install()
+del _install

xinference/model/image/scheduler/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2024 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference 0.15.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

Potentially problematic release.

xinference 0.15.4py3-none-any.whl → 0.16.0py3-none-any.whl