PyPI - xinference - Versions diffs - 1.7.1.post1__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

xinference 1.7.1.post1py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show

xinference/_version.py +3 -3
xinference/client/restful/async_restful_client.py +8 -13
xinference/client/restful/restful_client.py +6 -2
xinference/core/chat_interface.py +6 -4
xinference/core/media_interface.py +5 -0
xinference/core/model.py +1 -5
xinference/core/supervisor.py +117 -68
xinference/core/worker.py +49 -37
xinference/deploy/test/test_cmdline.py +2 -6
xinference/model/audio/__init__.py +26 -23
xinference/model/audio/chattts.py +3 -2
xinference/model/audio/core.py +49 -98
xinference/model/audio/cosyvoice.py +3 -2
xinference/model/audio/custom.py +28 -73
xinference/model/audio/f5tts.py +3 -2
xinference/model/audio/f5tts_mlx.py +3 -2
xinference/model/audio/fish_speech.py +3 -2
xinference/model/audio/funasr.py +17 -4
xinference/model/audio/kokoro.py +3 -2
xinference/model/audio/megatts.py +3 -2
xinference/model/audio/melotts.py +3 -2
xinference/model/audio/model_spec.json +572 -171
xinference/model/audio/utils.py +0 -6
xinference/model/audio/whisper.py +3 -2
xinference/model/audio/whisper_mlx.py +3 -2
xinference/model/cache_manager.py +141 -0
xinference/model/core.py +6 -49
xinference/model/custom.py +174 -0
xinference/model/embedding/__init__.py +67 -56
xinference/model/embedding/cache_manager.py +35 -0
xinference/model/embedding/core.py +104 -84
xinference/model/embedding/custom.py +55 -78
xinference/model/embedding/embed_family.py +80 -31
xinference/model/embedding/flag/core.py +21 -5
xinference/model/embedding/llama_cpp/__init__.py +0 -0
xinference/model/embedding/llama_cpp/core.py +234 -0
xinference/model/embedding/model_spec.json +968 -103
xinference/model/embedding/sentence_transformers/core.py +30 -20
xinference/model/embedding/vllm/core.py +11 -5
xinference/model/flexible/__init__.py +8 -2
xinference/model/flexible/core.py +26 -119
xinference/model/flexible/custom.py +69 -0
xinference/model/flexible/launchers/image_process_launcher.py +1 -0
xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
xinference/model/flexible/launchers/transformers_launcher.py +15 -3
xinference/model/flexible/launchers/yolo_launcher.py +5 -1
xinference/model/image/__init__.py +20 -20
xinference/model/image/cache_manager.py +62 -0
xinference/model/image/core.py +70 -182
xinference/model/image/custom.py +28 -72
xinference/model/image/model_spec.json +402 -119
xinference/model/image/ocr/got_ocr2.py +3 -2
xinference/model/image/stable_diffusion/core.py +22 -7
xinference/model/image/stable_diffusion/mlx.py +6 -6
xinference/model/image/utils.py +2 -2
xinference/model/llm/__init__.py +71 -94
xinference/model/llm/cache_manager.py +292 -0
xinference/model/llm/core.py +37 -111
xinference/model/llm/custom.py +88 -0
xinference/model/llm/llama_cpp/core.py +5 -7
xinference/model/llm/llm_family.json +16260 -8151
xinference/model/llm/llm_family.py +138 -839
xinference/model/llm/lmdeploy/core.py +5 -7
xinference/model/llm/memory.py +3 -4
xinference/model/llm/mlx/core.py +6 -8
xinference/model/llm/reasoning_parser.py +3 -1
xinference/model/llm/sglang/core.py +32 -14
xinference/model/llm/transformers/chatglm.py +3 -7
xinference/model/llm/transformers/core.py +49 -27
xinference/model/llm/transformers/deepseek_v2.py +2 -2
xinference/model/llm/transformers/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
xinference/model/llm/transformers/opt.py +3 -7
xinference/model/llm/utils.py +34 -49
xinference/model/llm/vllm/core.py +77 -27
xinference/model/llm/vllm/xavier/engine.py +5 -3
xinference/model/llm/vllm/xavier/scheduler.py +10 -6
xinference/model/llm/vllm/xavier/transfer.py +1 -1
xinference/model/rerank/__init__.py +26 -25
xinference/model/rerank/core.py +47 -87
xinference/model/rerank/custom.py +25 -71
xinference/model/rerank/model_spec.json +158 -33
xinference/model/rerank/utils.py +2 -2
xinference/model/utils.py +115 -54
xinference/model/video/__init__.py +13 -17
xinference/model/video/core.py +44 -102
xinference/model/video/diffusers.py +4 -3
xinference/model/video/model_spec.json +90 -21
xinference/types.py +5 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
xinference/web/ui/src/locales/en.json +0 -1
xinference/web/ui/src/locales/ja.json +0 -1
xinference/web/ui/src/locales/ko.json +0 -1
xinference/web/ui/src/locales/zh.json +0 -1
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
xinference/model/audio/model_spec_modelscope.json +0 -231
xinference/model/embedding/model_spec_modelscope.json +0 -293
xinference/model/embedding/utils.py +0 -18
xinference/model/image/model_spec_modelscope.json +0 -375
xinference/model/llm/llama_cpp/memory.py +0 -457
xinference/model/llm/llm_family_csghub.json +0 -56
xinference/model/llm/llm_family_modelscope.json +0 -8700
xinference/model/llm/llm_family_openmind_hub.json +0 -1019
xinference/model/rerank/model_spec_modelscope.json +0 -85
xinference/model/video/model_spec_modelscope.json +0 -184
xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
/xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/core.py CHANGED Viewed

@@ -22,35 +22,32 @@ from abc import abstractmethod
 from collections import defaultdict
 from contextvars import ContextVar
 from functools import lru_cache
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
 from ...core.utils import parse_replica_model_uid
 from ...types import PeftModelConfig
-from ..core import ModelDescription
 from .reasoning_parser import ReasoningParser
 if TYPE_CHECKING:
-    from .llm_family import LLMFamilyV1, LLMSpecV1
+    from .llm_family import LLMFamilyV2, LLMSpecV1
 logger = logging.getLogger(__name__)
-LLM_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
+LLM_VERSION_INFOS: Dict[str, List[Dict]] = defaultdict(list)
-def get_llm_model_descriptions():
+def get_llm_version_infos():
     import copy
-    return copy.deepcopy(LLM_MODEL_DESCRIPTIONS)
+    return copy.deepcopy(LLM_VERSION_INFOS)
 class LLM(abc.ABC):
     def __init__(
         self,
         replica_model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         *args,
         **kwargs,
@@ -58,8 +55,8 @@ class LLM(abc.ABC):
         self.model_uid, self.rep_id = parse_replica_model_uid(replica_model_uid)
         self.raw_model_uid = replica_model_uid
         self.model_family = model_family
-        self.model_spec = model_spec
-        self.quantization = quantization
+        self.model_spec = model_family.model_specs[0]
+        self.quantization = model_family.model_specs[0].quantization
         self.model_path = model_path
         self.reasoning_parser = None
         if args:
@@ -128,7 +125,7 @@ class LLM(abc.ABC):
     @classmethod
     def match(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls.check_lib():
             return False
@@ -137,7 +134,7 @@ class LLM(abc.ABC):
     @classmethod
     @abstractmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         raise NotImplementedError
@@ -169,89 +166,26 @@ class LLM(abc.ABC):
 chat_context_var: ContextVar[dict] = ContextVar("chat_context_var", default={})
-class LLMDescription(ModelDescription):
-    def __init__(
-        self,
-        address: Optional[str],
-        devices: Optional[List[str]],
-        llm_family: "LLMFamilyV1",
-        llm_spec: "LLMSpecV1",
-        quantization: Optional[str],
-        multimodal_projector: Optional[str] = None,
-        model_path: Optional[str] = None,
-    ):
-        super().__init__(address, devices, model_path=model_path)
-        self._llm_family = llm_family
-        self._llm_spec = llm_spec
-        self._quantization = quantization
-        self._multimodal_projector = multimodal_projector
-    @property
-    def spec(self):
-        return self._llm_family
-    def to_dict(self):
-        return {
-            "model_type": "LLM",
-            "address": self.address,
-            "accelerators": self.devices,
-            "model_name": self._llm_family.model_name,
-            "model_lang": self._llm_family.model_lang,
-            "model_ability": self._llm_family.model_ability,
-            "model_description": self._llm_family.model_description,
-            "model_format": self._llm_spec.model_format,
-            "model_size_in_billions": self._llm_spec.model_size_in_billions,
-            "model_family": self._llm_family.model_family
-            or self._llm_family.model_name,
-            "quantization": self._quantization,
-            "multimodal_projector": self._multimodal_projector,
-            "model_hub": self._llm_spec.model_hub,
-            "revision": self._llm_spec.model_revision,
-            "context_length": self._llm_family.context_length,
-        }
-    def to_version_info(self):
-        from .utils import get_file_location, get_model_version
-        model_file_location, cache_status = get_file_location(
-            self._llm_family, self._llm_spec, self._quantization
-        )
-        return {
-            "model_version": get_model_version(
-                self._llm_family, self._llm_spec, self._quantization
-            ),
-            "model_file_location": model_file_location,
-            "cache_status": cache_status,
-            "quantization": self._quantization,
-            "multimodal_projector": self._multimodal_projector,
-            "model_format": self._llm_spec.model_format,
-            "model_size_in_billions": self._llm_spec.model_size_in_billions,
-        }
-def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
+def generate_llm_version_info(llm_family: "LLMFamilyV2") -> Dict[str, List[Dict]]:
     res = defaultdict(list)
-    for spec in llm_family.model_specs:
+    # Use model_specs from huggingface, as HuggingFace is the most comprehensive.
+    hf_specs = [
+        spec for spec in llm_family.model_specs if spec.model_hub == "huggingface"
+    ]
+    for spec in hf_specs:
+        _llm_family = llm_family.copy()
+        _llm_family.model_specs = [spec]
         multimodal_projectors = getattr(spec, "multimodal_projectors", None)
-        for q in spec.quantizations:
-            if multimodal_projectors:
-                for mmproj in multimodal_projectors:
-                    res[llm_family.model_name].append(
-                        LLMDescription(
-                            None, None, llm_family, spec, q, mmproj
-                        ).to_version_info()
-                    )
-            else:
-                res[llm_family.model_name].append(
-                    LLMDescription(None, None, llm_family, spec, q).to_version_info()
-                )
+        if multimodal_projectors:
+            for mmproj in multimodal_projectors:
+                _llm_family.multimodal_projector = mmproj
+                res[_llm_family.model_name].append(_llm_family.to_version_info())
+        else:
+            res[_llm_family.model_name].append(_llm_family.to_version_info())
     return res
 def create_llm_model_instance(
-    subpool_addr: str,
-    devices: List[str],
     model_uid: str,
     model_name: str,
     model_engine: Optional[str],
@@ -264,35 +198,35 @@ def create_llm_model_instance(
     ] = None,
     model_path: Optional[str] = None,
     **kwargs,
-) -> Tuple[LLM, LLMDescription]:
-    from .llm_family import cache, check_engine_by_spec_parameters, match_llm
+) -> LLM:
+    from .cache_manager import LLMCacheManager
+    from .llm_family import check_engine_by_spec_parameters, match_llm
     if model_engine is None:
         raise ValueError("model_engine is required for LLM model")
-    match_result = match_llm(
+    llm_family = match_llm(
         model_name, model_format, model_size_in_billions, quantization, download_hub
     )
-    if not match_result:
+    if not llm_family:
         raise ValueError(
             f"Model not found, name: {model_name}, format: {model_format},"
             f" size: {model_size_in_billions}, quantization: {quantization}"
         )
-    llm_family, llm_spec, quantization = match_result
-    assert quantization is not None
     llm_cls = check_engine_by_spec_parameters(
         model_engine,
         llm_family.model_name,
-        llm_spec.model_format,
-        llm_spec.model_size_in_billions,
-        quantization,
+        llm_family.model_specs[0].model_format,
+        llm_family.model_specs[0].model_size_in_billions,
+        llm_family.model_specs[0].quantization,
     )
     logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
     multimodal_projector = kwargs.get("multimodal_projector")
     if not model_path:
-        model_path = cache(llm_family, llm_spec, quantization, multimodal_projector)
+        cache_manager = LLMCacheManager(llm_family, multimodal_projector)
+        model_path = cache_manager.cache()
     peft_model = peft_model_config.peft_model if peft_model_config else None
     if peft_model is not None:
@@ -300,8 +234,6 @@ def create_llm_model_instance(
             model = llm_cls(
                 model_uid,
                 llm_family,
-                llm_spec,
-                quantization,
                 model_path,
                 kwargs,
                 peft_model,
@@ -311,13 +243,7 @@ def create_llm_model_instance(
                 f"Model not supported with lora, name: {model_name}, format: {model_format}, engine: {model_engine}. "
                 f"Load this without lora."
             )
-            model = llm_cls(
-                model_uid, llm_family, llm_spec, quantization, model_path, kwargs
-            )
+            model = llm_cls(model_uid, llm_family, model_path, kwargs)
     else:
-        model = llm_cls(
-            model_uid, llm_family, llm_spec, quantization, model_path, kwargs
-        )
-    return model, LLMDescription(
-        subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
-    )
+        model = llm_cls(model_uid, llm_family, model_path, kwargs)
+    return model

xinference/model/llm/custom.py ADDED Viewed

@@ -0,0 +1,88 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import TYPE_CHECKING, List
+from ..custom import ModelRegistry
+if TYPE_CHECKING:
+    from .llm_family import LLMFamilyV2
+logger = logging.getLogger(__name__)
+UD_LLM_FAMILIES: List["LLMFamilyV2"] = []
+class LLMModelRegistry(ModelRegistry):
+    model_type = "llm"
+    def __init__(self):
+        from .llm_family import BUILTIN_LLM_FAMILIES
+        super().__init__()
+        self.models = UD_LLM_FAMILIES
+        self.builtin_models = [x.model_name for x in BUILTIN_LLM_FAMILIES]
+    def add_ud_model(self, model_spec):
+        from . import generate_engine_config_by_model_family
+        self.models.append(model_spec)
+        generate_engine_config_by_model_family(model_spec)
+    def check_model_uri(self, llm_family: "LLMFamilyV2"):
+        from ..utils import is_valid_model_uri
+        for spec in llm_family.model_specs:
+            model_uri = spec.model_uri
+            if model_uri and not is_valid_model_uri(model_uri):
+                raise ValueError(f"Invalid model URI {model_uri}.")
+    def remove_ud_model(self, llm_family: "LLMFamilyV2"):
+        from .llm_family import LLM_ENGINES
+        UD_LLM_FAMILIES.remove(llm_family)
+        del LLM_ENGINES[llm_family.model_name]
+    def remove_ud_model_files(self, llm_family: "LLMFamilyV2"):
+        from .cache_manager import LLMCacheManager
+        _llm_family = llm_family.copy()
+        for spec in llm_family.model_specs:
+            _llm_family.model_specs = [spec]
+            cache_manager = LLMCacheManager(_llm_family)
+            cache_manager.unregister_custom_model(self.model_type)
+def get_user_defined_llm_families():
+    from ..custom import RegistryManager
+    registry = RegistryManager.get_registry("llm")
+    return registry.get_custom_models()
+def register_llm(llm_family: "LLMFamilyV2", persist: bool):
+    from ..custom import RegistryManager
+    registry = RegistryManager.get_registry("llm")
+    registry.register(llm_family, persist)
+def unregister_llm(model_name: str, raise_error: bool = True):
+    from ..custom import RegistryManager
+    registry = RegistryManager.get_registry("llm")
+    registry.unregister(model_name, raise_error)

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -23,9 +23,8 @@ import orjson
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
 from ..core import LLM
-from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import ChatModelMixin
-from .memory import estimate_gpu_layers
 logger = logging.getLogger(__name__)
@@ -43,13 +42,11 @@ class XllamaCppModel(LLM, ChatModelMixin):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         llamacpp_model_config: Optional[dict] = None,
     ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        super().__init__(model_uid, model_family, model_path)
         self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
         self._llm = None
         self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
@@ -84,7 +81,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
+        cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["ggufv2"]:
             return False
@@ -100,6 +97,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
             from xllamacpp import (
                 CommonParams,
                 Server,
+                estimate_gpu_layers,
                 get_device_info,
                 ggml_backend_dev_type,
             )

xinference 1.7.1.post1__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

xinference 1.7.1.post1py3-none-any.whl → 1.8.0py3-none-any.whl