PyPI - xinference - Versions diffs - 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show

xinference/_version.py +3 -3
xinference/client/restful/async_restful_client.py +8 -13
xinference/client/restful/restful_client.py +6 -2
xinference/core/chat_interface.py +6 -4
xinference/core/media_interface.py +5 -0
xinference/core/model.py +1 -5
xinference/core/supervisor.py +117 -68
xinference/core/worker.py +49 -37
xinference/deploy/test/test_cmdline.py +2 -6
xinference/model/audio/__init__.py +26 -23
xinference/model/audio/chattts.py +3 -2
xinference/model/audio/core.py +49 -98
xinference/model/audio/cosyvoice.py +3 -2
xinference/model/audio/custom.py +28 -73
xinference/model/audio/f5tts.py +3 -2
xinference/model/audio/f5tts_mlx.py +3 -2
xinference/model/audio/fish_speech.py +3 -2
xinference/model/audio/funasr.py +17 -4
xinference/model/audio/kokoro.py +3 -2
xinference/model/audio/megatts.py +3 -2
xinference/model/audio/melotts.py +3 -2
xinference/model/audio/model_spec.json +572 -171
xinference/model/audio/utils.py +0 -6
xinference/model/audio/whisper.py +3 -2
xinference/model/audio/whisper_mlx.py +3 -2
xinference/model/cache_manager.py +141 -0
xinference/model/core.py +6 -49
xinference/model/custom.py +174 -0
xinference/model/embedding/__init__.py +67 -56
xinference/model/embedding/cache_manager.py +35 -0
xinference/model/embedding/core.py +104 -84
xinference/model/embedding/custom.py +55 -78
xinference/model/embedding/embed_family.py +80 -31
xinference/model/embedding/flag/core.py +21 -5
xinference/model/embedding/llama_cpp/__init__.py +0 -0
xinference/model/embedding/llama_cpp/core.py +234 -0
xinference/model/embedding/model_spec.json +968 -103
xinference/model/embedding/sentence_transformers/core.py +30 -20
xinference/model/embedding/vllm/core.py +11 -5
xinference/model/flexible/__init__.py +8 -2
xinference/model/flexible/core.py +26 -119
xinference/model/flexible/custom.py +69 -0
xinference/model/flexible/launchers/image_process_launcher.py +1 -0
xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
xinference/model/flexible/launchers/transformers_launcher.py +15 -3
xinference/model/flexible/launchers/yolo_launcher.py +5 -1
xinference/model/image/__init__.py +20 -20
xinference/model/image/cache_manager.py +62 -0
xinference/model/image/core.py +70 -182
xinference/model/image/custom.py +28 -72
xinference/model/image/model_spec.json +402 -119
xinference/model/image/ocr/got_ocr2.py +3 -2
xinference/model/image/stable_diffusion/core.py +22 -7
xinference/model/image/stable_diffusion/mlx.py +6 -6
xinference/model/image/utils.py +2 -2
xinference/model/llm/__init__.py +71 -94
xinference/model/llm/cache_manager.py +292 -0
xinference/model/llm/core.py +37 -111
xinference/model/llm/custom.py +88 -0
xinference/model/llm/llama_cpp/core.py +5 -7
xinference/model/llm/llm_family.json +16260 -8151
xinference/model/llm/llm_family.py +138 -839
xinference/model/llm/lmdeploy/core.py +5 -7
xinference/model/llm/memory.py +3 -4
xinference/model/llm/mlx/core.py +6 -8
xinference/model/llm/reasoning_parser.py +3 -1
xinference/model/llm/sglang/core.py +32 -14
xinference/model/llm/transformers/chatglm.py +3 -7
xinference/model/llm/transformers/core.py +49 -27
xinference/model/llm/transformers/deepseek_v2.py +2 -2
xinference/model/llm/transformers/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
xinference/model/llm/transformers/opt.py +3 -7
xinference/model/llm/utils.py +34 -49
xinference/model/llm/vllm/core.py +77 -27
xinference/model/llm/vllm/xavier/engine.py +5 -3
xinference/model/llm/vllm/xavier/scheduler.py +10 -6
xinference/model/llm/vllm/xavier/transfer.py +1 -1
xinference/model/rerank/__init__.py +26 -25
xinference/model/rerank/core.py +47 -87
xinference/model/rerank/custom.py +25 -71
xinference/model/rerank/model_spec.json +158 -33
xinference/model/rerank/utils.py +2 -2
xinference/model/utils.py +115 -54
xinference/model/video/__init__.py +13 -17
xinference/model/video/core.py +44 -102
xinference/model/video/diffusers.py +4 -3
xinference/model/video/model_spec.json +90 -21
xinference/types.py +5 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
xinference/web/ui/src/locales/en.json +0 -1
xinference/web/ui/src/locales/ja.json +0 -1
xinference/web/ui/src/locales/ko.json +0 -1
xinference/web/ui/src/locales/zh.json +0 -1
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
xinference/model/audio/model_spec_modelscope.json +0 -231
xinference/model/embedding/model_spec_modelscope.json +0 -293
xinference/model/embedding/utils.py +0 -18
xinference/model/image/model_spec_modelscope.json +0 -375
xinference/model/llm/llama_cpp/memory.py +0 -457
xinference/model/llm/llm_family_csghub.json +0 -56
xinference/model/llm/llm_family_modelscope.json +0 -8700
xinference/model/llm/llm_family_openmind_hub.json +0 -1019
xinference/model/rerank/model_spec_modelscope.json +0 -85
xinference/model/video/model_spec_modelscope.json +0 -184
xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
/xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0

xinference/model/image/ocr/got_ocr2.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Optional
 import PIL.Image
 if TYPE_CHECKING:
-    from ..core import ImageModelFamilyV1
+    from ..core import ImageModelFamilyV2
 logger = logging.getLogger(__name__)
@@ -29,9 +29,10 @@ class GotOCR2Model:
         model_uid: str,
         model_path: Optional[str] = None,
         device: Optional[str] = None,
-        model_spec: Optional["ImageModelFamilyV1"] = None,
+        model_spec: Optional["ImageModelFamilyV2"] = None,
         **kwargs,
     ):
+        self.model_family = model_spec
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -37,7 +37,7 @@ from ..utils import handle_image_result
 if TYPE_CHECKING:
     from ....core.progress_tracker import Progressor
-    from ..core import ImageModelFamilyV1
+    from ..core import ImageModelFamilyV2
 logger = logging.getLogger(__name__)
@@ -87,10 +87,11 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         lora_model: Optional[List[LoRA]] = None,
         lora_load_kwargs: Optional[Dict] = None,
         lora_fuse_kwargs: Optional[Dict] = None,
-        model_spec: Optional["ImageModelFamilyV1"] = None,
+        model_spec: Optional["ImageModelFamilyV2"] = None,
         gguf_model_path: Optional[str] = None,
         **kwargs,
     ):
+        self.model_family = model_spec
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device
@@ -239,10 +240,22 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         logger.debug(
             "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
         )
-        self._model = AutoPipelineModel.from_pretrained(
-            self._model_path,
-            **self._kwargs,
-        )
+        try:
+            self._model = AutoPipelineModel.from_pretrained(
+                self._model_path,
+                **self._kwargs,
+            )
+        except ValueError:
+            if "kontext" in self._model_spec.model_name.lower():
+                # TODO: remove this branch when auto pipeline supports
+                # flux.1-kontext-dev
+                from diffusers import FluxKontextPipeline
+                self._model = FluxKontextPipeline.from_pretrained(
+                    self._model_path, **self._kwargs
+                )
+            else:
+                raise
         self._load_to_device(self._model)
         self._apply_lora()
@@ -657,7 +670,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         response_format: str = "url",
         **kwargs,
     ):
-        if self._kwargs.get("controlnet"):
+        if self._kwargs.get("controlnet") or self._model_spec.model_ability == [  # type: ignore
+            "image2image"
+        ]:
             model = self._model
         else:
             ability = "image2image"

xinference/model/image/stable_diffusion/mlx.py CHANGED Viewed

@@ -20,7 +20,6 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 import numpy as np
 from PIL import Image
-from xoscar.utils import classproperty
 from ....types import LoRA
 from ..sdapi import SDAPIDiffusionModelMixin
@@ -28,7 +27,7 @@ from ..utils import handle_image_result
 if TYPE_CHECKING:
     from ....core.progress_tracker import Progressor
-    from ..core import ImageModelFamilyV1
+    from ..core import ImageModelFamilyV2
 logger = logging.getLogger(__name__)
@@ -61,9 +60,10 @@ class MLXDiffusionModel(SDAPIDiffusionModelMixin):
         lora_model: Optional[List[LoRA]] = None,
         lora_load_kwargs: Optional[Dict] = None,
         lora_fuse_kwargs: Optional[Dict] = None,
-        model_spec: Optional["ImageModelFamilyV1"] = None,
+        model_spec: Optional["ImageModelFamilyV2"] = None,
         **kwargs,
     ):
+        self.model_family = model_spec
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device
@@ -81,9 +81,9 @@ class MLXDiffusionModel(SDAPIDiffusionModelMixin):
     def model_ability(self):
         return self._abilities
-    @classproperty
-    def supported_models(self):
-        return ["FLUX.1-schnell", "FLUX.1-dev"]
+    @staticmethod
+    def support_model(model_name: str) -> bool:
+        return "flux" in model_name.lower()
     def load(self):
         try:

xinference/model/image/utils.py CHANGED Viewed

@@ -24,11 +24,11 @@ from ...constants import XINFERENCE_IMAGE_DIR
 from ...types import Image, ImageList
 if TYPE_CHECKING:
-    from .core import ImageModelFamilyV1
+    from .core import ImageModelFamilyV2
 def get_model_version(
-    image_model: "ImageModelFamilyV1", controlnet: Optional["ImageModelFamilyV1"]
+    image_model: "ImageModelFamilyV2", controlnet: Optional["ImageModelFamilyV2"]
 ) -> str:
     return (
         image_model.model_name

xinference/model/llm/__init__.py CHANGED Viewed

@@ -11,28 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import codecs
 import json
 import os
 import warnings
+from ..utils import flatten_quantizations
 from .core import (
     LLM,
-    LLM_MODEL_DESCRIPTIONS,
-    LLMDescription,
-    generate_llm_description,
-    get_llm_model_descriptions,
+    LLM_VERSION_INFOS,
+    generate_llm_version_info,
+    get_llm_version_infos,
 )
+from .custom import get_user_defined_llm_families, register_llm, unregister_llm
 from .llm_family import (
-    BUILTIN_CSGHUB_LLM_FAMILIES,
     BUILTIN_LLM_FAMILIES,
     BUILTIN_LLM_MODEL_CHAT_FAMILIES,
     BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
     BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
     BUILTIN_LLM_PROMPT_STYLE,
-    BUILTIN_MODELSCOPE_LLM_FAMILIES,
-    BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
     LLAMA_CLASSES,
     LLM_ENGINES,
     LMDEPLOY_CLASSES,
@@ -41,17 +38,13 @@ from .llm_family import (
     SUPPORTED_ENGINES,
     TRANSFORMERS_CLASSES,
     VLLM_CLASSES,
-    CustomLLMFamilyV1,
-    LlamaCppLLMSpecV1,
-    LLMFamilyV1,
+    CustomLLMFamilyV2,
+    LlamaCppLLMSpecV2,
+    LLMFamilyV2,
     LLMSpecV1,
-    MLXLLMSpecV1,
-    PytorchLLMSpecV1,
-    get_cache_status,
-    get_user_defined_llm_families,
+    MLXLLMSpecV2,
+    PytorchLLMSpecV2,
     match_llm,
-    register_llm,
-    unregister_llm,
 )
@@ -64,69 +57,72 @@ def check_format_with_engine(model_format, engine):
     return True
-def generate_engine_config_by_model_family(model_family):
+def generate_engine_config_by_model_family(model_family: "LLMFamilyV2"):
     model_name = model_family.model_name
     specs = model_family.model_specs
     engines = LLM_ENGINES.get(model_name, {})  # structure for engine query
     for spec in specs:
         model_format = spec.model_format
         model_size_in_billions = spec.model_size_in_billions
-        quantizations = spec.quantizations
-        for quantization in quantizations:
-            # traverse all supported engines to match the name, format, size in billions and quantization of model
-            for engine in SUPPORTED_ENGINES:
-                if not check_format_with_engine(
-                    model_format, engine
-                ):  # match the format of model with engine
-                    continue
-                CLASSES = SUPPORTED_ENGINES[engine]
-                for cls in CLASSES:
-                    if cls.match(model_family, spec, quantization):
-                        engine_params = engines.get(engine, [])
-                        already_exists = False
-                        # if the name, format and size in billions of model already exists in the structure, add the new quantization
-                        for param in engine_params:
-                            if (
-                                model_name == param["model_name"]
-                                and model_format == param["model_format"]
-                                and model_size_in_billions
-                                == param["model_size_in_billions"]
-                            ):
-                                if quantization not in param["quantizations"]:
-                                    param["quantizations"].append(quantization)
-                                already_exists = True
-                                break
-                        # successfully match the params for the first time, add to the structure
-                        if not already_exists:
-                            engine_params.append(
-                                {
-                                    "model_name": model_name,
-                                    "model_format": model_format,
-                                    "model_size_in_billions": model_size_in_billions,
-                                    "quantizations": [quantization],
-                                    "llm_class": cls,
-                                }
-                            )
-                            if hasattr(spec, "multimodal_projectors"):
-                                engine_params[-1][
-                                    "multimodal_projectors"
-                                ] = spec.multimodal_projectors
-                        engines[engine] = engine_params
-                        break
+        quantization = spec.quantization
+        # traverse all supported engines to match the name, format, size in billions and quantization of model
+        for engine in SUPPORTED_ENGINES:
+            if not check_format_with_engine(
+                model_format, engine
+            ):  # match the format of model with engine
+                continue
+            CLASSES = SUPPORTED_ENGINES[engine]
+            for cls in CLASSES:
+                if cls.match(model_family, spec, quantization):
+                    engine_params = engines.get(engine, [])
+                    already_exists = False
+                    # if the name, format and size in billions of model already exists in the structure, add the new quantization
+                    for param in engine_params:
+                        if (
+                            model_name == param["model_name"]
+                            and model_format == param["model_format"]
+                            and model_size_in_billions
+                            == param["model_size_in_billions"]
+                        ):
+                            if quantization not in param["quantizations"]:
+                                param["quantizations"].append(quantization)
+                            already_exists = True
+                            break
+                    # successfully match the params for the first time, add to the structure
+                    if not already_exists:
+                        engine_params.append(
+                            {
+                                "model_name": model_name,
+                                "model_format": model_format,
+                                "model_size_in_billions": model_size_in_billions,
+                                "quantizations": [quantization],
+                                "llm_class": cls,
+                            }
+                        )
+                        if hasattr(spec, "multimodal_projectors"):
+                            engine_params[-1][
+                                "multimodal_projectors"
+                            ] = spec.multimodal_projectors
+                    engines[engine] = engine_params
+                    break
     LLM_ENGINES[model_name] = engines
 def register_custom_model():
     from ...constants import XINFERENCE_MODEL_DIR
+    from ..custom import migrate_from_v1_to_v2
+    # migrate from v1 to v2 first
+    migrate_from_v1_to_v2("llm", CustomLLMFamilyV2)
-    user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "llm")
+    user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "v2", "llm")
     if os.path.isdir(user_defined_llm_dir):
         for f in os.listdir(user_defined_llm_dir):
             try:
                 with codecs.open(
                     os.path.join(user_defined_llm_dir, f), encoding="utf-8"
                 ) as fd:
-                    user_defined_llm_family = CustomLLMFamilyV1.parse_raw(fd.read())
+                    user_defined_llm_family = CustomLLMFamilyV2.parse_raw(fd.read())
                     register_llm(user_defined_llm_family, persist=False)
             except Exception as e:
                 warnings.warn(f"{user_defined_llm_dir}/{f} has error, {e}")
@@ -135,7 +131,11 @@ def register_custom_model():
 def load_model_family_from_json(json_filename, target_families):
     json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), json_filename)
     for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
-        model_spec = LLMFamilyV1.parse_obj(json_obj)
+        flattened = []
+        for spec in json_obj["model_specs"]:
+            flattened.extend(flatten_quantizations(spec))
+        json_obj["model_specs"] = flattened
+        model_spec = LLMFamilyV2.parse_obj(json_obj)
         target_families.append(model_spec)
         # register chat_template
@@ -178,11 +178,7 @@ def _install():
     from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
     # register llm classes.
-    LLAMA_CLASSES.extend(
-        [
-            XllamaCppModel,
-        ]
-    )
+    LLAMA_CLASSES.extend([XllamaCppModel])
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel, SGLANGVisionModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
     MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
@@ -198,36 +194,17 @@ def _install():
     SUPPORTED_ENGINES["LMDEPLOY"] = LMDEPLOY_CLASSES
     load_model_family_from_json("llm_family.json", BUILTIN_LLM_FAMILIES)
-    load_model_family_from_json(
-        "llm_family_modelscope.json", BUILTIN_MODELSCOPE_LLM_FAMILIES
-    )
-    load_model_family_from_json(
-        "llm_family_openmind_hub.json", BUILTIN_OPENMIND_HUB_LLM_FAMILIES
-    )
-    load_model_family_from_json("llm_family_csghub.json", BUILTIN_CSGHUB_LLM_FAMILIES)
-    for llm_specs in [
-        BUILTIN_LLM_FAMILIES,
-        BUILTIN_MODELSCOPE_LLM_FAMILIES,
-        BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
-        BUILTIN_CSGHUB_LLM_FAMILIES,
-    ]:
-        for llm_spec in llm_specs:
-            if llm_spec.model_name not in LLM_MODEL_DESCRIPTIONS:
-                LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(llm_spec))
+    for family in BUILTIN_LLM_FAMILIES:
+        if family.model_name not in LLM_VERSION_INFOS:
+            LLM_VERSION_INFOS.update(generate_llm_version_info(family))
     # traverse all families and add engine parameters corresponding to the model name
-    for families in [
-        BUILTIN_LLM_FAMILIES,
-        BUILTIN_MODELSCOPE_LLM_FAMILIES,
-        BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
-        BUILTIN_CSGHUB_LLM_FAMILIES,
-    ]:
-        for family in families:
-            generate_engine_config_by_model_family(family)
+    for family in BUILTIN_LLM_FAMILIES:
+        generate_engine_config_by_model_family(family)
     register_custom_model()
     # register model description
     for ud_llm in get_user_defined_llm_families():
-        LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(ud_llm))
+        LLM_VERSION_INFOS.update(generate_llm_version_info(ud_llm))

xinference/model/llm/cache_manager.py ADDED Viewed

@@ -0,0 +1,292 @@
+import logging
+import os
+from typing import TYPE_CHECKING, Optional
+from ..cache_manager import CacheManager
+if TYPE_CHECKING:
+    from .llm_family import LLMFamilyV2
+logger = logging.getLogger(__name__)
+class LLMCacheManager(CacheManager):
+    def __init__(
+        self, llm_family: "LLMFamilyV2", multimodal_projector: Optional[str] = None
+    ):
+        super().__init__(llm_family)
+        self._llm_family = llm_family
+        self._model_name = llm_family.model_name
+        self._model_format = llm_family.model_specs[0].model_format
+        self._model_size_in_billions = getattr(
+            llm_family.model_specs[0], "model_size_in_billions", None
+        )
+        self._quantization = llm_family.model_specs[0].quantization
+        self._model_uri = llm_family.model_specs[0].model_uri
+        self._multimodal_projector = multimodal_projector
+        self._model_id = llm_family.model_specs[0].model_id
+        self._model_hub = llm_family.model_specs[0].model_hub
+        self._model_revision = llm_family.model_specs[0].model_revision
+        self._cache_dir = os.path.join(
+            self._v2_cache_dir_prefix,
+            f"{self._model_name.replace('.', '_')}-{self._model_format}-"
+            f"{self._model_size_in_billions}b-{self._quantization}",
+        )
+    def cache_uri(self) -> str:
+        from ..utils import parse_uri
+        cache_dir = self.get_cache_dir()
+        assert self._model_uri is not None
+        src_scheme, src_root = parse_uri(self._model_uri)
+        if src_root.endswith("/"):
+            # remove trailing path separator.
+            src_root = src_root[:-1]
+        if src_scheme == "file":
+            if not os.path.isabs(src_root):
+                raise ValueError(
+                    f"Model URI cannot be a relative path: {self._model_uri}"
+                )
+            if os.path.exists(cache_dir):
+                logger.info(f"Cache {cache_dir} exists")
+                return cache_dir
+            else:
+                os.symlink(src_root, cache_dir, target_is_directory=True)
+            return cache_dir
+        else:
+            raise ValueError(f"Unsupported URL scheme: {src_scheme}")
+    def cache_from_huggingface(self) -> str:
+        """
+        Cache model from Hugging Face. Return the cache directory.
+        """
+        import huggingface_hub
+        from ..utils import (
+            IS_NEW_HUGGINGFACE_HUB,
+            create_symlink,
+            generate_model_file_names_with_quantization_parts,
+            merge_cached_files,
+            retry_download,
+            symlink_local_file,
+        )
+        cache_dir = self.get_cache_dir()
+        if self.get_cache_status():
+            return cache_dir
+        use_symlinks = {}
+        if not IS_NEW_HUGGINGFACE_HUB:
+            use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
+        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+            download_dir = retry_download(
+                huggingface_hub.snapshot_download,
+                self._model_name,
+                {
+                    "model_size": self._model_size_in_billions,
+                    "model_format": self._model_format,
+                },
+                self._model_id,
+                revision=self._model_revision,
+                **use_symlinks,
+            )
+            if IS_NEW_HUGGINGFACE_HUB:
+                create_symlink(download_dir, cache_dir)
+        elif self._model_format in ["ggufv2"]:
+            file_names, final_file_name, need_merge = (
+                generate_model_file_names_with_quantization_parts(
+                    self._llm_family.model_specs[0], self._multimodal_projector
+                )
+            )
+            for file_name in file_names:
+                download_file_path = retry_download(
+                    huggingface_hub.hf_hub_download,
+                    self._model_name,
+                    {
+                        "model_size": self._model_size_in_billions,
+                        "model_format": self._model_format,
+                    },
+                    self._model_id,
+                    revision=self._model_revision,
+                    filename=file_name,
+                    **use_symlinks,
+                )
+                if IS_NEW_HUGGINGFACE_HUB:
+                    symlink_local_file(download_file_path, cache_dir, file_name)
+            if need_merge:
+                merge_cached_files(cache_dir, file_names, final_file_name)
+        else:
+            raise ValueError(f"Unsupported model format: {self._model_format}")
+        return cache_dir
+    def cache_from_modelscope(self) -> str:
+        """
+        Cache model from Modelscope. Return the cache directory.
+        """
+        from modelscope.hub.file_download import model_file_download
+        from modelscope.hub.snapshot_download import snapshot_download
+        from ..utils import (
+            create_symlink,
+            generate_model_file_names_with_quantization_parts,
+            merge_cached_files,
+            retry_download,
+            symlink_local_file,
+        )
+        cache_dir = self.get_cache_dir()
+        if self.get_cache_status():
+            return cache_dir
+        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+            download_dir = retry_download(
+                snapshot_download,
+                self._model_name,
+                {
+                    "model_size": self._model_size_in_billions,
+                    "model_format": self._model_format,
+                },
+                self._model_id,
+                revision=self._model_revision,
+            )
+            create_symlink(download_dir, cache_dir)
+        elif self._model_format in ["ggufv2"]:
+            file_names, final_file_name, need_merge = (
+                generate_model_file_names_with_quantization_parts(
+                    self._llm_family.model_specs[0], self._multimodal_projector
+                )
+            )
+            for filename in file_names:
+                download_path = retry_download(
+                    model_file_download,
+                    self._model_name,
+                    {
+                        "model_size": self._model_size_in_billions,
+                        "model_format": self._model_format,
+                    },
+                    self._model_id,
+                    filename,
+                    revision=self._model_revision,
+                )
+                symlink_local_file(download_path, cache_dir, filename)
+            if need_merge:
+                merge_cached_files(cache_dir, file_names, final_file_name)
+        else:
+            raise ValueError(f"Unsupported format: {self._model_format}")
+        return cache_dir
+    def cache_from_openmind_hub(self) -> str:
+        """
+        Cache model from openmind_hub. Return the cache directory.
+        """
+        from openmind_hub import snapshot_download
+        from ..utils import create_symlink, retry_download
+        cache_dir = self.get_cache_dir()
+        if self.get_cache_status():
+            return cache_dir
+        if self._model_format in ["pytorch", "mindspore"]:
+            download_dir = retry_download(
+                snapshot_download,
+                self._model_name,
+                {
+                    "model_size": self._model_size_in_billions,
+                    "model_format": self._model_format,
+                },
+                self._model_id,
+                revision=self._model_revision,
+            )
+            create_symlink(download_dir, cache_dir)
+        else:
+            raise ValueError(f"Unsupported format: {self._model_format}")
+        return cache_dir
+    def cache_from_csghub(self) -> str:
+        """
+        Cache model from CSGHub. Return the cache directory.
+        """
+        from pycsghub.file_download import file_download
+        from pycsghub.snapshot_download import snapshot_download
+        from ...constants import XINFERENCE_CSG_ENDPOINT, XINFERENCE_ENV_CSG_TOKEN
+        from ..utils import (
+            create_symlink,
+            generate_model_file_names_with_quantization_parts,
+            merge_cached_files,
+            retry_download,
+            symlink_local_file,
+        )
+        cache_dir = self.get_cache_dir()
+        if self.get_cache_status():
+            return cache_dir
+        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+            download_dir = retry_download(
+                snapshot_download,
+                self._model_name,
+                {
+                    "model_size": self._model_size_in_billions,
+                    "model_format": self._model_format,
+                },
+                self._model_id,
+                endpoint=XINFERENCE_CSG_ENDPOINT,
+                token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
+            )
+            create_symlink(download_dir, cache_dir)
+        elif self._model_format in ["ggufv2"]:
+            file_names, final_file_name, need_merge = (
+                generate_model_file_names_with_quantization_parts(
+                    self._llm_family.model_specs[0], self._multimodal_projector
+                )
+            )
+            for filename in file_names:
+                download_path = retry_download(
+                    file_download,
+                    self._model_name,
+                    {
+                        "model_size": self._model_size_in_billions,
+                        "model_format": self._model_format,
+                    },
+                    self._model_id,
+                    file_name=filename,
+                    endpoint=XINFERENCE_CSG_ENDPOINT,
+                    token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
+                )
+                symlink_local_file(download_path, cache_dir, filename)
+            if need_merge:
+                merge_cached_files(cache_dir, file_names, final_file_name)
+        else:
+            raise ValueError(f"Unsupported format: {self._model_format}")
+        return cache_dir
+    def cache(self) -> str:
+        if self._model_uri is not None:
+            return self.cache_uri()
+        else:
+            if self._model_hub == "huggingface":
+                return self.cache_from_huggingface()
+            elif self._model_hub == "modelscope":
+                return self.cache_from_modelscope()
+            elif self._model_hub == "openmind_hub":
+                return self.cache_from_openmind_hub()
+            elif self._model_hub == "csghub":
+                return self.cache_from_csghub()
+            else:
+                raise ValueError(f"Unknown model hub: {self._model_hub}")

xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl