PyPI - xinference - Versions diffs - 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show

xinference/_version.py +3 -3
xinference/client/restful/async_restful_client.py +8 -13
xinference/client/restful/restful_client.py +6 -2
xinference/core/chat_interface.py +6 -4
xinference/core/media_interface.py +5 -0
xinference/core/model.py +1 -5
xinference/core/supervisor.py +117 -68
xinference/core/worker.py +49 -37
xinference/deploy/test/test_cmdline.py +2 -6
xinference/model/audio/__init__.py +26 -23
xinference/model/audio/chattts.py +3 -2
xinference/model/audio/core.py +49 -98
xinference/model/audio/cosyvoice.py +3 -2
xinference/model/audio/custom.py +28 -73
xinference/model/audio/f5tts.py +3 -2
xinference/model/audio/f5tts_mlx.py +3 -2
xinference/model/audio/fish_speech.py +3 -2
xinference/model/audio/funasr.py +17 -4
xinference/model/audio/kokoro.py +3 -2
xinference/model/audio/megatts.py +3 -2
xinference/model/audio/melotts.py +3 -2
xinference/model/audio/model_spec.json +572 -171
xinference/model/audio/utils.py +0 -6
xinference/model/audio/whisper.py +3 -2
xinference/model/audio/whisper_mlx.py +3 -2
xinference/model/cache_manager.py +141 -0
xinference/model/core.py +6 -49
xinference/model/custom.py +174 -0
xinference/model/embedding/__init__.py +67 -56
xinference/model/embedding/cache_manager.py +35 -0
xinference/model/embedding/core.py +104 -84
xinference/model/embedding/custom.py +55 -78
xinference/model/embedding/embed_family.py +80 -31
xinference/model/embedding/flag/core.py +21 -5
xinference/model/embedding/llama_cpp/__init__.py +0 -0
xinference/model/embedding/llama_cpp/core.py +234 -0
xinference/model/embedding/model_spec.json +968 -103
xinference/model/embedding/sentence_transformers/core.py +30 -20
xinference/model/embedding/vllm/core.py +11 -5
xinference/model/flexible/__init__.py +8 -2
xinference/model/flexible/core.py +26 -119
xinference/model/flexible/custom.py +69 -0
xinference/model/flexible/launchers/image_process_launcher.py +1 -0
xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
xinference/model/flexible/launchers/transformers_launcher.py +15 -3
xinference/model/flexible/launchers/yolo_launcher.py +5 -1
xinference/model/image/__init__.py +20 -20
xinference/model/image/cache_manager.py +62 -0
xinference/model/image/core.py +70 -182
xinference/model/image/custom.py +28 -72
xinference/model/image/model_spec.json +402 -119
xinference/model/image/ocr/got_ocr2.py +3 -2
xinference/model/image/stable_diffusion/core.py +22 -7
xinference/model/image/stable_diffusion/mlx.py +6 -6
xinference/model/image/utils.py +2 -2
xinference/model/llm/__init__.py +71 -94
xinference/model/llm/cache_manager.py +292 -0
xinference/model/llm/core.py +37 -111
xinference/model/llm/custom.py +88 -0
xinference/model/llm/llama_cpp/core.py +5 -7
xinference/model/llm/llm_family.json +16260 -8151
xinference/model/llm/llm_family.py +138 -839
xinference/model/llm/lmdeploy/core.py +5 -7
xinference/model/llm/memory.py +3 -4
xinference/model/llm/mlx/core.py +6 -8
xinference/model/llm/reasoning_parser.py +3 -1
xinference/model/llm/sglang/core.py +32 -14
xinference/model/llm/transformers/chatglm.py +3 -7
xinference/model/llm/transformers/core.py +49 -27
xinference/model/llm/transformers/deepseek_v2.py +2 -2
xinference/model/llm/transformers/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
xinference/model/llm/transformers/opt.py +3 -7
xinference/model/llm/utils.py +34 -49
xinference/model/llm/vllm/core.py +77 -27
xinference/model/llm/vllm/xavier/engine.py +5 -3
xinference/model/llm/vllm/xavier/scheduler.py +10 -6
xinference/model/llm/vllm/xavier/transfer.py +1 -1
xinference/model/rerank/__init__.py +26 -25
xinference/model/rerank/core.py +47 -87
xinference/model/rerank/custom.py +25 -71
xinference/model/rerank/model_spec.json +158 -33
xinference/model/rerank/utils.py +2 -2
xinference/model/utils.py +115 -54
xinference/model/video/__init__.py +13 -17
xinference/model/video/core.py +44 -102
xinference/model/video/diffusers.py +4 -3
xinference/model/video/model_spec.json +90 -21
xinference/types.py +5 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
xinference/web/ui/src/locales/en.json +0 -1
xinference/web/ui/src/locales/ja.json +0 -1
xinference/web/ui/src/locales/ko.json +0 -1
xinference/web/ui/src/locales/zh.json +0 -1
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
xinference/model/audio/model_spec_modelscope.json +0 -231
xinference/model/embedding/model_spec_modelscope.json +0 -293
xinference/model/embedding/utils.py +0 -18
xinference/model/image/model_spec_modelscope.json +0 -375
xinference/model/llm/llama_cpp/memory.py +0 -457
xinference/model/llm/llm_family_csghub.json +0 -56
xinference/model/llm/llm_family_modelscope.json +0 -8700
xinference/model/llm/llm_family_openmind_hub.json +0 -1019
xinference/model/rerank/model_spec_modelscope.json +0 -85
xinference/model/video/model_spec_modelscope.json +0 -184
xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
/xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -14,8 +14,7 @@
 import logging
 import os
-from threading import Lock
-from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Set, Type, Union
 from typing_extensions import Annotated, Literal
@@ -30,24 +29,14 @@ from ..._compat import (
     load_str_bytes,
     validator,
 )
-from ...constants import (
-    XINFERENCE_CACHE_DIR,
-    XINFERENCE_CSG_ENDPOINT,
-    XINFERENCE_ENV_CSG_TOKEN,
-    XINFERENCE_MODEL_DIR,
-)
+from ...constants import XINFERENCE_CACHE_DIR
 from ..core import VirtualEnvSettings
 from ..utils import (
-    IS_NEW_HUGGINGFACE_HUB,
-    create_symlink,
+    ModelInstanceInfoMixin,
     download_from_csghub,
     download_from_modelscope,
     download_from_openmind_hub,
-    is_valid_model_uri,
-    parse_uri,
     retry_download,
-    symlink_local_file,
-    valid_model_revision,
 )
 from . import LLM
@@ -60,11 +49,11 @@ BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
 BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set()
-class LlamaCppLLMSpecV1(BaseModel):
+class LlamaCppLLMSpecV2(BaseModel):
     model_format: Literal["ggufv2"]
     # Must in order that `str` first, then `int`
     model_size_in_billions: Union[str, int]
-    quantizations: List[str]
+    quantization: str
     multimodal_projectors: Optional[List[str]]
     model_id: Optional[str]
     model_file_name_template: str
@@ -88,11 +77,11 @@ class LlamaCppLLMSpecV1(BaseModel):
         return v
-class PytorchLLMSpecV1(BaseModel):
+class PytorchLLMSpecV2(BaseModel):
     model_format: Literal["pytorch", "gptq", "awq", "fp8"]
     # Must in order that `str` first, then `int`
     model_size_in_billions: Union[str, int]
-    quantizations: List[str]
+    quantization: str
     model_id: Optional[str]
     model_hub: str = "huggingface"
     model_uri: Optional[str]
@@ -112,11 +101,11 @@ class PytorchLLMSpecV1(BaseModel):
         return v
-class MLXLLMSpecV1(BaseModel):
+class MLXLLMSpecV2(BaseModel):
     model_format: Literal["mlx"]
     # Must in order that `str` first, then `int`
     model_size_in_billions: Union[str, int]
-    quantizations: List[str]
+    quantization: str
     model_id: Optional[str]
     model_hub: str = "huggingface"
     model_uri: Optional[str]
@@ -136,8 +125,8 @@ class MLXLLMSpecV1(BaseModel):
         return v
-class LLMFamilyV1(BaseModel):
-    version: Literal[1]
+class LLMFamilyV2(BaseModel, ModelInstanceInfoMixin):
+    version: Literal[2]
     context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
     model_name: str
     model_lang: List[str]
@@ -163,10 +152,61 @@ class LLMFamilyV1(BaseModel):
     stop: Optional[List[str]]
     reasoning_start_tag: Optional[str]
     reasoning_end_tag: Optional[str]
+    cache_config: Optional[dict]
     virtualenv: Optional[VirtualEnvSettings]
+    class Config:
+        extra = "allow"
+    def to_description(self):
+        spec = self.model_specs[0]
+        return {
+            "model_type": "LLM",
+            "address": getattr(self, "address", None),
+            "accelerators": getattr(self, "accelerators", None),
+            "model_name": self.model_name,
+            "model_lang": self.model_lang,
+            "model_ability": self.model_ability,
+            "model_description": self.model_description,
+            "model_format": spec.model_format,
+            "model_size_in_billions": spec.model_size_in_billions,
+            "model_family": self.model_family or self.model_name,
+            "quantization": spec.quantization,
+            "multimodal_projector": getattr(self, "multimodal_projector", None),
+            "model_hub": spec.model_hub,
+            "revision": spec.model_revision,
+            "context_length": self.context_length,
+        }
+    def to_version_info(self):
+        """
+        Entering this function means it is already bound to a model instance,
+        so there is only one spec.
+        """
+        from .cache_manager import LLMCacheManager
+        from .utils import get_model_version
+        spec = self.model_specs[0]
+        multimodal_projector = getattr(self, "multimodal_projector", None)
+        cache_manager = LLMCacheManager(self, multimodal_projector)
+        return {
+            "model_version": get_model_version(
+                self.model_name,
+                spec.model_format,
+                spec.model_size_in_billions,
+                spec.quantization,
+            ),
+            "model_file_location": cache_manager.get_cache_dir(),
+            "cache_status": cache_manager.get_cache_status(),
+            "quantization": spec.quantization,
+            "multimodal_projector": multimodal_projector,
+            "model_format": spec.model_format,
+            "model_size_in_billions": spec.model_size_in_billions,
+        }
-class CustomLLMFamilyV1(LLMFamilyV1):
+class CustomLLMFamilyV2(LLMFamilyV2):
     @classmethod
     def parse_raw(
         cls: Any,
@@ -176,7 +216,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
         encoding: str = "utf8",
         proto: Protocol = None,
         allow_pickle: bool = False,
-    ) -> LLMFamilyV1:
+    ) -> LLMFamilyV2:
         # See source code of BaseModel.parse_raw
         try:
             obj = load_str_bytes(
@@ -189,7 +229,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
             )
         except (ValueError, TypeError, UnicodeDecodeError) as e:
             raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls)
-        llm_spec: CustomLLMFamilyV1 = cls.parse_obj(obj)
+        llm_spec: CustomLLMFamilyV2 = cls.parse_obj(obj)
         vision_model_names: Set[str] = {
             family.model_name
             for family in BUILTIN_LLM_FAMILIES
@@ -255,39 +295,27 @@ class CustomLLMFamilyV1(LLMFamilyV1):
 LLMSpecV1 = Annotated[
-    Union[LlamaCppLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
+    Union[LlamaCppLLMSpecV2, PytorchLLMSpecV2, MLXLLMSpecV2],
     Field(discriminator="model_format"),
 ]
-LLMFamilyV1.update_forward_refs()
-CustomLLMFamilyV1.update_forward_refs()
+LLMFamilyV2.update_forward_refs()
+CustomLLMFamilyV2.update_forward_refs()
 LLAMA_CLASSES: List[Type[LLM]] = []
-BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
-BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
-BUILTIN_OPENMIND_HUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
-BUILTIN_CSGHUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
+BUILTIN_LLM_FAMILIES: List["LLMFamilyV2"] = []
 SGLANG_CLASSES: List[Type[LLM]] = []
 TRANSFORMERS_CLASSES: List[Type[LLM]] = []
-UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
-UD_LLM_FAMILIES_LOCK = Lock()
 VLLM_CLASSES: List[Type[LLM]] = []
 MLX_CLASSES: List[Type[LLM]] = []
 LMDEPLOY_CLASSES: List[Type[LLM]] = []
 LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
 SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
-LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {}
 # Add decorator definition
 def register_transformer(cls):
@@ -308,107 +336,16 @@ def register_transformer(cls):
     return cls
-def download_from_self_hosted_storage() -> bool:
-    from ...constants import XINFERENCE_ENV_MODEL_SRC
-    return os.environ.get(XINFERENCE_ENV_MODEL_SRC) == "xorbits"
-def get_legacy_cache_path(
-    model_name: str,
-    model_format: str,
-    model_size_in_billions: Optional[Union[str, int]] = None,
-    quantization: Optional[str] = None,
-) -> str:
-    full_name = f"{model_name}-{model_format}-{model_size_in_billions}b-{quantization}"
-    return os.path.join(XINFERENCE_CACHE_DIR, full_name, "model.bin")
-def cache(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-) -> str:
-    legacy_cache_path = get_legacy_cache_path(
-        llm_family.model_name,
-        llm_spec.model_format,
-        llm_spec.model_size_in_billions,
-        quantization,
-    )
-    if os.path.exists(legacy_cache_path):
-        logger.info("Legacy cache path exists: %s", legacy_cache_path)
-        return os.path.dirname(legacy_cache_path)
-    else:
-        if llm_spec.model_uri is not None:
-            logger.info(f"Caching from URI: {llm_spec.model_uri}")
-            return cache_from_uri(llm_family, llm_spec)
-        else:
-            if llm_spec.model_hub == "huggingface":
-                logger.info(f"Caching from Hugging Face: {llm_spec.model_id}")
-                return cache_from_huggingface(
-                    llm_family, llm_spec, quantization, multimodal_projector
-                )
-            elif llm_spec.model_hub == "modelscope":
-                logger.info(f"Caching from Modelscope: {llm_spec.model_id}")
-                return cache_from_modelscope(
-                    llm_family, llm_spec, quantization, multimodal_projector
-                )
-            elif llm_spec.model_hub == "openmind_hub":
-                logger.info(f"Caching from openmind_hub: {llm_spec.model_id}")
-                return cache_from_openmind_hub(
-                    llm_family, llm_spec, quantization, multimodal_projector
-                )
-            elif llm_spec.model_hub == "csghub":
-                logger.info(f"Caching from CSGHub: {llm_spec.model_id}")
-                return cache_from_csghub(
-                    llm_family, llm_spec, quantization, multimodal_projector
-                )
-            else:
-                raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
-def cache_from_uri(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-) -> str:
-    cache_dir_name = (
-        f"{llm_family.model_name}-{llm_spec.model_format}"
-        f"-{llm_spec.model_size_in_billions}b"
-    )
-    cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
-    assert llm_spec.model_uri is not None
-    src_scheme, src_root = parse_uri(llm_spec.model_uri)
-    if src_root.endswith("/"):
-        # remove trailing path separator.
-        src_root = src_root[:-1]
-    if src_scheme == "file":
-        if not os.path.isabs(src_root):
-            raise ValueError(
-                f"Model URI cannot be a relative path: {llm_spec.model_uri}"
-            )
-        os.makedirs(XINFERENCE_CACHE_DIR, exist_ok=True)
-        if os.path.exists(cache_dir):
-            logger.info(f"Cache {cache_dir} exists")
-            return cache_dir
-        else:
-            os.symlink(src_root, cache_dir, target_is_directory=True)
-        return cache_dir
-    else:
-        raise ValueError(f"Unsupported URL scheme: {src_scheme}")
 def cache_model_tokenizer_and_config(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
+    llm_family: LLMFamilyV2,
 ) -> str:
     """
     Download model config.json and tokenizers only
     """
+    llm_spec = llm_family.model_specs[0]
     cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec, "tokenizer_config")
     os.makedirs(cache_dir, exist_ok=True)
+    patterns = ["tokenizer*", "config.json", "configuration*", "tokenization*"]
     if llm_spec.model_hub == "huggingface":
         from huggingface_hub import snapshot_download
@@ -421,7 +358,7 @@ def cache_model_tokenizer_and_config(
             },
             llm_spec.model_id,
             revision=llm_spec.model_revision,
-            allow_patterns=["tokenizer*", "config.json"],
+            allow_patterns=patterns,
             local_dir=cache_dir,
         )
     elif llm_spec.model_hub == "modelscope":
@@ -436,7 +373,7 @@ def cache_model_tokenizer_and_config(
             },
             llm_spec.model_id,
             revision=llm_spec.model_revision,
-            allow_patterns=["tokenizer*", "config.json"],
+            allow_patterns=patterns,
             local_dir=cache_dir,
         )
     else:
@@ -447,13 +384,11 @@ def cache_model_tokenizer_and_config(
     return download_dir
-def cache_model_config(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-):
+def cache_model_config(llm_family: LLMFamilyV2):
     """Download model config.json into cache_dir,
     returns local filepath
     """
+    llm_spec = llm_family.model_specs[0]
     cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec, "model_mem")
     config_file = os.path.join(cache_dir, "config.json")
     if not os.path.islink(config_file) and not os.path.exists(config_file):
@@ -475,7 +410,7 @@ def cache_model_config(
 def _get_cache_dir_for_model_mem(
-    llm_family: LLMFamilyV1,
+    llm_family: LLMFamilyV2,
     llm_spec: "LLMSpecV1",
     category: str,
     create_if_not_exist=True,
@@ -486,597 +421,18 @@ def _get_cache_dir_for_model_mem(
     e.g. for cal-model-mem, (might called from supervisor / cli)
     Temporary use separate dir from worker's cache_dir, due to issue of different style of symlink.
     """
-    quant_suffix = ""
-    for q in llm_spec.quantizations:
-        if llm_spec.model_id and q in llm_spec.model_id:
-            quant_suffix = q
-            break
     cache_dir_name = (
         f"{llm_family.model_name}-{llm_spec.model_format}"
-        f"-{llm_spec.model_size_in_billions}b"
+        f"-{llm_spec.model_size_in_billions}b-{llm_spec.quantization}"
     )
-    if quant_suffix:
-        cache_dir_name += f"-{quant_suffix}"
     cache_dir = os.path.realpath(
-        os.path.join(XINFERENCE_CACHE_DIR, category, cache_dir_name)
+        os.path.join(XINFERENCE_CACHE_DIR, "v2", category, cache_dir_name)
     )
     if create_if_not_exist and not os.path.exists(cache_dir):
         os.makedirs(cache_dir, exist_ok=True)
     return cache_dir
-def _get_cache_dir(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-    quantization: Optional[str] = None,
-    create_if_not_exist=True,
-):
-    # If the model id contains quantization, then we should give each
-    # quantization a dedicated cache dir.
-    quant_suffix = ""
-    if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
-        quant_suffix = quantization
-    else:
-        for q in llm_spec.quantizations:
-            if llm_spec.model_id and q in llm_spec.model_id:
-                quant_suffix = q
-                break
-    # some model name includes ".", e.g. qwen1.5-chat
-    # if the model does not require trust_remote_code, it's OK
-    # because no need to import modeling_xxx.py from the path
-    # but when the model need to trust_remote_code,
-    # e.g. internlm2.5-chat, the import will fail,
-    # but before the model may have been downloaded,
-    # thus we check it first, if exist, return it,
-    # otherwise, we replace the "." with "_" in model name
-    old_cache_dir_name = (
-        f"{llm_family.model_name}-{llm_spec.model_format}"
-        f"-{llm_spec.model_size_in_billions}b"
-    )
-    if quant_suffix:
-        old_cache_dir_name += f"-{quant_suffix}"
-    old_cache_dir = os.path.realpath(
-        os.path.join(XINFERENCE_CACHE_DIR, old_cache_dir_name)
-    )
-    if os.path.exists(old_cache_dir):
-        return old_cache_dir
-    else:
-        cache_dir_name = (
-            f"{llm_family.model_name.replace('.', '_')}-{llm_spec.model_format}"
-            f"-{llm_spec.model_size_in_billions}b"
-        )
-        if quant_suffix:
-            cache_dir_name += f"-{quant_suffix}"
-        cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
-        if create_if_not_exist and not os.path.exists(cache_dir):
-            os.makedirs(cache_dir, exist_ok=True)
-        return cache_dir
-def _get_meta_path(
-    cache_dir: str,
-    model_format: str,
-    model_hub: str,
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-):
-    if model_format == "pytorch":
-        if model_hub == "huggingface":
-            return os.path.join(cache_dir, "__valid_download")
-        else:
-            return os.path.join(cache_dir, f"__valid_download_{model_hub}")
-    elif model_format == "ggufv2":
-        assert quantization is not None
-        if multimodal_projector is None:
-            # Compatible with old cache file to avoid re-download model.
-            if model_hub == "huggingface":
-                return os.path.join(cache_dir, f"__valid_download_{quantization}")
-            else:
-                return os.path.join(
-                    cache_dir, f"__valid_download_{model_hub}_{quantization}"
-                )
-        else:
-            if model_hub == "huggingface":
-                return os.path.join(
-                    cache_dir, f"__valid_download_{quantization}_{multimodal_projector}"
-                )
-            else:
-                return os.path.join(
-                    cache_dir,
-                    f"__valid_download_{model_hub}_{quantization}_{multimodal_projector}",
-                )
-    elif model_format in ["gptq", "awq", "fp8", "mlx"]:
-        assert quantization is not None
-        if model_hub == "huggingface":
-            return os.path.join(cache_dir, f"__valid_download_{quantization}")
-        else:
-            return os.path.join(
-                cache_dir, f"__valid_download_{model_hub}_{quantization}"
-            )
-    else:
-        raise ValueError(f"Unsupported format: {model_format}")
-def _skip_download(
-    cache_dir: str,
-    model_format: str,
-    model_hub: str,
-    model_revision: Optional[str],
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-) -> bool:
-    if model_format in ["pytorch", "mindspore"]:
-        model_hub_to_meta_path = {
-            "huggingface": _get_meta_path(
-                cache_dir, model_format, "huggingface", quantization
-            ),
-            "modelscope": _get_meta_path(
-                cache_dir, model_format, "modelscope", quantization
-            ),
-            "openmind_hub": _get_meta_path(
-                cache_dir, model_format, "openmind_hub", quantization
-            ),
-            "csghub": _get_meta_path(cache_dir, model_format, "csghub", quantization),
-        }
-        if valid_model_revision(model_hub_to_meta_path[model_hub], model_revision):
-            logger.info(f"Cache {cache_dir} exists")
-            return True
-        else:
-            for hub, meta_path in model_hub_to_meta_path.items():
-                if hub != model_hub and os.path.exists(meta_path):
-                    # PyTorch models from modelscope can also be loaded by transformers.
-                    logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
-                    return True
-            return False
-    elif model_format == "ggufv2":
-        assert quantization is not None
-        return os.path.exists(
-            _get_meta_path(
-                cache_dir, model_format, model_hub, quantization, multimodal_projector
-            )
-        )
-    elif model_format in ["gptq", "awq", "fp8", "mlx"]:
-        assert quantization is not None
-        return os.path.exists(
-            _get_meta_path(cache_dir, model_format, model_hub, quantization)
-        )
-    else:
-        raise ValueError(f"Unsupported format: {model_format}")
-def _generate_meta_file(
-    meta_path: str,
-    llm_family: "LLMFamilyV1",
-    llm_spec: "LLMSpecV1",
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-):
-    assert not valid_model_revision(
-        meta_path, llm_spec.model_revision
-    ), f"meta file {meta_path} should not be valid"
-    with open(meta_path, "w") as f:
-        import json
-        from .core import LLMDescription
-        desc = LLMDescription(
-            None, None, llm_family, llm_spec, quantization, multimodal_projector
-        )
-        json.dump(desc.to_dict(), f)
-def _generate_model_file_names(
-    llm_spec: "LLMSpecV1",
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-) -> Tuple[List[str], str, bool]:
-    file_names = []
-    final_file_name = llm_spec.model_file_name_template.format(
-        quantization=quantization
-    )
-    need_merge = False
-    if (
-        llm_spec.quantization_parts is None
-        or quantization not in llm_spec.quantization_parts
-    ):
-        file_names.append(final_file_name)
-    elif quantization is not None and quantization in llm_spec.quantization_parts:
-        parts = llm_spec.quantization_parts[quantization]
-        need_merge = True
-        logger.info(
-            f"Model {llm_spec.model_id} {llm_spec.model_format} {quantization} has {len(parts)} parts."
-        )
-        if llm_spec.model_file_name_split_template is None:
-            raise ValueError(
-                f"No model_file_name_split_template for model spec {llm_spec.model_id}"
-            )
-        for part in parts:
-            file_name = llm_spec.model_file_name_split_template.format(
-                quantization=quantization, part=part
-            )
-            file_names.append(file_name)
-    if multimodal_projector:
-        file_names.append(multimodal_projector)
-    return file_names, final_file_name, need_merge
-def _merge_cached_files(
-    cache_dir: str, input_file_names: List[str], output_file_name: str
-):
-    # now llama.cpp can find the gguf parts automatically
-    # we only need to provide the first part
-    # thus we create the symlink to the first part
-    symlink_local_file(
-        os.path.join(cache_dir, input_file_names[0]), cache_dir, output_file_name
-    )
-    logger.info(f"Merge complete.")
-def cache_from_csghub(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-) -> str:
-    """
-    Cache model from CSGHub. Return the cache directory.
-    """
-    from pycsghub.file_download import file_download
-    from pycsghub.snapshot_download import snapshot_download
-    cache_dir = _get_cache_dir(llm_family, llm_spec)
-    if _skip_download(
-        cache_dir,
-        llm_spec.model_format,
-        llm_spec.model_hub,
-        llm_spec.model_revision,
-        quantization,
-        multimodal_projector,
-    ):
-        return cache_dir
-    if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
-        download_dir = retry_download(
-            snapshot_download,
-            llm_family.model_name,
-            {
-                "model_size": llm_spec.model_size_in_billions,
-                "model_format": llm_spec.model_format,
-            },
-            llm_spec.model_id,
-            endpoint=XINFERENCE_CSG_ENDPOINT,
-            token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
-        )
-        create_symlink(download_dir, cache_dir)
-    elif llm_spec.model_format in ["ggufv2"]:
-        file_names, final_file_name, need_merge = _generate_model_file_names(
-            llm_spec, quantization, multimodal_projector
-        )
-        for filename in file_names:
-            download_path = retry_download(
-                file_download,
-                llm_family.model_name,
-                {
-                    "model_size": llm_spec.model_size_in_billions,
-                    "model_format": llm_spec.model_format,
-                },
-                llm_spec.model_id,
-                file_name=filename,
-                endpoint=XINFERENCE_CSG_ENDPOINT,
-                token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
-            )
-            symlink_local_file(download_path, cache_dir, filename)
-        if need_merge:
-            _merge_cached_files(cache_dir, file_names, final_file_name)
-    else:
-        raise ValueError(f"Unsupported format: {llm_spec.model_format}")
-    meta_path = _get_meta_path(
-        cache_dir,
-        llm_spec.model_format,
-        llm_spec.model_hub,
-        quantization,
-        multimodal_projector,
-    )
-    _generate_meta_file(
-        meta_path, llm_family, llm_spec, quantization, multimodal_projector
-    )
-    return cache_dir
-def cache_from_modelscope(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-) -> str:
-    """
-    Cache model from Modelscope. Return the cache directory.
-    """
-    from modelscope.hub.file_download import model_file_download
-    from modelscope.hub.snapshot_download import snapshot_download
-    cache_dir = _get_cache_dir(llm_family, llm_spec)
-    if _skip_download(
-        cache_dir,
-        llm_spec.model_format,
-        llm_spec.model_hub,
-        llm_spec.model_revision,
-        quantization,
-        multimodal_projector,
-    ):
-        return cache_dir
-    if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
-        download_dir = retry_download(
-            snapshot_download,
-            llm_family.model_name,
-            {
-                "model_size": llm_spec.model_size_in_billions,
-                "model_format": llm_spec.model_format,
-            },
-            llm_spec.model_id,
-            revision=llm_spec.model_revision,
-        )
-        create_symlink(download_dir, cache_dir)
-    elif llm_spec.model_format in ["ggufv2"]:
-        file_names, final_file_name, need_merge = _generate_model_file_names(
-            llm_spec, quantization, multimodal_projector
-        )
-        for filename in file_names:
-            download_path = retry_download(
-                model_file_download,
-                llm_family.model_name,
-                {
-                    "model_size": llm_spec.model_size_in_billions,
-                    "model_format": llm_spec.model_format,
-                },
-                llm_spec.model_id,
-                filename,
-                revision=llm_spec.model_revision,
-            )
-            symlink_local_file(download_path, cache_dir, filename)
-        if need_merge:
-            _merge_cached_files(cache_dir, file_names, final_file_name)
-    else:
-        raise ValueError(f"Unsupported format: {llm_spec.model_format}")
-    meta_path = _get_meta_path(
-        cache_dir,
-        llm_spec.model_format,
-        llm_spec.model_hub,
-        quantization,
-        multimodal_projector,
-    )
-    _generate_meta_file(meta_path, llm_family, llm_spec, quantization)
-    return cache_dir
-def cache_from_openmind_hub(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-) -> str:
-    """
-    Cache model from openmind_hub. Return the cache directory.
-    """
-    from openmind_hub import snapshot_download
-    cache_dir = _get_cache_dir(llm_family, llm_spec)
-    if _skip_download(
-        cache_dir,
-        llm_spec.model_format,
-        llm_spec.model_hub,
-        llm_spec.model_revision,
-        quantization,
-        multimodal_projector,
-    ):
-        return cache_dir
-    if llm_spec.model_format in ["pytorch", "mindspore"]:
-        download_dir = retry_download(
-            snapshot_download,
-            llm_family.model_name,
-            {
-                "model_size": llm_spec.model_size_in_billions,
-                "model_format": llm_spec.model_format,
-            },
-            llm_spec.model_id,
-            revision=llm_spec.model_revision,
-        )
-        create_symlink(download_dir, cache_dir)
-    else:
-        raise ValueError(f"Unsupported format: {llm_spec.model_format}")
-    meta_path = _get_meta_path(
-        cache_dir,
-        llm_spec.model_format,
-        llm_spec.model_hub,
-        quantization,
-        multimodal_projector,
-    )
-    _generate_meta_file(meta_path, llm_family, llm_spec, quantization)
-    return cache_dir
-def cache_from_huggingface(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-    quantization: Optional[str] = None,
-    multimodal_projector: Optional[str] = None,
-) -> str:
-    """
-    Cache model from Hugging Face. Return the cache directory.
-    """
-    import huggingface_hub
-    cache_dir = _get_cache_dir(llm_family, llm_spec)
-    if _skip_download(
-        cache_dir,
-        llm_spec.model_format,
-        llm_spec.model_hub,
-        llm_spec.model_revision,
-        quantization,
-        multimodal_projector,
-    ):
-        return cache_dir
-    use_symlinks = {}
-    if not IS_NEW_HUGGINGFACE_HUB:
-        use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
-    if llm_spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
-        assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
-        download_dir = retry_download(
-            huggingface_hub.snapshot_download,
-            llm_family.model_name,
-            {
-                "model_size": llm_spec.model_size_in_billions,
-                "model_format": llm_spec.model_format,
-            },
-            llm_spec.model_id,
-            revision=llm_spec.model_revision,
-            **use_symlinks,
-        )
-        if IS_NEW_HUGGINGFACE_HUB:
-            create_symlink(download_dir, cache_dir)
-    elif llm_spec.model_format in ["ggufv2"]:
-        assert isinstance(llm_spec, LlamaCppLLMSpecV1)
-        file_names, final_file_name, need_merge = _generate_model_file_names(
-            llm_spec, quantization, multimodal_projector
-        )
-        for file_name in file_names:
-            download_file_path = retry_download(
-                huggingface_hub.hf_hub_download,
-                llm_family.model_name,
-                {
-                    "model_size": llm_spec.model_size_in_billions,
-                    "model_format": llm_spec.model_format,
-                },
-                llm_spec.model_id,
-                revision=llm_spec.model_revision,
-                filename=file_name,
-                **use_symlinks,
-            )
-            if IS_NEW_HUGGINGFACE_HUB:
-                symlink_local_file(download_file_path, cache_dir, file_name)
-        if need_merge:
-            _merge_cached_files(cache_dir, file_names, final_file_name)
-    else:
-        raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
-    meta_path = _get_meta_path(
-        cache_dir,
-        llm_spec.model_format,
-        llm_spec.model_hub,
-        quantization,
-        multimodal_projector,
-    )
-    _generate_meta_file(meta_path, llm_family, llm_spec, quantization)
-    return cache_dir
-def _check_revision(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
-    builtin: list,
-    meta_path: str,
-    quantization: Optional[str] = None,
-) -> bool:
-    for family in builtin:
-        if llm_family.model_name == family.model_name:
-            specs = family.model_specs
-            for spec in specs:
-                if (
-                    spec.model_format == "pytorch"
-                    and spec.model_size_in_billions == llm_spec.model_size_in_billions
-                    and (quantization is None or quantization in spec.quantizations)
-                ):
-                    return valid_model_revision(meta_path, spec.model_revision)
-    return False
-def get_cache_status(
-    llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
-) -> Union[bool, List[bool]]:
-    """
-    Checks if a model's cache status is available based on the model format and quantization.
-    Supports different directories and model formats.
-    """
-    def check_file_status(meta_path: str) -> bool:
-        return os.path.exists(meta_path)
-    def check_revision_status(
-        meta_path: str, families: list, quantization: Optional[str] = None
-    ) -> bool:
-        return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
-    def handle_quantization(q: Union[str, None]) -> bool:
-        specific_cache_dir = _get_cache_dir(
-            llm_family, llm_spec, q, create_if_not_exist=False
-        )
-        meta_paths = {
-            "huggingface": _get_meta_path(
-                specific_cache_dir, llm_spec.model_format, "huggingface", q
-            ),
-            "modelscope": _get_meta_path(
-                specific_cache_dir, llm_spec.model_format, "modelscope", q
-            ),
-        }
-        if llm_spec.model_format == "pytorch":
-            return check_revision_status(
-                meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
-            ) or check_revision_status(
-                meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
-            )
-        else:
-            return check_file_status(meta_paths["huggingface"]) or check_file_status(
-                meta_paths["modelscope"]
-            )
-    if llm_spec.model_id and "{" in llm_spec.model_id:
-        return (
-            [handle_quantization(q) for q in llm_spec.quantizations]
-            if quantization is None
-            else handle_quantization(quantization)
-        )
-    else:
-        return (
-            [handle_quantization(q) for q in llm_spec.quantizations]
-            if llm_spec.model_format != "pytorch"
-            else handle_quantization(None)
-        )
-def get_user_defined_llm_families():
-    with UD_LLM_FAMILIES_LOCK:
-        return UD_LLM_FAMILIES.copy()
 def match_model_size(
     model_size: Union[int, str], spec_model_size: Union[int, str]
 ) -> bool:
@@ -1097,7 +453,7 @@ def match_model_size(
 def convert_model_size_to_float(
-    model_size_in_billions: Union[float, int, str]
+    model_size_in_billions: Union[float, int, str],
 ) -> float:
     if isinstance(model_size_in_billions, str):
         if "_" in model_size_in_billions:
@@ -1118,55 +474,68 @@ def match_llm(
     download_hub: Optional[
         Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
     ] = None,
-) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
+) -> Optional[LLMFamilyV2]:
     """
     Find an LLM family, spec, and quantization that satisfy given criteria.
     """
+    from .custom import get_user_defined_llm_families
     user_defined_llm_families = get_user_defined_llm_families()
-    def _match_quantization(q: Union[str, None], quantizations: List[str]):
+    def _match_quantization(q: Union[str, None], quant: str):
         # Currently, the quantization name could include both uppercase and lowercase letters,
         # so it is necessary to ensure that the case sensitivity does not
         # affect the matching results.
-        if q is None:
-            return q
-        for quant in quantizations:
-            if q.lower() == quant.lower():
-                return quant
+        if q is None or q.lower() != quant.lower():
+            return None
+        return quant
-    def _apply_format_to_model_id(spec: LLMSpecV1, q: str) -> LLMSpecV1:
+    def _apply_format_to_model_id(_spec: "LLMSpecV1", q: str) -> "LLMSpecV1":
         # Different quantized versions of some models use different model ids,
         # Here we check the `{}` in the model id to format the id.
-        if spec.model_id and "{" in spec.model_id:
-            spec.model_id = spec.model_id.format(quantization=q)
-        return spec
+        if _spec.model_id and "{" in _spec.model_id:
+            _spec.model_id = _spec.model_id.format(quantization=q)
+        return _spec
+    def _get_model_specs(
+        _model_specs: List["LLMSpecV1"], hub: str
+    ) -> List["LLMSpecV1"]:
+        return [x for x in _model_specs if x.model_hub == hub]
     # priority: download_hub > download_from_modelscope() and download_from_csghub()
     # set base model
-    base_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
-    hub_families_map = {
-        "modelscope": BUILTIN_MODELSCOPE_LLM_FAMILIES,
-        "openmind_hub": BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
-        "csghub": BUILTIN_CSGHUB_LLM_FAMILIES,
-    }
-    if download_hub == "huggingface":
-        all_families = base_families
-    elif download_hub in hub_families_map:
-        all_families = hub_families_map[download_hub] + base_families
-    elif download_from_modelscope():
-        all_families = BUILTIN_MODELSCOPE_LLM_FAMILIES + base_families
-    elif download_from_openmind_hub():
-        all_families = BUILTIN_OPENMIND_HUB_LLM_FAMILIES + base_families
-    elif download_from_csghub():
-        all_families = BUILTIN_CSGHUB_LLM_FAMILIES + base_families
-    else:
-        all_families = base_families
+    families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
-    for family in all_families:
+    for family in families:
         if model_name != family.model_name:
             continue
-        for spec in family.model_specs:
-            matched_quantization = _match_quantization(quantization, spec.quantizations)
+        # prepare possible quantization matching options
+        if download_hub is not None:
+            if download_hub == "huggingface":
+                model_specs = _get_model_specs(family.model_specs, download_hub)
+            else:
+                model_specs = _get_model_specs(
+                    family.model_specs, download_hub
+                ) + _get_model_specs(family.model_specs, "huggingface")
+        else:
+            if download_from_modelscope():
+                model_specs = _get_model_specs(
+                    family.model_specs, "modelscope"
+                ) + _get_model_specs(family.model_specs, "huggingface")
+            elif download_from_openmind_hub():
+                model_specs = _get_model_specs(
+                    family.model_specs, "openmind_hub"
+                ) + _get_model_specs(family.model_specs, "huggingface")
+            elif download_from_csghub():
+                model_specs = _get_model_specs(
+                    family.model_specs, "csghub"
+                ) + _get_model_specs(family.model_specs, "huggingface")
+            else:
+                model_specs = _get_model_specs(family.model_specs, "huggingface")
+        for spec in model_specs:
+            # check model_format and model_size_in_billions
             if (
                 model_format
                 and model_format != spec.model_format
@@ -1174,97 +543,27 @@ def match_llm(
                 and not match_model_size(
                     model_size_in_billions, spec.model_size_in_billions
                 )
-                or quantization
-                and matched_quantization is None
             ):
                 continue
-            # Copy spec to avoid _apply_format_to_model_id modify the original spec.
-            spec = spec.copy()
+            # Check quantization
+            matched_quantization = _match_quantization(quantization, spec.quantization)
+            if quantization and matched_quantization is None:
+                continue
+            _llm_family = family.copy()
             if quantization:
-                return (
-                    family,
-                    _apply_format_to_model_id(spec, matched_quantization),
-                    matched_quantization,
-                )
+                _llm_family.model_specs = [
+                    _apply_format_to_model_id(spec, matched_quantization)
+                ]
+                return _llm_family
             else:
                 # TODO: If user does not specify quantization, just use the first one
-                _q = "none" if spec.model_format == "pytorch" else spec.quantizations[0]
-                return family, _apply_format_to_model_id(spec, _q), _q
+                _q = "none" if spec.model_format == "pytorch" else spec.quantization
+                _llm_family.model_specs = [_apply_format_to_model_id(spec, _q)]
+                return _llm_family
     return None
-def register_llm(llm_family: LLMFamilyV1, persist: bool):
-    from ..utils import is_valid_model_name
-    from . import generate_engine_config_by_model_family
-    if not is_valid_model_name(llm_family.model_name):
-        raise ValueError(f"Invalid model name {llm_family.model_name}.")
-    for spec in llm_family.model_specs:
-        model_uri = spec.model_uri
-        if model_uri and not is_valid_model_uri(model_uri):
-            raise ValueError(f"Invalid model URI {model_uri}.")
-    with UD_LLM_FAMILIES_LOCK:
-        for family in BUILTIN_LLM_FAMILIES + UD_LLM_FAMILIES:
-            if llm_family.model_name == family.model_name:
-                raise ValueError(
-                    f"Model name conflicts with existing model {family.model_name}"
-                )
-        UD_LLM_FAMILIES.append(llm_family)
-        generate_engine_config_by_model_family(llm_family)
-    if persist:
-        persist_path = os.path.join(
-            XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
-        )
-        os.makedirs(os.path.dirname(persist_path), exist_ok=True)
-        with open(persist_path, mode="w") as fd:
-            fd.write(llm_family.json())
-def unregister_llm(model_name: str, raise_error: bool = True):
-    with UD_LLM_FAMILIES_LOCK:
-        llm_family = None
-        for i, f in enumerate(UD_LLM_FAMILIES):
-            if f.model_name == model_name:
-                llm_family = f
-                break
-        if llm_family:
-            UD_LLM_FAMILIES.remove(llm_family)
-            del LLM_ENGINES[model_name]
-            persist_path = os.path.join(
-                XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
-            )
-            if os.path.exists(persist_path):
-                os.remove(persist_path)
-            llm_spec = llm_family.model_specs[0]
-            cache_dir_name = (
-                f"{llm_family.model_name}-{llm_spec.model_format}"
-                f"-{llm_spec.model_size_in_billions}b"
-            )
-            cache_dir = os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name)
-            if os.path.exists(cache_dir):
-                logger.warning(
-                    f"Remove the cache of user-defined model {llm_family.model_name}. "
-                    f"Cache directory: {cache_dir}"
-                )
-                if os.path.islink(cache_dir):
-                    os.remove(cache_dir)
-                else:
-                    logger.warning(
-                        f"Cache directory is not a soft link, please remove it manually."
-                    )
-        else:
-            if raise_error:
-                raise ValueError(f"Model {model_name} not found")
-            else:
-                logger.warning(f"Custom model {model_name} not found")
 def check_engine_by_spec_parameters(
     model_engine: str,
     model_name: str,

xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl