PyPI - xinference - Versions diffs - 0.10.3__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

xinference 0.10.3py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (89) hide show

xinference/model/image/custom.py ADDED Viewed

@@ -0,0 +1,109 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from threading import Lock
+from typing import List, Optional
+from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
+from .core import ImageModelFamilyV1
+logger = logging.getLogger(__name__)
+UD_IMAGE_LOCK = Lock()
+class CustomImageModelFamilyV1(ImageModelFamilyV1):
+    model_id: Optional[str]  # type: ignore
+    model_revision: Optional[str]  # type: ignore
+    model_uri: Optional[str]
+    controlnet: Optional[List["CustomImageModelFamilyV1"]]
+UD_IMAGES: List[CustomImageModelFamilyV1] = []
+def get_user_defined_images() -> List[ImageModelFamilyV1]:
+    with UD_IMAGE_LOCK:
+        return UD_IMAGES.copy()
+def register_image(model_spec: CustomImageModelFamilyV1, persist: bool):
+    from ..utils import is_valid_model_name, is_valid_model_uri
+    from . import BUILTIN_IMAGE_MODELS, MODELSCOPE_IMAGE_MODELS
+    if not is_valid_model_name(model_spec.model_name):
+        raise ValueError(f"Invalid model name {model_spec.model_name}.")
+    with UD_IMAGE_LOCK:
+        for model_name in (
+            list(BUILTIN_IMAGE_MODELS.keys())
+            + list(MODELSCOPE_IMAGE_MODELS.keys())
+            + [spec.model_name for spec in UD_IMAGES]
+        ):
+            if model_spec.model_name == model_name:
+                raise ValueError(
+                    f"Model name conflicts with existing model {model_spec.model_name}"
+                )
+        UD_IMAGES.append(model_spec)
+    if persist:
+        #  We only validate model URL when persist is True.
+        model_uri = model_spec.model_uri
+        if model_uri and not is_valid_model_uri(model_uri):
+            raise ValueError(f"Invalid model URI {model_uri}")
+        persist_path = os.path.join(
+            XINFERENCE_MODEL_DIR, "image", f"{model_spec.model_id}.json"
+        )
+        os.makedirs(os.path.dirname(persist_path), exist_ok=True)
+        with open(persist_path, "w") as f:
+            f.write(model_spec.json())
+def unregister_image(model_name: str, raise_error: bool = True):
+    with UD_IMAGE_LOCK:
+        model_spec = None
+        for i, f in enumerate(UD_IMAGES):
+            if f.model_name == model_name:
+                model_spec = f
+                break
+        if model_spec:
+            UD_IMAGES.remove(model_spec)
+            persist_path = os.path.join(
+                XINFERENCE_MODEL_DIR, "image", f"{model_spec.model_id}.json"
+            )
+            if os.path.exists(persist_path):
+                os.remove(persist_path)
+            cache_dir = os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
+            if os.path.exists(cache_dir):
+                logger.warning(
+                    f"Remove the cache of user-defined model {model_spec.model_name}. "
+                    f"Cache directory: {cache_dir}"
+                )
+                if os.path.islink(cache_dir):
+                    os.remove(cache_dir)
+                else:
+                    logger.warning(
+                        f"Cache directory is not a soft link, please remove it manually."
+                    )
+        else:
+            if raise_error:
+                raise ValueError(f"Model {model_name} not found.")
+            else:
+                logger.warning(f"Custom image model {model_name} not found.")

xinference/model/llm/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import codecs
 import json
 import os
+import warnings
 from .core import (
     LLM,
@@ -30,8 +31,12 @@ from .llm_family import (
     BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
     BUILTIN_LLM_PROMPT_STYLE,
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
-    LLM_CLASSES,
-    PEFT_SUPPORTED_CLASSES,
+    LLAMA_CLASSES,
+    LLM_ENGINES,
+    SGLANG_CLASSES,
+    SUPPORTED_ENGINES,
+    TRANSFORMERS_CLASSES,
+    VLLM_CLASSES,
     CustomLLMFamilyV1,
     GgmlLLMSpecV1,
     LLMFamilyV1,
@@ -41,12 +46,68 @@ from .llm_family import (
     get_cache_status,
     get_user_defined_llm_families,
     match_llm,
-    match_llm_cls,
     register_llm,
     unregister_llm,
 )
+def check_format_with_engine(model_format, engine):
+    # only llama-cpp-python support and only support ggufv2 and ggmlv3
+    if model_format in ["ggufv2", "ggmlv3"] and engine != "llama.cpp":
+        return False
+    if model_format not in ["ggufv2", "ggmlv3"] and engine == "llama.cpp":
+        return False
+    return True
+def generate_engine_config_by_model_family(model_family):
+    model_name = model_family.model_name
+    specs = model_family.model_specs
+    engines = LLM_ENGINES.get(model_name, {})  # structure for engine query
+    for spec in specs:
+        model_format = spec.model_format
+        model_size_in_billions = spec.model_size_in_billions
+        quantizations = spec.quantizations
+        for quantization in quantizations:
+            # traverse all supported engines to match the name, format, size in billions and quatization of model
+            for engine in SUPPORTED_ENGINES:
+                if not check_format_with_engine(
+                    model_format, engine
+                ):  # match the format of model with engine
+                    continue
+                CLASSES = SUPPORTED_ENGINES[engine]
+                for cls in CLASSES:
+                    if cls.match(model_family, spec, quantization):
+                        engine_params = engines.get(engine, [])
+                        already_exists = False
+                        # if the name, format and size in billions of model already exists in the structure, add the new quantization
+                        for param in engine_params:
+                            if (
+                                model_name == param["model_name"]
+                                and model_format == param["model_format"]
+                                and model_size_in_billions
+                                == param["model_size_in_billions"]
+                            ):
+                                if quantization not in param["quantizations"]:
+                                    param["quantizations"].append(quantization)
+                                already_exists = True
+                                break
+                        # successfully match the params for the first time, add to the structure
+                        if not already_exists:
+                            engine_params.append(
+                                {
+                                    "model_name": model_name,
+                                    "model_format": model_format,
+                                    "model_size_in_billions": model_size_in_billions,
+                                    "quantizations": [quantization],
+                                    "llm_class": cls,
+                                }
+                            )
+                        engines[engine] = engine_params
+                        break
+    LLM_ENGINES[model_name] = engines
 def _install():
     from .ggml.chatglm import ChatglmCppChatModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
@@ -57,28 +118,31 @@ def _install():
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
     from .pytorch.internlm2 import Internlm2PytorchChatModel
     from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
-    from .pytorch.omnilmm import OmniLMMModel
     from .pytorch.qwen_vl import QwenVLChatModel
     from .pytorch.vicuna import VicunaPytorchChatModel
     from .pytorch.yi_vl import YiVLChatModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
     from .vllm.core import VLLMChatModel, VLLMModel
+    try:
+        from .pytorch.omnilmm import OmniLMMModel
+    except ImportError as e:
+        # For quite old transformers version,
+        # import will generate error
+        OmniLMMModel = None
+        warnings.warn(f"Cannot import OmniLLMModel due to reason: {e}")
     # register llm classes.
-    LLM_CLASSES.extend(
+    LLAMA_CLASSES.extend(
         [
+            ChatglmCppChatModel,
             LlamaCppChatModel,
             LlamaCppModel,
         ]
     )
-    LLM_CLASSES.extend(
-        [
-            ChatglmCppChatModel,
-        ]
-    )
-    LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel])
-    LLM_CLASSES.extend([VLLMModel, VLLMChatModel])
-    LLM_CLASSES.extend(
+    SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
+    VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
+    TRANSFORMERS_CLASSES.extend(
         [
             BaichuanPytorchChatModel,
             VicunaPytorchChatModel,
@@ -90,28 +154,19 @@ def _install():
             FalconPytorchModel,
             Internlm2PytorchChatModel,
             QwenVLChatModel,
-            OmniLMMModel,
             YiVLChatModel,
             DeepSeekVLChatModel,
             PytorchModel,
         ]
     )
-    PEFT_SUPPORTED_CLASSES.extend(
-        [
-            BaichuanPytorchChatModel,
-            VicunaPytorchChatModel,
-            FalconPytorchChatModel,
-            ChatglmPytorchChatModel,
-            LlamaPytorchModel,
-            LlamaPytorchChatModel,
-            PytorchChatModel,
-            FalconPytorchModel,
-            Internlm2PytorchChatModel,
-            QwenVLChatModel,
-            YiVLChatModel,
-            PytorchModel,
-        ]
-    )
+    if OmniLMMModel:  # type: ignore
+        TRANSFORMERS_CLASSES.append(OmniLMMModel)
+    # support 4 engines for now
+    SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES
+    SUPPORTED_ENGINES["SGLang"] = SGLANG_CLASSES
+    SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
+    SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
@@ -132,7 +187,7 @@ def _install():
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
         else:
             BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
-        if "tool_call" in model_spec.model_ability:
+        if "tools" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
     modelscope_json_path = os.path.join(
@@ -155,7 +210,7 @@ def _install():
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
         else:
             BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
-        if "tool_call" in model_spec.model_ability:
+        if "tools" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
     for llm_specs in [BUILTIN_LLM_FAMILIES, BUILTIN_MODELSCOPE_LLM_FAMILIES]:
@@ -163,6 +218,11 @@ def _install():
             if llm_spec.model_name not in LLM_MODEL_DESCRIPTIONS:
                 LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(llm_spec))
+    # traverse all families and add engine parameters corresponding to the model name
+    for families in [BUILTIN_LLM_FAMILIES, BUILTIN_MODELSCOPE_LLM_FAMILIES]:
+        for family in families:
+            generate_engine_config_by_model_family(family)
     from ...constants import XINFERENCE_MODEL_DIR
     user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "llm")

xinference/model/llm/core.py CHANGED Viewed

@@ -13,11 +13,13 @@
 # limitations under the License.
 import abc
+import inspect
 import logging
 import os
 import platform
 from abc import abstractmethod
 from collections import defaultdict
+from functools import lru_cache
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 from ...core.utils import parse_replica_model_uid
@@ -62,16 +64,6 @@ class LLM(abc.ABC):
         if kwargs:
             raise ValueError(f"Unrecognized keyword arguments: {kwargs}")
-    @staticmethod
-    def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]:
-        if isinstance(model_size_in_billions, str):
-            if "_" in model_size_in_billions:
-                ms = model_size_in_billions.replace("_", ".")
-                return float(ms)
-            else:
-                raise ValueError("Invalid format for `model_size_in_billions`")
-        return model_size_in_billions
     @staticmethod
     def _is_darwin_and_apple_silicon():
         return platform.system() == "Darwin" and platform.processor() == "arm"
@@ -81,12 +73,30 @@ class LLM(abc.ABC):
         return platform.system() == "Linux"
     @staticmethod
+    @lru_cache
     def _has_cuda_device():
-        from ...utils import cuda_count
-        return cuda_count() > 0
+        """
+        Use pynvml to impl this interface.
+        DO NOT USE torch to impl this, which will lead to some unexpected errors.
+        """
+        from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
+        device_count = 0
+        try:
+            nvmlInit()
+            device_count = nvmlDeviceGetCount()
+        except:
+            pass
+        finally:
+            try:
+                nvmlShutdown()
+            except:
+                pass
+        return device_count > 0
     @staticmethod
+    @lru_cache
     def _get_cuda_count():
         from ...utils import cuda_count
@@ -178,47 +188,60 @@ def create_llm_model_instance(
     devices: List[str],
     model_uid: str,
     model_name: str,
+    model_engine: Optional[str],
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
     peft_model_config: Optional[PeftModelConfig] = None,
-    is_local_deployment: bool = False,
     **kwargs,
 ) -> Tuple[LLM, LLMDescription]:
-    from . import match_llm, match_llm_cls
-    from .llm_family import cache
+    from .llm_family import cache, check_engine_by_spec_parameters, match_llm
+    if model_engine is None:
+        raise ValueError("model_engine is required for LLM model")
     match_result = match_llm(
-        model_name,
-        model_format,
-        model_size_in_billions,
-        quantization,
-        is_local_deployment,
+        model_name, model_format, model_size_in_billions, quantization
     )
     if not match_result:
         raise ValueError(
             f"Model not found, name: {model_name}, format: {model_format},"
             f" size: {model_size_in_billions}, quantization: {quantization}"
         )
     llm_family, llm_spec, quantization = match_result
     assert quantization is not None
-    save_path = cache(llm_family, llm_spec, quantization)
-    peft_model = peft_model_config.peft_model if peft_model_config else None
-    llm_cls = match_llm_cls(llm_family, llm_spec, quantization, peft_model=peft_model)
-    if not llm_cls:
-        raise ValueError(
-            f"Model not supported, name: {model_name}, format: {model_format},"
-            f" size: {model_size_in_billions}, quantization: {quantization}"
-        )
+    llm_cls = check_engine_by_spec_parameters(
+        model_engine,
+        llm_family.model_name,
+        llm_spec.model_format,
+        llm_spec.model_size_in_billions,
+        quantization,
+    )
     logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
+    save_path = cache(llm_family, llm_spec, quantization)
+    peft_model = peft_model_config.peft_model if peft_model_config else None
     if peft_model is not None:
-        model = llm_cls(
-            model_uid, llm_family, llm_spec, quantization, save_path, kwargs, peft_model
-        )
+        if "peft_model" in inspect.signature(llm_cls.__init__).parameters:
+            model = llm_cls(
+                model_uid,
+                llm_family,
+                llm_spec,
+                quantization,
+                save_path,
+                kwargs,
+                peft_model,
+            )
+        else:
+            logger.warning(
+                f"Model not supported with lora, name: {model_name}, format: {model_format}, engine: {model_engine}. "
+                f"Load this without lora."
+            )
+            model = llm_cls(
+                model_uid, llm_family, llm_spec, quantization, save_path, kwargs
+            )
     else:
         model = llm_cls(
             model_uid, llm_family, llm_spec, quantization, save_path, kwargs
@@ -226,71 +249,3 @@ def create_llm_model_instance(
     return model, LLMDescription(
         subpool_addr, devices, llm_family, llm_spec, quantization
     )
-def create_speculative_llm_model_instance(
-    subpool_addr: str,
-    devices: List[str],
-    model_uid: str,
-    model_name: str,
-    model_size_in_billions: Optional[Union[int, str]],
-    quantization: Optional[str],
-    draft_model_name: str,
-    draft_model_size_in_billions: Optional[int],
-    draft_quantization: Optional[str],
-    is_local_deployment: bool = False,
-) -> Tuple[LLM, LLMDescription]:
-    from . import match_llm
-    from .llm_family import cache
-    match_result = match_llm(
-        model_name,
-        "pytorch",
-        model_size_in_billions,
-        quantization,
-        is_local_deployment,
-    )
-    if not match_result:
-        raise ValueError(
-            f"Model not found, name: {model_name}, format: pytorch,"
-            f" size: {model_size_in_billions}, quantization: {quantization}"
-        )
-    llm_family, llm_spec, quantization = match_result
-    assert quantization is not None
-    save_path = cache(llm_family, llm_spec, quantization)
-    draft_match_result = match_llm(
-        draft_model_name,
-        "pytorch",
-        draft_model_size_in_billions,
-        draft_quantization,
-        is_local_deployment,
-    )
-    if not draft_match_result:
-        raise ValueError(
-            f"Model not found, name: {draft_model_name}, format: pytorch,"
-            f" size: {draft_model_size_in_billions}, quantization: {draft_quantization}"
-        )
-    draft_llm_family, draft_llm_spec, draft_quantization = draft_match_result
-    assert draft_quantization is not None
-    draft_save_path = cache(draft_llm_family, draft_llm_spec, draft_quantization)
-    from .pytorch.spec_model import SpeculativeModel
-    model = SpeculativeModel(
-        model_uid,
-        model_family=llm_family,
-        model_spec=llm_spec,
-        quantization=quantization,
-        model_path=save_path,
-        draft_model_family=draft_llm_family,
-        draft_model_spec=draft_llm_spec,
-        draft_quantization=draft_quantization,
-        draft_model_path=draft_save_path,
-    )
-    return model, LLMDescription(
-        subpool_addr, devices, llm_family, llm_spec, quantization
-    )

xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py CHANGED Viewed

@@ -116,7 +116,7 @@ class Vocab:
 class Tensor:
     def __init__(self, use_padding=True):
         self.name = None
-        self.dims: tuple[int, ...] = ()
+        self.dims: tuple[int, ...] = ()  # type: ignore
         self.dtype = None
         self.start_offset = 0
         self.len_bytes = np.int64(0)
@@ -211,7 +211,7 @@ class GGMLModel:
         self.validate_conversion(hp.ftype)
         vocab = Vocab(load_scores=self.file_format > GGMLFormat.GGML)
         offset += vocab.load(data, offset, hp.n_vocab)
-        tensors: list[Tensor] = []
+        tensors: list[Tensor] = []  # type: ignore
         tensor_map = {}
         while offset < len(data):
             tensor = Tensor(use_padding=self.file_format > GGMLFormat.GGMF)

xinference 0.10.3__py3-none-any.whl → 0.11.0__py3-none-any.whl

Potentially problematic release.

xinference 0.10.3py3-none-any.whl → 0.11.0py3-none-any.whl