PyPI - xinference - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl - Mend

xinference 1.5.1py3-none-any.whl → 1.6.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show

xinference/model/llm/__init__.py CHANGED Viewed

@@ -128,8 +128,38 @@ def register_custom_model():
                 warnings.warn(f"{user_defined_llm_dir}/{f} has error, {e}")
+def load_model_family_from_json(json_filename, target_families):
+    json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), json_filename)
+    for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
+        model_spec = LLMFamilyV1.parse_obj(json_obj)
+        target_families.append(model_spec)
+        # register chat_template
+        if (
+            "chat" in model_spec.model_ability
+            and isinstance(model_spec.chat_template, str)
+            and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
+        ):
+            # note that the key is the model name,
+            # since there are multiple representations of the same prompt style name in json.
+            if model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE:
+                BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                    "chat_template": model_spec.chat_template,
+                    "stop_token_ids": model_spec.stop_token_ids,
+                    "stop": model_spec.stop,
+                }
+        # register model family
+        if "chat" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
+        else:
+            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
+        if "tools" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
 def _install():
-    from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
+    from .llama_cpp.core import XllamaCppModel
     from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
     from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
@@ -166,8 +196,6 @@ def _install():
     # register llm classes.
     LLAMA_CLASSES.extend(
         [
-            LlamaCppChatModel,
-            LlamaCppModel,
             XllamaCppModel,
         ]
     )
@@ -210,115 +238,14 @@ def _install():
     SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
     SUPPORTED_ENGINES["LMDEPLOY"] = LMDEPLOY_CLASSES
-    json_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
+    load_model_family_from_json("llm_family.json", BUILTIN_LLM_FAMILIES)
+    load_model_family_from_json(
+        "llm_family_modelscope.json", BUILTIN_MODELSCOPE_LLM_FAMILIES
     )
-    for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
-        model_spec = LLMFamilyV1.parse_obj(json_obj)
-        BUILTIN_LLM_FAMILIES.append(model_spec)
-        # register chat_template
-        if "chat" in model_spec.model_ability and isinstance(
-            model_spec.chat_template, str
-        ):
-            # note that the key is the model name,
-            # since there are multiple representations of the same prompt style name in json.
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
-                "chat_template": model_spec.chat_template,
-                "stop_token_ids": model_spec.stop_token_ids,
-                "stop": model_spec.stop,
-            }
-        # register model family
-        if "chat" in model_spec.model_ability:
-            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
-        else:
-            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
-        if "tools" in model_spec.model_ability:
-            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
-    modelscope_json_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "llm_family_modelscope.json"
+    load_model_family_from_json(
+        "llm_family_openmind_hub.json", BUILTIN_OPENMIND_HUB_LLM_FAMILIES
     )
-    for json_obj in json.load(codecs.open(modelscope_json_path, "r", encoding="utf-8")):
-        model_spec = LLMFamilyV1.parse_obj(json_obj)
-        BUILTIN_MODELSCOPE_LLM_FAMILIES.append(model_spec)
-        # register prompt style, in case that we have something missed
-        # if duplicated with huggingface json, keep it as the huggingface style
-        if (
-            "chat" in model_spec.model_ability
-            and isinstance(model_spec.chat_template, str)
-            and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
-        ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
-                "chat_template": model_spec.chat_template,
-                "stop_token_ids": model_spec.stop_token_ids,
-                "stop": model_spec.stop,
-            }
-        # register model family
-        if "chat" in model_spec.model_ability:
-            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
-        else:
-            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
-        if "tools" in model_spec.model_ability:
-            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
-    openmind_hub_json_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "llm_family_openmind_hub.json"
-    )
-    for json_obj in json.load(
-        codecs.open(openmind_hub_json_path, "r", encoding="utf-8")
-    ):
-        model_spec = LLMFamilyV1.parse_obj(json_obj)
-        BUILTIN_OPENMIND_HUB_LLM_FAMILIES.append(model_spec)
-        # register prompt style, in case that we have something missed
-        # if duplicated with huggingface json, keep it as the huggingface style
-        if (
-            "chat" in model_spec.model_ability
-            and isinstance(model_spec.chat_template, str)
-            and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
-        ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
-                "chat_template": model_spec.chat_template,
-                "stop_token_ids": model_spec.stop_token_ids,
-                "stop": model_spec.stop,
-            }
-        # register model family
-        if "chat" in model_spec.model_ability:
-            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
-        else:
-            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
-        if "tools" in model_spec.model_ability:
-            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
-    csghub_json_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "llm_family_csghub.json"
-    )
-    for json_obj in json.load(codecs.open(csghub_json_path, "r", encoding="utf-8")):
-        model_spec = LLMFamilyV1.parse_obj(json_obj)
-        BUILTIN_CSGHUB_LLM_FAMILIES.append(model_spec)
-        # register prompt style, in case that we have something missed
-        # if duplicated with huggingface json, keep it as the huggingface style
-        if (
-            "chat" in model_spec.model_ability
-            and isinstance(model_spec.chat_template, str)
-            and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
-        ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
-                "chat_template": model_spec.chat_template,
-                "stop_token_ids": model_spec.stop_token_ids,
-                "stop": model_spec.stop,
-            }
-        # register model family
-        if "chat" in model_spec.model_ability:
-            BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
-        else:
-            BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
-        if "tools" in model_spec.model_ability:
-            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
+    load_model_family_from_json("llm_family_csghub.json", BUILTIN_CSGHUB_LLM_FAMILIES)
     for llm_specs in [
         BUILTIN_LLM_FAMILIES,

xinference/model/llm/core.py CHANGED Viewed

@@ -17,6 +17,7 @@ import inspect
 import logging
 import os
 import platform
+import warnings
 from abc import abstractmethod
 from collections import defaultdict
 from functools import lru_cache
@@ -134,13 +135,21 @@ class LLM(abc.ABC):
     ) -> bool:
         raise NotImplementedError
-    def prepare_parse_reasoning_content(self, reasoning_content):
-        # Initialize reasoning parser if model has reasoning ability
-        if "reasoning" in self.model_family.model_ability and reasoning_content:
-            self.reasoning_parser = ReasoningParser(
-                self.model_family.reasoning_start_tag,
-                self.model_family.reasoning_end_tag,
+    def prepare_parse_reasoning_content(
+        self, reasoning_content: bool, enable_thinking: bool = True
+    ):
+        if "hybrid" not in self.model_family.model_ability and not enable_thinking:
+            enable_thinking = True
+            warnings.warn(
+                "enable_thinking cannot be disabled for non hybrid model, will be ignored"
             )
+        # Initialize reasoning parser if model has reasoning ability
+        self.reasoning_parser = ReasoningParser(  # type: ignore
+            reasoning_content,
+            self.model_family.reasoning_start_tag,  # type: ignore
+            self.model_family.reasoning_end_tag,  # type: ignore
+            enable_thinking=enable_thinking,
+        )
 class LLMDescription(ModelDescription):

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -16,29 +16,17 @@ import importlib.util
 import logging
 import os
 import queue
-import time
-from typing import Dict, Iterator, List, Optional, Union
+from typing import Iterator, List, Optional, Union
 import orjson
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    Completion,
-    CompletionChunk,
-    CompletionUsage,
-    CreateCompletionLlamaCpp,
-    LlamaCppGenerateConfig,
-    LlamaCppModelConfig,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
+from ..utils import ChatModelMixin
 logger = logging.getLogger(__name__)
-USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 1)))
 class _Done:
     pass
@@ -57,21 +45,16 @@ class XllamaCppModel(LLM, ChatModelMixin):
         model_spec: "LLMSpecV1",
         quantization: str,
         model_path: str,
-        llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
+        llamacpp_model_config: Optional[dict] = None,
     ):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
-        self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
-            llamacpp_model_config
-        )
+        self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
         self._llm = None
         self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
-    def _sanitize_model_config(
-        self, llamacpp_model_config: Optional[LlamaCppModelConfig]
-    ) -> LlamaCppModelConfig:
+    def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
         if llamacpp_model_config is None:
-            llamacpp_model_config = LlamaCppModelConfig()
+            llamacpp_model_config = {}
         if self.model_family.context_length:
             llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
@@ -93,29 +76,6 @@ class XllamaCppModel(LLM, ChatModelMixin):
         return llamacpp_model_config
-    def _sanitize_generate_config(
-        self, generate_config: Optional[LlamaCppGenerateConfig]
-    ) -> LlamaCppGenerateConfig:
-        if generate_config is None:
-            generate_config = LlamaCppGenerateConfig(
-                **CreateCompletionLlamaCpp().dict()
-            )
-        else:
-            from llama_cpp import LlamaGrammar
-            grammar = generate_config.get("grammar")
-            if grammar is not None and not isinstance(grammar, LlamaGrammar):
-                generate_config["grammar"] = LlamaGrammar.from_string(
-                    generate_config["grammar"]
-                )
-            # Validate generate_config and fill default values to the generate config.
-            generate_config = LlamaCppGenerateConfig(
-                **CreateCompletionLlamaCpp(**generate_config).dict()
-            )
-        # Currently, llama.cpp does not support lora
-        generate_config.pop("lora_name", None)  # type: ignore
-        return generate_config
     @classmethod
     def check_lib(cls) -> bool:
         return importlib.util.find_spec("xllamacpp") is not None
@@ -143,7 +103,10 @@ class XllamaCppModel(LLM, ChatModelMixin):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._llamacpp_model_config.pop("enable_thinking", True)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         if os.path.isfile(self.model_path):
             # mostly passed from --model_path
@@ -152,7 +115,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
             # handle legacy cache.
             if (
                 self.model_spec.model_file_name_split_template
-                and self.model_spec.quantization_parts
+                and self.quantization in self.model_spec.quantization_parts
             ):
                 part = self.model_spec.quantization_parts[self.quantization]
                 model_path = os.path.join(
@@ -185,7 +148,14 @@ class XllamaCppModel(LLM, ChatModelMixin):
             params.n_parallel = os.cpu_count()
             for k, v in self._llamacpp_model_config.items():
                 try:
-                    setattr(params, k, v)
+                    if "." in k:
+                        parts = k.split(".")
+                        sub_param = params
+                        for p in parts[:-1]:
+                            sub_param = getattr(sub_param, p)
+                        setattr(sub_param, parts[-1], v)
+                    else:
+                        setattr(params, k, v)
                 except Exception as e:
                     logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
             n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
@@ -203,14 +173,13 @@ class XllamaCppModel(LLM, ChatModelMixin):
             raise RuntimeError(f"Load model {self.model_family.model_name} failed")
     def generate(
-        self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
+        self, prompt: str, generate_config: Optional[dict] = None
     ) -> Union[Completion, Iterator[CompletionChunk]]:
-        generate_config = self._sanitize_generate_config(generate_config)
+        generate_config = generate_config or {}
         stream = generate_config.get("stream", False)
         q: queue.Queue = queue.Queue()
         def _handle_completion():
-            # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
             data = generate_config
             data.pop("stopping_criteria", None)
             data.pop("logits_processor", None)
@@ -265,16 +234,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
     def chat(
         self,
-        messages: List[Dict],
-        generate_config: Optional[LlamaCppGenerateConfig] = None,
+        messages: List[dict],
+        generate_config: Optional[dict] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        generate_config = self._sanitize_generate_config(generate_config)
+        generate_config = generate_config or {}
         stream = generate_config.get("stream", False)
         tools = generate_config.pop("tools", []) if generate_config else None
         q: queue.Queue = queue.Queue()
         def _handle_chat_completion():
-            # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
             data = generate_config
             data.pop("stopping_criteria", None)
             data.pop("logits_processor", None)
@@ -336,299 +304,3 @@ class XllamaCppModel(LLM, ChatModelMixin):
             if type(r) is _Error:
                 raise Exception("Got error in chat: %s", r.msg)
             return self._to_chat_completion(r, self.reasoning_parser)
-class LlamaCppModel(LLM):
-    def __init__(
-        self,
-        model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
-        model_path: str,
-        llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
-    ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
-        self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
-            llamacpp_model_config
-        )
-        self._llm = None
-    def _can_apply_cublas(self):
-        # TODO: figure out the quantizations supported.
-        return True
-    def _sanitize_model_config(
-        self, llamacpp_model_config: Optional[LlamaCppModelConfig]
-    ) -> LlamaCppModelConfig:
-        if llamacpp_model_config is None:
-            llamacpp_model_config = LlamaCppModelConfig()
-        if self.model_family.context_length:
-            llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
-        llamacpp_model_config.setdefault("use_mmap", False)
-        llamacpp_model_config.setdefault("use_mlock", True)
-        if (
-            "llama-2" in self.model_family.model_name
-            and self.model_spec.model_size_in_billions == 70
-        ):
-            llamacpp_model_config["use_mlock"] = False
-            llamacpp_model_config["n_gqa"] = 8
-        if self._is_darwin_and_apple_silicon():
-            llamacpp_model_config.setdefault("n_gpu_layers", -1)
-        elif self._is_linux() and self._can_apply_cublas():
-            llamacpp_model_config.setdefault("n_gpu_layers", -1)
-        llamacpp_model_config.setdefault("reasoning_content", False)
-        return llamacpp_model_config
-    def _sanitize_generate_config(
-        self, generate_config: Optional[LlamaCppGenerateConfig]
-    ) -> LlamaCppGenerateConfig:
-        if generate_config is None:
-            generate_config = LlamaCppGenerateConfig(
-                **CreateCompletionLlamaCpp().dict()
-            )
-        else:
-            from llama_cpp import LlamaGrammar
-            grammar = generate_config.get("grammar")
-            if grammar is not None and not isinstance(grammar, LlamaGrammar):
-                generate_config["grammar"] = LlamaGrammar.from_string(
-                    generate_config["grammar"]
-                )
-            # Validate generate_config and fill default values to the generate config.
-            generate_config = LlamaCppGenerateConfig(
-                **CreateCompletionLlamaCpp(**generate_config).dict()
-            )
-        # Currently, llama.cpp does not support lora
-        generate_config.pop("lora_name", None)  # type: ignore
-        return generate_config
-    def load(self):
-        try:
-            import llama_cpp
-            from llama_cpp import Llama
-            if llama_cpp.__version__ < "0.2.0":
-                raise ValueError(
-                    "The llama_cpp version must be greater than 0.2.0. "
-                    "Please upgrade your version via `pip install -U llama_cpp` or refer to "
-                    "https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal."
-                )
-        except ImportError:
-            error_message = "Failed to import module 'llama_cpp'"
-            installation_guide = [
-                "Please make sure 'llama_cpp' is installed. ",
-                "You can install it by visiting the installation section of the git repo:\n",
-                "https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal",
-            ]
-            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
-        if os.path.isfile(self.model_path):
-            # mostly passed from --model_path
-            model_path = self.model_path
-        else:
-            # handle legacy cache.
-            if (
-                self.model_spec.model_file_name_split_template
-                and self.model_spec.quantization_parts
-            ):
-                part = self.model_spec.quantization_parts[self.quantization]
-                model_path = os.path.join(
-                    self.model_path,
-                    self.model_spec.model_file_name_split_template.format(
-                        quantization=self.quantization, part=part[0]
-                    ),
-                )
-            else:
-                model_path = os.path.join(
-                    self.model_path,
-                    self.model_spec.model_file_name_template.format(
-                        quantization=self.quantization
-                    ),
-                )
-                legacy_model_file_path = os.path.join(self.model_path, "model.bin")
-                if os.path.exists(legacy_model_file_path):
-                    model_path = legacy_model_file_path
-        try:
-            self._llm = Llama(
-                model_path=model_path,
-                verbose=True,
-                **self._llamacpp_model_config,
-            )
-        except AssertionError:
-            raise RuntimeError(f"Load model {self.model_family.model_name} failed")
-    @classmethod
-    def check_lib(cls) -> bool:
-        return importlib.util.find_spec("llama_cpp") is not None
-    @classmethod
-    def match_json(
-        cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
-    ) -> bool:
-        if llm_spec.model_format not in ["ggufv2"]:
-            return False
-        if "qwen" in llm_family.model_name:
-            return False
-        if "generate" not in llm_family.model_ability:
-            return False
-        return True
-    def generate(
-        self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
-        def generator_wrapper(
-            _prompt: str,
-            _generate_config: LlamaCppGenerateConfig,
-        ) -> Iterator[CompletionChunk]:
-            assert self._llm is not None
-            prompt_token_ids: List[int] = (
-                (
-                    self._llm.tokenize(prompt.encode("utf-8"), special=True)
-                    if prompt != ""
-                    else [self._llm.token_bos()]
-                )
-                if isinstance(prompt, str)
-                else prompt
-            )
-            prompt_tokens = len(prompt_token_ids)
-            completion_tokens, total_tokens = 0, 0
-            request_id = 0
-            for index, _completion_chunk in enumerate(
-                self._llm(prompt=_prompt, **_generate_config)
-            ):
-                _completion_chunk["model"] = self.model_uid
-                request_id = _completion_chunk["id"]
-                completion_tokens = index + 1
-                total_tokens = prompt_tokens + completion_tokens
-                _completion_chunk["usage"] = CompletionUsage(
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=completion_tokens,
-                    total_tokens=total_tokens,
-                )
-                yield _completion_chunk
-            if include_usage:
-                chunk = CompletionChunk(
-                    id=request_id,
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=self.model_uid,
-                    choices=[],
-                )
-                chunk["usage"] = CompletionUsage(
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=completion_tokens,
-                    total_tokens=total_tokens,
-                )
-                yield chunk
-        logger.debug(
-            "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
-        )
-        generate_config = self._sanitize_generate_config(generate_config)
-        stream = generate_config.get("stream", False)
-        stream_options = generate_config.pop("stream_options", None)
-        include_usage = (
-            stream_options["include_usage"]
-            if isinstance(stream_options, dict)
-            else False
-        )
-        if not stream:
-            assert self._llm is not None
-            completion = self._llm(prompt=prompt, **generate_config)
-            return completion
-        else:
-            return generator_wrapper(prompt, generate_config)
-class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
-    def __init__(
-        self,
-        model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
-        model_path: str,
-        llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
-    ):
-        super().__init__(
-            model_uid,
-            model_family,
-            model_spec,
-            quantization,
-            model_path,
-            llamacpp_model_config,
-        )
-    @classmethod
-    def match_json(
-        cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
-    ) -> bool:
-        if llm_spec.model_format not in ["ggufv2"]:
-            return False
-        if "chat" not in llm_family.model_ability:
-            return False
-        return True
-    def _sanitize_generate_config(
-        self, generate_config: Optional[LlamaCppGenerateConfig]
-    ) -> LlamaCppGenerateConfig:
-        generate_config = super()._sanitize_generate_config(generate_config)
-        if self.model_family.stop and self.model_family.stop:
-            generate_config["stop"] = self.model_family.stop.copy()
-        return generate_config
-    def chat(
-        self,
-        messages: List[Dict],
-        generate_config: Optional[LlamaCppGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        model_family = self.model_family.model_family or self.model_family.model_name
-        tools = generate_config.pop("tools", []) if generate_config else None
-        full_context_kwargs = (
-            self._get_chat_template_kwargs_from_generate_config(generate_config) or {}  # type: ignore
-        )
-        if tools:
-            if (
-                model_family in QWEN_TOOL_CALL_FAMILY
-                or model_family in DEEPSEEK_TOOL_CALL_FAMILY
-            ):
-                full_context_kwargs["tools"] = tools
-        assert self.model_family.chat_template is not None
-        full_prompt = self.get_full_context(
-            messages, self.model_family.chat_template, **full_context_kwargs
-        )
-        generate_config = self._sanitize_generate_config(generate_config)
-        stream = generate_config.get("stream", False)
-        if stream:
-            it = self.generate(full_prompt, generate_config)
-            assert isinstance(it, Iterator)
-            return self._to_chat_completion_chunks(it, self.reasoning_parser)
-        else:
-            c = self.generate(full_prompt, generate_config)
-            assert not isinstance(c, Iterator)
-            if tools:
-                return self._post_process_completion(
-                    self.model_family, self.model_uid, c, self.reasoning_parser
-                )
-            return self._to_chat_completion(c, self.reasoning_parser)
-if USE_XLLAMACPP:
-    LlamaCppModel = XllamaCppModel  # type: ignore  # noqa: F811
-    LlamaCppChatModel = XllamaCppModel  # type: ignore  # noqa: F811

xinference 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.5.1py3-none-any.whl → 1.6.0.post1py3-none-any.whl