PyPI - xinference - Versions diffs - 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend - Supply Chain Defender

xinference 1.5.0.post2py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -12,32 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import concurrent.futures
+import importlib.util
 import logging
 import os
 import queue
-import time
-from typing import Dict, Iterator, List, Optional, Union
+from typing import Iterator, List, Optional, Union
 import orjson
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    Completion,
-    CompletionChunk,
-    CompletionUsage,
-    CreateCompletionLlamaCpp,
-    LlamaCppGenerateConfig,
-    LlamaCppModelConfig,
-)
+from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
+from ..utils import ChatModelMixin
 logger = logging.getLogger(__name__)
-USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 1)))
 class _Done:
     pass
@@ -56,21 +45,16 @@ class XllamaCppModel(LLM, ChatModelMixin):
         model_spec: "LLMSpecV1",
         quantization: str,
         model_path: str,
-        llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
+        llamacpp_model_config: Optional[dict] = None,
     ):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
-        self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
-            llamacpp_model_config
-        )
+        self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
         self._llm = None
         self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
-    def _sanitize_model_config(
-        self, llamacpp_model_config: Optional[LlamaCppModelConfig]
-    ) -> LlamaCppModelConfig:
+    def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
         if llamacpp_model_config is None:
-            llamacpp_model_config = LlamaCppModelConfig()
+            llamacpp_model_config = {}
         if self.model_family.context_length:
             llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
@@ -92,31 +76,12 @@ class XllamaCppModel(LLM, ChatModelMixin):
         return llamacpp_model_config
-    def _sanitize_generate_config(
-        self, generate_config: Optional[LlamaCppGenerateConfig]
-    ) -> LlamaCppGenerateConfig:
-        if generate_config is None:
-            generate_config = LlamaCppGenerateConfig(
-                **CreateCompletionLlamaCpp().dict()
-            )
-        else:
-            from llama_cpp import LlamaGrammar
-            grammar = generate_config.get("grammar")
-            if grammar is not None and not isinstance(grammar, LlamaGrammar):
-                generate_config["grammar"] = LlamaGrammar.from_string(
-                    generate_config["grammar"]
-                )
-            # Validate generate_config and fill default values to the generate config.
-            generate_config = LlamaCppGenerateConfig(
-                **CreateCompletionLlamaCpp(**generate_config).dict()
-            )
-        # Currently, llama.cpp does not support lora
-        generate_config.pop("lora_name", None)  # type: ignore
-        return generate_config
+    @classmethod
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("xllamacpp") is not None
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["ggufv2"]:
@@ -138,7 +103,10 @@ class XllamaCppModel(LLM, ChatModelMixin):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._llamacpp_model_config.pop("enable_thinking", True)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         if os.path.isfile(self.model_path):
             # mostly passed from --model_path
@@ -147,7 +115,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
             # handle legacy cache.
             if (
                 self.model_spec.model_file_name_split_template
-                and self.model_spec.quantization_parts
+                and self.quantization in self.model_spec.quantization_parts
             ):
                 part = self.model_spec.quantization_parts[self.quantization]
                 model_path = os.path.join(
@@ -180,7 +148,14 @@ class XllamaCppModel(LLM, ChatModelMixin):
             params.n_parallel = os.cpu_count()
             for k, v in self._llamacpp_model_config.items():
                 try:
-                    setattr(params, k, v)
+                    if "." in k:
+                        parts = k.split(".")
+                        sub_param = params
+                        for p in parts[:-1]:
+                            sub_param = getattr(sub_param, p)
+                        setattr(sub_param, parts[-1], v)
+                    else:
+                        setattr(params, k, v)
                 except Exception as e:
                     logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
             n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
@@ -198,14 +173,13 @@ class XllamaCppModel(LLM, ChatModelMixin):
             raise RuntimeError(f"Load model {self.model_family.model_name} failed")
     def generate(
-        self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
+        self, prompt: str, generate_config: Optional[dict] = None
     ) -> Union[Completion, Iterator[CompletionChunk]]:
-        generate_config = self._sanitize_generate_config(generate_config)
+        generate_config = generate_config or {}
         stream = generate_config.get("stream", False)
         q: queue.Queue = queue.Queue()
         def _handle_completion():
-            # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
             data = generate_config
             data.pop("stopping_criteria", None)
             data.pop("logits_processor", None)
@@ -260,16 +234,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
     def chat(
         self,
-        messages: List[Dict],
-        generate_config: Optional[LlamaCppGenerateConfig] = None,
+        messages: List[dict],
+        generate_config: Optional[dict] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        generate_config = self._sanitize_generate_config(generate_config)
+        generate_config = generate_config or {}
         stream = generate_config.get("stream", False)
         tools = generate_config.pop("tools", []) if generate_config else None
         q: queue.Queue = queue.Queue()
         def _handle_chat_completion():
-            # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
             data = generate_config
             data.pop("stopping_criteria", None)
             data.pop("logits_processor", None)
@@ -331,293 +304,3 @@ class XllamaCppModel(LLM, ChatModelMixin):
             if type(r) is _Error:
                 raise Exception("Got error in chat: %s", r.msg)
             return self._to_chat_completion(r, self.reasoning_parser)
-class LlamaCppModel(LLM):
-    def __init__(
-        self,
-        model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
-        model_path: str,
-        llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
-    ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
-        self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
-            llamacpp_model_config
-        )
-        self._llm = None
-    def _can_apply_cublas(self):
-        # TODO: figure out the quantizations supported.
-        return True
-    def _sanitize_model_config(
-        self, llamacpp_model_config: Optional[LlamaCppModelConfig]
-    ) -> LlamaCppModelConfig:
-        if llamacpp_model_config is None:
-            llamacpp_model_config = LlamaCppModelConfig()
-        if self.model_family.context_length:
-            llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
-        llamacpp_model_config.setdefault("use_mmap", False)
-        llamacpp_model_config.setdefault("use_mlock", True)
-        if (
-            "llama-2" in self.model_family.model_name
-            and self.model_spec.model_size_in_billions == 70
-        ):
-            llamacpp_model_config["use_mlock"] = False
-            llamacpp_model_config["n_gqa"] = 8
-        if self._is_darwin_and_apple_silicon():
-            llamacpp_model_config.setdefault("n_gpu_layers", -1)
-        elif self._is_linux() and self._can_apply_cublas():
-            llamacpp_model_config.setdefault("n_gpu_layers", -1)
-        llamacpp_model_config.setdefault("reasoning_content", False)
-        return llamacpp_model_config
-    def _sanitize_generate_config(
-        self, generate_config: Optional[LlamaCppGenerateConfig]
-    ) -> LlamaCppGenerateConfig:
-        if generate_config is None:
-            generate_config = LlamaCppGenerateConfig(
-                **CreateCompletionLlamaCpp().dict()
-            )
-        else:
-            from llama_cpp import LlamaGrammar
-            grammar = generate_config.get("grammar")
-            if grammar is not None and not isinstance(grammar, LlamaGrammar):
-                generate_config["grammar"] = LlamaGrammar.from_string(
-                    generate_config["grammar"]
-                )
-            # Validate generate_config and fill default values to the generate config.
-            generate_config = LlamaCppGenerateConfig(
-                **CreateCompletionLlamaCpp(**generate_config).dict()
-            )
-        # Currently, llama.cpp does not support lora
-        generate_config.pop("lora_name", None)  # type: ignore
-        return generate_config
-    def load(self):
-        try:
-            import llama_cpp
-            from llama_cpp import Llama
-            if llama_cpp.__version__ < "0.2.0":
-                raise ValueError(
-                    "The llama_cpp version must be greater than 0.2.0. "
-                    "Please upgrade your version via `pip install -U llama_cpp` or refer to "
-                    "https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal."
-                )
-        except ImportError:
-            error_message = "Failed to import module 'llama_cpp'"
-            installation_guide = [
-                "Please make sure 'llama_cpp' is installed. ",
-                "You can install it by visiting the installation section of the git repo:\n",
-                "https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal",
-            ]
-            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
-        if os.path.isfile(self.model_path):
-            # mostly passed from --model_path
-            model_path = self.model_path
-        else:
-            # handle legacy cache.
-            if (
-                self.model_spec.model_file_name_split_template
-                and self.model_spec.quantization_parts
-            ):
-                part = self.model_spec.quantization_parts[self.quantization]
-                model_path = os.path.join(
-                    self.model_path,
-                    self.model_spec.model_file_name_split_template.format(
-                        quantization=self.quantization, part=part[0]
-                    ),
-                )
-            else:
-                model_path = os.path.join(
-                    self.model_path,
-                    self.model_spec.model_file_name_template.format(
-                        quantization=self.quantization
-                    ),
-                )
-                legacy_model_file_path = os.path.join(self.model_path, "model.bin")
-                if os.path.exists(legacy_model_file_path):
-                    model_path = legacy_model_file_path
-        try:
-            self._llm = Llama(
-                model_path=model_path,
-                verbose=True,
-                **self._llamacpp_model_config,
-            )
-        except AssertionError:
-            raise RuntimeError(f"Load model {self.model_family.model_name} failed")
-    @classmethod
-    def match(
-        cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
-    ) -> bool:
-        if llm_spec.model_format not in ["ggufv2"]:
-            return False
-        if "qwen" in llm_family.model_name:
-            return False
-        if "generate" not in llm_family.model_ability:
-            return False
-        return True
-    def generate(
-        self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
-        def generator_wrapper(
-            _prompt: str,
-            _generate_config: LlamaCppGenerateConfig,
-        ) -> Iterator[CompletionChunk]:
-            assert self._llm is not None
-            prompt_token_ids: List[int] = (
-                (
-                    self._llm.tokenize(prompt.encode("utf-8"), special=True)
-                    if prompt != ""
-                    else [self._llm.token_bos()]
-                )
-                if isinstance(prompt, str)
-                else prompt
-            )
-            prompt_tokens = len(prompt_token_ids)
-            completion_tokens, total_tokens = 0, 0
-            request_id = 0
-            for index, _completion_chunk in enumerate(
-                self._llm(prompt=_prompt, **_generate_config)
-            ):
-                _completion_chunk["model"] = self.model_uid
-                request_id = _completion_chunk["id"]
-                completion_tokens = index + 1
-                total_tokens = prompt_tokens + completion_tokens
-                _completion_chunk["usage"] = CompletionUsage(
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=completion_tokens,
-                    total_tokens=total_tokens,
-                )
-                yield _completion_chunk
-            if include_usage:
-                chunk = CompletionChunk(
-                    id=request_id,
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=self.model_uid,
-                    choices=[],
-                )
-                chunk["usage"] = CompletionUsage(
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=completion_tokens,
-                    total_tokens=total_tokens,
-                )
-                yield chunk
-        logger.debug(
-            "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
-        )
-        generate_config = self._sanitize_generate_config(generate_config)
-        stream = generate_config.get("stream", False)
-        stream_options = generate_config.pop("stream_options", None)
-        include_usage = (
-            stream_options["include_usage"]
-            if isinstance(stream_options, dict)
-            else False
-        )
-        if not stream:
-            assert self._llm is not None
-            completion = self._llm(prompt=prompt, **generate_config)
-            return completion
-        else:
-            return generator_wrapper(prompt, generate_config)
-class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
-    def __init__(
-        self,
-        model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
-        model_path: str,
-        llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
-    ):
-        super().__init__(
-            model_uid,
-            model_family,
-            model_spec,
-            quantization,
-            model_path,
-            llamacpp_model_config,
-        )
-    @classmethod
-    def match(
-        cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
-    ) -> bool:
-        if llm_spec.model_format not in ["ggufv2"]:
-            return False
-        if "chat" not in llm_family.model_ability:
-            return False
-        return True
-    def _sanitize_generate_config(
-        self, generate_config: Optional[LlamaCppGenerateConfig]
-    ) -> LlamaCppGenerateConfig:
-        generate_config = super()._sanitize_generate_config(generate_config)
-        if self.model_family.stop and self.model_family.stop:
-            generate_config["stop"] = self.model_family.stop.copy()
-        return generate_config
-    def chat(
-        self,
-        messages: List[Dict],
-        generate_config: Optional[LlamaCppGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        model_family = self.model_family.model_family or self.model_family.model_name
-        tools = generate_config.pop("tools", []) if generate_config else None
-        full_context_kwargs = {}
-        if tools:
-            if (
-                model_family in QWEN_TOOL_CALL_FAMILY
-                or model_family in DEEPSEEK_TOOL_CALL_FAMILY
-            ):
-                full_context_kwargs["tools"] = tools
-        assert self.model_family.chat_template is not None
-        full_prompt = self.get_full_context(
-            messages, self.model_family.chat_template, **full_context_kwargs
-        )
-        generate_config = self._sanitize_generate_config(generate_config)
-        stream = generate_config.get("stream", False)
-        if stream:
-            it = self.generate(full_prompt, generate_config)
-            assert isinstance(it, Iterator)
-            return self._to_chat_completion_chunks(it, self.reasoning_parser)
-        else:
-            c = self.generate(full_prompt, generate_config)
-            assert not isinstance(c, Iterator)
-            if tools:
-                return self._post_process_completion(
-                    self.model_family, self.model_uid, c, self.reasoning_parser
-                )
-            return self._to_chat_completion(c, self.reasoning_parser)
-if USE_XLLAMACPP:
-    LlamaCppModel = XllamaCppModel  # type: ignore  # noqa: F811
-    LlamaCppChatModel = XllamaCppModel  # type: ignore  # noqa: F811