PyPI - xinference - Versions diffs - 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.14.4.post1py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show

xinference/model/llm/__init__.py CHANGED Viewed

@@ -45,7 +45,6 @@ from .llm_family import (
     LLMFamilyV1,
     LLMSpecV1,
     MLXLLMSpecV1,
-    PromptStyleV1,
     PytorchLLMSpecV1,
     get_cache_status,
     get_user_defined_llm_families,
@@ -137,13 +136,18 @@ def _install():
     from .transformers.cogvlm2 import CogVLM2Model
     from .transformers.cogvlm2_video import CogVLM2VideoModel
     from .transformers.core import PytorchChatModel, PytorchModel
+    from .transformers.deepseek_v2 import (
+        DeepSeekV2PytorchChatModel,
+        DeepSeekV2PytorchModel,
+    )
     from .transformers.deepseek_vl import DeepSeekVLChatModel
     from .transformers.glm4v import Glm4VModel
     from .transformers.intern_vl import InternVLChatModel
     from .transformers.internlm2 import Internlm2PytorchChatModel
-    from .transformers.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
     from .transformers.minicpmv25 import MiniCPMV25Model
     from .transformers.minicpmv26 import MiniCPMV26Model
+    from .transformers.qwen2_audio import Qwen2AudioChatModel
+    from .transformers.qwen2_vl import Qwen2VLChatModel
     from .transformers.qwen_vl import QwenVLChatModel
     from .transformers.yi_vl import YiVLChatModel
     from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
@@ -170,11 +174,11 @@ def _install():
     TRANSFORMERS_CLASSES.extend(
         [
             ChatglmPytorchChatModel,
-            LlamaPytorchModel,
-            LlamaPytorchChatModel,
             PytorchChatModel,
             Internlm2PytorchChatModel,
             QwenVLChatModel,
+            Qwen2VLChatModel,
+            Qwen2AudioChatModel,
             YiVLChatModel,
             DeepSeekVLChatModel,
             InternVLChatModel,
@@ -184,6 +188,8 @@ def _install():
             MiniCPMV25Model,
             MiniCPMV26Model,
             Glm4VModel,
+            DeepSeekV2PytorchModel,
+            DeepSeekV2PytorchChatModel,
         ]
     )
     if OmniLMMModel:  # type: ignore
@@ -204,13 +210,17 @@ def _install():
         model_spec = LLMFamilyV1.parse_obj(json_obj)
         BUILTIN_LLM_FAMILIES.append(model_spec)
-        # register prompt style
+        # register chat_template
         if "chat" in model_spec.model_ability and isinstance(
-            model_spec.prompt_style, PromptStyleV1
+            model_spec.chat_template, str
         ):
             # note that the key is the model name,
             # since there are multiple representations of the same prompt style name in json.
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
@@ -230,10 +240,14 @@ def _install():
         # if duplicated with huggingface json, keep it as the huggingface style
         if (
             "chat" in model_spec.model_ability
-            and isinstance(model_spec.prompt_style, PromptStyleV1)
+            and isinstance(model_spec.chat_template, str)
             and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
         ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
@@ -253,10 +267,14 @@ def _install():
         # if duplicated with huggingface json, keep it as the huggingface style
         if (
             "chat" in model_spec.model_ability
-            and isinstance(model_spec.prompt_style, PromptStyleV1)
+            and isinstance(model_spec.chat_template, str)
             and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
         ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -14,12 +14,11 @@
 import logging
 import os
 import time
-from typing import Iterable, Iterator, List, Optional, Union
+from typing import Dict, Iterator, List, Optional, Union
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
     CompletionChunk,
     CompletionUsage,
@@ -181,13 +180,12 @@ class LlamaCppModel(LLM):
             for index, _completion_chunk in enumerate(
                 self._llm(prompt=_prompt, **_generate_config)
             ):
+                _completion_chunk["model"] = self.model_uid
                 request_id = _completion_chunk["id"]
-                choice = _completion_chunk["choices"][0]
-                if choice["finish_reason"] is not None:
-                    completion_tokens = index
+                completion_tokens = index + 1
                 total_tokens = prompt_tokens + completion_tokens
                 _completion_chunk["usage"] = CompletionUsage(
-                    prompt_tokens=total_tokens,
+                    prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
                 )
@@ -262,39 +260,26 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
         self, generate_config: Optional[LlamaCppGenerateConfig]
     ) -> LlamaCppGenerateConfig:
         generate_config = super()._sanitize_generate_config(generate_config)
-        if self.model_family.prompt_style and self.model_family.prompt_style.stop:
-            generate_config["stop"] = self.model_family.prompt_style.stop
+        if self.model_family.stop and self.model_family.stop:
+            generate_config["stop"] = self.model_family.stop.copy()
         return generate_config
     def chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[LlamaCppGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        if system_prompt:
-            prompt_style.system_prompt = system_prompt
-        chat_history = chat_history or []
-        assert prompt_style is not None
+        model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
+        full_context_kwargs = {}
+        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
+            full_context_kwargs["tools"] = tools
+        assert self.model_family.chat_template is not None
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, **full_context_kwargs
+        )
         generate_config = self._sanitize_generate_config(generate_config)
-        # TODO(codingl2k1): qwen hacky to set stop for function call.
-        model_family = self.model_family.model_family or self.model_family.model_name
-        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            stop = generate_config.get("stop")
-            if isinstance(stop, str):
-                generate_config["stop"] = [stop, "Observation:"]
-            elif isinstance(stop, Iterable):
-                assert not isinstance(stop, str)
-                generate_config["stop"] = stop + ["Observation:"]  # type: ignore
-            else:
-                generate_config["stop"] = "Observation:"
         stream = generate_config.get("stream", False)
         if stream:
@@ -305,7 +290,5 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
             if tools:
-                return self._tool_calls_completion(
-                    self.model_family, self.model_uid, c, tools
-                )
+                return self._tool_calls_completion(self.model_family, self.model_uid, c)
             return self._to_chat_completion(c)