PyPI - xinference - Versions diffs - 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl - Mend

xinference 0.14.1.post1py3-none-any.whl → 0.14.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show

xinference/model/llm/{pytorch → transformers}/minicpmv26.py RENAMED Viewed

@@ -11,16 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
-import json
 import logging
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from io import BytesIO
 from typing import Dict, Iterator, List, Optional, Union
-import requests
 import torch
 from PIL import Image
@@ -35,6 +31,7 @@ from ....types import (
 )
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import _decode_image
 from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
@@ -106,47 +103,60 @@ class MiniCPMV26Model(PytorchChatModel):
         self._save_tensorizer()
     def _message_content_to_chat(self, content):
-        def _load_image(_url):
+        MAX_NUM_FRAMES = 64
+        def encode_video(video_path):
+            from decord import VideoReader, cpu
+            def uniform_sample(l, n):
+                gap = len(l) / n
+                idxs = [int(i * gap + gap / 2) for i in range(n)]
+                return [l[i] for i in idxs]
+            vr = VideoReader(video_path, ctx=cpu(0))
+            sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+            frame_idx = [i for i in range(0, len(vr), sample_fps)]
+            if len(frame_idx) > MAX_NUM_FRAMES:
+                frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+            frames = vr.get_batch(frame_idx).asnumpy()
+            frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+            print("num frames:", len(frames))
+            return frames
+        def _load_video(_url):
+            frames = None
             if _url.startswith("data:"):
-                logging.info("Parse url by base64 decoder.")
-                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
-                # e.g. f"data:image/jpeg;base64,{base64_image}"
-                _type, data = _url.split(";")
-                _, ext = _type.split("/")
-                data = data[len("base64,") :]
-                data = base64.b64decode(data.encode("utf-8"))
-                return Image.open(BytesIO(data)).convert("RGB")
+                raise RuntimeError("Only video url format is supported")
             else:
-                try:
-                    response = requests.get(_url)
-                except requests.exceptions.MissingSchema:
-                    return Image.open(_url).convert("RGB")
-                else:
-                    return Image.open(BytesIO(response.content)).convert("RGB")
+                frames = encode_video(_url)
+            return frames
         if not isinstance(content, str):
             texts = []
             image_urls = []
+            video_urls = []
             for c in content:
                 c_type = c.get("type")
                 if c_type == "text":
                     texts.append(c["text"])
                 elif c_type == "image_url":
                     image_urls.append(c["image_url"]["url"])
+                elif c_type == "video_url":
+                    video_urls.append(c["video_url"]["url"])
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
-                    fut = executor.submit(_load_image, image_url)
+                    fut = executor.submit(_decode_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
+            frames = []
+            if len(video_urls) > 1:
+                raise RuntimeError("Only one video per message is supported")
+            for v in video_urls:
+                frames = _load_video(v)
             text = " ".join(texts)
-            if len(images) == 0:
-                return text, []
-            elif len(images) == 1:
-                return text, images
-            else:
-                raise RuntimeError("Only one image per message is supported")
-        return content, []
+            return text, images, frames
+        return content, [], []
     def chat(
         self,
@@ -156,36 +166,51 @@ class MiniCPMV26Model(PytorchChatModel):
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         stream = generate_config.get("stream", False) if generate_config else False
-        content, images_chat = self._message_content_to_chat(prompt)
+        videoExisted = False
+        content, images_chat, video_frames = self._message_content_to_chat(prompt)
+        if len(video_frames) > 0:
+            videoExisted = True
+            images_chat = video_frames
         msgs = []
         query_to_response: List[Dict] = []
-        images_history = []
         for h in chat_history or []:
+            images_history = []
             role = h["role"]
-            content_h, images_tmp = self._message_content_to_chat(h["content"])
+            content_h, images_tmp, video_frames_h = self._message_content_to_chat(
+                h["content"]
+            )
             if images_tmp != []:
                 images_history = images_tmp
+            if len(video_frames_h) > 0:
+                videoExisted = True
+                images_history = video_frames_h
             if len(query_to_response) == 0 and role == "user":
-                query_to_response.append({"role": "user", "content": content_h})
+                query_to_response.append(
+                    {"role": "user", "content": images_history + [content_h]}
+                )
             if len(query_to_response) == 1 and role == "assistant":
-                query_to_response.append({"role": "assistant", "content": content_h})
+                query_to_response.append(
+                    {"role": "assistant", "content": images_history + [content_h]}
+                )
             if len(query_to_response) == 2:
                 msgs.extend(query_to_response)
                 query_to_response = []
-        image = None
-        if len(images_chat) > 0:
-            image = images_chat[0]
-        elif len(images_history) > 0:
-            image = images_history[0]
-        msgs.append({"role": "user", "content": content})
+        msgs.append({"role": "user", "content": images_chat + [content]})
+        # Set decode params for video
+        params = {}
+        if videoExisted:
+            params = {"use_image_id": False, "max_slice_nums": 1}
         chat = self._model.chat(
-            image=image,
-            msgs=json.dumps(msgs, ensure_ascii=True),
+            image=None,
+            msgs=msgs,
             tokenizer=self._tokenizer,
             sampling=True,
-            **generate_config
+            **generate_config,
+            **params,
         )
         if stream:
             it = self.chat_stream(chat)

xinference/model/llm/{pytorch → transformers}/utils.py RENAMED Viewed

@@ -40,8 +40,7 @@ from ....types import (
 )
 if TYPE_CHECKING:
-    from ...llm.pytorch.core import PytorchModel
+    from ...llm.transformers.core import PytorchModel
 logger = logging.getLogger(__name__)

xinference/model/llm/{pytorch → transformers}/yi_vl.py RENAMED Viewed

@@ -11,18 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
 import logging
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from io import BytesIO
 from threading import Thread
 from typing import Dict, Iterator, List, Optional, Union
-import requests
 import torch
-from PIL import Image
 from ....model.utils import select_device
 from ....types import (
@@ -35,6 +31,7 @@ from ....types import (
     CompletionUsage,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import _decode_image
 from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
@@ -78,25 +75,6 @@ class YiVLChatModel(PytorchChatModel):
     @staticmethod
     def _message_content_to_yi(content) -> Union[str, tuple]:
-        def _load_image(_url):
-            if _url.startswith("data:"):
-                logging.info("Parse url by base64 decoder.")
-                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
-                # e.g. f"data:image/jpeg;base64,{base64_image}"
-                _type, data = _url.split(";")
-                _, ext = _type.split("/")
-                data = data[len("base64,") :]
-                data = base64.b64decode(data.encode("utf-8"))
-                return Image.open(BytesIO(data))
-            else:
-                try:
-                    response = requests.get(_url)
-                except requests.exceptions.MissingSchema:
-                    return Image.open(_url)
-                else:
-                    return Image.open(BytesIO(response.content))
         if not isinstance(content, str):
             from ....thirdparty.llava.model.constants import DEFAULT_IMAGE_TOKEN
@@ -111,7 +89,7 @@ class YiVLChatModel(PytorchChatModel):
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
-                    fut = executor.submit(_load_image, image_url)
+                    fut = executor.submit(_decode_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
             text = " ".join(texts)

xinference/model/llm/utils.py CHANGED Viewed

@@ -11,14 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import base64
 import functools
 import json
 import logging
 import os
 import time
 import uuid
+from io import BytesIO
 from typing import AsyncGenerator, Dict, Iterator, List, Optional, Tuple, cast
+import requests
+from PIL import Image
 from ...types import (
     SPECIAL_TOOL_PROMPT,
     ChatCompletion,
@@ -28,7 +33,7 @@ from ...types import (
     CompletionChunk,
 )
 from .llm_family import (
-    GgmlLLMSpecV1,
+    LlamaCppLLMSpecV1,
     LLMFamilyV1,
     LLMSpecV1,
     PromptStyleV1,
@@ -60,7 +65,7 @@ class ChatModelMixin:
         chat_history: List[ChatCompletionMessage],
         prompt_style: PromptStyleV1,
         tools: Optional[List[Dict]] = None,
-    ) -> str:
+    ):
         """
         Inspired by FastChat. Format chat history into a prompt according to the prompty style of
         different models.
@@ -92,17 +97,6 @@ class ChatModelMixin:
                 else:
                     ret += role + ":"
             return ret
-        elif prompt_style.style_name == "ADD_COLON_TWO":
-            seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
-            ret = prompt_style.system_prompt + seps[0]
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += role + ": " + content + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
         elif prompt_style.style_name == "NO_COLON_TWO":
             seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
             ret = prompt_style.system_prompt
@@ -144,21 +138,6 @@ class ChatModelMixin:
                 else:
                     ret += f"<|start_header_id|>{role}<|end_header_id|>{prompt_style.intra_message_sep}"
             return ret
-        elif prompt_style.style_name == "FALCON":
-            ret = prompt_style.system_prompt
-            for message in chat_history:
-                role = get_role(message["role"])
-                content = message["content"]
-                if content:
-                    ret += (
-                        role
-                        + ": "
-                        + content.replace("\r\n", "\n").replace("\n\n", "\n")
-                    )
-                    ret += "\n\n"
-                else:
-                    ret += role + ":"
-            return ret
         elif prompt_style.style_name == "MIXTRAL_V01":
             ret = ""
             for i, message in enumerate(chat_history):
@@ -168,22 +147,6 @@ class ChatModelMixin:
                 else:  # assistant
                     ret += f"{content} </s>"
             return ret
-        elif prompt_style.style_name == "CHATGLM":
-            round_add_n = 1 if prompt_style.intra_message_sep == "\n\n" else 0
-            if prompt_style.system_prompt:
-                ret = prompt_style.system_prompt + prompt_style.intra_message_sep
-            else:
-                ret = ""
-            for i, message in enumerate(chat_history):
-                role = get_role(message["role"])
-                content = message["content"]
-                if i % 2 == 0:
-                    ret += f"[Round {i // 2 + round_add_n}]{prompt_style.intra_message_sep}"
-                if content:
-                    ret += role + "：" + content + prompt_style.intra_message_sep
-                else:
-                    ret += role + "："
-            return ret
         elif prompt_style.style_name == "CHATGLM3":
             prompts = (
                 [f"<|system|>\n {prompt_style.system_prompt}"]
@@ -323,25 +286,6 @@ Begin!"""
                 else:
                     ret += role + "\n"
             return ret
-        elif prompt_style.style_name == "INTERNLM":
-            seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
-            ret = ""
-            for i, message in enumerate(chat_history[:-2]):
-                if i % 2 == 0:
-                    ret += "<s>"
-                role = get_role(message["role"])
-                content = message["content"]
-                ret += role + ":" + str(content) + seps[i % 2]
-            if len(ret) == 0:
-                ret += "<s>"
-            ret += (
-                chat_history[-2]["role"]
-                + ":"
-                + str(chat_history[-2]["content"])
-                + seps[0]
-            )
-            ret += chat_history[-1]["role"] + ":"
-            return ret
         elif prompt_style.style_name == "INTERNLM2":
             ret = (
                 "<s>"
@@ -370,9 +314,6 @@ Begin!"""
                 else:
                     ret += role + ": Let's think step by step."
             return ret
-        elif prompt_style.style_name == "INSTRUCTION":
-            message = chat_history[-2]
-            return prompt_style.system_prompt.format(message["content"])
         elif prompt_style.style_name == "DEEPSEEK_CHAT":
             seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
             ret = prompt_style.system_prompt
@@ -504,6 +445,61 @@ Begin!"""
                 else:
                     ret += role
             return ret
+        elif prompt_style.style_name == "INTERNVL":
+            ret = (
+                "<s>"
+                if prompt_style.system_prompt == ""
+                else "<s><|im_start|>system\n"
+                + prompt_style.system_prompt
+                + prompt_style.intra_message_sep
+                + "\n"
+            )
+            images = []  # type: ignore
+            for message in chat_history:
+                role = get_role(message["role"])
+                content = message["content"]
+                if isinstance(content, str):
+                    if content:
+                        ret += (
+                            role
+                            + "\n"
+                            + content
+                            + prompt_style.intra_message_sep
+                            + "\n"
+                        )
+                    else:
+                        ret += role + "\n"
+                elif isinstance(content, list):
+                    text = ""
+                    image_urls = []
+                    for c in content:
+                        c_type = c.get("type")
+                        if c_type == "text":
+                            text = c["text"]
+                        elif c_type == "image_url":
+                            image_urls.append(c["image_url"]["url"])
+                    image_futures = []
+                    from concurrent.futures import ThreadPoolExecutor
+                    with ThreadPoolExecutor() as executor:
+                        for image_url in image_urls:
+                            fut = executor.submit(_decode_image, image_url)
+                            image_futures.append(fut)
+                    images = [fut.result() for fut in image_futures]
+                    if len(image_futures) == 0:
+                        ret += (
+                            role + "\n" + text + prompt_style.intra_message_sep + "\n"
+                        )
+                    else:
+                        ret += (
+                            role
+                            + "\n"
+                            + f"<image>\n{text}"
+                            + prompt_style.intra_message_sep
+                            + "\n"
+                        )
+            return (ret, images)
         else:
             raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
@@ -706,7 +702,7 @@ Begin!"""
         family = model_family.model_family or model_family.model_name
         if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
             content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
-        elif family in ["chatglm3"] + GLM4_TOOL_CALL_FAMILY:
+        elif family in GLM4_TOOL_CALL_FAMILY:
             content, func, args = cls._eval_glm_chat_arguments(c, tools)
         elif family in QWEN_TOOL_CALL_FAMILY:
             content, func, args = cls._eval_qwen_chat_arguments(c, tools)
@@ -870,10 +866,10 @@ def get_file_location(
         is_cached = cache_status
     assert isinstance(is_cached, bool)
-    if spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
+    if spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
         return cache_dir, is_cached
-    elif spec.model_format in ["ggmlv3", "ggufv2"]:
-        assert isinstance(spec, GgmlLLMSpecV1)
+    elif spec.model_format in ["ggufv2"]:
+        assert isinstance(spec, LlamaCppLLMSpecV1)
         filename = spec.model_file_name_template.format(quantization=quantization)
         model_path = os.path.join(cache_dir, filename)
         return model_path, is_cached
@@ -885,3 +881,22 @@ def get_model_version(
     llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
 ) -> str:
     return f"{llm_family.model_name}--{llm_spec.model_size_in_billions}B--{llm_spec.model_format}--{quantization}"
+def _decode_image(_url):
+    if _url.startswith("data:"):
+        logging.info("Parse url by base64 decoder.")
+        # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
+        # e.g. f"data:image/jpeg;base64,{base64_image}"
+        _type, data = _url.split(";")
+        _, ext = _type.split("/")
+        data = data[len("base64,") :]
+        data = base64.b64decode(data.encode("utf-8"))
+        return Image.open(BytesIO(data)).convert("RGB")
+    else:
+        try:
+            response = requests.get(_url)
+        except requests.exceptions.MissingSchema:
+            return Image.open(_url).convert("RGB")
+        else:
+            return Image.open(BytesIO(response.content)).convert("RGB")

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -21,6 +21,7 @@ import time
 import uuid
 from typing import (
     TYPE_CHECKING,
+    Any,
     AsyncGenerator,
     Dict,
     Iterable,
@@ -88,11 +89,12 @@ try:
 except ImportError:
     VLLM_INSTALLED = False
+VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
+    "internvl2",
+]
 VLLM_SUPPORTED_MODELS = [
     "llama-2",
     "llama-3",
-    "baichuan",
-    "internlm-16k",
     "mistral-v0.1",
     "codestral-v0.1",
     "Yi",
@@ -105,13 +107,7 @@ VLLM_SUPPORTED_MODELS = [
 VLLM_SUPPORTED_CHAT_MODELS = [
     "llama-2-chat",
     "llama-3-instruct",
-    "vicuna-v1.3",
-    "vicuna-v1.5",
-    "baichuan-chat",
     "baichuan-2-chat",
-    "internlm-chat-7b",
-    "internlm-chat-8k",
-    "internlm-chat-20b",
     "internlm2-chat",
     "internlm2.5-chat",
     "internlm2.5-chat-1m",
@@ -338,7 +334,7 @@ class VLLMModel(LLM):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -421,7 +417,7 @@ class VLLMModel(LLM):
     async def async_generate(
         self,
-        prompt: str,
+        prompt: Union[str, Dict[str, Any]],
         generate_config: Optional[Dict] = None,
         tools: object = False,
     ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
@@ -558,7 +554,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -644,3 +640,106 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                     self.model_family, self.model_uid, c, tools
                 )
             return self._to_chat_completion(c)
+class VLLMVisionModel(VLLMModel, ChatModelMixin):
+    def load(self):
+        try:
+            import vllm
+            from vllm.engine.arg_utils import AsyncEngineArgs
+            from vllm.engine.async_llm_engine import AsyncLLMEngine
+        except ImportError:
+            error_message = "Failed to import module 'vllm'"
+            installation_guide = [
+                "Please make sure 'vllm' is installed. ",
+                "You can install it by `pip install vllm`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        if vllm.__version__ >= "0.3.1":
+            # from vllm v0.3.1, it uses cupy as NCCL backend
+            # in which cupy will fork a process
+            # only for xoscar >= 0.3.0, new process is allowed in subpool
+            # besides, xinference set start method as forkserver for unix
+            # we need to set it to fork to make cupy NCCL work
+            multiprocessing.set_start_method("fork", force=True)
+        self._model_config = self._sanitize_model_config(self._model_config)
+        logger.info(
+            f"Loading {self.model_uid} with following model config: {self._model_config}"
+        )
+        engine_args = AsyncEngineArgs(
+            model=self.model_path,
+            **self._model_config,
+        )
+        self._engine = AsyncLLMEngine.from_engine_args(engine_args)
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format != "pytorch":
+            return False
+        if llm_spec.model_format == "pytorch":
+            if quantization != "none" and not (quantization is None):
+                return False
+        if isinstance(llm_family, CustomLLMFamilyV1):
+            if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
+                return False
+        else:
+            if llm_family.model_name not in VLLM_SUPPORTED_VISION_MODEL_LIST:
+                return False
+        if "vision" not in llm_family.model_ability:
+            return False
+        return VLLM_INSTALLED
+    def _sanitize_chat_config(
+        self,
+        generate_config: Optional[Dict] = None,
+    ) -> Dict:
+        if not generate_config:
+            generate_config = {}
+        if self.model_family.prompt_style:
+            if self.model_family.prompt_style.stop_token_ids:
+                generate_config.setdefault(
+                    "stop_token_ids",
+                    self.model_family.prompt_style.stop_token_ids.copy(),
+                )
+        return generate_config
+    async def async_chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[Dict] = None,
+    ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
+        # only support single image, waiting vllm support multi images
+        assert self.model_family.prompt_style is not None
+        prompt_style = self.model_family.prompt_style.copy()
+        chat_history = chat_history or []
+        prompt, images = self.get_prompt(prompt, chat_history, prompt_style)
+        if len(images) == 0:
+            inputs = {
+                "prompt": prompt,
+            }
+        else:
+            inputs = {
+                "prompt": prompt,
+                "multi_modal_data": {"image": images[-1]},  # type: ignore
+            }
+        generate_config = self._sanitize_chat_config(generate_config)
+        stream = generate_config.get("stream", None)
+        if stream:
+            agen = await self.async_generate(inputs, generate_config)
+            assert isinstance(agen, AsyncGenerator)
+            return self._async_to_chat_completion_chunks(agen)
+        else:
+            c = await self.async_generate(inputs, generate_config)
+            assert not isinstance(c, AsyncGenerator)
+            return self._to_chat_completion(c)

xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl

Potentially problematic release.

xinference 0.14.1.post1py3-none-any.whl → 0.14.3py3-none-any.whl