PyPI - xinference - Versions diffs - 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl - Mend

xinference 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (191) hide show

xinference/model/llm/transformers/intern_vl.py CHANGED Viewed

@@ -42,27 +42,38 @@ def _message_content_to_intern(content, image_cnt):
     if not isinstance(content, str):
         texts = []
         image_urls = []
+        video_urls = []
         for c in content:
             c_type = c.get("type")
             if c_type == "text":
                 texts.append(c["text"])
             elif c_type == "image_url":
                 image_urls.append(c["image_url"]["url"])
+            elif c_type == "video_url":
+                video_urls.append(c["video_url"]["url"])
+        if len(video_urls) > 1:
+            raise RuntimeError("Only one video per message is supported")
         image_futures = []
         with ThreadPoolExecutor() as executor:
             for image_url in image_urls:
                 fut = executor.submit(_decode_image, image_url)
                 image_futures.append(fut)
         images = [fut.result() for fut in image_futures]
+        videos = []
+        for vid_url in video_urls:
+            videos.append(_load_video(vid_url, num_segments=8, max_num=1))
         prefix = ""
         for i, _ in enumerate(images):
             prefix += f"Image-{image_cnt + i + 1}: <image>\n\n"
+        if len(videos) > 0:
+            prefix = "".join(
+                [f"Frame{i+1}: <image>\n" for i in range(len(videos[0][1]))]
+            )
         text = prefix + " ".join(texts)
-        if len(images) == 0:
-            return text, []
-        else:
-            return text, images
-    return content, []
+        return text, images, videos
+    return content, [], []
 def _get_prompt_and_chat_history(
@@ -71,18 +82,21 @@ def _get_prompt_and_chat_history(
 ):
     # Convert openai history to intern vl history
     images = []
+    videos = []
     history = []
     image_cnt = 0
     for h1, h2 in zip(*[iter(chat_history or [])] * 2):
-        content1, img = _message_content_to_intern(h1["content"], image_cnt)
-        content2, _ = _message_content_to_intern(h2["content"], image_cnt)
+        content1, img, vid = _message_content_to_intern(h1["content"], image_cnt)
+        content2, _, _ = _message_content_to_intern(h2["content"], image_cnt)
         history.append([content1, content2])
         images.extend(img)
         image_cnt += len(img)
+        videos.extend(vid)
-    question, img = _message_content_to_intern(prompt, image_cnt)
+    question, img, vid = _message_content_to_intern(prompt, image_cnt)
     images.extend(img)
-    return question, history, images
+    videos.extend(vid)
+    return question, history, images, videos
 def _build_transform(input_size=448):
@@ -174,6 +188,53 @@ def _load_image(image_file, input_size=448, max_num=12):
     return pixel_values
+# video multi-round conversation
+def _get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    import numpy as np
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array(
+        [
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(num_segments)
+        ]
+    )
+    return frame_indices
+def _load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    from decord import VideoReader, cpu
+    from PIL import Image
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    transform = _build_transform(input_size=input_size)
+    frame_indices = _get_index(
+        bound, fps, max_frame, first_idx=0, num_segments=num_segments
+    )
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
+        img = _dynamic_preprocess(
+            img, image_size=input_size, use_thumbnail=True, max_num=max_num
+        )
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(torch.bfloat16).cuda()
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list
 class InternVLChatModel(PytorchChatModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -305,7 +366,9 @@ class InternVLChatModel(PytorchChatModel):
             else False
         )
-        content, history, images = _get_prompt_and_chat_history(prompt, chat_history)
+        content, history, images, videos = _get_prompt_and_chat_history(
+            prompt, chat_history
+        )
         num_patches_list = []
         if len(images) == 1:
@@ -327,6 +390,10 @@ class InternVLChatModel(PytorchChatModel):
         else:
             pixel_values = None
+        if len(videos) > 0:
+            pixel_values = videos[0][0]
+            num_patches_list = videos[0][1]
         assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
         img_context_token_id = self._tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
@@ -440,7 +507,23 @@ class InternVLChatModel(PytorchChatModel):
             )
             chunk["usage"] = completion_usage
             yield chunk
+        completion_choice = CompletionChoice(
+            text="", index=0, logprobs=None, finish_reason="stop"
+        )
+        chunk = CompletionChunk(
+            id=completion_id,
+            object="text_completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[completion_choice],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+        )
+        chunk["usage"] = completion_usage
+        yield chunk
         if include_usage:
             chunk = CompletionChunk(
                 id=completion_id,

xinference/model/llm/transformers/minicpmv25.py CHANGED Viewed

@@ -11,18 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
 import json
 import logging
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from io import BytesIO
 from typing import Dict, Iterator, List, Optional, Union
-import requests
 import torch
-from PIL import Image
 from ....types import (
     ChatCompletion,
@@ -35,6 +31,7 @@ from ....types import (
 )
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import _decode_image
 from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
@@ -102,24 +99,6 @@ class MiniCPMV25Model(PytorchChatModel):
         self._save_tensorizer()
     def _message_content_to_chat(self, content):
-        def _load_image(_url):
-            if _url.startswith("data:"):
-                logging.info("Parse url by base64 decoder.")
-                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
-                # e.g. f"data:image/jpeg;base64,{base64_image}"
-                _type, data = _url.split(";")
-                _, ext = _type.split("/")
-                data = data[len("base64,") :]
-                data = base64.b64decode(data.encode("utf-8"))
-                return Image.open(BytesIO(data)).convert("RGB")
-            else:
-                try:
-                    response = requests.get(_url)
-                except requests.exceptions.MissingSchema:
-                    return Image.open(_url).convert("RGB")
-                else:
-                    return Image.open(BytesIO(response.content)).convert("RGB")
         if not isinstance(content, str):
             texts = []
             image_urls = []
@@ -132,7 +111,7 @@ class MiniCPMV25Model(PytorchChatModel):
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
-                    fut = executor.submit(_load_image, image_url)
+                    fut = executor.submit(_decode_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
             text = " ".join(texts)

xinference/model/llm/transformers/minicpmv26.py CHANGED Viewed

@@ -11,15 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
 import logging
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from io import BytesIO
 from typing import Dict, Iterator, List, Optional, Union
-import requests
 import torch
 from PIL import Image
@@ -34,6 +31,7 @@ from ....types import (
 )
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import _decode_image
 from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
@@ -105,24 +103,6 @@ class MiniCPMV26Model(PytorchChatModel):
         self._save_tensorizer()
     def _message_content_to_chat(self, content):
-        def _load_image(_url):
-            if _url.startswith("data:"):
-                logging.info("Parse url by base64 decoder.")
-                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
-                # e.g. f"data:image/jpeg;base64,{base64_image}"
-                _type, data = _url.split(";")
-                _, ext = _type.split("/")
-                data = data[len("base64,") :]
-                data = base64.b64decode(data.encode("utf-8"))
-                return Image.open(BytesIO(data)).convert("RGB")
-            else:
-                try:
-                    response = requests.get(_url)
-                except requests.exceptions.MissingSchema:
-                    return Image.open(_url).convert("RGB")
-                else:
-                    return Image.open(BytesIO(response.content)).convert("RGB")
         MAX_NUM_FRAMES = 64
         def encode_video(video_path):
@@ -166,7 +146,7 @@ class MiniCPMV26Model(PytorchChatModel):
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
-                    fut = executor.submit(_load_image, image_url)
+                    fut = executor.submit(_decode_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
             frames = []

xinference/model/llm/transformers/yi_vl.py CHANGED Viewed

@@ -11,18 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
 import logging
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from io import BytesIO
 from threading import Thread
 from typing import Dict, Iterator, List, Optional, Union
-import requests
 import torch
-from PIL import Image
 from ....model.utils import select_device
 from ....types import (
@@ -35,6 +31,7 @@ from ....types import (
     CompletionUsage,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import _decode_image
 from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
@@ -78,25 +75,6 @@ class YiVLChatModel(PytorchChatModel):
     @staticmethod
     def _message_content_to_yi(content) -> Union[str, tuple]:
-        def _load_image(_url):
-            if _url.startswith("data:"):
-                logging.info("Parse url by base64 decoder.")
-                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
-                # e.g. f"data:image/jpeg;base64,{base64_image}"
-                _type, data = _url.split(";")
-                _, ext = _type.split("/")
-                data = data[len("base64,") :]
-                data = base64.b64decode(data.encode("utf-8"))
-                return Image.open(BytesIO(data))
-            else:
-                try:
-                    response = requests.get(_url)
-                except requests.exceptions.MissingSchema:
-                    return Image.open(_url)
-                else:
-                    return Image.open(BytesIO(response.content))
         if not isinstance(content, str):
             from ....thirdparty.llava.model.constants import DEFAULT_IMAGE_TOKEN
@@ -111,7 +89,7 @@ class YiVLChatModel(PytorchChatModel):
             image_futures = []
             with ThreadPoolExecutor() as executor:
                 for image_url in image_urls:
-                    fut = executor.submit(_load_image, image_url)
+                    fut = executor.submit(_decode_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
             text = " ".join(texts)

xinference/model/llm/utils.py CHANGED Viewed

@@ -32,6 +32,7 @@ from ...types import (
     Completion,
     CompletionChunk,
 )
+from ..utils import ensure_cache_cleared
 from .llm_family import (
     LlamaCppLLMSpecV1,
     LLMFamilyV1,
@@ -459,7 +460,16 @@ Begin!"""
                 role = get_role(message["role"])
                 content = message["content"]
                 if isinstance(content, str):
-                    ret += role + "\n" + content + prompt_style.intra_message_sep + "\n"
+                    if content:
+                        ret += (
+                            role
+                            + "\n"
+                            + content
+                            + prompt_style.intra_message_sep
+                            + "\n"
+                        )
+                    else:
+                        ret += role + "\n"
                 elif isinstance(content, list):
                     text = ""
                     image_urls = []
@@ -567,6 +577,7 @@ Begin!"""
         return cast(ChatCompletionChunk, chat_chunk)
     @classmethod
+    @ensure_cache_cleared
     def _to_chat_completion_chunks(
         cls,
         chunks: Iterator[CompletionChunk],
@@ -599,6 +610,7 @@ Begin!"""
             i += 1
     @staticmethod
+    @ensure_cache_cleared
     def _to_chat_completion(completion: Completion) -> ChatCompletion:
         return {
             "id": "chat" + completion["id"],

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -643,39 +643,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
 class VLLMVisionModel(VLLMModel, ChatModelMixin):
-    def load(self):
-        try:
-            import vllm
-            from vllm.engine.arg_utils import AsyncEngineArgs
-            from vllm.engine.async_llm_engine import AsyncLLMEngine
-        except ImportError:
-            error_message = "Failed to import module 'vllm'"
-            installation_guide = [
-                "Please make sure 'vllm' is installed. ",
-                "You can install it by `pip install vllm`\n",
-            ]
-            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        if vllm.__version__ >= "0.3.1":
-            # from vllm v0.3.1, it uses cupy as NCCL backend
-            # in which cupy will fork a process
-            # only for xoscar >= 0.3.0, new process is allowed in subpool
-            # besides, xinference set start method as forkserver for unix
-            # we need to set it to fork to make cupy NCCL work
-            multiprocessing.set_start_method("fork", force=True)
-        self._model_config = self._sanitize_model_config(self._model_config)
-        logger.info(
-            f"Loading {self.model_uid} with following model config: {self._model_config}"
-        )
-        engine_args = AsyncEngineArgs(
-            model=self.model_path,
-            **self._model_config,
-        )
-        self._engine = AsyncLLMEngine.from_engine_args(engine_args)
     @classmethod
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
@@ -721,7 +688,7 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
         prompt_style = self.model_family.prompt_style.copy()
         chat_history = chat_history or []
         prompt, images = self.get_prompt(prompt, chat_history, prompt_style)
-        logger.info(f"messages:{prompt}")
         if len(images) == 0:
             inputs = {
                 "prompt": prompt,

xinference/model/rerank/custom.py CHANGED Viewed

@@ -48,6 +48,10 @@ def register_rerank(model_spec: CustomRerankModelSpec, persist: bool):
     if not is_valid_model_name(model_spec.model_name):
         raise ValueError(f"Invalid model name {model_spec.model_name}.")
+    model_uri = model_spec.model_uri
+    if model_uri and not is_valid_model_uri(model_uri):
+        raise ValueError(f"Invalid model URI {model_uri}.")
     with UD_RERANK_LOCK:
         for model_name in (
             list(BUILTIN_RERANK_MODELS.keys())
@@ -62,11 +66,6 @@ def register_rerank(model_spec: CustomRerankModelSpec, persist: bool):
         UD_RERANKS.append(model_spec)
     if persist:
-        # We only validate model URL when persist is True.
-        model_uri = model_spec.model_uri
-        if model_uri and not is_valid_model_uri(model_uri):
-            raise ValueError(f"Invalid model URI {model_uri}.")
         persist_path = os.path.join(
             XINFERENCE_MODEL_DIR, "rerank", f"{model_spec.model_name}.json"
         )

xinference/model/utils.py CHANGED Viewed

@@ -11,17 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import functools
+import gc
+import inspect
 import json
 import logging
 import os
+import random
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 import huggingface_hub
+import numpy as np
+import torch
 from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
-from ..device_utils import get_available_device, is_device_available
+from ..device_utils import empty_cache, get_available_device, is_device_available
 from .core import CacheableModelSpec
 logger = logging.getLogger(__name__)
@@ -348,3 +355,36 @@ def convert_float_to_int_or_str(model_size: float) -> Union[int, str]:
         return int(model_size)
     else:
         return str(model_size)
+def ensure_cache_cleared(func: Callable):
+    assert not inspect.iscoroutinefunction(func) and not inspect.isasyncgenfunction(
+        func
+    )
+    if inspect.isgeneratorfunction(func):
+        @functools.wraps(func)
+        def inner(*args, **kwargs):
+            for obj in func(*args, **kwargs):
+                yield obj
+            gc.collect()
+            empty_cache()
+    else:
+        @functools.wraps(func)
+        def inner(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            finally:
+                gc.collect()
+                empty_cache()
+    return inner
+def set_all_random_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)

xinference/model/video/core.py CHANGED Viewed

@@ -14,7 +14,7 @@
 import logging
 import os
 from collections import defaultdict
-from typing import Dict, List, Literal, Optional, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple
 from ...constants import XINFERENCE_CACHE_DIR
 from ..core import CacheableModelSpec, ModelDescription
@@ -44,6 +44,8 @@ class VideoModelFamilyV1(CacheableModelSpec):
     model_revision: str
     model_hub: str = "huggingface"
     model_ability: Optional[List[str]]
+    default_model_config: Optional[Dict[str, Any]]
+    default_generate_config: Optional[Dict[str, Any]]
 class VideoModelDescription(ModelDescription):

xinference/model/video/diffusers.py CHANGED Viewed

@@ -15,7 +15,6 @@
 import base64
 import logging
 import os
-import sys
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
@@ -24,10 +23,9 @@ from typing import TYPE_CHECKING, List, Union
 import numpy as np
 import PIL.Image
-import torch
 from ...constants import XINFERENCE_VIDEO_DIR
-from ...device_utils import move_model_to_available_device
+from ...device_utils import gpu_count, move_model_to_available_device
 from ...types import Video, VideoList
 if TYPE_CHECKING:
@@ -76,41 +74,58 @@ class DiffUsersVideoModel:
     def load(self):
         import torch
-        torch_dtype = self._kwargs.get("torch_dtype")
-        if sys.platform != "darwin" and torch_dtype is None:
-            # The following params crashes on Mac M2
-            self._kwargs["torch_dtype"] = torch.float16
-            self._kwargs["variant"] = "fp16"
-            self._kwargs["use_safetensors"] = True
+        kwargs = self._model_spec.default_model_config.copy()
+        kwargs.update(self._kwargs)
+        scheduler_cls_name = kwargs.pop("scheduler", None)
+        torch_dtype = kwargs.get("torch_dtype")
         if isinstance(torch_dtype, str):
-            self._kwargs["torch_dtype"] = getattr(torch, torch_dtype)
+            kwargs["torch_dtype"] = getattr(torch, torch_dtype)
+        logger.debug("Loading video model with kwargs: %s", kwargs)
         if self._model_spec.model_family == "CogVideoX":
+            import diffusers
             from diffusers import CogVideoXPipeline
-            self._model = CogVideoXPipeline.from_pretrained(
-                self._model_path, **self._kwargs
+            pipeline = self._model = CogVideoXPipeline.from_pretrained(
+                self._model_path, **kwargs
             )
         else:
             raise Exception(
                 f"Unsupported model family: {self._model_spec.model_family}"
             )
-        if self._kwargs.get("cpu_offload", False):
+        if scheduler_cls_name:
+            logger.debug("Using scheduler: %s", scheduler_cls_name)
+            pipeline.scheduler = getattr(diffusers, scheduler_cls_name).from_config(
+                pipeline.scheduler.config, timestep_spacing="trailing"
+            )
+        if kwargs.get("compile_graph", False):
+            pipeline.transformer = torch.compile(
+                pipeline.transformer, mode="max-autotune", fullgraph=True
+            )
+        if kwargs.get("cpu_offload", False):
             logger.debug("CPU offloading model")
-            self._model.enable_model_cpu_offload()
-        elif not self._kwargs.get("device_map"):
+            pipeline.enable_model_cpu_offload()
+            if kwargs.get("sequential_cpu_offload", True):
+                pipeline.enable_sequential_cpu_offload()
+            pipeline.vae.enable_slicing()
+            pipeline.vae.enable_tiling()
+        elif not kwargs.get("device_map"):
             logger.debug("Loading model to available device")
-            self._model = move_model_to_available_device(self._model)
+            if gpu_count() > 1:
+                kwargs["device_map"] = "balanced"
+            else:
+                pipeline = move_model_to_available_device(self._model)
         # Recommended if your computer has < 64 GB of RAM
-        self._model.enable_attention_slicing()
+        pipeline.enable_attention_slicing()
     def text_to_video(
         self,
         prompt: str,
         n: int = 1,
         num_inference_steps: int = 50,
-        guidance_scale: int = 6,
         response_format: str = "b64_json",
         **kwargs,
     ) -> VideoList:
@@ -121,31 +136,19 @@ class DiffUsersVideoModel:
         # from diffusers.utils import export_to_video
         from ...device_utils import empty_cache
+        assert self._model is not None
+        assert callable(self._model)
+        generate_kwargs = self._model_spec.default_generate_config.copy()
+        generate_kwargs.update(kwargs)
+        generate_kwargs["num_videos_per_prompt"] = n
         logger.debug(
             "diffusers text_to_video args: %s",
-            kwargs,
+            generate_kwargs,
         )
-        assert self._model is not None
-        if self._kwargs.get("cpu_offload"):
-            # if enabled cpu offload,
-            # the model.device would be CPU
-            device = "cuda"
-        else:
-            device = self._model.device
-        prompt_embeds, _ = self._model.encode_prompt(
-            prompt=prompt,
-            do_classifier_free_guidance=True,
-            num_videos_per_prompt=n,
-            max_sequence_length=226,
-            device=device,
-            dtype=torch.float16,
-        )
-        assert callable(self._model)
         output = self._model(
+            prompt=prompt,
             num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            prompt_embeds=prompt_embeds,
-            **kwargs,
+            **generate_kwargs,
         )
         # clean cache

xinference 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

Potentially problematic release.

xinference 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl