PyPI - xinference - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl - Mend

xinference 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (56) hide show

xinference/model/llm/pytorch/yi_vl.py CHANGED Viewed

@@ -139,6 +139,12 @@ class YiVLChatModel(PytorchChatModel):
             generate_config = {}
         stream = generate_config.get("stream", False)
+        stream_options = generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         from ....thirdparty.llava.conversation import conv_templates
         from ....thirdparty.llava.mm_utils import (
@@ -166,11 +172,11 @@ class YiVLChatModel(PytorchChatModel):
         )
         images = state.get_images(return_pil=True)
-        image = images[0]
-        image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
-            "pixel_values"
-        ][0]
+        if images:
+            image = images[0]
+            image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
+                "pixel_values"
+            ][0]
         stop_str = state.sep
         keywords = [stop_str]
@@ -187,7 +193,9 @@ class YiVLChatModel(PytorchChatModel):
             "input_ids": input_ids,
             "images": image_tensor.unsqueeze(0)
             .to(dtype=torch.bfloat16)
-            .to(self._model.device),
+            .to(self._model.device)
+            if images
+            else None,
             "streamer": streamer,
             "do_sample": True,
             "top_p": float(top_p),
@@ -200,7 +208,7 @@ class YiVLChatModel(PytorchChatModel):
         t.start()
         if stream:
-            it = self._generate_stream(streamer, stop_str)
+            it = self._generate_stream(streamer, stop_str, input_ids, include_usage)
             return self._to_chat_completion_chunks(it)
         else:
             c = self._generate(streamer, stop_str)
@@ -229,8 +237,12 @@ class YiVLChatModel(PytorchChatModel):
         )
         return c
-    def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
+    def _generate_stream(
+        self, streamer, stop_str, input_ids, include_usage
+    ) -> Iterator[CompletionChunk]:
         completion_id = str(uuid.uuid1())
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        prompt_tokens = len(input_ids[0])
         for i, new_text in enumerate(streamer):
             if not new_text.endswith(stop_str):
                 completion_choice = CompletionChoice(
@@ -243,10 +255,12 @@ class YiVLChatModel(PytorchChatModel):
                     model=self.model_uid,
                     choices=[completion_choice],
                 )
+                completion_tokens = i
+                total_tokens = prompt_tokens + completion_tokens
                 completion_usage = CompletionUsage(
-                    prompt_tokens=-1,
-                    completion_tokens=-1,
-                    total_tokens=-1,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
                 )
                 chunk["usage"] = completion_usage
                 yield chunk
@@ -262,9 +276,23 @@ class YiVLChatModel(PytorchChatModel):
             choices=[completion_choice],
         )
         completion_usage = CompletionUsage(
-            prompt_tokens=-1,
-            completion_tokens=-1,
-            total_tokens=-1,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
         )
         chunk["usage"] = completion_usage
         yield chunk
+        if include_usage:
+            chunk = CompletionChunk(
+                id=completion_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[],
+            )
+            chunk["usage"] = CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+            yield chunk

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -53,6 +53,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
     stop: Optional[Union[str, List[str]]]
     ignore_eos: bool
     stream: bool
+    stream_options: Optional[Union[dict, None]]
 try:
@@ -157,6 +158,8 @@ class SGLANGModel(LLM):
         )
         generate_config.setdefault("stop", [])
         generate_config.setdefault("stream", False)
+        stream_options = generate_config.get("stream_options")
+        generate_config.setdefault("stream_options", stream_options)
         generate_config.setdefault("ignore_eos", False)
         return generate_config
@@ -192,7 +195,7 @@ class SGLANGModel(LLM):
     @staticmethod
     def _convert_state_to_completion_chunk(
-        request_id: str, model: str, output_text: str, meta_info: Dict
+        request_id: str, model: str, output_text: str
     ) -> CompletionChunk:
         choices: List[CompletionChoice] = [
             CompletionChoice(
@@ -209,13 +212,6 @@ class SGLANGModel(LLM):
             model=model,
             choices=choices,
         )
-        prompt_tokens = meta_info["prompt_tokens"]
-        completion_tokens = meta_info["completion_tokens"]
-        chunk["usage"] = CompletionUsage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=prompt_tokens + completion_tokens,
-        )
         return chunk
     @staticmethod
@@ -272,6 +268,9 @@ class SGLANGModel(LLM):
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
         stream = sanitized_generate_config.pop("stream")
+        stream_options = sanitized_generate_config.pop("stream_options")
+        if isinstance(stream_options, dict):
+            include_usage = stream_options.pop("include_usage", False)
         request_id = str(uuid.uuid1())
         state = pipeline.run(
             question=prompt,
@@ -289,11 +288,34 @@ class SGLANGModel(LLM):
         else:
             async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
+                prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
                 async for out, meta_info in state.text_async_iter(
                     var_name="answer", return_meta_data=True
                 ):
                     chunk = self._convert_state_to_completion_chunk(
-                        request_id, self.model_uid, output_text=out, meta_info=meta_info
+                        request_id, self.model_uid, output_text=out
+                    )
+                    prompt_tokens = meta_info["prompt_tokens"]
+                    completion_tokens = meta_info["completion_tokens"]
+                    total_tokens = prompt_tokens + completion_tokens
+                    chunk["usage"] = CompletionUsage(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
+                    )
+                    yield chunk
+                if include_usage:
+                    chunk = CompletionChunk(
+                        id=request_id,
+                        object="text_completion",
+                        created=int(time.time()),
+                        model=self.model_uid,
+                        choices=[],
+                    )
+                    chunk["usage"] = CompletionUsage(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
                     )
                     yield chunk

xinference/model/llm/utils.py CHANGED Viewed

@@ -456,6 +456,19 @@ Begin!"""
                     ret += f"<|{role}|>{prompt_style.intra_message_sep}"
             ret += "<|assistant|>\n"
             return ret
+        elif prompt_style.style_name == "c4ai-command-r":
+            ret = (
+                f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>"
+                f"{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
+            )
+            for i, message in enumerate(chat_history):
+                role = get_role(message["role"])
+                content = message["content"]
+                if content:
+                    ret += f"{role}{content}{prompt_style.inter_message_sep}"
+                else:
+                    ret += role
+            return ret
         else:
             raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
@@ -482,9 +495,6 @@ Begin!"""
                 for i, choice in enumerate(chunk["choices"])
             ],
         }
-        usage = chunk.get("usage")
-        if usage is not None:
-            chat_chunk["usage"] = usage
         return cast(ChatCompletionChunk, chat_chunk)
     @classmethod
@@ -508,6 +518,19 @@ Begin!"""
                 for i, choice in enumerate(chunk["choices"])
             ],
         }
+        return cast(ChatCompletionChunk, chat_chunk)
+    @classmethod
+    def _get_final_chat_completion_chunk(
+        cls, chunk: CompletionChunk
+    ) -> ChatCompletionChunk:
+        chat_chunk = {
+            "id": "chat" + chunk["id"],
+            "model": chunk["model"],
+            "created": chunk["created"],
+            "object": "chat.completion.chunk",
+            "choices": [],
+        }
         usage = chunk.get("usage")
         if usage is not None:
             chat_chunk["usage"] = usage
@@ -521,7 +544,12 @@ Begin!"""
         for i, chunk in enumerate(chunks):
             if i == 0:
                 yield cls._get_first_chat_completion_chunk(chunk)
-            yield cls._to_chat_completion_chunk(chunk)
+            # usage
+            choices = chunk.get("choices")
+            if not choices:
+                yield cls._get_final_chat_completion_chunk(chunk)
+            else:
+                yield cls._to_chat_completion_chunk(chunk)
     @classmethod
     async def _async_to_chat_completion_chunks(
@@ -532,7 +560,12 @@ Begin!"""
         async for chunk in chunks:
             if i == 0:
                 yield cls._get_first_chat_completion_chunk(chunk)
-            yield cls._to_chat_completion_chunk(chunk)
+            # usage
+            choices = chunk.get("choices")
+            if not choices:
+                yield cls._get_final_chat_completion_chunk(chunk)
+            else:
+                yield cls._to_chat_completion_chunk(chunk)
             i += 1
     @staticmethod

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -37,6 +37,7 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    LoRA,
     ToolCallFunction,
     ToolCalls,
 )
@@ -64,16 +65,19 @@ class VLLMModelConfig(TypedDict, total=False):
 class VLLMGenerateConfig(TypedDict, total=False):
+    lora_name: Optional[str]
     n: int
     best_of: Optional[int]
     presence_penalty: float
     frequency_penalty: float
     temperature: float
     top_p: float
+    top_k: int
     max_tokens: int
     stop_token_ids: Optional[List[int]]
     stop: Optional[Union[str, List[str]]]
     stream: bool  # non-sampling param, should not be passed to the engine.
+    stream_options: Optional[Union[dict, None]]
 try:
@@ -90,8 +94,11 @@ VLLM_SUPPORTED_MODELS = [
     "internlm-16k",
     "mistral-v0.1",
     "Yi",
+    "Yi-1.5",
     "code-llama",
     "code-llama-python",
+    "deepseek",
+    "deepseek-coder",
 ]
 VLLM_SUPPORTED_CHAT_MODELS = [
     "llama-2-chat",
@@ -106,6 +113,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "internlm2-chat",
     "qwen-chat",
     "Yi-chat",
+    "Yi-1.5-chat",
     "code-llama-instruct",
     "mistral-instruct-v0.1",
     "mistral-instruct-v0.2",
@@ -119,6 +127,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
 ]
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
+    VLLM_SUPPORTED_MODELS.append("codeqwen1.5")
     VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
@@ -130,8 +139,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
 if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
-    VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01")
-    VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01-4bit")
+    VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01")
+    VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01-4bit")
 class VLLMModel(LLM):
@@ -143,16 +152,30 @@ class VLLMModel(LLM):
         quantization: str,
         model_path: str,
         model_config: Optional[VLLMModelConfig],
+        peft_model: Optional[List[LoRA]] = None,
     ):
+        try:
+            from vllm.lora.request import LoRARequest
+        except ImportError:
+            error_message = "Failed to import module 'vllm'"
+            installation_guide = [
+                "Please make sure 'vllm' is installed. ",
+                "You can install it by `pip install vllm`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._model_config = model_config
         self._engine = None
+        self.lora_modules = peft_model
+        self.lora_requests: List[LoRARequest] = []
     def load(self):
         try:
             import vllm
             from vllm.engine.arg_utils import AsyncEngineArgs
             from vllm.engine.async_llm_engine import AsyncLLMEngine
+            from vllm.lora.request import LoRARequest
         except ImportError:
             error_message = "Failed to import module 'vllm'"
             installation_guide = [
@@ -171,11 +194,33 @@ class VLLMModel(LLM):
             multiprocessing.set_start_method("fork", force=True)
         self._model_config = self._sanitize_model_config(self._model_config)
+        if self.lora_modules is None:
+            self.lora_requests = []
+        else:
+            self.lora_requests = [
+                LoRARequest(
+                    lora_name=lora.lora_name,
+                    lora_int_id=i,
+                    lora_local_path=lora.local_path,
+                )
+                for i, lora in enumerate(self.lora_modules, start=1)
+            ]
+        enable_lora = len(self.lora_requests) > 0
+        max_loras = len(self.lora_requests)
         logger.info(
             f"Loading {self.model_uid} with following model config: {self._model_config}"
+            f"Enable lora: {enable_lora}. Lora count: {max_loras}."
         )
-        engine_args = AsyncEngineArgs(model=self.model_path, **self._model_config)
+        engine_args = AsyncEngineArgs(
+            model=self.model_path,
+            enable_lora=enable_lora,
+            max_loras=max_loras,
+            **self._model_config,
+        )
         self._engine = AsyncLLMEngine.from_engine_args(engine_args)
     def _sanitize_model_config(
@@ -206,6 +251,7 @@ class VLLMModel(LLM):
             generate_config = {}
         sanitized = VLLMGenerateConfig()
+        sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
         sanitized.setdefault("n", generate_config.get("n", 1))
         sanitized.setdefault("best_of", generate_config.get("best_of", None))
         sanitized.setdefault(
@@ -216,12 +262,16 @@ class VLLMModel(LLM):
         )
         sanitized.setdefault("temperature", generate_config.get("temperature", 1.0))
         sanitized.setdefault("top_p", generate_config.get("top_p", 1.0))
+        sanitized.setdefault("top_k", generate_config.get("top_k", -1))
         sanitized.setdefault("max_tokens", generate_config.get("max_tokens", 1024))
         sanitized.setdefault("stop", generate_config.get("stop", None))
         sanitized.setdefault(
             "stop_token_ids", generate_config.get("stop_token_ids", None)
         )
-        sanitized.setdefault("stream", generate_config.get("stream", None))
+        sanitized.setdefault("stream", generate_config.get("stream", False))
+        sanitized.setdefault(
+            "stream_options", generate_config.get("stream_options", None)
+        )
         return sanitized
@@ -338,16 +388,34 @@ class VLLMModel(LLM):
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
+        lora_model = sanitized_generate_config.pop("lora_name")
+        lora_request = None
+        if lora_model is not None:
+            for lora in self.lora_requests:
+                if lora_model == lora.lora_name:
+                    lora_request = lora
+                    break
         stream = sanitized_generate_config.pop("stream")
+        stream_options = sanitized_generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         sampling_params = SamplingParams(**sanitized_generate_config)
         request_id = str(uuid.uuid1())
         assert self._engine is not None
-        results_generator = self._engine.generate(prompt, sampling_params, request_id)
+        results_generator = self._engine.generate(
+            prompt, sampling_params, request_id, lora_request=lora_request
+        )
         async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
             previous_texts = [""] * sanitized_generate_config["n"]
             tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
+            prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
             async for _request_output in results_generator:
                 chunk = self._convert_request_output_to_completion_chunk(
                     request_id=request_id,
@@ -398,6 +466,20 @@ class VLLMModel(LLM):
                     total_tokens=total_tokens,
                 )
                 yield chunk
+            if include_usage:
+                chunk = CompletionChunk(
+                    id=request_id,
+                    object="text_completion",
+                    created=int(time.time()),
+                    model=self.model_uid,
+                    choices=[],
+                )
+                chunk["usage"] = CompletionUsage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
+                yield chunk
         if stream:
             return stream_results()

xinference/model/rerank/core.py CHANGED Viewed

@@ -46,7 +46,7 @@ def get_rerank_model_descriptions():
 class RerankModelSpec(CacheableModelSpec):
     model_name: str
     language: List[str]
-    type: Optional[str] = "normal"
+    type: Optional[str] = "unknown"
     model_id: str
     model_revision: Optional[str]
     model_hub: str = "huggingface"
@@ -118,6 +118,28 @@ class RerankModel:
         self._use_fp16 = use_fp16
         self._model = None
         self._counter = 0
+        if model_spec.type == "unknown":
+            model_spec.type = self._auto_detect_type(model_path)
+    @staticmethod
+    def _auto_detect_type(model_path):
+        """This method may not be stable due to the fact that the tokenizer name may be changed.
+        Therefore, we only use this method for unknown model types."""
+        from transformers import AutoTokenizer
+        type_mapper = {
+            "LlamaTokenizerFast": "LLM-based layerwise",
+            "GemmaTokenizerFast": "LLM-based",
+            "XLMRobertaTokenizerFast": "normal",
+        }
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        rerank_type = type_mapper.get(type(tokenizer).__name__)
+        if rerank_type is None:
+            raise Exception(
+                f"Can't determine the rerank type based on the tokenizer {tokenizer}"
+            )
+        return rerank_type
     def load(self):
         if self._model_spec.type == "normal":

xinference/model/utils.py CHANGED Viewed

@@ -19,6 +19,7 @@ from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, Tuple, Union
+import huggingface_hub
 from fsspec import AbstractFileSystem
 from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
@@ -27,6 +28,7 @@ from .core import CacheableModelSpec
 logger = logging.getLogger(__name__)
 MAX_ATTEMPTS = 3
+IS_NEW_HUGGINGFACE_HUB: bool = huggingface_hub.__version__ >= "0.23.0"
 def is_locale_chinese_simplified() -> bool:
@@ -76,6 +78,13 @@ def symlink_local_file(path: str, local_dir: str, relpath: str) -> str:
     return local_dir_filepath
+def create_symlink(download_dir: str, cache_dir: str):
+    for subdir, dirs, files in os.walk(download_dir):
+        for file in files:
+            relpath = os.path.relpath(os.path.join(subdir, file), download_dir)
+            symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
 def retry_download(
     download_func: Callable,
     model_name: str,
@@ -306,22 +315,23 @@ def cache(model_spec: CacheableModelSpec, model_description_type: type):
             model_spec.model_id,
             revision=model_spec.model_revision,
         )
-        for subdir, dirs, files in os.walk(download_dir):
-            for file in files:
-                relpath = os.path.relpath(os.path.join(subdir, file), download_dir)
-                symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
+        create_symlink(download_dir, cache_dir)
     else:
         from huggingface_hub import snapshot_download as hf_download
-        retry_download(
+        use_symlinks = {}
+        if not IS_NEW_HUGGINGFACE_HUB:
+            use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
+        download_dir = retry_download(
             hf_download,
             model_spec.model_name,
             None,
             model_spec.model_id,
             revision=model_spec.model_revision,
-            local_dir=cache_dir,
-            local_dir_use_symlinks=True,
+            **use_symlinks,
         )
+        if IS_NEW_HUGGINGFACE_HUB:
+            create_symlink(download_dir, cache_dir)
     with open(meta_path, "w") as f:
         import json

xinference/thirdparty/deepseek_vl/models/processing_vlm.py CHANGED Viewed

@@ -25,8 +25,8 @@ from PIL.Image import Image
 from transformers import LlamaTokenizerFast
 from transformers.processing_utils import ProcessorMixin
-from .image_processing_vlm import VLMImageProcessor
 from ..utils.conversation import get_conv_template
+from .image_processing_vlm import VLMImageProcessor
 class DictOutput(object):

xinference/thirdparty/deepseek_vl/models/siglip_vit.py CHANGED Viewed

@@ -92,7 +92,7 @@ def _no_grad_trunc_normal_(tensor, mean, std, a, b):
 def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
     # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
     r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first
-    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its orignal dtype.
+    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its original dtype.
     Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn
     from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
     with values outside :math:`[a, b]` redrawn until they are within
@@ -305,7 +305,7 @@ class VisionTransformer(nn.Module):
             img_size: Input image size.
             patch_size: Patch size.
             in_chans: Number of image input channels.
-            num_classes: Mumber of classes for classification head.
+            num_classes: Number of classes for classification head.
             global_pool: Type of global pooling for final sequence (default: 'token').
             embed_dim: Transformer embedding dimension.
             depth: Depth of transformer.

xinference/thirdparty/llava/mm_utils.py CHANGED Viewed

@@ -2,11 +2,12 @@ import base64
 from io import BytesIO
 import torch
-from .model import LlavaLlamaForCausalLM
-from .model.constants import IMAGE_TOKEN_INDEX
 from PIL import Image
 from transformers import AutoTokenizer, StoppingCriteria
+from .model import LlavaLlamaForCausalLM
+from .model.constants import IMAGE_TOKEN_INDEX
 def load_image_from_base64(image):
     return Image.open(BytesIO(base64.b64decode(image)))

xinference/thirdparty/llava/model/llava_arch.py CHANGED Viewed

@@ -17,9 +17,9 @@ import os
 from abc import ABC, abstractmethod
 import torch
-from .constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, key_info
 from .clip_encoder.builder import build_vision_tower
+from .constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, key_info
 from .multimodal_projector.builder import build_vision_projector

xinference/thirdparty/omnilmm/chat.py CHANGED Viewed

@@ -7,11 +7,6 @@ import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
-from .model.omnilmm import OmniLMMForCausalLM
-from .model.utils import build_transform
-from .train.train_utils import omni_preprocess
-from .utils import disable_torch_init
 DEFAULT_IMAGE_TOKEN = "<image>"
 DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
 DEFAULT_IM_START_TOKEN = "<im_start>"
@@ -21,6 +16,10 @@ DEFAULT_IM_END_TOKEN = "<im_end>"
 def init_omni_lmm(model_path, device_map):
     from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+    from .model.omnilmm import OmniLMMForCausalLM
+    from .model.utils import build_transform
+    from .utils import disable_torch_init
     torch.backends.cuda.matmul.allow_tf32 = True
     disable_torch_init()
     model_name = os.path.expanduser(model_path)
@@ -98,6 +97,8 @@ def expand_question_into_multimodal(
 def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
+    from .train.train_utils import omni_preprocess
     question = expand_question_into_multimodal(
         question,
         image_token_len,

xinference 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl

Potentially problematic release.

xinference 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl