PyPI - xinference - Versions diffs - 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

xinference 0.14.4.post1py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (149) hide show

xinference/model/audio/cosyvoice.py CHANGED Viewed

@@ -53,7 +53,82 @@ class CosyVoiceModel:
         from cosyvoice.cli.cosyvoice import CosyVoice
-        self._model = CosyVoice(self._model_path)
+        self._model = CosyVoice(
+            self._model_path, load_jit=self._kwargs.get("load_jit", False)
+        )
+    def _speech_handle(
+        self,
+        stream,
+        input,
+        instruct_text,
+        prompt_speech,
+        prompt_text,
+        voice,
+        response_format,
+    ):
+        if prompt_speech:
+            from cosyvoice.utils.file_utils import load_wav
+            with io.BytesIO(prompt_speech) as prompt_speech_io:
+                prompt_speech_16k = load_wav(prompt_speech_io, 16000)
+            if prompt_text:
+                logger.info("CosyVoice inference_zero_shot")
+                output = self._model.inference_zero_shot(
+                    input, prompt_text, prompt_speech_16k, stream=stream
+                )
+            else:
+                logger.info("CosyVoice inference_cross_lingual")
+                output = self._model.inference_cross_lingual(
+                    input, prompt_speech_16k, stream=stream
+                )
+        else:
+            available_speakers = self._model.list_avaliable_spks()
+            if not voice:
+                voice = available_speakers[0]
+            else:
+                assert (
+                    voice in available_speakers
+                ), f"Invalid voice {voice}, CosyVoice available speakers: {available_speakers}"
+            if instruct_text:
+                logger.info("CosyVoice inference_instruct")
+                output = self._model.inference_instruct(
+                    input, voice, instruct_text=instruct_text, stream=stream
+                )
+            else:
+                logger.info("CosyVoice inference_sft")
+                output = self._model.inference_sft(input, voice, stream=stream)
+        import torch
+        import torchaudio
+        def _generator_stream():
+            with BytesIO() as out:
+                writer = torchaudio.io.StreamWriter(out, format=response_format)
+                writer.add_audio_stream(sample_rate=22050, num_channels=1)
+                i = 0
+                last_pos = 0
+                with writer.open():
+                    for chunk in output:
+                        chunk = chunk["tts_speech"]
+                        trans_chunk = torch.transpose(chunk, 0, 1)
+                        writer.write_audio_chunk(i, trans_chunk)
+                        new_last_pos = out.tell()
+                        if new_last_pos != last_pos:
+                            out.seek(last_pos)
+                            encoded_bytes = out.read()
+                            yield encoded_bytes
+                            last_pos = new_last_pos
+        def _generator_block():
+            chunk = next(output)
+            assert isinstance(chunk, dict), "Expected data to be of type dict"
+            with BytesIO() as out:
+                torchaudio.save(out, chunk["tts_speech"], 22050, format=response_format)
+                return out.getvalue()
+        return _generator_stream() if stream else _generator_block()
     def speech(
         self,
@@ -64,12 +139,6 @@ class CosyVoiceModel:
         stream: bool = False,
         **kwargs,
     ):
-        if stream:
-            raise Exception("CosyVoiceModel does not support stream.")
-        import torchaudio
-        from cosyvoice.utils.file_utils import load_wav
         prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
         prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
         instruct_text: Optional[str] = kwargs.pop("instruct_text", None)
@@ -103,39 +172,15 @@ class CosyVoiceModel:
             ), "CosyVoice model does not support instruct_text"
         assert self._model is not None
         set_all_random_seed(seed)
-        if prompt_speech:
-            assert not voice, "voice can't be set with prompt speech."
-            with io.BytesIO(prompt_speech) as prompt_speech_io:
-                prompt_speech_16k = load_wav(prompt_speech_io, 16000)
-                if prompt_text:
-                    logger.info("CosyVoice inference_zero_shot")
-                    output = self._model.inference_zero_shot(
-                        input, prompt_text, prompt_speech_16k
-                    )
-                else:
-                    logger.info("CosyVoice inference_cross_lingual")
-                    output = self._model.inference_cross_lingual(
-                        input, prompt_speech_16k
-                    )
-        else:
-            available_speakers = self._model.list_avaliable_spks()
-            if not voice:
-                voice = available_speakers[0]
-            else:
-                assert (
-                    voice in available_speakers
-                ), f"Invalid voice {voice}, CosyVoice available speakers: {available_speakers}"
-            if instruct_text:
-                logger.info("CosyVoice inference_instruct")
-                output = self._model.inference_instruct(
-                    input, voice, instruct_text=instruct_text
-                )
-            else:
-                logger.info("CosyVoice inference_sft")
-                output = self._model.inference_sft(input, voice)
-        # Save the generated audio
-        with BytesIO() as out:
-            torchaudio.save(out, output["tts_speech"], 22050, format=response_format)
-            return out.getvalue()
+        return self._speech_handle(
+            stream,
+            input,
+            instruct_text,
+            prompt_speech,
+            prompt_text,
+            voice,
+            response_format,
+        )

xinference/model/embedding/core.py CHANGED Viewed

@@ -19,6 +19,7 @@ from collections import defaultdict
 from typing import Dict, List, Literal, Optional, Tuple, Union, no_type_check
 import numpy as np
+import torch
 from ...device_utils import empty_cache
 from ...types import Embedding, EmbeddingData, EmbeddingUsage
@@ -34,7 +35,11 @@ EMBEDDING_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
 EMBEDDING_EMPTY_CACHE_COUNT = int(
     os.getenv("XINFERENCE_EMBEDDING_EMPTY_CACHE_COUNT", "10")
 )
+EMBEDDING_EMPTY_CACHE_TOKENS = int(
+    os.getenv("XINFERENCE_EMBEDDING_EMPTY_CACHE_TOKENS", "8192")
+)
 assert EMBEDDING_EMPTY_CACHE_COUNT > 0
+assert EMBEDDING_EMPTY_CACHE_TOKENS > 0
 def get_embedding_model_descriptions():
@@ -149,6 +154,25 @@ class EmbeddingModel:
             def to(self, *args, **kwargs):
                 pass
+        torch_dtype = None
+        if torch_dtype_str := self._kwargs.get("torch_dtype"):
+            try:
+                torch_dtype = getattr(torch, torch_dtype_str)
+                if torch_dtype not in [
+                    torch.float16,
+                    torch.float32,
+                    torch.bfloat16,
+                ]:
+                    logger.warning(
+                        f"Load embedding model with unsupported torch dtype :  {torch_dtype_str}. Using default torch dtype: fp32."
+                    )
+                    torch_dtype = torch.float32
+            except AttributeError:
+                logger.warning(
+                    f"Load embedding model with  unknown torch dtype '{torch_dtype_str}'. Using default torch dtype: fp32."
+                )
+                torch_dtype = torch.float32
         from ..utils import patch_trust_remote_code
         patch_trust_remote_code()
@@ -156,42 +180,21 @@ class EmbeddingModel:
             "gte" in self._model_spec.model_name.lower()
             and "qwen2" in self._model_spec.model_name.lower()
         ):
-            import torch
-            torch_dtype_str = self._kwargs.get("torch_dtype")
-            if torch_dtype_str is not None:
-                try:
-                    torch_dtype = getattr(torch, torch_dtype_str)
-                    if torch_dtype not in [
-                        torch.float16,
-                        torch.float32,
-                        torch.bfloat16,
-                    ]:
-                        logger.warning(
-                            f"Load embedding model with unsupported torch dtype :  {torch_dtype_str}. Using default torch dtype: fp32."
-                        )
-                        torch_dtype = torch.float32
-                except AttributeError:
-                    logger.warning(
-                        f"Load embedding model with  unknown torch dtype '{torch_dtype_str}'. Using default torch dtype: fp32."
-                    )
-                    torch_dtype = torch.float32
-            else:
-                torch_dtype = "auto"
+            model_kwargs = {"device_map": "auto"}
+            if torch_dtype:
+                model_kwargs["torch_dtype"] = torch_dtype
             self._model = XSentenceTransformer(
                 self._model_path,
                 device=self._device,
-                model_kwargs={"device_map": "auto", "torch_dtype": torch_dtype},
+                model_kwargs=model_kwargs,
             )
         else:
-            self._model = SentenceTransformer(self._model_path, device=self._device)
+            model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
+            self._model = SentenceTransformer(
+                self._model_path, device=self._device, model_kwargs=model_kwargs
+            )
     def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
-        self._counter += 1
-        if self._counter % EMBEDDING_EMPTY_CACHE_COUNT == 0:
-            logger.debug("Empty embedding cache.")
-            gc.collect()
-            empty_cache()
         from sentence_transformers import SentenceTransformer
         kwargs.setdefault("normalize_embeddings", True)
@@ -309,7 +312,9 @@ class EmbeddingModel:
                 features = model.tokenize(sentences_batch)
                 features = batch_to_device(features, device)
                 features.update(extra_features)
-                all_token_nums += sum([len(f) for f in features])
+                # when batching, the attention mask 1 means there is a token
+                # thus we just sum up it to get the total number of tokens
+                all_token_nums += features["attention_mask"].sum().item()
                 with torch.no_grad():
                     out_features = model.forward(features)
@@ -393,13 +398,29 @@ class EmbeddingModel:
         usage = EmbeddingUsage(
             prompt_tokens=all_token_nums, total_tokens=all_token_nums
         )
-        return Embedding(
+        result = Embedding(
             object="list",
             model=self._model_uid,
             data=embedding_list,
             usage=usage,
         )
+        # clean cache if possible
+        self._counter += 1
+        if (
+            self._counter % EMBEDDING_EMPTY_CACHE_COUNT == 0
+            or all_token_nums >= EMBEDDING_EMPTY_CACHE_TOKENS
+        ):
+            logger.debug(
+                "Empty embedding cache, calling count %s, all_token_nums %s",
+                self._counter,
+                all_token_nums,
+            )
+            gc.collect()
+            empty_cache()
+        return result
 def match_embedding(
     model_name: str,

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -172,10 +172,21 @@ class DiffusionModel:
             "stable diffusion args: %s",
             kwargs,
         )
+        is_padded = kwargs.pop("is_padded", None)
+        origin_size = kwargs.pop("origin_size", None)
         model = model if model is not None else self._model
         assert callable(model)
         images = model(**kwargs).images
+        # revert padding if padded
+        if is_padded and origin_size:
+            new_images = []
+            x, y = origin_size
+            for img in images:
+                new_images.append(img.crop((0, 0, x, y)))
+            images = new_images
         # clean cache
         gc.collect()
         empty_cache()
@@ -198,7 +209,7 @@ class DiffusionModel:
             with ThreadPoolExecutor() as executor:
                 results = list(map(partial(executor.submit, _gen_base64_image), images))  # type: ignore
-                image_list = [Image(url=None, b64_json=s.result()) for s in results]
+                image_list = [Image(url=None, b64_json=s.result()) for s in results]  # type: ignore
             return ImageList(created=int(time.time()), data=image_list)
         else:
             raise ValueError(f"Unsupported response format: {response_format}")
@@ -265,6 +276,9 @@ class DiffusionModel:
         if padding_image_to_multiple := kwargs.pop("padding_image_to_multiple", None):
             # Model like SD3 image to image requires image's height and width is times of 16
             # padding the image if specified
+            origin_x, origin_y = image.size
+            kwargs["origin_size"] = (origin_x, origin_y)
+            kwargs["is_padded"] = True
             image = self.pad_to_multiple(image, multiple=int(padding_image_to_multiple))
         if size:
@@ -318,6 +332,9 @@ class DiffusionModel:
         if padding_image_to_multiple := kwargs.pop("padding_image_to_multiple", None):
             # Model like SD3 inpainting requires image's height and width is times of 16
             # padding the image if specified
+            origin_x, origin_y = image.size
+            kwargs["origin_size"] = (origin_x, origin_y)
+            kwargs["is_padded"] = True
             image = self.pad_to_multiple(image, multiple=int(padding_image_to_multiple))
             mask_image = self.pad_to_multiple(
                 mask_image, multiple=int(padding_image_to_multiple)

xinference/model/llm/__init__.py CHANGED Viewed

@@ -45,7 +45,6 @@ from .llm_family import (
     LLMFamilyV1,
     LLMSpecV1,
     MLXLLMSpecV1,
-    PromptStyleV1,
     PytorchLLMSpecV1,
     get_cache_status,
     get_user_defined_llm_families,
@@ -141,9 +140,9 @@ def _install():
     from .transformers.glm4v import Glm4VModel
     from .transformers.intern_vl import InternVLChatModel
     from .transformers.internlm2 import Internlm2PytorchChatModel
-    from .transformers.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
     from .transformers.minicpmv25 import MiniCPMV25Model
     from .transformers.minicpmv26 import MiniCPMV26Model
+    from .transformers.qwen2_vl import Qwen2VLChatModel
     from .transformers.qwen_vl import QwenVLChatModel
     from .transformers.yi_vl import YiVLChatModel
     from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
@@ -170,11 +169,10 @@ def _install():
     TRANSFORMERS_CLASSES.extend(
         [
             ChatglmPytorchChatModel,
-            LlamaPytorchModel,
-            LlamaPytorchChatModel,
             PytorchChatModel,
             Internlm2PytorchChatModel,
             QwenVLChatModel,
+            Qwen2VLChatModel,
             YiVLChatModel,
             DeepSeekVLChatModel,
             InternVLChatModel,
@@ -204,13 +202,17 @@ def _install():
         model_spec = LLMFamilyV1.parse_obj(json_obj)
         BUILTIN_LLM_FAMILIES.append(model_spec)
-        # register prompt style
+        # register chat_template
         if "chat" in model_spec.model_ability and isinstance(
-            model_spec.prompt_style, PromptStyleV1
+            model_spec.chat_template, str
         ):
             # note that the key is the model name,
             # since there are multiple representations of the same prompt style name in json.
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
@@ -230,10 +232,14 @@ def _install():
         # if duplicated with huggingface json, keep it as the huggingface style
         if (
             "chat" in model_spec.model_ability
-            and isinstance(model_spec.prompt_style, PromptStyleV1)
+            and isinstance(model_spec.chat_template, str)
             and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
         ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
@@ -253,10 +259,14 @@ def _install():
         # if duplicated with huggingface json, keep it as the huggingface style
         if (
             "chat" in model_spec.model_ability
-            and isinstance(model_spec.prompt_style, PromptStyleV1)
+            and isinstance(model_spec.chat_template, str)
             and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
         ):
-            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
+                "chat_template": model_spec.chat_template,
+                "stop_token_ids": model_spec.stop_token_ids,
+                "stop": model_spec.stop,
+            }
         # register model family
         if "chat" in model_spec.model_ability:
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -14,12 +14,11 @@
 import logging
 import os
 import time
-from typing import Iterable, Iterator, List, Optional, Union
+from typing import Dict, Iterator, List, Optional, Union
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
-    ChatCompletionMessage,
     Completion,
     CompletionChunk,
     CompletionUsage,
@@ -181,13 +180,12 @@ class LlamaCppModel(LLM):
             for index, _completion_chunk in enumerate(
                 self._llm(prompt=_prompt, **_generate_config)
             ):
+                _completion_chunk["model"] = self.model_uid
                 request_id = _completion_chunk["id"]
-                choice = _completion_chunk["choices"][0]
-                if choice["finish_reason"] is not None:
-                    completion_tokens = index
+                completion_tokens = index + 1
                 total_tokens = prompt_tokens + completion_tokens
                 _completion_chunk["usage"] = CompletionUsage(
-                    prompt_tokens=total_tokens,
+                    prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
                 )
@@ -262,39 +260,26 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
         self, generate_config: Optional[LlamaCppGenerateConfig]
     ) -> LlamaCppGenerateConfig:
         generate_config = super()._sanitize_generate_config(generate_config)
-        if self.model_family.prompt_style and self.model_family.prompt_style.stop:
-            generate_config["stop"] = self.model_family.prompt_style.stop
+        if self.model_family.stop and self.model_family.stop:
+            generate_config["stop"] = self.model_family.stop.copy()
         return generate_config
     def chat(
         self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        messages: List[Dict],
         generate_config: Optional[LlamaCppGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        assert self.model_family.prompt_style is not None
-        prompt_style = self.model_family.prompt_style.copy()
-        if system_prompt:
-            prompt_style.system_prompt = system_prompt
-        chat_history = chat_history or []
-        assert prompt_style is not None
+        model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_prompt = self.get_prompt(prompt, chat_history, prompt_style, tools=tools)
+        full_context_kwargs = {}
+        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
+            full_context_kwargs["tools"] = tools
+        assert self.model_family.chat_template is not None
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, **full_context_kwargs
+        )
         generate_config = self._sanitize_generate_config(generate_config)
-        # TODO(codingl2k1): qwen hacky to set stop for function call.
-        model_family = self.model_family.model_family or self.model_family.model_name
-        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            stop = generate_config.get("stop")
-            if isinstance(stop, str):
-                generate_config["stop"] = [stop, "Observation:"]
-            elif isinstance(stop, Iterable):
-                assert not isinstance(stop, str)
-                generate_config["stop"] = stop + ["Observation:"]  # type: ignore
-            else:
-                generate_config["stop"] = "Observation:"
         stream = generate_config.get("stream", False)
         if stream:
@@ -305,7 +290,5 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
             if tools:
-                return self._tool_calls_completion(
-                    self.model_family, self.model_uid, c, tools
-                )
+                return self._tool_calls_completion(self.model_family, self.model_uid, c)
             return self._to_chat_completion(c)

xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

xinference 0.14.4.post1py3-none-any.whl → 0.15.0py3-none-any.whl