PyPI - xinference - Versions diffs - 0.15.3__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

xinference 0.15.3py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (65) hide show

xinference/__init__.py +0 -4
xinference/_version.py +3 -3
xinference/api/restful_api.py +29 -2
xinference/client/restful/restful_client.py +10 -0
xinference/constants.py +7 -3
xinference/core/image_interface.py +76 -23
xinference/core/model.py +158 -46
xinference/core/progress_tracker.py +187 -0
xinference/core/scheduler.py +10 -7
xinference/core/supervisor.py +11 -0
xinference/core/utils.py +9 -0
xinference/core/worker.py +1 -0
xinference/deploy/supervisor.py +4 -0
xinference/model/__init__.py +4 -0
xinference/model/audio/chattts.py +2 -1
xinference/model/audio/core.py +0 -2
xinference/model/audio/model_spec.json +8 -0
xinference/model/audio/model_spec_modelscope.json +9 -0
xinference/model/image/core.py +6 -7
xinference/model/image/scheduler/__init__.py +13 -0
xinference/model/image/scheduler/flux.py +533 -0
xinference/model/image/sdapi.py +35 -4
xinference/model/image/stable_diffusion/core.py +215 -110
xinference/model/image/utils.py +39 -3
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +185 -17
xinference/model/llm/llm_family_modelscope.json +124 -12
xinference/model/llm/transformers/chatglm.py +104 -0
xinference/model/llm/transformers/cogvlm2.py +2 -1
xinference/model/llm/transformers/cogvlm2_video.py +2 -0
xinference/model/llm/transformers/core.py +43 -113
xinference/model/llm/transformers/deepseek_v2.py +0 -226
xinference/model/llm/transformers/deepseek_vl.py +2 -0
xinference/model/llm/transformers/glm4v.py +2 -1
xinference/model/llm/transformers/intern_vl.py +2 -0
xinference/model/llm/transformers/internlm2.py +3 -95
xinference/model/llm/transformers/minicpmv25.py +2 -0
xinference/model/llm/transformers/minicpmv26.py +2 -0
xinference/model/llm/transformers/omnilmm.py +2 -0
xinference/model/llm/transformers/opt.py +68 -0
xinference/model/llm/transformers/qwen2_audio.py +11 -4
xinference/model/llm/transformers/qwen2_vl.py +2 -28
xinference/model/llm/transformers/qwen_vl.py +2 -1
xinference/model/llm/transformers/utils.py +36 -283
xinference/model/llm/transformers/yi_vl.py +2 -0
xinference/model/llm/utils.py +60 -16
xinference/model/llm/vllm/core.py +68 -9
xinference/model/llm/vllm/utils.py +0 -1
xinference/model/utils.py +7 -4
xinference/model/video/core.py +0 -2
xinference/utils.py +2 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.e51a356d.js → main.f7da0140.js} +3 -3
xinference/web/ui/build/static/js/main.f7da0140.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/METADATA +38 -6
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/RECORD +63 -59
xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
/xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.f7da0140.js.LICENSE.txt} +0 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/LICENSE +0 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/WHEEL +0 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/entry_points.txt +0 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/transformers/opt.py ADDED Viewed

@@ -0,0 +1,68 @@
+# Copyright 2022-2024 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from builtins import classmethod
+from typing import List, Optional
+from ....core.scheduler import InferenceRequest
+from ....types import LoRA
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from .core import PytorchModel, PytorchModelConfig
+class OptPytorchModel(PytorchModel):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model: Optional[List[LoRA]] = None,
+    ):
+        super().__init__(
+            model_uid,
+            model_family,
+            model_spec,
+            quantization,
+            model_path,
+            pytorch_model_config=pytorch_model_config,
+            peft_model=peft_model,
+        )
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format != "pytorch":
+            return False
+        model_family = llm_family.model_family or llm_family.model_name
+        if model_family != "opt":
+            return False
+        return True
+    def build_prefill_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        Mainly for UT.
+        Transformers code in `main` branch supports `position_ids` parameter (https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#L1076),
+        while in release branch, it doesn't (https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/opt/modeling_opt.py#L886).
+        """
+        return None
+    def build_decode_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        return None

xinference/model/llm/transformers/qwen2_audio.py CHANGED Viewed

@@ -14,16 +14,22 @@
 import logging
 import uuid
 from io import BytesIO
-from typing import Dict, Iterator, List, Optional, Union
+from typing import Iterator, List, Optional, Union
 from urllib.request import urlopen
 import numpy as np
 from ....model.utils import select_device
-from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionChunk,
+)
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 logger = logging.getLogger(__name__)
@@ -68,7 +74,7 @@ class Qwen2AudioChatModel(PytorchChatModel):
     def _transform_messages(
         self,
-        messages: List[Dict],
+        messages: List[ChatCompletionMessage],
     ):
         import librosa
@@ -89,9 +95,10 @@ class Qwen2AudioChatModel(PytorchChatModel):
         return text, audios
+    @cache_clean
     def chat(
         self,
-        messages: List[Dict],
+        messages: List[ChatCompletionMessage],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         text, audios = self._transform_messages(messages)

xinference/model/llm/transformers/qwen2_vl.py CHANGED Viewed

@@ -27,6 +27,7 @@ from ....types import (
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 logger = logging.getLogger(__name__)
@@ -75,34 +76,7 @@ class Qwen2VLChatModel(PytorchChatModel):
                 self.model_path, device_map=device, trust_remote_code=True
             ).eval()
-    def _transform_messages(
-        self,
-        messages: List[ChatCompletionMessage],
-    ):
-        transformed_messages = []
-        for msg in messages:
-            new_content = []
-            role = msg["role"]
-            content = msg["content"]
-            if isinstance(content, str):
-                new_content.append({"type": "text", "text": content})
-            elif isinstance(content, List):
-                for item in content:  # type: ignore
-                    if "text" in item:
-                        new_content.append({"type": "text", "text": item["text"]})
-                    elif "image_url" in item:
-                        new_content.append(
-                            {"type": "image", "image": item["image_url"]["url"]}
-                        )
-                    elif "video_url" in item:
-                        new_content.append(
-                            {"type": "video", "video": item["video_url"]["url"]}
-                        )
-            new_message = {"role": role, "content": new_content}
-            transformed_messages.append(new_message)
-        return transformed_messages
+    @cache_clean
     def chat(
         self,
         messages: List[ChatCompletionMessage],  # type: ignore

xinference/model/llm/transformers/qwen_vl.py CHANGED Viewed

@@ -28,7 +28,7 @@ from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
-from .utils import pad_prefill_tokens
+from .utils import cache_clean, pad_prefill_tokens
 logger = logging.getLogger(__name__)
@@ -137,6 +137,7 @@ class QwenVLChatModel(PytorchChatModel):
         prompt = self._message_content_to_qwen(messages[-1]["content"])
         return prompt, qwen_history
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import gc
+import asyncio
+import functools
 import logging
 import os
 import time
-import uuid
-from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 import torch
 from transformers.cache_utils import DynamicCache
@@ -45,20 +45,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-def is_sentence_complete(output: str):
-    """Check whether the output is a complete sentence."""
-    end_symbols = (".", "?", "!", "...", "。", "？", "！", "…", '"', "'", "”")
-    return output.endswith(end_symbols)
-def is_partial_stop(output: str, stop_str: str):
-    """Check whether the output contains a partial stop str."""
-    for i in range(0, min(len(output), len(stop_str))):
-        if stop_str.startswith(output[-i:]):
-            return True
-    return False
 def get_context_length(config) -> int:
     """Get the context length of a model from a huggingface model config."""
     if (
@@ -98,272 +84,6 @@ def prepare_logits_processor(
     return processor_list
-@torch.inference_mode()
-def generate_stream(
-    model_uid,
-    model,
-    tokenizer,
-    prompt,
-    device,
-    generate_config,
-    judge_sent_end=False,
-) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]:
-    context_len = get_context_length(model.config)
-    stream_interval = generate_config.get("stream_interval", 2)
-    stream = generate_config.get("stream", False)
-    stream_options = generate_config.pop("stream_options", None)
-    include_usage = (
-        stream_options["include_usage"] if isinstance(stream_options, dict) else False
-    )
-    len_prompt = len(prompt)
-    temperature = float(generate_config.get("temperature", 1.0))
-    repetition_penalty = float(generate_config.get("repetition_penalty", 1.0))
-    top_p = float(generate_config.get("top_p", 1.0))
-    top_k = int(generate_config.get("top_k", -1))  # -1 means disable
-    max_new_tokens = int(generate_config.get("max_tokens", max_tokens_field.default))
-    echo = bool(generate_config.get("echo", False))
-    stop_str = generate_config.get("stop", None)
-    stop_token_ids = generate_config.get("stop_token_ids", None) or []
-    stop_token_ids.append(tokenizer.eos_token_id)
-    chunk_id = str(uuid.uuid4())
-    logits_processor = prepare_logits_processor(
-        temperature, repetition_penalty, top_p, top_k
-    )
-    if ".modeling_qwen." in str(type(model)).lower():
-        # TODO: hacky
-        input_ids = tokenizer(prompt, allowed_special="all").input_ids
-    else:
-        input_ids = tokenizer(prompt).input_ids
-    output_ids = list(input_ids)
-    if model.config.is_encoder_decoder:
-        max_src_len = context_len
-    else:
-        max_src_len = context_len - max_new_tokens - 8
-        if max_src_len < 0:
-            raise ValueError("Max tokens exceeds model's max length")
-    input_ids = input_ids[-max_src_len:]
-    input_echo_len = len(input_ids)
-    if model.config.is_encoder_decoder:
-        encoder_output = model.encoder(
-            input_ids=torch.as_tensor([input_ids], device=device)
-        )[0]
-        start_ids = torch.as_tensor(
-            [[model.generation_config.decoder_start_token_id]],
-            dtype=torch.int64,
-            device=device,
-        )
-    start = time.time()
-    past_key_values = out = None
-    sent_interrupt = False
-    token = None
-    last_output_length = 0
-    for i in range(max_new_tokens):
-        if i == 0:
-            if model.config.is_encoder_decoder:
-                out = model.decoder(
-                    input_ids=start_ids,
-                    encoder_hidden_states=encoder_output,
-                    use_cache=True,
-                )
-                logits = model.lm_head(out[0])
-            else:
-                out = model(torch.as_tensor([input_ids], device=device), use_cache=True)
-                logits = out.logits
-            past_key_values = out.past_key_values
-        else:
-            if model.config.is_encoder_decoder:
-                out = model.decoder(
-                    input_ids=torch.as_tensor(
-                        [[token] if not sent_interrupt else output_ids], device=device
-                    ),
-                    encoder_hidden_states=encoder_output,
-                    use_cache=True,
-                    past_key_values=past_key_values if not sent_interrupt else None,
-                )
-                sent_interrupt = False
-                logits = model.lm_head(out[0])
-            else:
-                out = model(
-                    input_ids=torch.as_tensor(
-                        [[token] if not sent_interrupt else output_ids], device=device
-                    ),
-                    use_cache=True,
-                    past_key_values=past_key_values if not sent_interrupt else None,
-                )
-                sent_interrupt = False
-                logits = out.logits
-            past_key_values = out.past_key_values
-        if logits_processor:
-            if repetition_penalty > 1.0:
-                tmp_output_ids = torch.as_tensor([output_ids], device=logits.device)
-            else:
-                tmp_output_ids = None
-            last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
-        else:
-            last_token_logits = logits[0, -1, :]
-        if device == "mps":
-            # Switch to CPU by avoiding some bugs in mps backend.
-            last_token_logits = last_token_logits.float().to("cpu")
-        if temperature < 1e-5 or top_p < 1e-8:  # greedy
-            _, indices = torch.topk(last_token_logits, 2)
-            tokens = [int(index) for index in indices.tolist()]
-        else:
-            probs = torch.softmax(last_token_logits, dim=-1)
-            indices = torch.multinomial(probs, num_samples=2)
-            tokens = [int(token) for token in indices.tolist()]
-        token = tokens[0]
-        output_ids.append(token)
-        if token in stop_token_ids:
-            stopped = True
-        else:
-            stopped = False
-        if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
-            if echo:
-                tmp_output_ids = output_ids
-                rfind_start = len_prompt
-            else:
-                tmp_output_ids = output_ids[input_echo_len:]
-                rfind_start = 0
-            output = tokenizer.decode(
-                tmp_output_ids,
-                skip_special_tokens=True,
-                spaces_between_special_tokens=False,
-                clean_up_tokenization_spaces=True,
-            )
-            # TODO: For the issue of incomplete sentences interrupting output, apply a patch and others can also modify it to a more elegant way
-            if judge_sent_end and stopped and not is_sentence_complete(output):
-                if len(tokens) > 1:
-                    token = tokens[1]
-                    output_ids[-1] = token
-                else:
-                    output_ids.pop()
-                stopped = False
-                sent_interrupt = True
-            partially_stopped = False
-            if stop_str:
-                if isinstance(stop_str, str):
-                    pos = output.rfind(stop_str, rfind_start)
-                    if pos != -1:
-                        output = output[:pos]
-                        stopped = True
-                    else:
-                        partially_stopped = is_partial_stop(output, stop_str)
-                elif isinstance(stop_str, Iterable):
-                    for each_stop in stop_str:
-                        pos = output.rfind(each_stop, rfind_start)
-                        if pos != -1:
-                            output = output[:pos]
-                            stopped = True
-                            break
-                        else:
-                            partially_stopped = is_partial_stop(output, each_stop)
-                            if partially_stopped:
-                                break
-                else:
-                    raise ValueError("Invalid stop field type.")
-            if stream:
-                output = output.strip("�")
-                tmp_output_length = len(output)
-                output = output[last_output_length:]
-                last_output_length = tmp_output_length
-            # prevent yielding partial stop sequence
-            if not partially_stopped:
-                completion_choice = CompletionChoice(
-                    text=output, index=0, logprobs=None, finish_reason=None
-                )
-                completion_chunk = CompletionChunk(
-                    id=chunk_id,
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=model_uid,
-                    choices=[completion_choice],
-                )
-                completion_usage = CompletionUsage(
-                    prompt_tokens=input_echo_len,
-                    completion_tokens=i,
-                    total_tokens=(input_echo_len + i),
-                )
-                yield completion_chunk, completion_usage
-        if stopped:
-            break
-    elapsed_time = time.time() - start
-    logger.info(f"Average generation speed: {i / elapsed_time:.2f} tokens/s.")
-    # finish stream event, which contains finish reason
-    if stopped:
-        finish_reason = "stop"
-    elif i == max_new_tokens - 1:
-        finish_reason = "length"
-    else:
-        finish_reason = None
-    if stream:
-        completion_choice = CompletionChoice(
-            text=output, index=0, logprobs=None, finish_reason=finish_reason
-        )
-    else:
-        completion_choice = CompletionChoice(
-            text=output, index=0, logprobs=None, finish_reason=finish_reason
-        )
-    completion_chunk = CompletionChunk(
-        id=chunk_id,
-        object="text_completion",
-        created=int(time.time()),
-        model=model_uid,
-        choices=[completion_choice],
-    )
-    completion_usage = CompletionUsage(
-        prompt_tokens=input_echo_len,
-        completion_tokens=i,
-        total_tokens=(input_echo_len + i),
-    )
-    yield completion_chunk, completion_usage
-    if include_usage:
-        completion_chunk = CompletionChunk(
-            id=chunk_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=model_uid,
-            choices=[],
-        )
-        completion_usage = CompletionUsage(
-            prompt_tokens=input_echo_len,
-            completion_tokens=i,
-            total_tokens=(input_echo_len + i),
-        )
-        yield completion_chunk, completion_usage
-    # clean
-    del past_key_values, out
-    gc.collect()
-    empty_cache()
 def _get_token_from_logits(
     req: InferenceRequest, i: int, logits, temperature, repetition_penalty, top_p, top_k
 ):
@@ -678,6 +398,7 @@ def _batch_inference_one_step_internal(
                     output = output.strip("�")
                     output = output[r.last_output_length :]
                     r.last_output_length += len(output)
+                    r.outputs.append(output)
                     completion_chunk = generate_completion_chunk(
                         chunk_text=output,
@@ -702,6 +423,7 @@ def _batch_inference_one_step_internal(
                         )
                         r.completion.append(completion_chunk)
                         r.completion.append(eos_flag)
+                        r.outputs.append(eos_flag)
                     # last round, handle stream result
                     # append usage information when enable `include_usage` for OPENAI API compatibility
@@ -776,3 +498,34 @@ def batch_inference_one_step(
         for r in req_list:
             r.stopped = True
             r.error_msg = str(e)
+def cache_clean(fn):
+    @functools.wraps(fn)
+    async def _async_wrapper(self, *args, **kwargs):
+        import gc
+        from ....device_utils import empty_cache
+        result = await fn(self, *args, **kwargs)
+        gc.collect()
+        empty_cache()
+        return result
+    @functools.wraps(fn)
+    def _wrapper(self, *args, **kwargs):
+        import gc
+        from ....device_utils import empty_cache
+        result = fn(self, *args, **kwargs)
+        gc.collect()
+        empty_cache()
+        return result
+    if asyncio.iscoroutinefunction(fn):
+        return _async_wrapper
+    else:
+        return _wrapper

xinference/model/llm/transformers/yi_vl.py CHANGED Viewed

@@ -29,6 +29,7 @@ from ..utils import (
     parse_messages,
 )
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 logger = logging.getLogger(__name__)
@@ -99,6 +100,7 @@ class YiVLChatModel(PytorchChatModel):
                 raise RuntimeError("Only one image per message is supported by Yi VL.")
         return content
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],

xinference 0.15.3__py3-none-any.whl → 0.16.0__py3-none-any.whl

Potentially problematic release.

xinference 0.15.3py3-none-any.whl → 0.16.0py3-none-any.whl