PyPI - xinference - Versions diffs - 0.15.4__py3-none-any.whl → 0.16.1__py3-none-any.whl - Mend

xinference 0.15.4py3-none-any.whl → 0.16.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (67) hide show

xinference/__init__.py +0 -4
xinference/_version.py +3 -3
xinference/api/restful_api.py +48 -0
xinference/client/restful/restful_client.py +19 -0
xinference/constants.py +4 -4
xinference/core/chat_interface.py +5 -1
xinference/core/image_interface.py +5 -1
xinference/core/model.py +195 -34
xinference/core/scheduler.py +10 -7
xinference/core/utils.py +9 -0
xinference/model/__init__.py +4 -0
xinference/model/audio/chattts.py +25 -14
xinference/model/audio/model_spec.json +1 -1
xinference/model/audio/model_spec_modelscope.json +1 -1
xinference/model/embedding/model_spec.json +1 -1
xinference/model/image/core.py +59 -4
xinference/model/image/model_spec.json +24 -3
xinference/model/image/model_spec_modelscope.json +25 -3
xinference/model/image/ocr/__init__.py +13 -0
xinference/model/image/ocr/got_ocr2.py +76 -0
xinference/model/image/scheduler/__init__.py +13 -0
xinference/model/image/scheduler/flux.py +533 -0
xinference/model/image/stable_diffusion/core.py +8 -34
xinference/model/image/stable_diffusion/mlx.py +221 -0
xinference/model/image/utils.py +39 -3
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +178 -1
xinference/model/llm/llm_family_modelscope.json +119 -0
xinference/model/llm/transformers/chatglm.py +104 -0
xinference/model/llm/transformers/core.py +37 -111
xinference/model/llm/transformers/deepseek_v2.py +0 -226
xinference/model/llm/transformers/internlm2.py +3 -95
xinference/model/llm/transformers/opt.py +68 -0
xinference/model/llm/transformers/utils.py +4 -284
xinference/model/llm/utils.py +2 -2
xinference/model/llm/vllm/core.py +16 -1
xinference/thirdparty/mlx/__init__.py +13 -0
xinference/thirdparty/mlx/flux/__init__.py +15 -0
xinference/thirdparty/mlx/flux/autoencoder.py +357 -0
xinference/thirdparty/mlx/flux/clip.py +154 -0
xinference/thirdparty/mlx/flux/datasets.py +75 -0
xinference/thirdparty/mlx/flux/flux.py +247 -0
xinference/thirdparty/mlx/flux/layers.py +302 -0
xinference/thirdparty/mlx/flux/lora.py +76 -0
xinference/thirdparty/mlx/flux/model.py +134 -0
xinference/thirdparty/mlx/flux/sampler.py +56 -0
xinference/thirdparty/mlx/flux/t5.py +244 -0
xinference/thirdparty/mlx/flux/tokenizers.py +185 -0
xinference/thirdparty/mlx/flux/trainer.py +98 -0
xinference/thirdparty/mlx/flux/utils.py +179 -0
xinference/utils.py +2 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.e51a356d.js → main.b76aeeb7.js} +3 -3
xinference/web/ui/build/static/js/main.b76aeeb7.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/32ea2c04cf0bba2761b4883d2c40cc259952c94d2d6bb774e510963ca37aac0a.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
{xinference-0.15.4.dist-info → xinference-0.16.1.dist-info}/METADATA +49 -10
{xinference-0.15.4.dist-info → xinference-0.16.1.dist-info}/RECORD +64 -44
xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
/xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.b76aeeb7.js.LICENSE.txt} +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.1.dist-info}/LICENSE +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.1.dist-info}/WHEEL +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.1.dist-info}/entry_points.txt +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.1.dist-info}/top_level.txt +0 -0

xinference/model/llm/transformers/opt.py ADDED Viewed

@@ -0,0 +1,68 @@
+# Copyright 2022-2024 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from builtins import classmethod
+from typing import List, Optional
+from ....core.scheduler import InferenceRequest
+from ....types import LoRA
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from .core import PytorchModel, PytorchModelConfig
+class OptPytorchModel(PytorchModel):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model: Optional[List[LoRA]] = None,
+    ):
+        super().__init__(
+            model_uid,
+            model_family,
+            model_spec,
+            quantization,
+            model_path,
+            pytorch_model_config=pytorch_model_config,
+            peft_model=peft_model,
+        )
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format != "pytorch":
+            return False
+        model_family = llm_family.model_family or llm_family.model_name
+        if model_family != "opt":
+            return False
+        return True
+    def build_prefill_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        Mainly for UT.
+        Transformers code in `main` branch supports `position_ids` parameter (https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#L1076),
+        while in release branch, it doesn't (https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/opt/modeling_opt.py#L886).
+        """
+        return None
+    def build_decode_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        return None

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -11,14 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import asyncio
 import functools
-import gc
 import logging
 import os
 import time
-import uuid
-from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 import torch
 from transformers.cache_utils import DynamicCache
@@ -46,20 +45,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-def is_sentence_complete(output: str):
-    """Check whether the output is a complete sentence."""
-    end_symbols = (".", "?", "!", "...", "。", "？", "！", "…", '"', "'", "”")
-    return output.endswith(end_symbols)
-def is_partial_stop(output: str, stop_str: str):
-    """Check whether the output contains a partial stop str."""
-    for i in range(0, min(len(output), len(stop_str))):
-        if stop_str.startswith(output[-i:]):
-            return True
-    return False
 def get_context_length(config) -> int:
     """Get the context length of a model from a huggingface model config."""
     if (
@@ -99,273 +84,6 @@ def prepare_logits_processor(
     return processor_list
-@torch.inference_mode()
-def generate_stream(
-    model_uid,
-    model,
-    tokenizer,
-    prompt,
-    device,
-    generate_config,
-    judge_sent_end=False,
-) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]:
-    context_len = get_context_length(model.config)
-    stream_interval = generate_config.get("stream_interval", 2)
-    stream = generate_config.get("stream", False)
-    stream_options = generate_config.pop("stream_options", None)
-    include_usage = (
-        stream_options["include_usage"] if isinstance(stream_options, dict) else False
-    )
-    len_prompt = len(prompt)
-    temperature = float(generate_config.get("temperature", 1.0))
-    repetition_penalty = float(generate_config.get("repetition_penalty", 1.0))
-    top_p = float(generate_config.get("top_p", 1.0))
-    top_k = int(generate_config.get("top_k", -1))  # -1 means disable
-    max_new_tokens = int(generate_config.get("max_tokens", max_tokens_field.default))
-    echo = bool(generate_config.get("echo", False))
-    stop_str = generate_config.get("stop", None)
-    stop_token_ids = generate_config.get("stop_token_ids", None) or []
-    if tokenizer.eos_token_id not in stop_token_ids:
-        stop_token_ids.append(tokenizer.eos_token_id)
-    chunk_id = str(uuid.uuid4())
-    logits_processor = prepare_logits_processor(
-        temperature, repetition_penalty, top_p, top_k
-    )
-    if ".modeling_qwen." in str(type(model)).lower():
-        # TODO: hacky
-        input_ids = tokenizer(prompt, allowed_special="all").input_ids
-    else:
-        input_ids = tokenizer(prompt).input_ids
-    output_ids = list(input_ids)
-    if model.config.is_encoder_decoder:
-        max_src_len = context_len
-    else:
-        max_src_len = context_len - max_new_tokens - 8
-        if max_src_len < 0:
-            raise ValueError("Max tokens exceeds model's max length")
-    input_ids = input_ids[-max_src_len:]
-    input_echo_len = len(input_ids)
-    if model.config.is_encoder_decoder:
-        encoder_output = model.encoder(
-            input_ids=torch.as_tensor([input_ids], device=device)
-        )[0]
-        start_ids = torch.as_tensor(
-            [[model.generation_config.decoder_start_token_id]],
-            dtype=torch.int64,
-            device=device,
-        )
-    start = time.time()
-    past_key_values = out = None
-    sent_interrupt = False
-    token = None
-    last_output_length = 0
-    for i in range(max_new_tokens):
-        if i == 0:
-            if model.config.is_encoder_decoder:
-                out = model.decoder(
-                    input_ids=start_ids,
-                    encoder_hidden_states=encoder_output,
-                    use_cache=True,
-                )
-                logits = model.lm_head(out[0])
-            else:
-                out = model(torch.as_tensor([input_ids], device=device), use_cache=True)
-                logits = out.logits
-            past_key_values = out.past_key_values
-        else:
-            if model.config.is_encoder_decoder:
-                out = model.decoder(
-                    input_ids=torch.as_tensor(
-                        [[token] if not sent_interrupt else output_ids], device=device
-                    ),
-                    encoder_hidden_states=encoder_output,
-                    use_cache=True,
-                    past_key_values=past_key_values if not sent_interrupt else None,
-                )
-                sent_interrupt = False
-                logits = model.lm_head(out[0])
-            else:
-                out = model(
-                    input_ids=torch.as_tensor(
-                        [[token] if not sent_interrupt else output_ids], device=device
-                    ),
-                    use_cache=True,
-                    past_key_values=past_key_values if not sent_interrupt else None,
-                )
-                sent_interrupt = False
-                logits = out.logits
-            past_key_values = out.past_key_values
-        if logits_processor:
-            if repetition_penalty > 1.0:
-                tmp_output_ids = torch.as_tensor([output_ids], device=logits.device)
-            else:
-                tmp_output_ids = None
-            last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
-        else:
-            last_token_logits = logits[0, -1, :]
-        if device == "mps":
-            # Switch to CPU by avoiding some bugs in mps backend.
-            last_token_logits = last_token_logits.float().to("cpu")
-        if temperature < 1e-5 or top_p < 1e-8:  # greedy
-            _, indices = torch.topk(last_token_logits, 2)
-            tokens = [int(index) for index in indices.tolist()]
-        else:
-            probs = torch.softmax(last_token_logits, dim=-1)
-            indices = torch.multinomial(probs, num_samples=2)
-            tokens = [int(token) for token in indices.tolist()]
-        token = tokens[0]
-        output_ids.append(token)
-        if token in stop_token_ids:
-            stopped = True
-        else:
-            stopped = False
-        if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
-            if echo:
-                tmp_output_ids = output_ids
-                rfind_start = len_prompt
-            else:
-                tmp_output_ids = output_ids[input_echo_len:]
-                rfind_start = 0
-            output = tokenizer.decode(
-                tmp_output_ids,
-                skip_special_tokens=True,
-                spaces_between_special_tokens=False,
-                clean_up_tokenization_spaces=True,
-            )
-            # TODO: For the issue of incomplete sentences interrupting output, apply a patch and others can also modify it to a more elegant way
-            if judge_sent_end and stopped and not is_sentence_complete(output):
-                if len(tokens) > 1:
-                    token = tokens[1]
-                    output_ids[-1] = token
-                else:
-                    output_ids.pop()
-                stopped = False
-                sent_interrupt = True
-            partially_stopped = False
-            if stop_str:
-                if isinstance(stop_str, str):
-                    pos = output.rfind(stop_str, rfind_start)
-                    if pos != -1:
-                        output = output[:pos]
-                        stopped = True
-                    else:
-                        partially_stopped = is_partial_stop(output, stop_str)
-                elif isinstance(stop_str, Iterable):
-                    for each_stop in stop_str:
-                        pos = output.rfind(each_stop, rfind_start)
-                        if pos != -1:
-                            output = output[:pos]
-                            stopped = True
-                            break
-                        else:
-                            partially_stopped = is_partial_stop(output, each_stop)
-                            if partially_stopped:
-                                break
-                else:
-                    raise ValueError("Invalid stop field type.")
-            if stream:
-                output = output.strip("�")
-                tmp_output_length = len(output)
-                output = output[last_output_length:]
-                last_output_length = tmp_output_length
-            # prevent yielding partial stop sequence
-            if not partially_stopped:
-                completion_choice = CompletionChoice(
-                    text=output, index=0, logprobs=None, finish_reason=None
-                )
-                completion_chunk = CompletionChunk(
-                    id=chunk_id,
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=model_uid,
-                    choices=[completion_choice],
-                )
-                completion_usage = CompletionUsage(
-                    prompt_tokens=input_echo_len,
-                    completion_tokens=i,
-                    total_tokens=(input_echo_len + i),
-                )
-                yield completion_chunk, completion_usage
-        if stopped:
-            break
-    elapsed_time = time.time() - start
-    logger.info(f"Average generation speed: {i / elapsed_time:.2f} tokens/s.")
-    # finish stream event, which contains finish reason
-    if stopped:
-        finish_reason = "stop"
-    elif i == max_new_tokens - 1:
-        finish_reason = "length"
-    else:
-        finish_reason = None
-    if stream:
-        completion_choice = CompletionChoice(
-            text=output, index=0, logprobs=None, finish_reason=finish_reason
-        )
-    else:
-        completion_choice = CompletionChoice(
-            text=output, index=0, logprobs=None, finish_reason=finish_reason
-        )
-    completion_chunk = CompletionChunk(
-        id=chunk_id,
-        object="text_completion",
-        created=int(time.time()),
-        model=model_uid,
-        choices=[completion_choice],
-    )
-    completion_usage = CompletionUsage(
-        prompt_tokens=input_echo_len,
-        completion_tokens=i,
-        total_tokens=(input_echo_len + i),
-    )
-    yield completion_chunk, completion_usage
-    if include_usage:
-        completion_chunk = CompletionChunk(
-            id=chunk_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=model_uid,
-            choices=[],
-        )
-        completion_usage = CompletionUsage(
-            prompt_tokens=input_echo_len,
-            completion_tokens=i,
-            total_tokens=(input_echo_len + i),
-        )
-        yield completion_chunk, completion_usage
-    # clean
-    del past_key_values, out
-    gc.collect()
-    empty_cache()
 def _get_token_from_logits(
     req: InferenceRequest, i: int, logits, temperature, repetition_penalty, top_p, top_k
 ):
@@ -680,6 +398,7 @@ def _batch_inference_one_step_internal(
                     output = output.strip("�")
                     output = output[r.last_output_length :]
                     r.last_output_length += len(output)
+                    r.outputs.append(output)
                     completion_chunk = generate_completion_chunk(
                         chunk_text=output,
@@ -704,6 +423,7 @@ def _batch_inference_one_step_internal(
                         )
                         r.completion.append(completion_chunk)
                         r.completion.append(eos_flag)
+                        r.outputs.append(eos_flag)
                     # last round, handle stream result
                     # append usage information when enable `include_usage` for OPENAI API compatibility

xinference/model/llm/utils.py CHANGED Viewed

@@ -386,8 +386,8 @@ class ChatModelMixin:
         return result
     @classmethod
-    def _tool_calls_completion_chunk(cls, model_family, model_uid, c):
-        _id = str(uuid.uuid4())
+    def _tool_calls_completion_chunk(cls, model_family, model_uid, c, chunk_id=None):
+        _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
         tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
         failed_contents = []

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -717,11 +717,26 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format != "pytorch":
+        if not cls._has_cuda_device():
+            return False
+        if not cls._is_linux():
+            return False
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
+        if llm_spec.model_format == "awq":
+            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
+            if "4" not in quantization:
+                return False
+        if llm_spec.model_format == "gptq":
+            if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
+                if not any(q in quantization for q in ("3", "4", "8")):
+                    return False
+            else:
+                if "4" not in quantization:
+                    return False
         if isinstance(llm_family, CustomLLMFamilyV1):
             if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
                 return False

xinference/thirdparty/mlx/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

xinference/thirdparty/mlx/flux/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+# Copyright © 2024 Apple Inc.
+from .datasets import Dataset, load_dataset
+from .flux import FluxPipeline
+from .lora import LoRALinear
+from .sampler import FluxSampler
+from .trainer import Trainer
+from .utils import (
+    load_ae,
+    load_clip,
+    load_clip_tokenizer,
+    load_flow_model,
+    load_t5,
+    load_t5_tokenizer,
+)

xinference 0.15.4__py3-none-any.whl → 0.16.1__py3-none-any.whl

Potentially problematic release.

xinference 0.15.4py3-none-any.whl → 0.16.1py3-none-any.whl