PyPI - xinference - Versions diffs - 0.15.4__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

xinference 0.15.4py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (38) hide show

xinference/__init__.py +0 -4
xinference/_version.py +3 -3
xinference/constants.py +4 -4
xinference/core/model.py +89 -18
xinference/core/scheduler.py +10 -7
xinference/core/utils.py +9 -0
xinference/deploy/supervisor.py +4 -0
xinference/model/__init__.py +4 -0
xinference/model/image/scheduler/__init__.py +13 -0
xinference/model/image/scheduler/flux.py +533 -0
xinference/model/image/stable_diffusion/core.py +6 -31
xinference/model/image/utils.py +39 -3
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +169 -1
xinference/model/llm/llm_family_modelscope.json +108 -0
xinference/model/llm/transformers/chatglm.py +104 -0
xinference/model/llm/transformers/core.py +37 -111
xinference/model/llm/transformers/deepseek_v2.py +0 -226
xinference/model/llm/transformers/internlm2.py +3 -95
xinference/model/llm/transformers/opt.py +68 -0
xinference/model/llm/transformers/utils.py +4 -284
xinference/model/llm/utils.py +2 -2
xinference/model/llm/vllm/core.py +16 -1
xinference/utils.py +2 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.e51a356d.js → main.f7da0140.js} +3 -3
xinference/web/ui/build/static/js/main.f7da0140.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/METADATA +36 -4
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/RECORD +36 -33
xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
/xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.f7da0140.js.LICENSE.txt} +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/LICENSE +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/WHEEL +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/entry_points.txt +0 -0
{xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -11,14 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import asyncio
 import functools
-import gc
 import logging
 import os
 import time
-import uuid
-from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 import torch
 from transformers.cache_utils import DynamicCache
@@ -46,20 +45,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-def is_sentence_complete(output: str):
-    """Check whether the output is a complete sentence."""
-    end_symbols = (".", "?", "!", "...", "。", "？", "！", "…", '"', "'", "”")
-    return output.endswith(end_symbols)
-def is_partial_stop(output: str, stop_str: str):
-    """Check whether the output contains a partial stop str."""
-    for i in range(0, min(len(output), len(stop_str))):
-        if stop_str.startswith(output[-i:]):
-            return True
-    return False
 def get_context_length(config) -> int:
     """Get the context length of a model from a huggingface model config."""
     if (
@@ -99,273 +84,6 @@ def prepare_logits_processor(
     return processor_list
-@torch.inference_mode()
-def generate_stream(
-    model_uid,
-    model,
-    tokenizer,
-    prompt,
-    device,
-    generate_config,
-    judge_sent_end=False,
-) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]:
-    context_len = get_context_length(model.config)
-    stream_interval = generate_config.get("stream_interval", 2)
-    stream = generate_config.get("stream", False)
-    stream_options = generate_config.pop("stream_options", None)
-    include_usage = (
-        stream_options["include_usage"] if isinstance(stream_options, dict) else False
-    )
-    len_prompt = len(prompt)
-    temperature = float(generate_config.get("temperature", 1.0))
-    repetition_penalty = float(generate_config.get("repetition_penalty", 1.0))
-    top_p = float(generate_config.get("top_p", 1.0))
-    top_k = int(generate_config.get("top_k", -1))  # -1 means disable
-    max_new_tokens = int(generate_config.get("max_tokens", max_tokens_field.default))
-    echo = bool(generate_config.get("echo", False))
-    stop_str = generate_config.get("stop", None)
-    stop_token_ids = generate_config.get("stop_token_ids", None) or []
-    if tokenizer.eos_token_id not in stop_token_ids:
-        stop_token_ids.append(tokenizer.eos_token_id)
-    chunk_id = str(uuid.uuid4())
-    logits_processor = prepare_logits_processor(
-        temperature, repetition_penalty, top_p, top_k
-    )
-    if ".modeling_qwen." in str(type(model)).lower():
-        # TODO: hacky
-        input_ids = tokenizer(prompt, allowed_special="all").input_ids
-    else:
-        input_ids = tokenizer(prompt).input_ids
-    output_ids = list(input_ids)
-    if model.config.is_encoder_decoder:
-        max_src_len = context_len
-    else:
-        max_src_len = context_len - max_new_tokens - 8
-        if max_src_len < 0:
-            raise ValueError("Max tokens exceeds model's max length")
-    input_ids = input_ids[-max_src_len:]
-    input_echo_len = len(input_ids)
-    if model.config.is_encoder_decoder:
-        encoder_output = model.encoder(
-            input_ids=torch.as_tensor([input_ids], device=device)
-        )[0]
-        start_ids = torch.as_tensor(
-            [[model.generation_config.decoder_start_token_id]],
-            dtype=torch.int64,
-            device=device,
-        )
-    start = time.time()
-    past_key_values = out = None
-    sent_interrupt = False
-    token = None
-    last_output_length = 0
-    for i in range(max_new_tokens):
-        if i == 0:
-            if model.config.is_encoder_decoder:
-                out = model.decoder(
-                    input_ids=start_ids,
-                    encoder_hidden_states=encoder_output,
-                    use_cache=True,
-                )
-                logits = model.lm_head(out[0])
-            else:
-                out = model(torch.as_tensor([input_ids], device=device), use_cache=True)
-                logits = out.logits
-            past_key_values = out.past_key_values
-        else:
-            if model.config.is_encoder_decoder:
-                out = model.decoder(
-                    input_ids=torch.as_tensor(
-                        [[token] if not sent_interrupt else output_ids], device=device
-                    ),
-                    encoder_hidden_states=encoder_output,
-                    use_cache=True,
-                    past_key_values=past_key_values if not sent_interrupt else None,
-                )
-                sent_interrupt = False
-                logits = model.lm_head(out[0])
-            else:
-                out = model(
-                    input_ids=torch.as_tensor(
-                        [[token] if not sent_interrupt else output_ids], device=device
-                    ),
-                    use_cache=True,
-                    past_key_values=past_key_values if not sent_interrupt else None,
-                )
-                sent_interrupt = False
-                logits = out.logits
-            past_key_values = out.past_key_values
-        if logits_processor:
-            if repetition_penalty > 1.0:
-                tmp_output_ids = torch.as_tensor([output_ids], device=logits.device)
-            else:
-                tmp_output_ids = None
-            last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
-        else:
-            last_token_logits = logits[0, -1, :]
-        if device == "mps":
-            # Switch to CPU by avoiding some bugs in mps backend.
-            last_token_logits = last_token_logits.float().to("cpu")
-        if temperature < 1e-5 or top_p < 1e-8:  # greedy
-            _, indices = torch.topk(last_token_logits, 2)
-            tokens = [int(index) for index in indices.tolist()]
-        else:
-            probs = torch.softmax(last_token_logits, dim=-1)
-            indices = torch.multinomial(probs, num_samples=2)
-            tokens = [int(token) for token in indices.tolist()]
-        token = tokens[0]
-        output_ids.append(token)
-        if token in stop_token_ids:
-            stopped = True
-        else:
-            stopped = False
-        if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
-            if echo:
-                tmp_output_ids = output_ids
-                rfind_start = len_prompt
-            else:
-                tmp_output_ids = output_ids[input_echo_len:]
-                rfind_start = 0
-            output = tokenizer.decode(
-                tmp_output_ids,
-                skip_special_tokens=True,
-                spaces_between_special_tokens=False,
-                clean_up_tokenization_spaces=True,
-            )
-            # TODO: For the issue of incomplete sentences interrupting output, apply a patch and others can also modify it to a more elegant way
-            if judge_sent_end and stopped and not is_sentence_complete(output):
-                if len(tokens) > 1:
-                    token = tokens[1]
-                    output_ids[-1] = token
-                else:
-                    output_ids.pop()
-                stopped = False
-                sent_interrupt = True
-            partially_stopped = False
-            if stop_str:
-                if isinstance(stop_str, str):
-                    pos = output.rfind(stop_str, rfind_start)
-                    if pos != -1:
-                        output = output[:pos]
-                        stopped = True
-                    else:
-                        partially_stopped = is_partial_stop(output, stop_str)
-                elif isinstance(stop_str, Iterable):
-                    for each_stop in stop_str:
-                        pos = output.rfind(each_stop, rfind_start)
-                        if pos != -1:
-                            output = output[:pos]
-                            stopped = True
-                            break
-                        else:
-                            partially_stopped = is_partial_stop(output, each_stop)
-                            if partially_stopped:
-                                break
-                else:
-                    raise ValueError("Invalid stop field type.")
-            if stream:
-                output = output.strip("�")
-                tmp_output_length = len(output)
-                output = output[last_output_length:]
-                last_output_length = tmp_output_length
-            # prevent yielding partial stop sequence
-            if not partially_stopped:
-                completion_choice = CompletionChoice(
-                    text=output, index=0, logprobs=None, finish_reason=None
-                )
-                completion_chunk = CompletionChunk(
-                    id=chunk_id,
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=model_uid,
-                    choices=[completion_choice],
-                )
-                completion_usage = CompletionUsage(
-                    prompt_tokens=input_echo_len,
-                    completion_tokens=i,
-                    total_tokens=(input_echo_len + i),
-                )
-                yield completion_chunk, completion_usage
-        if stopped:
-            break
-    elapsed_time = time.time() - start
-    logger.info(f"Average generation speed: {i / elapsed_time:.2f} tokens/s.")
-    # finish stream event, which contains finish reason
-    if stopped:
-        finish_reason = "stop"
-    elif i == max_new_tokens - 1:
-        finish_reason = "length"
-    else:
-        finish_reason = None
-    if stream:
-        completion_choice = CompletionChoice(
-            text=output, index=0, logprobs=None, finish_reason=finish_reason
-        )
-    else:
-        completion_choice = CompletionChoice(
-            text=output, index=0, logprobs=None, finish_reason=finish_reason
-        )
-    completion_chunk = CompletionChunk(
-        id=chunk_id,
-        object="text_completion",
-        created=int(time.time()),
-        model=model_uid,
-        choices=[completion_choice],
-    )
-    completion_usage = CompletionUsage(
-        prompt_tokens=input_echo_len,
-        completion_tokens=i,
-        total_tokens=(input_echo_len + i),
-    )
-    yield completion_chunk, completion_usage
-    if include_usage:
-        completion_chunk = CompletionChunk(
-            id=chunk_id,
-            object="text_completion",
-            created=int(time.time()),
-            model=model_uid,
-            choices=[],
-        )
-        completion_usage = CompletionUsage(
-            prompt_tokens=input_echo_len,
-            completion_tokens=i,
-            total_tokens=(input_echo_len + i),
-        )
-        yield completion_chunk, completion_usage
-    # clean
-    del past_key_values, out
-    gc.collect()
-    empty_cache()
 def _get_token_from_logits(
     req: InferenceRequest, i: int, logits, temperature, repetition_penalty, top_p, top_k
 ):
@@ -680,6 +398,7 @@ def _batch_inference_one_step_internal(
                     output = output.strip("�")
                     output = output[r.last_output_length :]
                     r.last_output_length += len(output)
+                    r.outputs.append(output)
                     completion_chunk = generate_completion_chunk(
                         chunk_text=output,
@@ -704,6 +423,7 @@ def _batch_inference_one_step_internal(
                         )
                         r.completion.append(completion_chunk)
                         r.completion.append(eos_flag)
+                        r.outputs.append(eos_flag)
                     # last round, handle stream result
                     # append usage information when enable `include_usage` for OPENAI API compatibility

xinference/model/llm/utils.py CHANGED Viewed

@@ -386,8 +386,8 @@ class ChatModelMixin:
         return result
     @classmethod
-    def _tool_calls_completion_chunk(cls, model_family, model_uid, c):
-        _id = str(uuid.uuid4())
+    def _tool_calls_completion_chunk(cls, model_family, model_uid, c, chunk_id=None):
+        _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
         tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
         failed_contents = []

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -717,11 +717,26 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format != "pytorch":
+        if not cls._has_cuda_device():
+            return False
+        if not cls._is_linux():
+            return False
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
+        if llm_spec.model_format == "awq":
+            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
+            if "4" not in quantization:
+                return False
+        if llm_spec.model_format == "gptq":
+            if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
+                if not any(q in quantization for q in ("3", "4", "8")):
+                    return False
+            else:
+                if "4" not in quantization:
+                    return False
         if isinstance(llm_family, CustomLLMFamilyV1):
             if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
                 return False

xinference/utils.py CHANGED Viewed

@@ -13,9 +13,8 @@
 # limitations under the License.
-import torch
 def cuda_count():
+    import torch
     # even if install torch cpu, this interface would return 0.
     return torch.cuda.device_count()

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "files": {
     "main.css": "./static/css/main.5061c4c3.css",
-    "main.js": "./static/js/main.e51a356d.js",
+    "main.js": "./static/js/main.f7da0140.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
     "main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
-    "main.e51a356d.js.map": "./static/js/main.e51a356d.js.map"
+    "main.f7da0140.js.map": "./static/js/main.f7da0140.js.map"
   },
   "entrypoints": [
     "static/css/main.5061c4c3.css",
-    "static/js/main.e51a356d.js"
+    "static/js/main.f7da0140.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~e51a356d~~.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.f7da0140.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.15.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

Potentially problematic release.

xinference 0.15.4py3-none-any.whl → 0.16.0py3-none-any.whl