PyPI - xinference - Versions diffs - 0.11.3__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

xinference 0.11.3py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (30) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +69 -0
xinference/client/restful/restful_client.py +70 -0
xinference/constants.py +4 -0
xinference/core/model.py +141 -12
xinference/core/scheduler.py +428 -0
xinference/core/supervisor.py +26 -0
xinference/isolation.py +9 -2
xinference/model/audio/chattts.py +84 -0
xinference/model/audio/core.py +10 -3
xinference/model/audio/model_spec.json +20 -0
xinference/model/llm/__init__.py +4 -0
xinference/model/llm/llm_family.json +507 -1
xinference/model/llm/llm_family_modelscope.json +409 -2
xinference/model/llm/pytorch/chatglm.py +2 -1
xinference/model/llm/pytorch/cogvlm2.py +76 -17
xinference/model/llm/pytorch/core.py +91 -6
xinference/model/llm/pytorch/glm4v.py +258 -0
xinference/model/llm/pytorch/minicpmv25.py +232 -0
xinference/model/llm/pytorch/utils.py +386 -2
xinference/model/llm/vllm/core.py +6 -0
xinference/thirdparty/ChatTTS/__init__.py +1 -0
xinference/thirdparty/ChatTTS/core.py +200 -0
xinference/types.py +3 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/METADATA +26 -9
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/RECORD +30 -24
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/LICENSE +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/WHEEL +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/entry_points.txt +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -14,13 +14,15 @@
 import gc
 import logging
+import os
 import time
 import uuid
 from threading import Thread
-from typing import Iterable, Iterator, Tuple
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple
 import torch
 from transformers import GenerationConfig, TextIteratorStreamer
+from transformers.cache_utils import DynamicCache
 from transformers.generation.logits_process import (
     LogitsProcessorList,
     RepetitionPenaltyLogitsProcessor,
@@ -29,8 +31,10 @@ from transformers.generation.logits_process import (
     TopPLogitsWarper,
 )
+from ....core.scheduler import InferenceRequest
 from ....device_utils import empty_cache
 from ....types import (
+    Completion,
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
@@ -54,7 +58,7 @@ def is_partial_stop(output: str, stop_str: str):
     return False
-def get_context_length(config):
+def get_context_length(config) -> int:
     """Get the context length of a model from a huggingface model config."""
     if (
         hasattr(config, "max_sequence_length")
@@ -528,3 +532,383 @@ def generate_stream_falcon(
     # clean
     gc.collect()
     empty_cache()
+def _get_token_from_logits(
+    req: InferenceRequest, i: int, logits, temperature, repetition_penalty, top_p, top_k
+):
+    logits_processor = prepare_logits_processor(
+        temperature, repetition_penalty, top_p, top_k
+    )
+    if logits_processor:
+        if repetition_penalty > 1.0:
+            tmp_output_ids = torch.as_tensor(
+                [req.prompt_tokens + req.new_tokens], device=logits.device
+            )
+        else:
+            tmp_output_ids = None
+        last_token_logits = logits_processor(tmp_output_ids, logits[i : i + 1, -1, :])[
+            0
+        ]
+    else:
+        last_token_logits = logits[i : i + 1, -1, :]
+    if temperature < 1e-5 or top_p < 1e-8:  # greedy
+        _, indices = torch.topk(last_token_logits, 2)
+    else:
+        probs = torch.softmax(last_token_logits, dim=-1)
+        indices = torch.multinomial(probs, num_samples=2)
+    token = indices[0].int().item()
+    return token
+def _pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]:
+    assert len(x) <= max_len
+    return [pad] * (max_len - len(x)) + x
+def _pad_seqs_inplace(seqs: List[List[int]], pad: int):
+    max_len = max(len(seq) for seq in seqs)
+    n = len(seqs)
+    i = 0
+    while i < n:
+        seqs[i] = _pad_to_max_length(seqs[i], max_len, pad)
+        i += 1
+def get_max_src_len(context_len: int, r: InferenceRequest) -> int:
+    max_new_tokens = int(
+        r.sanitized_generate_config.get("max_tokens", max_tokens_field.default)
+    )
+    return context_len - max_new_tokens - 8
+def _get_completion_chunk(
+    output: str,
+    finish_reason: Optional[str],
+    model_uid: str,
+    r: InferenceRequest,
+    just_usage: bool,
+):
+    completion_choice = (
+        [
+            CompletionChoice(
+                text=output, index=0, logprobs=None, finish_reason=finish_reason
+            )
+        ]
+        if not just_usage
+        else []
+    )
+    completion_chunk = CompletionChunk(
+        id=str(uuid.uuid1()),
+        object="text_completion",
+        created=int(time.time()),
+        model=model_uid,
+        choices=completion_choice,
+    )
+    completion_usage = CompletionUsage(
+        prompt_tokens=len(r.prompt_tokens),
+        completion_tokens=len(r.new_tokens),
+        total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
+    )
+    completion_chunk["usage"] = completion_usage
+    return completion_chunk
+def _get_completion(
+    output: str, finish_reason: Optional[str], model_uid: str, r: InferenceRequest
+):
+    completion_choice = CompletionChoice(
+        text=output, index=0, logprobs=None, finish_reason=finish_reason
+    )
+    completion_chunk = CompletionChunk(
+        id=str(uuid.uuid1()),
+        object="text_completion",
+        created=int(time.time()),
+        model=model_uid,
+        choices=[completion_choice],
+    )
+    completion_usage = CompletionUsage(
+        prompt_tokens=len(r.prompt_tokens),
+        completion_tokens=len(r.new_tokens),
+        total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
+    )
+    completion = Completion(
+        id=completion_chunk["id"],
+        object=completion_chunk["object"],
+        created=completion_chunk["created"],
+        model=completion_chunk["model"],
+        choices=completion_chunk["choices"],
+        usage=completion_usage,
+    )
+    return completion
+def _merge_kv_cache(
+    past_kv: Tuple[Tuple[torch.Tensor]], new_kv: Tuple[Tuple[torch.Tensor]]
+):
+    from torch.nn.functional import pad
+    past_cache = DynamicCache.from_legacy_cache(past_kv)
+    new_cache = DynamicCache.from_legacy_cache(new_kv)
+    past_seq_len = past_cache.get_seq_length()
+    new_seq_len = new_cache.get_seq_length()
+    if past_seq_len != new_seq_len:
+        padding_target = new_cache if past_seq_len > new_seq_len else past_cache
+        padding_len = abs(past_seq_len - new_seq_len)
+        for idx in range(len(padding_target)):
+            k = padding_target.key_cache[idx]
+            v = padding_target.value_cache[idx]
+            _k = pad(k, (0, 0, padding_len, 0))
+            _v = pad(v, (0, 0, padding_len, 0))
+            padding_target.key_cache[idx] = _k
+            padding_target.value_cache[idx] = _v
+    ret_kv = DynamicCache()
+    for idx in range(len(past_cache)):
+        k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
+        v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
+        ret_kv.update(torch.cat((k1, k2), 0), torch.cat((v1, v2), 0), idx)
+    return ret_kv.to_legacy_cache()
+@torch.inference_mode()
+def _batch_inference_one_step_internal(
+    req_list: List[InferenceRequest],
+    model_uid,
+    model,
+    tokenizer,
+    device,
+    context_len: int,
+    decode_round: int = 16,
+    bos_flag: str = "<bos_stream>",
+    eos_flag: str = "<eos_stream>",
+):
+    # need to judge stopped here,
+    # since some requests state may change to stopped due to invalid parameters, e.g. max_src_len
+    valid_req_list = [r for r in req_list if not r.stopped]
+    if not valid_req_list:
+        return
+    generate_config_mapping: Dict[InferenceRequest, Tuple] = {
+        r: r.get_generate_configs(tokenizer.eos_token_id) for r in valid_req_list
+    }
+    s_time = time.time()
+    prefill_reqs = []
+    prompts = []
+    decode_reqs = []
+    for r in valid_req_list:
+        if r.is_prefill:
+            prompts.append(r.full_prompt)
+            prefill_reqs.append(r)
+        else:
+            decode_reqs.append(r)
+    if prompts:  # prefill first
+        input_ids: List[List[int]] = tokenizer(prompts, padding=False).input_ids
+        prompt_tokens = []
+        for i, input_id in enumerate(input_ids):
+            req = valid_req_list[i]
+            max_src_len = get_max_src_len(context_len, req)
+            req.prompt_tokens = input_id[-max_src_len:]
+            prompt_tokens.append(req.prompt_tokens)
+        _pad_seqs_inplace(prompt_tokens, 0)
+        out = model(torch.as_tensor(prompt_tokens, device=device), use_cache=True)
+        logits = out.logits
+        past_key_values = out.past_key_values
+        for i, r in enumerate(prefill_reqs):
+            (
+                max_new_tokens,
+                stream_interval,
+                include_usage,
+                stop_str,
+                stop_token_ids,
+                temperature,
+                repetition_penalty,
+                top_p,
+                top_k,
+            ) = generate_config_mapping[r]
+            token = _get_token_from_logits(
+                r, i, logits, temperature, repetition_penalty, top_p, top_k
+            )
+            r.is_prefill = False
+            r.append_new_token(token)
+        if decode_reqs:
+            decode_kv = decode_reqs[0].kv_cache
+            # prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
+            merged_kv_cache = _merge_kv_cache(decode_kv, past_key_values)
+            for r in valid_req_list:
+                r.kv_cache = merged_kv_cache
+            empty_cache()
+        else:
+            for r in valid_req_list:
+                r.kv_cache = past_key_values
+    past_key_values = valid_req_list[0].kv_cache
+    stop_token_mapping: Dict[InferenceRequest, int] = {}
+    output_mapping: Dict[InferenceRequest, str] = {}
+    # here, only decode phase, just run some rounds
+    for _i in range(decode_round):
+        decode_tokens: List[List[int]] = [[r.new_tokens[-1]] for r in valid_req_list]
+        out = model(
+            input_ids=torch.as_tensor(decode_tokens, device=device),
+            use_cache=True,
+            past_key_values=past_key_values,
+        )
+        logits = out.logits
+        past_key_values = out.past_key_values
+        for i, r in enumerate(valid_req_list):
+            (
+                max_new_tokens,
+                stream_interval,
+                include_usage,
+                stop_str,
+                stop_token_ids,
+                temperature,
+                repetition_penalty,
+                top_p,
+                top_k,
+            ) = generate_config_mapping[r]
+            token = _get_token_from_logits(
+                r, i, logits, temperature, repetition_penalty, top_p, top_k
+            )
+            r.kv_cache = past_key_values
+            r.append_new_token(token)
+            output = None
+            if not r.stopped:
+                stopped = token in stop_token_ids
+                if stopped:
+                    finish_reason = "stop"
+                elif len(r.new_tokens) == max_new_tokens:
+                    finish_reason = "length"
+                    stopped = True
+                else:
+                    finish_reason = None
+                # handle stop str
+                if stop_str and r not in output_mapping:
+                    output = tokenizer.decode(
+                        r.new_tokens,
+                        skip_special_tokens=True,
+                        spaces_between_special_tokens=False,
+                        clean_up_tokenization_spaces=True,
+                    )
+                    if isinstance(stop_str, str):
+                        stop_str = [stop_str]
+                    for stop in stop_str:
+                        pos = output.rfind(stop)
+                        if pos != -1:
+                            output = output[:pos]
+                            output_mapping[r] = output
+                            stopped = True
+                            finish_reason = "stop"
+                            break
+                r.stopped = stopped
+                r.finish_reason = finish_reason
+            if r.stopped and r not in stop_token_mapping and r not in output_mapping:
+                stop_token_mapping[r] = _i + 1
+            if r.stream:
+                """
+                Note that you can't just decode based on the newest r.new_tokens here,
+                which may destroy the integrity of the parsed characters,
+                and at the same time is not good at handling some special characters.
+                So the implementation here is to decode all the tokens that have been generated each time,
+                and then take the slice.
+                """
+                if r.stopped or len(r.new_tokens) % stream_interval == 0:
+                    if output is None:
+                        output = tokenizer.decode(
+                            r.new_tokens,
+                            skip_special_tokens=True,
+                            spaces_between_special_tokens=False,
+                            clean_up_tokenization_spaces=True,
+                        )
+                    if r.last_output_length == 0:
+                        r.completion.append(bos_flag)
+                    # this special character is mainly for qwen
+                    output = output.strip("�")
+                    output = output[r.last_output_length :]
+                    r.last_output_length += len(output)
+                    completion_chunk = _get_completion_chunk(
+                        output, r.finish_reason, model_uid, r, False
+                    )
+                    r.completion.append(completion_chunk)
+                    if r.stopped:
+                        r.completion.append(eos_flag)
+                    # last round, handle stream result
+                    # append usage information when enable `include_usage` for OPENAI API compatibility
+                    # The reason for counting the usage in the last round of the iteration is that,
+                    # these tokens are real generated and should be counted.
+                    if r.stopped and _i == decode_round - 1 and include_usage:
+                        r.completion.append(
+                            _get_completion_chunk(
+                                "", r.finish_reason, model_uid, r, True
+                            )
+                        )
+            else:
+                # last round, handle non-stream result
+                if r.stopped and _i == decode_round - 1:
+                    invalid_token_num = decode_round - stop_token_mapping[r]
+                    outputs = (
+                        tokenizer.decode(
+                            r.new_tokens[: -(invalid_token_num + 1)]
+                            if r.finish_reason == "stop"
+                            else r.new_tokens[:-invalid_token_num],
+                            skip_special_tokens=True,
+                            spaces_between_special_tokens=False,
+                            clean_up_tokenization_spaces=True,
+                        )
+                        if r not in output_mapping
+                        else output_mapping[r]
+                    )
+                    completion = _get_completion(outputs, r.finish_reason, model_uid, r)
+                    r.completion = [completion]
+    e_time = time.time()
+    logger.debug(
+        f"Average throughput for a step: {(len(valid_req_list) * decode_round + len(prompts)) / (e_time - s_time)} token/s."
+    )
+def batch_inference_one_step(
+    req_list: List[InferenceRequest],
+    model_uid,
+    model,
+    tokenizer,
+    device,
+    context_len: int,
+):
+    from ....core.model import OutOfMemoryError
+    try:
+        _batch_inference_one_step_internal(
+            req_list, model_uid, model, tokenizer, device, context_len
+        )
+    except OutOfMemoryError:
+        logger.exception(
+            f"Batch inference out of memory. "
+            f"Xinference will restart the model: {model_uid}. "
+            f"Please be patient for a few moments."
+        )
+        # Just kill the process and let xinference auto-recover the model
+        os._exit(1)
+    except Exception as e:
+        logger.exception(f"Internal error for batch inference: {e}.")
+        # TODO: handle this

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -93,6 +93,7 @@ VLLM_SUPPORTED_MODELS = [
     "baichuan",
     "internlm-16k",
     "mistral-v0.1",
+    "codestral-v0.1",
     "Yi",
     "Yi-1.5",
     "code-llama",
@@ -118,11 +119,14 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "code-llama-instruct",
     "mistral-instruct-v0.1",
     "mistral-instruct-v0.2",
+    "mistral-instruct-v0.3",
     "mixtral-instruct-v0.1",
     "mixtral-8x22B-instruct-v0.1",
     "chatglm3",
     "chatglm3-32k",
     "chatglm3-128k",
+    "glm4-chat",
+    "glm4-chat-1m",
     "deepseek-chat",
     "deepseek-coder-instruct",
 ]
@@ -130,6 +134,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
     VLLM_SUPPORTED_MODELS.append("codeqwen1.5")
     VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct")
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -140,6 +145,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
 if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01")

xinference/thirdparty/ChatTTS/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .core import Chat

xinference/thirdparty/ChatTTS/core.py ADDED Viewed

@@ -0,0 +1,200 @@
+import os
+import logging
+from functools import partial
+from omegaconf import OmegaConf
+import torch
+from vocos import Vocos
+from .model.dvae import DVAE
+from .model.gpt import GPT_warpper
+from .utils.gpu_utils import select_device
+from .utils.infer_utils import count_invalid_characters, detect_language, apply_character_map, apply_half2full_map
+from .utils.io_utils import get_latest_modified_file
+from .infer.api import refine_text, infer_code
+from huggingface_hub import snapshot_download
+logging.basicConfig(level = logging.INFO)
+class Chat:
+    def __init__(self, ):
+        self.pretrain_models = {}
+        self.normalizer = {}
+        self.logger = logging.getLogger(__name__)
+    def check_model(self, level = logging.INFO, use_decoder = False):
+        not_finish = False
+        check_list = ['vocos', 'gpt', 'tokenizer']
+        if use_decoder:
+            check_list.append('decoder')
+        else:
+            check_list.append('dvae')
+        for module in check_list:
+            if module not in self.pretrain_models:
+                self.logger.log(logging.WARNING, f'{module} not initialized.')
+                not_finish = True
+        if not not_finish:
+            self.logger.log(level, f'All initialized.')
+        return not not_finish
+    def load_models(self, source='huggingface', force_redownload=False, local_path='<LOCAL_PATH>', **kwargs):
+        if source == 'huggingface':
+            hf_home = os.getenv('HF_HOME', os.path.expanduser("~/.cache/huggingface"))
+            try:
+                download_path = get_latest_modified_file(os.path.join(hf_home, 'hub/models--2Noise--ChatTTS/snapshots'))
+            except:
+                download_path = None
+            if download_path is None or force_redownload:
+                self.logger.log(logging.INFO, f'Download from HF: https://huggingface.co/2Noise/ChatTTS')
+                download_path = snapshot_download(repo_id="2Noise/ChatTTS", allow_patterns=["*.pt", "*.yaml"])
+            else:
+                self.logger.log(logging.INFO, f'Load from cache: {download_path}')
+        elif source == 'local':
+            self.logger.log(logging.INFO, f'Load from local: {local_path}')
+            download_path = local_path
+        self._load(**{k: os.path.join(download_path, v) for k, v in OmegaConf.load(os.path.join(download_path, 'config', 'path.yaml')).items()}, **kwargs)
+    def _load(
+        self,
+        vocos_config_path: str = None,
+        vocos_ckpt_path: str = None,
+        dvae_config_path: str = None,
+        dvae_ckpt_path: str = None,
+        gpt_config_path: str = None,
+        gpt_ckpt_path: str = None,
+        decoder_config_path: str = None,
+        decoder_ckpt_path: str = None,
+        tokenizer_path: str = None,
+        device: str = None,
+        compile: bool = True,
+    ):
+        if not device:
+            device = select_device(4096)
+            self.logger.log(logging.INFO, f'use {device}')
+        if vocos_config_path:
+            vocos = Vocos.from_hparams(vocos_config_path).to(device).eval()
+            assert vocos_ckpt_path, 'vocos_ckpt_path should not be None'
+            vocos.load_state_dict(torch.load(vocos_ckpt_path))
+            self.pretrain_models['vocos'] = vocos
+            self.logger.log(logging.INFO, 'vocos loaded.')
+        if dvae_config_path:
+            cfg = OmegaConf.load(dvae_config_path)
+            dvae = DVAE(**cfg).to(device).eval()
+            assert dvae_ckpt_path, 'dvae_ckpt_path should not be None'
+            dvae.load_state_dict(torch.load(dvae_ckpt_path, map_location='cpu'))
+            self.pretrain_models['dvae'] = dvae
+            self.logger.log(logging.INFO, 'dvae loaded.')
+        if gpt_config_path:
+            cfg = OmegaConf.load(gpt_config_path)
+            gpt = GPT_warpper(**cfg).to(device).eval()
+            assert gpt_ckpt_path, 'gpt_ckpt_path should not be None'
+            gpt.load_state_dict(torch.load(gpt_ckpt_path, map_location='cpu'))
+            if compile and 'cuda' in str(device):
+                gpt.gpt.forward = torch.compile(gpt.gpt.forward,  backend='inductor', dynamic=True)
+            self.pretrain_models['gpt'] = gpt
+            spk_stat_path = os.path.join(os.path.dirname(gpt_ckpt_path), 'spk_stat.pt')
+            assert os.path.exists(spk_stat_path), f'Missing spk_stat.pt: {spk_stat_path}'
+            self.pretrain_models['spk_stat'] = torch.load(spk_stat_path).to(device)
+            self.logger.log(logging.INFO, 'gpt loaded.')
+        if decoder_config_path:
+            cfg = OmegaConf.load(decoder_config_path)
+            decoder = DVAE(**cfg).to(device).eval()
+            assert decoder_ckpt_path, 'decoder_ckpt_path should not be None'
+            decoder.load_state_dict(torch.load(decoder_ckpt_path, map_location='cpu'))
+            self.pretrain_models['decoder'] = decoder
+            self.logger.log(logging.INFO, 'decoder loaded.')
+        if tokenizer_path:
+            tokenizer = torch.load(tokenizer_path, map_location='cpu')
+            tokenizer.padding_side = 'left'
+            self.pretrain_models['tokenizer'] = tokenizer
+            self.logger.log(logging.INFO, 'tokenizer loaded.')
+        self.check_model()
+    def infer(
+        self,
+        text,
+        skip_refine_text=False,
+        refine_text_only=False,
+        params_refine_text={},
+        params_infer_code={'prompt':'[speed_5]'},
+        use_decoder=True,
+        do_text_normalization=True,
+        lang=None,
+    ):
+        assert self.check_model(use_decoder=use_decoder)
+        if not isinstance(text, list):
+            text = [text]
+        if do_text_normalization:
+            for i, t in enumerate(text):
+                _lang = detect_language(t) if lang is None else lang
+                self.init_normalizer(_lang)
+                text[i] = self.normalizer[_lang](t)
+                if _lang == 'zh':
+                    text[i] = apply_half2full_map(text[i])
+        for i, t in enumerate(text):
+            invalid_characters = count_invalid_characters(t)
+            if len(invalid_characters):
+                self.logger.log(logging.WARNING, f'Invalid characters found! : {invalid_characters}')
+                text[i] = apply_character_map(t)
+        if not skip_refine_text:
+            text_tokens = refine_text(self.pretrain_models, text, **params_refine_text)['ids']
+            text_tokens = [i[i < self.pretrain_models['tokenizer'].convert_tokens_to_ids('[break_0]')] for i in text_tokens]
+            text = self.pretrain_models['tokenizer'].batch_decode(text_tokens)
+            if refine_text_only:
+                return text
+        text = [params_infer_code.get('prompt', '') + i for i in text]
+        params_infer_code.pop('prompt', '')
+        result = infer_code(self.pretrain_models, text, **params_infer_code, return_hidden=use_decoder)
+        if use_decoder:
+            mel_spec = [self.pretrain_models['decoder'](i[None].permute(0,2,1)) for i in result['hiddens']]
+        else:
+            mel_spec = [self.pretrain_models['dvae'](i[None].permute(0,2,1)) for i in result['ids']]
+        wav = [self.pretrain_models['vocos'].decode(i).cpu().numpy() for i in mel_spec]
+        return wav
+    def sample_random_speaker(self, ):
+        dim = self.pretrain_models['gpt'].gpt.layers[0].mlp.gate_proj.in_features
+        std, mean = self.pretrain_models['spk_stat'].chunk(2)
+        return torch.randn(dim, device=std.device) * std + mean
+    def init_normalizer(self, lang):
+        if lang not in self.normalizer:
+            if lang == 'zh':
+                try:
+                    from tn.chinese.normalizer import Normalizer
+                except:
+                    self.logger.log(logging.WARNING, f'Package WeTextProcessing not found! \
+                        Run: conda install -c conda-forge pynini=2.1.5 && pip install WeTextProcessing')
+                self.normalizer[lang] = Normalizer().normalize
+            else:
+                try:
+                    from nemo_text_processing.text_normalization.normalize import Normalizer
+                except:
+                    self.logger.log(logging.WARNING, f'Package nemo_text_processing not found! \
+                        Run: conda install -c conda-forge pynini=2.1.5 && pip install nemo_text_processing')
+                self.normalizer[lang] = partial(Normalizer(input_case='cased', lang=lang).normalize, verbose=False, punct_post_process=True)

xinference/types.py CHANGED Viewed

@@ -284,6 +284,7 @@ class PytorchGenerateConfig(TypedDict, total=False):
     tools: Optional[List[Dict]]
     lora_name: Optional[str]
     stream_options: Optional[Union[dict, None]]
+    request_id: Optional[str]
 class PytorchModelConfig(TypedDict, total=False):
@@ -297,6 +298,7 @@ class PytorchModelConfig(TypedDict, total=False):
     gptq_groupsize: int
     gptq_act_order: bool
     trust_remote_code: bool
+    max_num_seqs: int
 def get_pydantic_model_from_method(
@@ -361,6 +363,7 @@ class CreateCompletionTorch(BaseModel):
     top_p: float = top_p_field
     top_k: int = top_k_field
     lora_name: Optional[str]
+    request_id: Optional[str]
 CreateCompletionLlamaCpp: BaseModel

xinference 0.11.3__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

xinference 0.11.3py3-none-any.whl → 0.12.0py3-none-any.whl