PyPI - xinference - Versions diffs - 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

xinference 0.12.0py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (67) hide show

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import json
 import logging
 import os
+from functools import lru_cache
 from typing import Iterable, Iterator, List, Optional, Union
 from ....core.scheduler import InferenceRequest
@@ -28,6 +29,7 @@ from ....types import (
     ChatCompletionChunk,
     ChatCompletionMessage,
     Completion,
+    CompletionChoice,
     CompletionChunk,
     CreateCompletionTorch,
     Embedding,
@@ -366,6 +368,90 @@ class PytorchModel(LLM):
         else:
             return generator_wrapper(prompt, generate_config)
+    @lru_cache
+    def get_context_len(self):
+        return get_context_length(self._model.config)
+    def get_max_num_seqs(self) -> int:
+        return self._pytorch_model_config.get("max_num_seqs")  # type: ignore
+    def prepare_batch_inference(self, req_list: List[InferenceRequest]):
+        # check some parameters
+        for r in req_list:
+            if r.sanitized_generate_config is None:
+                r.sanitized_generate_config = self._sanitize_generate_config(
+                    r.generate_config
+                )
+            if r.is_prefill:
+                # check some generate params
+                max_src_len = get_max_src_len(self.get_context_len(), r)  # type: ignore
+                if max_src_len < 0:
+                    r.stopped = True
+                    r.error_msg = "Max tokens exceeds model's max length"
+                    continue
+                if r.stream_interval <= 0:
+                    r.stopped = True
+                    r.error_msg = "`stream_interval` must be greater than 0"
+                    continue
+                stop_str = r.sanitized_generate_config.get("stop", None)
+                if stop_str and (
+                    not (isinstance(stop_str, str) or isinstance(stop_str, Iterable))
+                ):
+                    r.stopped = True
+                    r.error_msg = "Invalid `stop` field type"
+                    continue
+    def handle_batch_inference_results(self, req_list: List[InferenceRequest]):
+        for req in req_list:
+            if req.error_msg is None:
+                # nothing need handle for non-stream case
+                if req.stream:
+                    results = []
+                    for i, c in enumerate(req.completion):
+                        if c == "<bos_stream>":
+                            chunk = req.completion[i + 1]
+                            results.append(
+                                CompletionChunk(
+                                    id=chunk["id"],
+                                    object=chunk["object"],
+                                    created=chunk["created"],
+                                    model=chunk["model"],
+                                    choices=[
+                                        CompletionChoice(
+                                            text="",
+                                            index=0,
+                                            logprobs=None,
+                                            finish_reason=None,
+                                        )
+                                    ],
+                                )
+                            )
+                            continue
+                        elif c == "<eos_stream>":
+                            break
+                        else:
+                            results.append(c)
+                    if req.stopped and req.include_usage:
+                        results.append(req.completion[-1])
+                    req.completion = results
+    def batch_inference(self, req_list: List[InferenceRequest]):
+        from .utils import batch_inference_one_step
+        self.prepare_batch_inference(req_list)
+        context_len = self.get_context_len()
+        assert isinstance(context_len, int)
+        batch_inference_one_step(
+            req_list,
+            self.model_uid,
+            self._model,
+            self._tokenizer,
+            self._device,
+            context_len,
+        )
+        self.handle_batch_inference_results(req_list)
     def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
         try:
             import torch
@@ -464,7 +550,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             pytorch_model_config,
             peft_model,
         )
-        self._context_len = None
     def _sanitize_generate_config(
         self,
@@ -540,7 +625,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def load(self):
         super().load()
-        self._context_len = get_context_length(self._model.config)
     def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
         assert self.model_family.prompt_style is not None
@@ -553,48 +637,14 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         )
         return full_prompt
-    def get_max_num_seqs(self) -> int:
-        return self._pytorch_model_config.get("max_num_seqs")  # type: ignore
-    def batch_inference(self, req_list: List[InferenceRequest]):
-        from .utils import batch_inference_one_step
+    def prepare_batch_inference(self, req_list: List[InferenceRequest]):
+        super().prepare_batch_inference(req_list)
         for r in req_list:
-            if r.sanitized_generate_config is None:
-                r.sanitized_generate_config = self._sanitize_generate_config(
-                    r.generate_config
-                )
-            if r.is_prefill:
-                # check some generate params
-                max_src_len = get_max_src_len(self._context_len, r)  # type: ignore
-                if max_src_len < 0:
-                    r.stopped = True
-                    r.error_msg = "Max tokens exceeds model's max length"
-                    continue
-                if r.stream_interval <= 0:
-                    r.stopped = True
-                    r.error_msg = "`stream_interval` must be greater than 0"
-                    continue
-                stop_str = r.sanitized_generate_config.get("stop", None)
-                if stop_str and (
-                    not (isinstance(stop_str, str) or isinstance(stop_str, Iterable))
-                ):
-                    r.stopped = True
-                    r.error_msg = "Invalid `stop` field type"
-                    continue
-                r.full_prompt = self._get_full_prompt(
-                    r.prompt, r.system_prompt, r.chat_history, None
-                )
+            r.full_prompt = self._get_full_prompt(
+                r.prompt, r.system_prompt, r.chat_history, None
+            )
-        assert isinstance(self._context_len, int)
-        batch_inference_one_step(
-            req_list,
-            self.model_uid,
-            self._model,
-            self._tokenizer,
-            self._device,
-            self._context_len,
-        )
+    def handle_batch_inference_results(self, req_list: List[InferenceRequest]):
         for req in req_list:
             if req.stream and req.error_msg is None:
                 if req.completion:

xinference/model/llm/pytorch/glm4v.py CHANGED Viewed

@@ -56,19 +56,29 @@ class Glm4VModel(PytorchChatModel):
             return True
         return False
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(device)
-        self._device = "auto" if self._device == "cuda" else self._device
+        kwargs = {"device_map": self._device}
+        quantization = self.quantization
+        if quantization != "none":
+            if self._device == "cuda" and self._is_linux():
+                kwargs["device_map"] = "auto"
+                self._device = "auto"
+                if quantization == "4-bit":
+                    kwargs["load_in_4bit"] = True
+                elif quantization == "8-bit":
+                    kwargs["load_in_8bit"] = True
         model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             low_cpu_mem_usage=True,
             trust_remote_code=True,
             torch_dtype=torch.float16,
-            device_map=self._device,
+            **kwargs,
         )
         self._model = model.eval()

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -45,7 +45,7 @@ class QwenVLChatModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if "qwen" in model_family.model_name:
+        if "qwen" in model_family.model_name and "vision" in model_family.model_ability:
             return True
         return False

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -126,6 +126,7 @@ def generate_stream(
     stop_str = generate_config.get("stop", None)
     stop_token_ids = generate_config.get("stop_token_ids", None) or []
     stop_token_ids.append(tokenizer.eos_token_id)
+    chunk_id = str(uuid.uuid4())
     logits_processor = prepare_logits_processor(
         temperature, repetition_penalty, top_p, top_k
@@ -289,7 +290,7 @@ def generate_stream(
                     text=output, index=0, logprobs=None, finish_reason=None
                 )
                 completion_chunk = CompletionChunk(
-                    id=str(uuid.uuid1()),
+                    id=chunk_id,
                     object="text_completion",
                     created=int(time.time()),
                     model=model_uid,
@@ -327,7 +328,7 @@ def generate_stream(
         )
     completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
+        id=chunk_id,
         object="text_completion",
         created=int(time.time()),
         model=model_uid,
@@ -343,7 +344,7 @@ def generate_stream(
     if include_usage:
         completion_chunk = CompletionChunk(
-            id=str(uuid.uuid1()),
+            id=chunk_id,
             object="text_completion",
             created=int(time.time()),
             model=model_uid,
@@ -390,6 +391,7 @@ def generate_stream_falcon(
     stop_str = generate_config.get("stop", None)
     stop_token_ids = generate_config.get("stop_token_ids", None) or []
     stop_token_ids.append(tokenizer.eos_token_id)
+    chunk_id = str(uuid.uuid4())
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     input_ids = inputs["input_ids"]
@@ -473,7 +475,7 @@ def generate_stream_falcon(
                     text=output, index=0, logprobs=None, finish_reason=None
                 )
                 completion_chunk = CompletionChunk(
-                    id=str(uuid.uuid1()),
+                    id=chunk_id,
                     object="text_completion",
                     created=int(time.time()),
                     model=model_uid,
@@ -500,7 +502,7 @@ def generate_stream_falcon(
         text=output, index=0, logprobs=None, finish_reason=finish_reason
     )
     completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
+        id=chunk_id,
         object="text_completion",
         created=int(time.time()),
         model=model_uid,
@@ -516,7 +518,7 @@ def generate_stream_falcon(
     if include_usage:
         completion_chunk = CompletionChunk(
-            id=str(uuid.uuid1()),
+            id=chunk_id,
             object="text_completion",
             created=int(time.time()),
             model=model_uid,
@@ -586,6 +588,7 @@ def get_max_src_len(context_len: int, r: InferenceRequest) -> int:
 def _get_completion_chunk(
     output: str,
+    chunk_id: str,
     finish_reason: Optional[str],
     model_uid: str,
     r: InferenceRequest,
@@ -601,7 +604,7 @@ def _get_completion_chunk(
         else []
     )
     completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
+        id=chunk_id,
         object="text_completion",
         created=int(time.time()),
         model=model_uid,
@@ -617,14 +620,18 @@ def _get_completion_chunk(
 def _get_completion(
-    output: str, finish_reason: Optional[str], model_uid: str, r: InferenceRequest
+    output: str,
+    chunk_id: str,
+    finish_reason: Optional[str],
+    model_uid: str,
+    r: InferenceRequest,
 ):
     completion_choice = CompletionChoice(
         text=output, index=0, logprobs=None, finish_reason=finish_reason
     )
     completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
+        id=chunk_id,
         object="text_completion",
         created=int(time.time()),
         model=model_uid,
@@ -701,7 +708,7 @@ def _batch_inference_one_step_internal(
     decode_reqs = []
     for r in valid_req_list:
         if r.is_prefill:
-            prompts.append(r.full_prompt)
+            prompts.append(r.full_prompt if r.full_prompt is not None else r.prompt)
             prefill_reqs.append(r)
         else:
             decode_reqs.append(r)
@@ -846,7 +853,7 @@ def _batch_inference_one_step_internal(
                     r.last_output_length += len(output)
                     completion_chunk = _get_completion_chunk(
-                        output, r.finish_reason, model_uid, r, False
+                        output, r.chunk_id, r.finish_reason, model_uid, r, False
                     )
                     r.completion.append(completion_chunk)
                     if r.stopped:
@@ -859,7 +866,7 @@ def _batch_inference_one_step_internal(
                     if r.stopped and _i == decode_round - 1 and include_usage:
                         r.completion.append(
                             _get_completion_chunk(
-                                "", r.finish_reason, model_uid, r, True
+                                "", r.chunk_id, r.finish_reason, model_uid, r, True
                             )
                         )
             else:
@@ -878,7 +885,9 @@ def _batch_inference_one_step_internal(
                         if r not in output_mapping
                         else output_mapping[r]
                     )
-                    completion = _get_completion(outputs, r.finish_reason, model_uid, r)
+                    completion = _get_completion(
+                        outputs, r.chunk_id, r.finish_reason, model_uid, r
+                    )
                     r.completion = [completion]
     e_time = time.time()
@@ -911,4 +920,8 @@ def batch_inference_one_step(
         os._exit(1)
     except Exception as e:
         logger.exception(f"Internal error for batch inference: {e}.")
-        # TODO: handle this
+        # If internal error happens, just skip all the requests in this batch.
+        # If not handle here, the client will hang.
+        for r in req_list:
+            r.stopped = True
+            r.error_msg = str(e)

xinference/model/llm/utils.py CHANGED Viewed

@@ -607,7 +607,7 @@ Begin!"""
             return arguments, None, None
     @staticmethod
-    def _eval_chatglm3_arguments(c, tools):
+    def _eval_glm_chat_arguments(c, tools):
         if isinstance(c[0], str):
             return c[0], None, None
         return None, c[0]["name"], c[0]["parameters"]
@@ -659,9 +659,9 @@ Begin!"""
         family = model_family.model_family or model_family.model_name
         if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
             content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
-        elif "chatglm3" == family:
-            content, func, args = cls._eval_chatglm3_arguments(c, tools)
-        elif family in ["qwen-chat", "qwen1.5-chat"]:
+        elif family in ["chatglm3", "glm4-chat"]:
+            content, func, args = cls._eval_glm_chat_arguments(c, tools)
+        elif family in ["qwen-chat", "qwen1.5-chat", "qwen2-instruct"]:
             content, func, args = cls._eval_qwen_chat_arguments(c, tools)
         else:
             raise Exception(
@@ -676,28 +676,29 @@ Begin!"""
         Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
         Returns:
-            A function that takes tokens (string output by the model so far) as input
-            returns True if current token is after "\nFinal Answer:", else False.
+            A function that takes tokens (string output by the model so far) and delta (new tokens added) as input,
+            returns the part after "\nFinal Answer:" if found, else returns delta.
         """
         family = model_family.model_family or model_family.model_name
         if family in ["qwen-chat", "qwen1.5-chat"]:
             # Encapsulating function to reset 'found' after each call
             found = False
-            def process_token(tokens: str):
+            def process_tokens(tokens: str, delta: str):
                 nonlocal found
                 # Once "Final Answer:" is found, future tokens are allowed.
                 if found:
-                    return True
+                    return delta
                 # Check if the token ends with "\nFinal Answer:" and update `found`.
-                if tokens.endswith("\nFinal Answer:"):
+                final_answer_idx = tokens.lower().rfind("\nfinal answer:")
+                if final_answer_idx != -1:
                     found = True
-                return False
+                    return tokens[final_answer_idx + len("\nfinal answer:") :]
+                return ""
-            return process_token
+            return process_tokens
         else:
-            # For other families, allow all tokens.
-            return lambda tokens: True
+            return lambda tokens, delta: delta
     @classmethod
     def _tool_calls_completion(cls, model_family, model_uid, c, tools):

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -444,7 +444,9 @@ class VLLMModel(LLM):
                         _content, func, args = ChatModelMixin._eval_tool_arguments(
                             self.model_family, chunk, tools
                         )
-                        choice["text"] = choice_delta
+                        choice["text"] = tools_token_filter(
+                            tokens=previous_texts[0], delta=choice_delta
+                        )
                         if func is not None:
                             choice["text"] = None
                             choice["finish_reason"] = "tool_calls"
@@ -458,9 +460,13 @@ class VLLMModel(LLM):
                                     ),
                                 )
                             ]
-                    # use a filter function to skip Qwen's react thought process
-                    elif not tools_token_filter(previous_texts[0]):
-                        continue
+                    else:
+                        # use a filter function to skip Qwen's react thought process
+                        choice["text"] = tools_token_filter(
+                            tokens=previous_texts[0], delta=choice["text"]
+                        )
+                        if not choice["text"]:
+                            continue
                 prompt_tokens = len(_request_output.prompt_token_ids)
                 completion_tokens = sum(
                     len(output.token_ids) for output in _request_output.outputs

xinference/model/utils.py CHANGED Viewed

@@ -42,14 +42,20 @@ def is_locale_chinese_simplified() -> bool:
 def download_from_modelscope() -> bool:
-    if os.environ.get(XINFERENCE_ENV_MODEL_SRC) == "modelscope":
-        return True
+    if os.environ.get(XINFERENCE_ENV_MODEL_SRC):
+        return os.environ.get(XINFERENCE_ENV_MODEL_SRC) == "modelscope"
     elif is_locale_chinese_simplified():
         return True
     else:
         return False
+def download_from_csghub() -> bool:
+    if os.environ.get(XINFERENCE_ENV_MODEL_SRC) == "csghub":
+        return True
+    return False
 def symlink_local_file(path: str, local_dir: str, relpath: str) -> str:
     from huggingface_hub.file_download import _create_symlink

xinference/thirdparty/ChatTTS/experimental/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/ChatTTS/experimental/llm.py ADDED Viewed

@@ -0,0 +1,40 @@
+from openai import OpenAI
+prompt_dict = {
+    'kimi': [ {"role": "system", "content": "你是 Kimi，由 Moonshot AI 提供的人工智能助手，你更擅长中文和英文的对话。"},
+              {"role": "user", "content": "你好，请注意你现在生成的文字要按照人日常生活的口吻，你的回复将会后续用TTS模型转为语音，并且请把回答控制在100字以内。并且标点符号仅包含逗号和句号，将数字等转为文字回答。"},
+              {"role": "assistant", "content": "好的，我现在生成的文字将按照人日常生活的口吻， 并且我会把回答控制在一百字以内, 标点符号仅包含逗号和句号，将阿拉伯数字等转为中文文字回答。下面请开始对话。"},],
+    'deepseek': [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "你好，请注意你现在生成的文字要按照人日常生活的口吻，你的回复将会后续用TTS模型转为语音，并且请把回答控制在100字以内。并且标点符号仅包含逗号和句号，将数字等转为文字回答。"},
+        {"role": "assistant", "content": "好的，我现在生成的文字将按照人日常生活的口吻， 并且我会把回答控制在一百字以内, 标点符号仅包含逗号和句号，将阿拉伯数字等转为中文文字回答。下面请开始对话。"},],
+    'deepseek_TN': [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "你好，现在我们在处理TTS的文本输入，下面将会给你输入一段文本，请你将其中的阿拉伯数字等等转为文字表达，并且输出的文本里仅包含逗号和句号这两个标点符号"},
+        {"role": "assistant", "content": "好的，我现在对TTS的文本输入进行处理。这一般叫做text normalization。下面请输入"},
+        {"role": "user", "content": "We paid $123 for this desk."},
+        {"role": "assistant", "content": "We paid one hundred and twenty three dollars for this desk."},
+        {"role": "user", "content": "详询请拨打010-724654"},
+        {"role": "assistant", "content": "详询请拨打零幺零，七二四六五四"},
+        {"role": "user", "content": "罗森宣布将于7月24日退市，在华门店超6000家！"},
+        {"role": "assistant", "content": "罗森宣布将于七月二十四日退市，在华门店超过六千家。"},
+        ],
+}
+class llm_api:
+    def __init__(self, api_key, base_url, model):
+        self.client =  OpenAI(
+            api_key = api_key,
+            base_url = base_url,
+        )
+        self.model = model
+    def call(self, user_question, temperature = 0.3, prompt_version='kimi', **kwargs):
+        completion = self.client.chat.completions.create(
+            model = self.model,
+            messages = prompt_dict[prompt_version]+[{"role": "user", "content": user_question},],
+            temperature = temperature,
+            **kwargs
+        )
+        return completion.choices[0].message.content

xinference/thirdparty/ChatTTS/infer/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/ChatTTS/infer/api.py ADDED Viewed

@@ -0,0 +1,125 @@
+import torch
+import torch.nn.functional as F
+from transformers.generation import TopKLogitsWarper, TopPLogitsWarper
+from ..utils.infer_utils import CustomRepetitionPenaltyLogitsProcessorRepeat
+def infer_code(
+    models,
+    text,
+    spk_emb = None,
+    top_P = 0.7,
+    top_K = 20,
+    temperature = 0.3,
+    repetition_penalty = 1.05,
+    max_new_token = 2048,
+    **kwargs
+):
+    device = next(models['gpt'].parameters()).device
+    if not isinstance(text, list):
+        text = [text]
+    if not isinstance(temperature, list):
+        temperature = [temperature] * models['gpt'].num_vq
+    if spk_emb is not None:
+        text = [f'[Stts][spk_emb]{i}[Ptts]' for i in text]
+    else:
+        text = [f'[Stts][empty_spk]{i}[Ptts]' for i in text]
+    text_token = models['tokenizer'](text, return_tensors='pt', add_special_tokens=False, padding=True).to(device)
+    input_ids = text_token['input_ids'][...,None].expand(-1, -1, models['gpt'].num_vq)
+    text_mask = torch.ones(text_token['input_ids'].shape, dtype=bool, device=device)
+    inputs = {
+        'input_ids': input_ids,
+        'text_mask': text_mask,
+        'attention_mask': text_token['attention_mask'],
+    }
+    emb = models['gpt'].get_emb(**inputs)
+    if spk_emb is not None:
+        emb[inputs['input_ids'][..., 0] == models['tokenizer'].convert_tokens_to_ids('[spk_emb]')] = \
+            F.normalize(spk_emb.to(device).to(emb.dtype)[None].expand(len(text), -1), p=2.0, dim=1, eps=1e-12)
+    num_code = models['gpt'].emb_code[0].num_embeddings - 1
+    LogitsWarpers = []
+    if top_P is not None:
+        LogitsWarpers.append(TopPLogitsWarper(top_P, min_tokens_to_keep=3))
+    if top_K is not None:
+        LogitsWarpers.append(TopKLogitsWarper(top_K, min_tokens_to_keep=3))
+    LogitsProcessors = []
+    if repetition_penalty is not None and repetition_penalty != 1:
+        LogitsProcessors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(\
+            repetition_penalty, num_code, 16))
+    result = models['gpt'].generate(
+        emb, inputs['input_ids'],
+        temperature = torch.tensor(temperature, device=device),
+        attention_mask = inputs['attention_mask'],
+        LogitsWarpers = LogitsWarpers,
+        LogitsProcessors = LogitsProcessors,
+        eos_token = num_code,
+        max_new_token = max_new_token,
+        infer_text = False,
+        **kwargs
+    )
+    return result
+def refine_text(
+    models,
+    text,
+    top_P = 0.7,
+    top_K = 20,
+    temperature = 0.7,
+    repetition_penalty = 1.0,
+    max_new_token = 384,
+    prompt = '',
+    **kwargs
+):
+    device = next(models['gpt'].parameters()).device
+    if not isinstance(text, list):
+        text = [text]
+    assert len(text), 'text should not be empty'
+    text = [f"[Sbreak]{i}[Pbreak]{prompt}" for i in text]
+    text_token = models['tokenizer'](text, return_tensors='pt', add_special_tokens=False, padding=True).to(device)
+    text_mask = torch.ones(text_token['input_ids'].shape, dtype=bool, device=device)
+    inputs = {
+        'input_ids': text_token['input_ids'][...,None].expand(-1, -1, models['gpt'].num_vq),
+        'text_mask': text_mask,
+        'attention_mask': text_token['attention_mask'],
+    }
+    LogitsWarpers = []
+    if top_P is not None:
+        LogitsWarpers.append(TopPLogitsWarper(top_P, min_tokens_to_keep=3))
+    if top_K is not None:
+        LogitsWarpers.append(TopKLogitsWarper(top_K, min_tokens_to_keep=3))
+    LogitsProcessors = []
+    if repetition_penalty is not None and repetition_penalty != 1:
+        LogitsProcessors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(repetition_penalty, len(models['tokenizer']), 16))
+    result = models['gpt'].generate(
+        models['gpt'].get_emb(**inputs), inputs['input_ids'],
+        temperature = torch.tensor([temperature,], device=device),
+        attention_mask = inputs['attention_mask'],
+        LogitsWarpers = LogitsWarpers,
+        LogitsProcessors = LogitsProcessors,
+        eos_token = torch.tensor(models['tokenizer'].convert_tokens_to_ids('[Ebreak]'), device=device)[None],
+        max_new_token = max_new_token,
+        infer_text = True,
+        **kwargs
+    )
+    return result

xinference/thirdparty/ChatTTS/model/__init__.py ADDED Viewed

File without changes

xinference 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

Potentially problematic release.

xinference 0.12.0py3-none-any.whl → 0.12.1py3-none-any.whl