PyPI - xinference - Versions diffs - 0.13.2__py3-none-any.whl → 0.13.3__py3-none-any.whl - Mend

xinference 0.13.2py3-none-any.whl → 0.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (78) hide show

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -11,10 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
+import json
+import threading
 import time
 import uuid
 from typing import Any, Dict, Iterator, List, Optional, Union
+import torch
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList
 from ....core.scheduler import InferenceRequest
 from ....types import (
     SPECIAL_TOOL_PROMPT,
@@ -33,6 +40,16 @@ from ..utils import GLM4_TOOL_CALL_FAMILY
 from .core import PytorchChatModel, PytorchModelConfig
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 198] = 5e4
+        return scores
 class ChatglmPytorchChatModel(PytorchChatModel):
     def __init__(
         self,
@@ -103,9 +120,11 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         tools = generate_config.pop("tools", None)
         if tools is None:
             return False
+        # Convert a iterable to a list
+        tools = list(tools)
         tool_choice = generate_config.pop("tool_choice", "none")
         if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
-            chat_history[:] = self.process_messages(
+            chat_history[:] = self._process_messages(
                 chat_history, tools=tools, tool_choice=tool_choice
             )
             return True
@@ -124,7 +143,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             return True
     @staticmethod
-    def process_messages(messages, tools=None, tool_choice="none"):
+    def _process_messages(messages, tools=None, tool_choice="none"):
         # This method is adapted from https://github.com/THUDM/GLM-4/blob/main/basic_demo/openai_api_server.py
         _messages = messages
         processed_messages = []
@@ -210,6 +229,209 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     break
         return processed_messages
+    @staticmethod
+    def _process_response(output, history, tools, end=False):
+        # Copy from https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/modeling_chatglm.py
+        content = ""
+        history = copy.deepcopy(history)
+        if not tools and end:
+            return None, None
+        for response in output.split("<|assistant|>"):
+            if "\n" in response:
+                metadata, content = response.split("\n", maxsplit=1)
+            else:
+                metadata, content = "", response
+            if not metadata.strip():
+                if tools and any(t.startswith(response) for t in tools) and not end:
+                    # Waiting for tool call complete.
+                    return None, None
+                content = content.strip()
+                history.append(
+                    {"role": "assistant", "metadata": metadata, "content": content}
+                )
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                if tools and metadata in tools and not end:
+                    return None, None
+                history.append(
+                    {"role": "assistant", "metadata": metadata, "content": content}
+                )
+                metadata = metadata.strip()
+                if tools and metadata in tools and end:
+                    try:
+                        parameters = json.loads(content)
+                        content = {"name": metadata.strip(), "parameters": parameters}
+                    except json.JSONDecodeError:
+                        content = {"name": metadata.strip(), "content": content}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+    def _get_generate_args(
+        self,
+        tokenizer,
+        query: str,
+        history: Optional[List[Dict]] = None,
+        role: str = "user",
+        past_key_values=None,
+        max_length: int = 8192,
+        do_sample=True,
+        top_p=0.8,
+        temperature=0.8,
+        logits_processor=None,
+        **kwargs,
+    ):
+        # Copy from https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/modeling_chatglm.py
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [
+            tokenizer.eos_token_id,
+            tokenizer.convert_tokens_to_ids("<|user|>"),
+            tokenizer.convert_tokens_to_ids("<|observation|>"),
+        ]
+        gen_kwargs = {
+            "max_length": max_length,
+            "do_sample": do_sample,
+            "top_p": top_p,
+            "temperature": temperature,
+            "logits_processor": logits_processor,
+            **kwargs,
+        }
+        if past_key_values is None:
+            inputs = tokenizer.apply_chat_template(
+                history + [{"role": role, "content": query}],
+                add_generation_prompt=True,
+                tokenize=True,
+                return_tensors="pt",
+                return_dict=True,
+            )
+        else:
+            inputs = tokenizer.apply_chat_template(
+                [{"role": role, "content": query}],
+                add_special_tokens=False,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_tensors="pt",
+                return_dict=True,
+            )
+        inputs = inputs.to(self._model.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat(
+                (attention_mask.new_ones(1, past_length), attention_mask), dim=1
+            )
+            inputs["attention_mask"] = attention_mask
+        history.append({"role": role, "content": query})
+        tools = history[0]["role"] == "system" and history[0].get("tools")
+        tools = (
+            [
+                t.get("function", {}).get("name", "")
+                for t in tools
+                if isinstance(t, dict)
+            ]
+            if tools
+            else []
+        )
+        kwargs = dict(inputs)
+        kwargs["past_key_values"] = past_key_values
+        kwargs["eos_token_id"] = eos_token_id
+        kwargs.update(gen_kwargs)
+        return kwargs, tools
+    @torch.inference_mode()
+    def stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: Optional[List[Dict]] = None,
+        role: str = "user",
+        past_key_values=None,
+        max_length: int = 8192,
+        do_sample=True,
+        top_p=0.8,
+        temperature=0.8,
+        logits_processor=None,
+        **kwargs,
+    ):
+        from transformers import TextIteratorStreamer
+        kwargs, tools = self._get_generate_args(
+            tokenizer=tokenizer,
+            query=query,
+            history=history,
+            role=role,
+            past_key_values=past_key_values,
+            max_length=max_length,
+            do_sample=do_sample,
+            top_p=top_p,
+            temperature=temperature,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+        streamer = TextIteratorStreamer(
+            tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        kwargs["streamer"] = streamer
+        thread = threading.Thread(target=self._model.generate, kwargs=kwargs)
+        thread.start()
+        response = ""
+        for token in streamer:
+            response += token
+            if response and response[-1] != "�":
+                new_response, new_history = self._process_response(
+                    response, history, tools, end=False
+                )
+                if new_response is None:
+                    continue
+                yield new_response, new_history
+        if tools:
+            new_response, new_history = self._process_response(
+                response, history, tools, end=True
+            )
+            if new_response:
+                yield new_response, new_history
+    @torch.inference_mode()
+    def non_stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: Optional[List[Dict]] = None,
+        role: str = "user",
+        past_key_values=None,
+        max_length: int = 8192,
+        do_sample=True,
+        top_p=0.8,
+        temperature=0.8,
+        logits_processor=None,
+        **kwargs,
+    ):
+        kwargs, tools = self._get_generate_args(
+            tokenizer=tokenizer,
+            query=query,
+            history=history,
+            role=role,
+            past_key_values=past_key_values,
+            max_length=max_length,
+            do_sample=do_sample,
+            top_p=top_p,
+            temperature=temperature,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+        outputs = self._model.generate(**kwargs)
+        outputs = outputs[:, kwargs["input_ids"].shape[1] :]
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return self._process_response(response, history, tools, end=True)
     def chat(
         self,
         prompt: str,
@@ -247,7 +469,13 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             if isinstance(stream_options, dict)
             else False
         )
-        if stream and not tools:
+        if stream and (
+            not tools or self.model_family.model_name in GLM4_TOOL_CALL_FAMILY
+        ):
+            if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
+                stream_chat = self.stream_chat
+            else:
+                stream_chat = self._model.stream_chat
             def _stream_generator():
                 last_chunk_text_length = 0
@@ -256,9 +484,14 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                 inputs = self._tokenizer([prompt], return_tensors="pt")
                 inputs = inputs.to(self._model.device)
                 prompt_tokens = len(inputs["input_ids"][0])
-                for chunk_text, _ in self._model.stream_chat(
+                for chunk_text, _ in stream_chat(
                     self._tokenizer, prompt, chat_history, **kwargs
                 ):
+                    if tools and isinstance(chunk_text, dict):
+                        yield self._tool_calls_completion_chunk(
+                            self.model_family, self.model_uid, [chunk_text, _], tools
+                        )
+                        return
                     completion_tokens = completion_tokens + 1
                     total_tokens = prompt_tokens + completion_tokens
                     chunk_text = chunk_text[last_chunk_text_length:]
@@ -312,7 +545,12 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             return self._to_chat_completion_chunks(_stream_generator())
         else:
-            response = self._model.chat(self._tokenizer, prompt, chat_history, **kwargs)
+            if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
+                chat = self.non_stream_chat
+            else:
+                chat = self._model.chat
+            response = chat(self._tokenizer, prompt, chat_history, **kwargs)
             if tools:
                 return self._tool_calls_completion(
                     self.model_family, self.model_uid, response, tools

xinference/model/llm/pytorch/cogvlm2.py CHANGED Viewed

@@ -387,7 +387,7 @@ class CogVLM2Model(PytorchChatModel):
             prompt, system_prompt=system_prompt, chat_history=chat_history
         )
-        input_by_model: dict = self._model.build_conversation_input_ids(
+        input_by_model: dict = self._model.build_conversation_input_ids(  # type: ignore
             self._tokenizer,
             query=query,
             history=history,

xinference/model/llm/utils.py CHANGED Viewed

@@ -483,11 +483,40 @@ Begin!"""
                 else:
                     ret += role
             return ret
+        elif prompt_style.style_name == "mistral-nemo":
+            seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep]
+            ret = "<s>"
+            for i, message in enumerate(chat_history):
+                role = get_role(message["role"])
+                content = message["content"]
+                if content:
+                    if i == len(chat_history) - 2 and prompt_style.system_prompt:
+                        ret += (
+                            role
+                            + " "
+                            + prompt_style.system_prompt
+                            + "\n\n"
+                            + content
+                            + seps[i % 2]
+                        )
+                    else:
+                        ret += role + " " + content + seps[i % 2]
+                else:
+                    ret += role
+            return ret
         else:
             raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
     @classmethod
     def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk:
+        choices = chunk.get("choices")
+        if (
+            chunk.get("object") == "chat.completion.chunk"
+            and choices
+            and "delta" in choices[0]
+        ):
+            # Already a ChatCompletionChunk, we don't need to convert chunk.
+            return cast(ChatCompletionChunk, chunk)
         chat_chunk = {
             "id": "chat" + chunk["id"],
             "model": chunk["model"],
@@ -497,7 +526,7 @@ Begin!"""
                 {
                     "index": i,
                     "delta": {
-                        "content": choice["text"],
+                        "content": choice.get("text"),
                         **(
                             {"tool_calls": choice["tool_calls"]}
                             if "tool_calls" in choice
@@ -718,6 +747,54 @@ Begin!"""
         else:
             return lambda tokens, delta: delta
+    @classmethod
+    def _tool_calls_completion_chunk(cls, model_family, model_uid, c, tools):
+        _id = str(uuid.uuid4())
+        content, func, args = cls._eval_tool_arguments(model_family, c, tools)
+        if func:
+            d = {
+                "role": "assistant",
+                "content": content,
+                "tool_calls": [
+                    {
+                        "id": f"call_{_id}",
+                        "type": "function",
+                        "function": {
+                            "name": func,
+                            "arguments": json.dumps(args),
+                        },
+                    }
+                ],
+            }
+            finish_reason = "tool_calls"
+        else:
+            d = {"role": "assistant", "content": content, "tool_calls": []}
+            finish_reason = "stop"
+        try:
+            usage = c.get("usage")
+            assert "prompt_tokens" in usage
+        except Exception:
+            usage = {
+                "prompt_tokens": -1,
+                "completion_tokens": -1,
+                "total_tokens": -1,
+            }
+        return {
+            "id": "chat" + f"cmpl-{_id}",
+            "model": model_uid,
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": d,
+                    "logprobs": None,
+                    "finish_reason": finish_reason,
+                }
+            ],
+            "usage": usage,
+        }
     @classmethod
     def _tool_calls_completion(cls, model_family, model_uid, c, tools):
         _id = str(uuid.uuid4())

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -151,6 +151,14 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01")
+if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
+    VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("mistral-large-instruct")
+if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
+    VLLM_SUPPORTED_MODELS.append("llama-3.1")
+    VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
 class VLLMModel(LLM):
     def __init__(

xinference/thirdparty/cosyvoice/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/cosyvoice/bin/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/cosyvoice/bin/inference.py ADDED Viewed

@@ -0,0 +1,114 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.dataset.dataset import Dataset
+def get_args():
+    parser = argparse.ArgumentParser(description='inference with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
+    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--llm_model', required=True, help='llm model file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot'],
+                        help='inference mode')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init cosyvoice models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+    model.load(args.llm_model, args.flow_model, args.hifigan_model)
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for batch_idx, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            text = batch["text"]
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            tts_text = batch["tts_text"]
+            tts_index = batch["tts_index"]
+            tts_text_token = batch["tts_text_token"].to(device)
+            tts_text_token_len = batch["tts_text_token_len"].to(device)
+            speech_token = batch["speech_token"].to(device)
+            speech_token_len = batch["speech_token_len"].to(device)
+            speech_feat = batch["speech_feat"].to(device)
+            speech_feat_len = batch["speech_feat_len"].to(device)
+            utt_embedding = batch["utt_embedding"].to(device)
+            spk_embedding = batch["spk_embedding"].to(device)
+            if args.mode == 'sft':
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
+            else:
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
+                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
+            model_output = model.inference(**model_input)
+            tts_key = '{}_{}'.format(utts[0], tts_index[0])
+            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
+            torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
+            f.write('{} {}\n'.format(tts_key, tts_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    main()

xinference 0.13.2__py3-none-any.whl → 0.13.3__py3-none-any.whl

Potentially problematic release.

xinference 0.13.2py3-none-any.whl → 0.13.3py3-none-any.whl