PyPI - xinference - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

xinference 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (210) hide show

xinference/model/llm/vllm/xavier/transfer.py ADDED Viewed

@@ -0,0 +1,298 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+from functools import lru_cache
+from queue import Queue
+from typing import Dict, List, Optional, no_type_check
+import torch
+import xoscar as xo
+from vllm.core.scheduler import Scheduler
+from vllm.utils import TORCH_DTYPE_TO_NUMPY_DTYPE, Device
+from vllm.worker.cache_engine import CacheEngine
+logger = logging.getLogger(__name__)
+class BufferTransferMixin:
+    def __init__(self):
+        self.num_buffer: int = 0
+        self.buffers: List[torch.Tensor] = []
+        self.buffer_queue: Optional[Queue] = None
+        self.transfer_block_num = 0
+        self.num_attn_layers = 0
+    def init_buffer(
+        self, num_buffer: int, buffer_shape, buffer_dtype, buffer_device, pin_memory
+    ):
+        # (transfer_block_num, num_attn_layers, 2, *kv_cache_shape[2:])
+        if buffer_dtype is torch.bfloat16:
+            buffer_dtype = torch.float16
+        self.num_buffer = num_buffer
+        self.transfer_block_num = buffer_shape[0]
+        self.num_attn_layers = buffer_shape[1]
+        self.buffers = [
+            torch.zeros(
+                size=buffer_shape,
+                dtype=buffer_dtype,
+                device=buffer_device,
+                pin_memory=pin_memory,
+            )
+            for _ in range(self.num_buffer)
+        ]
+        self.buffer_queue = Queue()
+        for i in range(self.num_buffer):
+            self.buffer_queue.put_nowait(i)
+        logger.debug(
+            f"Init buffer done. "
+            f"transfer_block_num: {self.transfer_block_num}, "
+            f"num_buffer: {self.num_buffer}, "
+            f"buffer_dtype: {buffer_dtype}, "
+            f"buffer_shape: {buffer_shape}"
+        )
+    @no_type_check
+    def get_buffer_index(self) -> int:
+        return self.buffer_queue.get()
+    @no_type_check
+    def free_buffer_index(self, index: int) -> None:
+        self.buffer_queue.put_nowait(index)
+    def get_swap_buffer(self, index: int, num_blocks: int) -> torch.Tensor:
+        buf = self.buffers[index]
+        buffer = buf[:num_blocks].view(
+            self.num_attn_layers, 2, num_blocks, *buf.shape[3:]
+        )
+        return buffer
+    @lru_cache(maxsize=None)
+    def get_gloo_dtype(self, input_dtype: torch.dtype):
+        from xoscar.collective.common import TypeMappingGloo
+        return TypeMappingGloo[TORCH_DTYPE_TO_NUMPY_DTYPE[input_dtype]]
+class TransferActor(xo.StatelessActor, BufferTransferMixin):
+    @classmethod
+    def default_uid(cls):
+        return f"vllm-transfer-actor"
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        rank_address: str,
+        store_address: str,
+        store_port: int,
+        world_addresses: List[str],
+    ):
+        super().__init__()
+        self._rank = rank
+        self._world_size = world_size
+        self._store_address = store_address
+        self._rank_address = rank_address
+        self._store_port = store_port
+        self._world_addresses = world_addresses
+        self._context = None
+        self._cache_engine: Optional[List[CacheEngine]] = None
+        self._scheduler: Optional[List[Scheduler]] = None
+        self._swap_stream = torch.cuda.Stream()
+    async def __post_create__(self):
+        from xoscar.collective import xoscar_pygloo as xp
+        context = xp.rendezvous.Context(self._rank, self._world_size)
+        attr = xp.transport.tcp.attr(self._rank_address.split(":")[0])
+        dev = xp.transport.tcp.CreateDevice(attr)
+        opt = xp.rendezvous.TCPStoreOptions()
+        opt.port = self._store_port
+        opt.numWorkers = self._world_size
+        opt.isServer = self._rank == 0
+        store = xp.rendezvous.TCPStore(self._store_address, opt)
+        store = xp.rendezvous.PrefixStore(str(self._world_size), store)
+        context.connectFullMesh(store, dev)
+        self._context = context
+        logger.debug(
+            f"Rank {self._rank} arrives successfully, world addresses: {self._world_addresses}"
+        )
+    def setup(
+        self,
+        cache_engine: List[CacheEngine],
+        scheduler: List[Scheduler],
+        num_buffer: int,
+        buffer_shape,
+        buffer_dtype,
+        buffer_device,
+        pin_memory: bool,
+    ):
+        self._cache_engine = cache_engine
+        self._scheduler = scheduler
+        self.init_buffer(
+            num_buffer, buffer_shape, buffer_dtype, buffer_device, pin_memory
+        )
+    def _get_cache_engine(self, virtual_engine: int) -> CacheEngine:
+        return self._cache_engine[virtual_engine]  # type: ignore
+    @staticmethod
+    def _get_swap_block_ids(src_to_dst: Dict[int, int], is_sender: bool) -> List[int]:
+        return list(sorted([r if is_sender else l for r, l in src_to_dst.items()]))
+    def _swap_out_to_buffer(
+        self, cache_engine: CacheEngine, cpu_buf_index: int, block_ids: List[int]
+    ) -> torch.Tensor:
+        num_blocks = len(block_ids)
+        src_to_dst = torch.tensor(
+            [(block_num, idx) for idx, block_num in enumerate(block_ids)],
+            device="cpu",
+            dtype=torch.int64,
+        ).view(-1, 2)
+        cpu_buf = self.get_swap_buffer(cpu_buf_index, num_blocks)
+        with torch.cuda.stream(self._swap_stream):
+            for i in range(self.num_attn_layers):
+                cache_engine.attn_backend.swap_blocks(
+                    cache_engine.gpu_cache[i], cpu_buf[i], src_to_dst
+                )
+        torch.cuda.Stream.synchronize(self._swap_stream)
+        return cpu_buf
+    def _swap_in_from_buffer(
+        self, cache_engine: CacheEngine, cpu_buf: torch.Tensor, block_ids: List[int]
+    ) -> None:
+        src_to_dst = torch.tensor(
+            [(idx, block_num) for idx, block_num in enumerate(block_ids)],
+            device="cpu",
+            dtype=torch.int64,
+        ).view(-1, 2)
+        with torch.cuda.stream(self._swap_stream):
+            for i in range(self.num_attn_layers):
+                cache_engine.attn_backend.swap_blocks(
+                    cpu_buf[i], cache_engine.gpu_cache[i], src_to_dst
+                )
+        torch.cuda.Stream.synchronize(self._swap_stream)
+    def _incr_count_for_block_id(self, virtual_engine: int, block_ids: List[int]):
+        """
+        The reference count of the `block_id` involved in the transfer is incremented by 1
+        to ensure it is not reclaimed.
+        """
+        scheduler = self._scheduler[virtual_engine]  # type: ignore
+        gpu_allocator = scheduler.block_manager.block_allocator._allocators[Device.GPU]
+        for _id in block_ids:
+            gpu_allocator._refcounter.incr(_id)
+    def _decr_count_for_block_id(self, virtual_engine: int, block_ids: List[int]):
+        """
+        After the transfer, the reference count is decremented by 1.
+        """
+        scheduler = self._scheduler[virtual_engine]  # type: ignore
+        gpu_allocator = scheduler.block_manager.block_allocator._allocators[Device.GPU]
+        for _id in block_ids:
+            gpu_allocator._refcounter.decr(_id)
+    async def do_send(
+        self, virtual_engine: int, to_rank: int, src_to_dst: Dict[int, int]
+    ):
+        """
+        Sending logic: GPU -> Buffer -> Gloo send.
+        GPU -> Buffer is directly handled using the internal `swap_out` interface of vllm.
+        """
+        from xoscar.collective import xoscar_pygloo as xp
+        cache_engine = self._get_cache_engine(virtual_engine)
+        block_ids = self._get_swap_block_ids(src_to_dst, is_sender=True)
+        self._incr_count_for_block_id(virtual_engine, block_ids)
+        cpu_buf_index = self.get_buffer_index()
+        total_blocks: int = len(block_ids)
+        try:
+            for start_idx in range(0, total_blocks, self.transfer_block_num):
+                offset = min(self.transfer_block_num, total_blocks - start_idx)
+                send_block_ids = block_ids[start_idx : start_idx + offset]
+                sendbuf = self._swap_out_to_buffer(
+                    cache_engine, cpu_buf_index, send_block_ids
+                )
+                assert sendbuf.is_contiguous()
+                sendptr = sendbuf.numpy().ctypes.data
+                data_size = sendbuf.numel()
+                datatype = self.get_gloo_dtype(sendbuf.dtype)
+                peer = to_rank
+                xp.send(self._context, sendptr, data_size, datatype, peer)
+        finally:
+            self._decr_count_for_block_id(virtual_engine, block_ids)
+            self.free_buffer_index(cpu_buf_index)
+    async def do_recv(
+        self, virtual_engine: int, from_rank: int, src_to_dst: Dict[int, int]
+    ):
+        """
+        Receiving logic: Gloo recv -> Buffer -> GPU.
+        Buffer -> GPU is directly handled using the internal `swap_in` interface of vllm.
+        """
+        from xoscar.collective import xoscar_pygloo as xp
+        cache_engine = self._get_cache_engine(virtual_engine)
+        block_ids = self._get_swap_block_ids(src_to_dst, is_sender=False)
+        self._incr_count_for_block_id(virtual_engine, block_ids)
+        total_blocks = len(block_ids)
+        cpu_buf_index = self.get_buffer_index()
+        try:
+            for start_idx in range(0, total_blocks, self.transfer_block_num):
+                offset = min(self.transfer_block_num, total_blocks - start_idx)
+                recv_block_ids = block_ids[start_idx : start_idx + offset]
+                recvbuf = self.get_swap_buffer(cpu_buf_index, len(recv_block_ids))
+                assert recvbuf.is_contiguous()
+                recvptr = recvbuf.numpy().ctypes.data
+                data_size = recvbuf.numel()
+                datatype = self.get_gloo_dtype(recvbuf.dtype)
+                peer = from_rank
+                xp.recv(self._context, recvptr, data_size, datatype, peer)
+                self._swap_in_from_buffer(cache_engine, recvbuf, recv_block_ids)
+        finally:
+            self._decr_count_for_block_id(virtual_engine, block_ids)
+            self.free_buffer_index(cpu_buf_index)
+    async def recv(
+        self, virtual_engine: int, from_address: str, src_to_dst: Dict[int, int]
+    ):
+        """
+        This is the external entry point for the call.
+        The transfer logic is as follows:
+        the receiver requests the sender to send the data directly to itself in a point-to-point manner.
+        """
+        rank = self._world_addresses.index(from_address)
+        sender_ref = await xo.actor_ref(
+            address=from_address, uid=f"{TransferActor.default_uid()}-{rank}"
+        )
+        await asyncio.gather(
+            sender_ref.do_send(virtual_engine, self._rank, src_to_dst),
+            self.do_recv(virtual_engine, rank, src_to_dst),
+        )

xinference/model/video/diffusers.py CHANGED Viewed

@@ -91,6 +91,20 @@ class DiffUsersVideoModel:
             pipeline = self._model = CogVideoXPipeline.from_pretrained(
                 self._model_path, **kwargs
             )
+        elif self._model_spec.model_family == "HunyuanVideo":
+            from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
+            transformer_torch_dtype = kwargs.pop("transformer_torch_dtype")
+            if isinstance(transformer_torch_dtype, str):
+                transformer_torch_dtype = getattr(torch, transformer_torch_dtype)
+            transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+                self._model_path,
+                subfolder="transformer",
+                torch_dtype=transformer_torch_dtype,
+            )
+            pipeline = self._model = HunyuanVideoPipeline.from_pretrained(
+                self._model_path, transformer=transformer, **kwargs
+            )
         else:
             raise Exception(
                 f"Unsupported model family: {self._model_spec.model_family}"

xinference/model/video/model_spec.json CHANGED Viewed

@@ -30,5 +30,20 @@
     "default_generate_config": {
       "guidance_scale": 7
     }
+  },
+  {
+    "model_name": "HunyuanVideo",
+    "model_family": "HunyuanVideo",
+    "model_id": "hunyuanvideo-community/HunyuanVideo",
+    "model_revision": "e8c2aaa66fe3742a32c11a6766aecbf07c56e773",
+    "model_ability": [
+      "text2video"
+    ],
+    "default_model_config": {
+      "transformer_torch_dtype": "bfloat16",
+      "torch_dtype": "float16"
+    },
+    "default_generate_config": {
+    }
   }
 ]

xinference/model/video/model_spec_modelscope.json CHANGED Viewed

@@ -32,5 +32,21 @@
     "default_generate_config": {
       "guidance_scale": 7
     }
+  },
+  {
+    "model_name": "HunyuanVideo",
+    "model_family": "HunyuanVideo",
+    "model_hub": "modelscope",
+    "model_id": "Xorbits/HunyuanVideo",
+    "model_revision": "master",
+    "model_ability": [
+      "text2video"
+    ],
+    "default_model_config": {
+      "transformer_torch_dtype": "bfloat16",
+      "torch_dtype": "float16"
+    },
+    "default_generate_config": {
+    }
   }
 ]

xinference/thirdparty/cosyvoice/bin/average_model.py ADDED Viewed

@@ -0,0 +1,92 @@
+# Copyright (c) 2020 Mobvoi Inc (Di Wu)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import glob
+import yaml
+import torch
+def get_args():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument('--src_path',
+                        required=True,
+                        help='src model path for average')
+    parser.add_argument('--val_best',
+                        action="store_true",
+                        help='averaged model')
+    parser.add_argument('--num',
+                        default=5,
+                        type=int,
+                        help='nums for averaged model')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    val_scores = []
+    if args.val_best:
+        yamls = glob.glob('{}/*.yaml'.format(args.src_path))
+        yamls = [
+            f for f in yamls
+            if not (os.path.basename(f).startswith('train')
+                    or os.path.basename(f).startswith('init'))
+        ]
+        for y in yamls:
+            with open(y, 'r') as f:
+                dic_yaml = yaml.load(f, Loader=yaml.BaseLoader)
+                loss = float(dic_yaml['loss_dict']['loss'])
+                epoch = int(dic_yaml['epoch'])
+                step = int(dic_yaml['step'])
+                tag = dic_yaml['tag']
+                val_scores += [[epoch, step, loss, tag]]
+        sorted_val_scores = sorted(val_scores,
+                                   key=lambda x: x[2],
+                                   reverse=False)
+        print("best val (epoch, step, loss, tag) = " +
+              str(sorted_val_scores[:args.num]))
+        path_list = [
+            args.src_path + '/epoch_{}_whole.pt'.format(score[0])
+            for score in sorted_val_scores[:args.num]
+        ]
+    print(path_list)
+    avg = {}
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print('Processing {}'.format(path))
+        states = torch.load(path, map_location=torch.device('cpu'))
+        for k in states.keys():
+            if k not in avg.keys():
+                avg[k] = states[k].clone()
+            else:
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            # pytorch 1.6 use true_divide instead of /=
+            avg[k] = torch.true_divide(avg[k], num)
+    print('Saving to {}'.format(args.dst_model))
+    torch.save(avg, args.dst_model)
+if __name__ == '__main__':
+    main()

xinference/thirdparty/cosyvoice/bin/export_jit.py CHANGED Viewed

@@ -19,12 +19,13 @@ import logging
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
 import os
 import sys
+import torch
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/../..'.format(ROOT_DIR))
 sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
-import torch
 from cosyvoice.cli.cosyvoice import CosyVoice
 def get_args():
     parser = argparse.ArgumentParser(description='export your model for deployment')
     parser.add_argument('--model_dir',
@@ -35,6 +36,7 @@ def get_args():
     print(args)
     return args
 def main():
     args = get_args()
     logging.basicConfig(level=logging.DEBUG,
@@ -44,7 +46,7 @@ def main():
     torch._C._jit_set_profiling_mode(False)
     torch._C._jit_set_profiling_executor(False)
-    cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_trt=False)
+    cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_onnx=False)
     # 1. export llm text_encoder
     llm_text_encoder = cosyvoice.model.llm.text_encoder.half()
@@ -60,5 +62,13 @@ def main():
     script = torch.jit.optimize_for_inference(script)
     script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+    # 3. export flow encoder
+    flow_encoder = cosyvoice.model.flow.encoder
+    script = torch.jit.script(flow_encoder)
+    script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
 if __name__ == '__main__':
     main()

xinference/thirdparty/cosyvoice/bin/export_onnx.py ADDED Viewed

@@ -0,0 +1,112 @@
+# Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, hexisyztem@icloud.com)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import onnxruntime
+import random
+import torch
+from tqdm import tqdm
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice
+def get_dummy_input(batch_size, seq_len, out_channels, device):
+    x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
+    mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    t = torch.rand((batch_size), dtype=torch.float32, device=device)
+    spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
+    cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    return x, mask, mu, t, spks, cond
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_onnx=False)
+    # 1. export flow decoder estimator
+    estimator = cosyvoice.model.flow.decoder.estimator
+    device = cosyvoice.model.device
+    batch_size, seq_len = 1, 256
+    out_channels = cosyvoice.model.flow.decoder.estimator.out_channels
+    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+    torch.onnx.export(
+        estimator,
+        (x, mask, mu, t, spks, cond),
+        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+        export_params=True,
+        opset_version=18,
+        do_constant_folding=True,
+        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+        output_names=['estimator_out'],
+        dynamic_axes={
+            'x': {0: 'batch_size', 2: 'seq_len'},
+            'mask': {0: 'batch_size', 2: 'seq_len'},
+            'mu': {0: 'batch_size', 2: 'seq_len'},
+            'cond': {0: 'batch_size', 2: 'seq_len'},
+            't': {0: 'batch_size'},
+            'spks': {0: 'batch_size'},
+            'estimator_out': {0: 'batch_size', 2: 'seq_len'},
+        }
+    )
+    # 2. test computation consistency
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                  sess_options=option, providers=providers)
+    for _ in tqdm(range(10)):
+        x, mask, mu, t, spks, cond = get_dummy_input(random.randint(1, 6), random.randint(16, 512), out_channels, device)
+        output_pytorch = estimator(x, mask, mu, t, spks, cond)
+        ort_inputs = {
+            'x': x.cpu().numpy(),
+            'mask': mask.cpu().numpy(),
+            'mu': mu.cpu().numpy(),
+            't': t.cpu().numpy(),
+            'spks': spks.cpu().numpy(),
+            'cond': cond.cpu().numpy()
+        }
+        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+if __name__ == "__main__":
+    main()

xinference/thirdparty/cosyvoice/bin/export_trt.sh ADDED Viewed

@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright 2024 Alibaba Inc. All Rights Reserved.
+# download tensorrt from https://developer.nvidia.com/tensorrt/download/10x, check your system and cuda for compatibability
+# for example for linux + cuda12.4, you can download https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
+TRT_DIR=<YOUR_TRT_DIR>
+MODEL_DIR=<COSYVOICE2_MODEL_DIR>
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_DIR/lib:/usr/local/cuda/lib64
+$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp16.mygpu.plan --fp16 --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw --outputIOFormats=fp16:chw

xinference/thirdparty/cosyvoice/bin/inference.py CHANGED Viewed

@@ -18,16 +18,15 @@ import argparse
 import logging
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
 import os
 import torch
 from torch.utils.data import DataLoader
 import torchaudio
 from hyperpyyaml import load_hyperpyyaml
 from tqdm import tqdm
 from cosyvoice.cli.model import CosyVoiceModel
 from cosyvoice.dataset.dataset import Dataset
 def get_args():
     parser = argparse.ArgumentParser(description='inference with your model')
     parser.add_argument('--config', required=True, help='config file')
@@ -66,7 +65,8 @@ def main():
     model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
     model.load(args.llm_model, args.flow_model, args.hifigan_model)
-    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False,
+                           tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
     test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
     del configs
@@ -74,13 +74,11 @@ def main():
     fn = os.path.join(args.result_dir, 'wav.scp')
     f = open(fn, 'w')
     with torch.no_grad():
-        for batch_idx, batch in tqdm(enumerate(test_data_loader)):
+        for _, batch in tqdm(enumerate(test_data_loader)):
             utts = batch["utts"]
             assert len(utts) == 1, "inference mode only support batchsize 1"
-            text = batch["text"]
             text_token = batch["text_token"].to(device)
             text_token_len = batch["text_token_len"].to(device)
-            tts_text = batch["tts_text"]
             tts_index = batch["tts_index"]
             tts_text_token = batch["tts_text_token"].to(device)
             tts_text_token_len = batch["tts_text_token_len"].to(device)
@@ -101,7 +99,7 @@ def main():
                                'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
                                'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
             tts_speeches = []
-            for model_output in model.inference(**model_input):
+            for model_output in model.tts(**model_input):
                 tts_speeches.append(model_output['tts_speech'])
             tts_speeches = torch.concat(tts_speeches, dim=1)
             tts_key = '{}_{}'.format(utts[0], tts_index[0])

xinference 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

xinference 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl