PyPI - minicpmo-utils - Versions diffs - 0.1.0__py3-none-any.whl - Mend

minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

cosyvoice/__init__.py +17 -0
cosyvoice/bin/average_model.py +93 -0
cosyvoice/bin/export_jit.py +103 -0
cosyvoice/bin/export_onnx.py +120 -0
cosyvoice/bin/inference_deprecated.py +126 -0
cosyvoice/bin/train.py +195 -0
cosyvoice/cli/__init__.py +0 -0
cosyvoice/cli/cosyvoice.py +209 -0
cosyvoice/cli/frontend.py +238 -0
cosyvoice/cli/model.py +386 -0
cosyvoice/dataset/__init__.py +0 -0
cosyvoice/dataset/dataset.py +151 -0
cosyvoice/dataset/processor.py +434 -0
cosyvoice/flow/decoder.py +494 -0
cosyvoice/flow/flow.py +281 -0
cosyvoice/flow/flow_matching.py +227 -0
cosyvoice/flow/length_regulator.py +70 -0
cosyvoice/hifigan/discriminator.py +230 -0
cosyvoice/hifigan/f0_predictor.py +58 -0
cosyvoice/hifigan/generator.py +582 -0
cosyvoice/hifigan/hifigan.py +67 -0
cosyvoice/llm/llm.py +610 -0
cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
cosyvoice/tokenizer/tokenizer.py +279 -0
cosyvoice/transformer/__init__.py +0 -0
cosyvoice/transformer/activation.py +84 -0
cosyvoice/transformer/attention.py +330 -0
cosyvoice/transformer/convolution.py +145 -0
cosyvoice/transformer/decoder.py +396 -0
cosyvoice/transformer/decoder_layer.py +132 -0
cosyvoice/transformer/embedding.py +302 -0
cosyvoice/transformer/encoder.py +474 -0
cosyvoice/transformer/encoder_layer.py +236 -0
cosyvoice/transformer/label_smoothing_loss.py +96 -0
cosyvoice/transformer/positionwise_feed_forward.py +115 -0
cosyvoice/transformer/subsampling.py +383 -0
cosyvoice/transformer/upsample_encoder.py +320 -0
cosyvoice/utils/__init__.py +0 -0
cosyvoice/utils/class_utils.py +83 -0
cosyvoice/utils/common.py +186 -0
cosyvoice/utils/executor.py +176 -0
cosyvoice/utils/file_utils.py +129 -0
cosyvoice/utils/frontend_utils.py +136 -0
cosyvoice/utils/losses.py +57 -0
cosyvoice/utils/mask.py +265 -0
cosyvoice/utils/scheduler.py +738 -0
cosyvoice/utils/train_utils.py +367 -0
cosyvoice/vllm/cosyvoice2.py +103 -0
matcha/__init__.py +0 -0
matcha/app.py +357 -0
matcha/cli.py +418 -0
matcha/hifigan/__init__.py +0 -0
matcha/hifigan/config.py +28 -0
matcha/hifigan/denoiser.py +64 -0
matcha/hifigan/env.py +17 -0
matcha/hifigan/meldataset.py +217 -0
matcha/hifigan/models.py +368 -0
matcha/hifigan/xutils.py +60 -0
matcha/models/__init__.py +0 -0
matcha/models/baselightningmodule.py +209 -0
matcha/models/components/__init__.py +0 -0
matcha/models/components/decoder.py +443 -0
matcha/models/components/flow_matching.py +132 -0
matcha/models/components/text_encoder.py +410 -0
matcha/models/components/transformer.py +316 -0
matcha/models/matcha_tts.py +239 -0
matcha/onnx/__init__.py +0 -0
matcha/onnx/export.py +181 -0
matcha/onnx/infer.py +168 -0
matcha/text/__init__.py +53 -0
matcha/text/cleaners.py +116 -0
matcha/text/numbers.py +71 -0
matcha/text/symbols.py +17 -0
matcha/train.py +122 -0
matcha/utils/__init__.py +5 -0
matcha/utils/audio.py +82 -0
matcha/utils/generate_data_statistics.py +111 -0
matcha/utils/instantiators.py +56 -0
matcha/utils/logging_utils.py +53 -0
matcha/utils/model.py +90 -0
matcha/utils/monotonic_align/__init__.py +22 -0
matcha/utils/monotonic_align/setup.py +7 -0
matcha/utils/pylogger.py +21 -0
matcha/utils/rich_utils.py +101 -0
matcha/utils/utils.py +219 -0
minicpmo/__init__.py +24 -0
minicpmo/utils.py +636 -0
minicpmo/version.py +2 -0
minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
s3tokenizer/__init__.py +153 -0
s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
s3tokenizer/assets/mel_filters.npz +0 -0
s3tokenizer/cli.py +183 -0
s3tokenizer/model.py +546 -0
s3tokenizer/model_v2.py +605 -0
s3tokenizer/utils.py +390 -0
stepaudio2/__init__.py +40 -0
stepaudio2/cosyvoice2/__init__.py +1 -0
stepaudio2/cosyvoice2/flow/__init__.py +0 -0
stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
stepaudio2/cosyvoice2/flow/flow.py +230 -0
stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
stepaudio2/cosyvoice2/transformer/attention.py +328 -0
stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
stepaudio2/cosyvoice2/utils/__init__.py +1 -0
stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
stepaudio2/cosyvoice2/utils/common.py +101 -0
stepaudio2/cosyvoice2/utils/mask.py +49 -0
stepaudio2/flashcosyvoice/__init__.py +0 -0
stepaudio2/flashcosyvoice/cli.py +424 -0
stepaudio2/flashcosyvoice/config.py +80 -0
stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/flow.py +198 -0
stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
stepaudio2/flashcosyvoice/utils/audio.py +77 -0
stepaudio2/flashcosyvoice/utils/context.py +28 -0
stepaudio2/flashcosyvoice/utils/loader.py +116 -0
stepaudio2/flashcosyvoice/utils/memory.py +19 -0
stepaudio2/stepaudio2.py +204 -0
stepaudio2/token2wav.py +248 -0
stepaudio2/utils.py +91 -0

stepaudio2/cosyvoice2/utils/common.py ADDED Viewed

@@ -0,0 +1,101 @@
+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Unility functions for Transformer."""
+import random
+from typing import List
+import numpy as np
+import torch
+IGNORE_ID = -1
+def pad_list(xs: List[torch.Tensor], pad_value: int):
+    """Perform padding for the list of tensors.
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    max_len = max([len(item) for item in xs])
+    batchs = len(xs)
+    ndim = xs[0].ndim
+    if ndim == 1:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    elif ndim == 2:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              xs[0].shape[1],
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    elif ndim == 3:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              xs[0].shape[1],
+                              xs[0].shape[2],
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    else:
+        raise ValueError(f"Unsupported ndim: {ndim}")
+    pad_res.fill_(pad_value)
+    for i in range(batchs):
+        pad_res[i, :len(xs[i])] = xs[i]
+    return pad_res
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def fade_in_out(fade_in_mel, fade_out_mel, window):
+    device = fade_in_mel.device
+    fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
+    mel_overlap_len = int(window.shape[0] / 2)
+    if fade_in_mel.device == torch.device('cpu'):
+        fade_in_mel = fade_in_mel.clone()
+    fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
+        fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
+    return fade_in_mel.to(device)
+def set_all_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)

stepaudio2/cosyvoice2/utils/mask.py ADDED Viewed

@@ -0,0 +1,49 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+from typing import List
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of padded part.
+    See description of make_non_pad_mask.
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: Mask tensor containing indices of padded part.
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask

stepaudio2/flashcosyvoice/__init__.py ADDED Viewed

File without changes

stepaudio2/flashcosyvoice/cli.py ADDED Viewed

@@ -0,0 +1,424 @@
+# Copyright (c) 2025 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Example Usage: see README.md
+"""
+import argparse
+import json
+import os
+import random
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+import numpy as np
+import onnxruntime
+import s3tokenizer
+import torch
+import torch.distributed as dist
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from tqdm import tqdm
+from stepaudio2.flashcosyvoice.config import Config, CosyVoice2LLMConfig, SamplingParams
+from stepaudio2.flashcosyvoice.cosyvoice2 import CosyVoice2
+from stepaudio2.flashcosyvoice.utils.audio import mel_spectrogram
+def set_all_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def save_file_async(
+    wav, prompt_speech_tokens, generated_speech_tokens,
+    info, timing_stats
+):
+    """Save audio asynchronously."""
+    try:
+        os.makedirs(os.path.dirname(info['wav']), exist_ok=True)
+        if wav is not None:
+            wav = wav.cpu()
+            torchaudio.save(info['wav'], wav, 24000)
+            duration = wav.shape[-1] / 24000.0
+            rtf = ((timing_stats['dataloader_time'] + timing_stats['model_inference_time']) / timing_stats['batch_size']) / duration
+            timing_stats['rtf'] = rtf
+        else:
+            duration = 0.0
+        info['timing_stats'] = timing_stats
+        info['prompt_speech_tokens'] = prompt_speech_tokens
+        info['generated_speech_tokens'] = generated_speech_tokens
+        with open(f"{info['wav'].replace('.wav', '.json')}", "w") as f:
+            json.dump(info, f, ensure_ascii=False, indent=4)
+        return duration
+    except Exception as e:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [ERROR] - Error saving audio {info.get('key', 'unknown')}: {e}")
+        return 0.0
+class AudioDataset(Dataset):
+    def __init__(self, text_norm, text_tokenizer, data_list, model_config: Config):
+        self.datas = []
+        self.text_norm = text_norm
+        self.model_config = model_config
+        """Example data_list:
+        ```
+        {"key": "uttid_1", "prompt_text": "你好，我是小明。", "text": "你好，我是小红。", "prompt_wav": "/mnt/data/audio/00000000.wav", "wav": "/mnt/data/audio_synthetic/uttid_1.wav"}
+        {"key": "uttid_2", "prompt_text": "你好，我是小红。", "text": "你好，我是小明。", "prompt_wav": "/mnt/data/audio/00000001.wav", "wav": "/mnt/data/audio_synthetic/uttid_2.wav"}
+        ```
+        Note:
+            - `key` is the key of this sample.
+            - `prompt_text` is the text used for prompt.
+            - `text` is the text used for generating real audio.
+            - `prompt_wav` is the audio used for prompt.
+            - `wav` is the path to the generated audio to be saved (we highly recommend to pre-define the save path before running the script).
+        """
+        missing = 0
+        with open(data_list, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            total_lines = len(lines)
+            if torch.distributed.get_node_local_rank() == 0:
+                iterator = tqdm(lines, desc='Loading data')
+            else:
+                iterator = lines
+            for line in iterator:
+                data = json.loads(line.strip())
+                valid = True
+                for k in ['key', 'prompt_text', 'text', 'prompt_wav']:
+                    if k not in data:
+                        valid = False
+                        break
+                    if data[k] is None:
+                        valid = False
+                        break
+                if not os.path.exists(data['prompt_wav']):
+                    valid = False
+                if valid:
+                    self.datas.append(data)
+                else:
+                    missing += 1
+        if torch.distributed.get_node_local_rank() == 0:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f'[{timestamp}] - [INFO] - Loaded {total_lines} lines, found {missing} missing lines, total valid lines == {len(self.datas)}.')
+        self.text_tokenizer = text_tokenizer
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.spk_model = onnxruntime.InferenceSession(f"{self.model_config.model}/campplus.onnx", sess_options=option,
+                                                      providers=["CPUExecutionProvider"])
+    def __len__(self):
+        return len(self.datas)
+    def __getitem__(self, idx):
+        data = self.datas[idx]
+        try:
+            # 1. feature for s3tokenizer
+            audio = s3tokenizer.load_audio(data['prompt_wav'], sr=16000)  # [T]
+            log_mel = s3tokenizer.log_mel_spectrogram(audio)  # [num_mels, T]
+            # 2. feature for speaker embedding
+            spk_feat = kaldi.fbank(audio.unsqueeze(0), num_mel_bins=80, dither=0, sample_frequency=16000)
+            spk_feat = spk_feat - spk_feat.mean(dim=0, keepdim=True)
+            spk_emb = self.spk_model.run(
+                None, {self.spk_model.get_inputs()[0].name: spk_feat.unsqueeze(dim=0).cpu().numpy()}
+            )[0].flatten().tolist()
+            # 3. feature for flow
+            audio, sample_rate = torchaudio.load(data['prompt_wav'], backend='soundfile')
+            audio = audio.mean(dim=0, keepdim=True)  # [1, T]
+            if sample_rate != 24000:
+                audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=24000)(audio)
+            mel = mel_spectrogram(audio).transpose(1, 2).squeeze(0)  # [T, num_mels]
+            mel_len = mel.shape[0]
+            # 4. feature for llm
+            if self.text_norm is not None:
+                prompt_texts = [i["text"] for i in json.loads(self.text_norm.do_voicegen_frd(data['prompt_text'].strip()))["sentences"]]
+                prompt_text = ''.join(prompt_texts)
+                texts = [i["text"] for i in json.loads(self.text_norm.do_voicegen_frd(data['text'].strip()))["sentences"]]
+                text = ''.join(texts)
+            else:
+                prompt_text = data['prompt_text']
+                text = data['text']
+            prompt_text_ids = self.text_tokenizer.encode(prompt_text)
+            prompt_text_ids = [i + self.model_config.hf_config.speech_vocab_size + 2 for i in prompt_text_ids]
+            text_ids = self.text_tokenizer.encode(text)
+            text_ids = [i + self.model_config.hf_config.speech_vocab_size + 2 for i in text_ids]
+            item = {
+                "prompt_text_tokens": prompt_text_ids, "text_tokens": text_ids,
+                "spk_emb": spk_emb, "mel": mel, "mel_len": mel_len, "log_mel": log_mel, "info": data,
+                "min_tokens": len(text_ids) * self.model_config.min_token_text_ratio,
+                "max_tokens": len(text_ids) * self.model_config.max_token_text_ratio,
+            }
+        except Exception as e:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [WARNING] - Error processing data item {data.get('key', idx)}: {e}")
+            return None
+        return item
+def collate_fn(batch):
+    prompt_mels_for_llm = [item["log_mel"] for item in batch if item is not None]
+    prompt_mels_for_llm, prompt_mels_lens_for_llm = s3tokenizer.padding(prompt_mels_for_llm)  # [B, num_mels=128, T]
+    prompt_text_tokens_for_llm = [item["prompt_text_tokens"] for item in batch if item is not None]
+    text_tokens_for_llm = [item["text_tokens"] for item in batch if item is not None]
+    prompt_mels_for_flow = [item["mel"] for item in batch if item is not None]
+    prompt_mels_for_flow = torch.nn.utils.rnn.pad_sequence(prompt_mels_for_flow, batch_first=True, padding_value=0)  # [B, T', num_mels=80]
+    prompt_mels_lens_for_flow = [item["mel_len"] for item in batch if item is not None]
+    prompt_mels_lens_for_flow = torch.tensor(prompt_mels_lens_for_flow)
+    spk_emb_for_flow = [item["spk_emb"] for item in batch if item is not None]
+    spk_emb_for_flow = torch.tensor(spk_emb_for_flow)
+    sampling_params = [SamplingParams(min_tokens=item["min_tokens"], max_tokens=item["max_tokens"], use_ras=True) for item in batch if item is not None]
+    infos = [item["info"] for item in batch if item is not None]
+    return {
+        "prompt_mels_for_llm": prompt_mels_for_llm,
+        "prompt_mels_lens_for_llm": prompt_mels_lens_for_llm,
+        "prompt_text_tokens_for_llm": prompt_text_tokens_for_llm,
+        "text_tokens_for_llm": text_tokens_for_llm,
+        "prompt_mels_for_flow": prompt_mels_for_flow,
+        "prompt_mels_lens_for_flow": prompt_mels_lens_for_flow,
+        "spk_emb_for_flow": spk_emb_for_flow,
+        "sampling_params": sampling_params,
+        "infos": infos,
+    }
+def init_distributed():
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+    tqdm.write(f'[{timestamp}] - [INFO] - Inference on multiple gpus, this gpu {local_rank}, rank {rank}, world_size {world_size}')
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group("nccl")
+    return world_size, local_rank, rank
+def get_args():
+    parser = argparse.ArgumentParser(description='FlashCosyVoice')
+    parser.add_argument('--model_path',
+                        required=True,
+                        type=str,
+                        help='model path')
+    parser.add_argument('--data_list',
+                        required=True,
+                        type=str,
+                        help='data list')
+    parser.add_argument('--batch_size_dataloader',
+                        required=True,
+                        type=int,
+                        help='batch size (per-device) for dataloading')
+    parser.add_argument('--batch_size_flow',
+                        required=True,
+                        type=int,
+                        help='batch size (per-device) for flow-matching')
+    parser.add_argument('--num_workers',
+                        type=int,
+                        default=4,
+                        help='workers for dataloader')
+    parser.add_argument('--prefetch',
+                        type=int,
+                        default=5,
+                        help='prefetch for dataloader')
+    parser.add_argument('--enable_tn',
+                        action='store_true',
+                        help='enable text normalization')
+    parser.add_argument('--only_llm',
+                        action='store_true',
+                        help='only generate speech tokens from llm')
+    parser.add_argument('--fp16_flow',
+                        action='store_true',
+                        help='enable fp16 flow')
+    parser.add_argument('--seed',
+                        type=int,
+                        default=1986,
+                        help='random seed for generation')
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    if args.enable_tn:
+        # Check python version, if == 3.10, use ttsfrd
+        if sys.version_info.major == 3 and sys.version_info.minor == 10:
+            # Check if ttsfrd is installed
+            try:
+                import ttsfrd
+                from cosyvoice_ttsfrd import get_resource_path
+            except ImportError as e:
+                raise ImportError("ttsfrd is not installed, please install it first, see `https://github.com/xingchensong/CosyVoice-ttsfrd` for installation guide.") from e
+            text_norm = ttsfrd.TtsFrontendEngine()
+            text_norm.initialize(get_resource_path())
+            text_norm.set_lang_type('pinyinvg')
+        else:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [WARNING] - Only python 3.10 is supported for ttsfrd, see `https://github.com/xingchensong/CosyVoice-ttsfrd` for more info. Setting enable_tn to False...")
+            # TODO: maybe we should use wetext if python version is not 3.10?
+            args.enable_tn = False
+            text_norm = None
+    else:
+        text_norm = None
+    assert (torch.cuda.is_available())
+    world_size, local_rank, rank = init_distributed()
+    config = Config(model=args.model_path, enforce_eager=True, tensor_parallel_size=1,
+                    max_num_seqs=args.batch_size_dataloader,
+                    hf_config=CosyVoice2LLMConfig(fp16_flow=args.fp16_flow), rank=local_rank)
+    model = CosyVoice2(config)
+    set_all_random_seed(args.seed)
+    dataset = AudioDataset(text_norm, model.llm.tokenizer, args.data_list, config)
+    sampler = DistributedSampler(dataset,
+                                 num_replicas=world_size,
+                                 rank=rank)
+    dataloader = DataLoader(dataset, batch_size=args.batch_size_dataloader, num_workers=args.num_workers, pin_memory=True,
+                            sampler=sampler, shuffle=False, prefetch_factor=args.prefetch, collate_fn=collate_fn)
+    total_steps = len(dataset)
+    if local_rank == 0:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [INFO] - {args}")
+        progress_bar = tqdm(total=total_steps, desc="Processing samples", unit="wav",
+                            position=0, leave=True, dynamic_ncols=True)
+    cpu_counts = os.cpu_count()
+    executor = ThreadPoolExecutor(max_workers=min(args.batch_size_dataloader, cpu_counts // 8))
+    pending_futures = []
+    dataloader_iter = iter(dataloader)
+    succeed_duration = 0.01  # avoid division by zero
+    start_time = time.time()
+    estimated_total_wavs = 0
+    succeed_wavs = 0
+    failed_wavs = 0
+    last_print_time = start_time
+    while True:
+        try:
+            dataloader_start = time.time()
+            batch = next(dataloader_iter)
+            dataloader_time = time.time() - dataloader_start
+            if len(batch['infos']) == 0:
+                timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+                tqdm.write(f"[{timestamp}] - [WARNING] - rank {rank} of {world_size}: No valid batch found, skipping this batch...")
+                continue
+            model_start = time.time()
+            results_dict, timing_stats = model(**batch, batch_size_flow=args.batch_size_flow,
+                                               only_llm=args.only_llm)
+            model_time = time.time() - model_start
+            estimated_total_wavs += len(results_dict['generated_wavs'])
+            timing_stats['dataloader_time'] = dataloader_time
+            timing_stats['model_inference_time'] = model_time
+            if args.only_llm:
+                results_dict['generated_wavs'] = [None] * len(results_dict['prompt_speech_tokens'])
+            for i in range(len(results_dict['generated_wavs'])):
+                future = executor.submit(
+                    save_file_async, results_dict['generated_wavs'][i],
+                    results_dict['prompt_speech_tokens'][i],
+                    results_dict['generated_speech_tokens'][i],
+                    batch['infos'][i].copy(), timing_stats.copy()
+                )
+                pending_futures.append(future)
+            completed_futures = []
+            for future in pending_futures:
+                if future.done():
+                    try:
+                        duration = future.result()
+                        succeed_duration += duration
+                        succeed_wavs += 1
+                    except Exception as e:
+                        failed_wavs += 1
+                        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+                        tqdm.write(f"[{timestamp}] - [ERROR] - rank {rank} of {world_size}: Error in async save task: {e}")
+                    completed_futures.append(future)
+            for future in completed_futures:
+                pending_futures.remove(future)
+            if local_rank == 0:
+                update_n = world_size * len(batch["prompt_text_tokens_for_llm"])
+                if progress_bar.n + update_n > progress_bar.total:
+                    progress_bar.update(progress_bar.total - progress_bar.n)
+                else:
+                    progress_bar.update(update_n)
+                current_time = time.time()
+                if current_time - last_print_time >= 120 and not args.only_llm:
+                    elapsed_time = current_time - start_time
+                    avg_duration = succeed_duration / succeed_wavs if succeed_wavs > 0 else 0
+                    estimated_total_duration = avg_duration * estimated_total_wavs
+                    current_rtf = elapsed_time / estimated_total_duration if estimated_total_duration > 0.01 else 0
+                    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+                    tqdm.write(f"[{timestamp}] - [INFO] - rank {rank} of {world_size}: Estimated total wavs: {estimated_total_wavs} ({estimated_total_wavs - succeed_wavs} pending to save), Succeed wavs: {succeed_wavs}, Failed wavs: {failed_wavs}, Estimated total duration: {estimated_total_duration:.2f}s ({estimated_total_duration / 3600:.2f} h), Estimated RTF: {current_rtf:.5f}, Elapsed time: {elapsed_time:.2f}s")  # noqa
+                    last_print_time = current_time
+        except StopIteration:
+            break
+        except Exception as e:
+            failed_wavs += 1
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [ERROR] - rank {rank} of {world_size}: Error in main loop: {e}")
+            continue
+    total_time = time.time() - start_time
+    if local_rank == 0:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [INFO] - Waiting for {len(pending_futures)} pending save tasks to complete...")
+    for future in pending_futures:
+        try:
+            duration = future.result(timeout=60)
+            succeed_duration += duration
+            succeed_wavs += 1
+        except Exception as e:
+            failed_wavs += 1
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [ERROR] - rank {rank} of {world_size}: Error in final async save task: {e}")
+    executor.shutdown(wait=True)
+    if local_rank == 0:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [INFO] - All async save tasks completed.")
+        progress_bar.close()
+    if not args.only_llm:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [INFO] - rank {rank} of {world_size}: Final Report - Succeed wavs: {succeed_wavs}, Failed wavs: {failed_wavs}, Total duration: {succeed_duration:.2f}s ({succeed_duration / 3600:.2f} h), RTF: {total_time / succeed_duration:.5f}")  # noqa
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

stepaudio2/flashcosyvoice/config.py ADDED Viewed

@@ -0,0 +1,80 @@
+import os
+from dataclasses import dataclass, field
+import torch
+from transformers import AutoConfig
+@dataclass
+class CosyVoice2LLMConfig:
+    architectures: list[str] = field(default_factory=lambda: ["Qwen2ForCausalLM"])
+    attention_dropout: float = 0.0
+    bos_token_id: int = 151643
+    eos_token_id: int = 6561  # speech eos
+    hidden_act: str = "silu"
+    hidden_size: int = 896
+    initializer_range: float = 0.02
+    intermediate_size: int = 4864
+    max_position_embeddings: int = 32768
+    max_window_layers: int = 24
+    model_type: str = "qwen2"
+    num_attention_heads: int = 14
+    num_hidden_layers: int = 24
+    num_key_value_heads: int = 2
+    head_dim: int = 64
+    rms_norm_eps: float = 1e-06
+    rope_scaling: dict | None = None
+    rope_theta: float = 1000000.0
+    sliding_window: int = 32768
+    tie_word_embeddings: bool = False
+    torch_dtype: torch.dtype = torch.bfloat16
+    transformers_version: str = "4.52.0.dev0"
+    use_cache: bool = True
+    use_sliding_window: bool = False
+    vocab_size: int = 158500  # text_vocab_size + speech_vocab_size + 2 (eos and task_id)
+    text_vocab_size: int = 151936
+    speech_vocab_size: int = 6562  # actually 6564, we only care about non-streaming inference, so cut off tokens (6562, 6563) that are only used for streaming TTS
+    lm_head_bias: bool = True
+    qkv_bias: bool = True
+    fp16_flow: bool = True
+@dataclass
+class SamplingParams:
+    temperature: float = 1.0
+    min_tokens: int = 2
+    max_tokens: int = 64
+    ignore_eos: bool = False
+    top_k: int = 25
+    # RasSampler parameters
+    use_ras: bool = False
+    win_size: int = 10
+    tau_r: float = 0.1
+    top_p: float = 0.8
+@dataclass
+class Config:
+    model: str
+    max_num_batched_tokens: int = 1572864
+    max_num_seqs: int = 1024
+    max_model_len: int = 1536  # 15s prompt + 30s generated audio for 25hz audio tokenizer
+    gpu_memory_utilization: float = 0.9
+    tensor_parallel_size: int = 1
+    enforce_eager: bool = False
+    hf_config: CosyVoice2LLMConfig | AutoConfig = field(default_factory=CosyVoice2LLMConfig)
+    eos: int = -1
+    kvcache_block_size: int = 256
+    num_kvcache_blocks: int = -1
+    min_token_text_ratio: int = 2
+    max_token_text_ratio: int = 20
+    rank: int = 0
+    def __post_init__(self):
+        assert os.path.isdir(self.model)
+        assert self.kvcache_block_size % 256 == 0
+        assert 1 <= self.tensor_parallel_size <= 8
+        max_pos = getattr(self.hf_config, "max_position_embeddings", 4096)
+        self.max_model_len = min(self.max_model_len, max_pos)
+        assert self.max_num_batched_tokens >= self.max_model_len