PyPI - minicpmo-utils - Versions diffs - 0.1.0__py3-none-any.whl - Mend

minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

cosyvoice/__init__.py +17 -0
cosyvoice/bin/average_model.py +93 -0
cosyvoice/bin/export_jit.py +103 -0
cosyvoice/bin/export_onnx.py +120 -0
cosyvoice/bin/inference_deprecated.py +126 -0
cosyvoice/bin/train.py +195 -0
cosyvoice/cli/__init__.py +0 -0
cosyvoice/cli/cosyvoice.py +209 -0
cosyvoice/cli/frontend.py +238 -0
cosyvoice/cli/model.py +386 -0
cosyvoice/dataset/__init__.py +0 -0
cosyvoice/dataset/dataset.py +151 -0
cosyvoice/dataset/processor.py +434 -0
cosyvoice/flow/decoder.py +494 -0
cosyvoice/flow/flow.py +281 -0
cosyvoice/flow/flow_matching.py +227 -0
cosyvoice/flow/length_regulator.py +70 -0
cosyvoice/hifigan/discriminator.py +230 -0
cosyvoice/hifigan/f0_predictor.py +58 -0
cosyvoice/hifigan/generator.py +582 -0
cosyvoice/hifigan/hifigan.py +67 -0
cosyvoice/llm/llm.py +610 -0
cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
cosyvoice/tokenizer/tokenizer.py +279 -0
cosyvoice/transformer/__init__.py +0 -0
cosyvoice/transformer/activation.py +84 -0
cosyvoice/transformer/attention.py +330 -0
cosyvoice/transformer/convolution.py +145 -0
cosyvoice/transformer/decoder.py +396 -0
cosyvoice/transformer/decoder_layer.py +132 -0
cosyvoice/transformer/embedding.py +302 -0
cosyvoice/transformer/encoder.py +474 -0
cosyvoice/transformer/encoder_layer.py +236 -0
cosyvoice/transformer/label_smoothing_loss.py +96 -0
cosyvoice/transformer/positionwise_feed_forward.py +115 -0
cosyvoice/transformer/subsampling.py +383 -0
cosyvoice/transformer/upsample_encoder.py +320 -0
cosyvoice/utils/__init__.py +0 -0
cosyvoice/utils/class_utils.py +83 -0
cosyvoice/utils/common.py +186 -0
cosyvoice/utils/executor.py +176 -0
cosyvoice/utils/file_utils.py +129 -0
cosyvoice/utils/frontend_utils.py +136 -0
cosyvoice/utils/losses.py +57 -0
cosyvoice/utils/mask.py +265 -0
cosyvoice/utils/scheduler.py +738 -0
cosyvoice/utils/train_utils.py +367 -0
cosyvoice/vllm/cosyvoice2.py +103 -0
matcha/__init__.py +0 -0
matcha/app.py +357 -0
matcha/cli.py +418 -0
matcha/hifigan/__init__.py +0 -0
matcha/hifigan/config.py +28 -0
matcha/hifigan/denoiser.py +64 -0
matcha/hifigan/env.py +17 -0
matcha/hifigan/meldataset.py +217 -0
matcha/hifigan/models.py +368 -0
matcha/hifigan/xutils.py +60 -0
matcha/models/__init__.py +0 -0
matcha/models/baselightningmodule.py +209 -0
matcha/models/components/__init__.py +0 -0
matcha/models/components/decoder.py +443 -0
matcha/models/components/flow_matching.py +132 -0
matcha/models/components/text_encoder.py +410 -0
matcha/models/components/transformer.py +316 -0
matcha/models/matcha_tts.py +239 -0
matcha/onnx/__init__.py +0 -0
matcha/onnx/export.py +181 -0
matcha/onnx/infer.py +168 -0
matcha/text/__init__.py +53 -0
matcha/text/cleaners.py +116 -0
matcha/text/numbers.py +71 -0
matcha/text/symbols.py +17 -0
matcha/train.py +122 -0
matcha/utils/__init__.py +5 -0
matcha/utils/audio.py +82 -0
matcha/utils/generate_data_statistics.py +111 -0
matcha/utils/instantiators.py +56 -0
matcha/utils/logging_utils.py +53 -0
matcha/utils/model.py +90 -0
matcha/utils/monotonic_align/__init__.py +22 -0
matcha/utils/monotonic_align/setup.py +7 -0
matcha/utils/pylogger.py +21 -0
matcha/utils/rich_utils.py +101 -0
matcha/utils/utils.py +219 -0
minicpmo/__init__.py +24 -0
minicpmo/utils.py +636 -0
minicpmo/version.py +2 -0
minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
s3tokenizer/__init__.py +153 -0
s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
s3tokenizer/assets/mel_filters.npz +0 -0
s3tokenizer/cli.py +183 -0
s3tokenizer/model.py +546 -0
s3tokenizer/model_v2.py +605 -0
s3tokenizer/utils.py +390 -0
stepaudio2/__init__.py +40 -0
stepaudio2/cosyvoice2/__init__.py +1 -0
stepaudio2/cosyvoice2/flow/__init__.py +0 -0
stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
stepaudio2/cosyvoice2/flow/flow.py +230 -0
stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
stepaudio2/cosyvoice2/transformer/attention.py +328 -0
stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
stepaudio2/cosyvoice2/utils/__init__.py +1 -0
stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
stepaudio2/cosyvoice2/utils/common.py +101 -0
stepaudio2/cosyvoice2/utils/mask.py +49 -0
stepaudio2/flashcosyvoice/__init__.py +0 -0
stepaudio2/flashcosyvoice/cli.py +424 -0
stepaudio2/flashcosyvoice/config.py +80 -0
stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/flow.py +198 -0
stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
stepaudio2/flashcosyvoice/utils/audio.py +77 -0
stepaudio2/flashcosyvoice/utils/context.py +28 -0
stepaudio2/flashcosyvoice/utils/loader.py +116 -0
stepaudio2/flashcosyvoice/utils/memory.py +19 -0
stepaudio2/stepaudio2.py +204 -0
stepaudio2/token2wav.py +248 -0
stepaudio2/utils.py +91 -0

stepaudio2/flashcosyvoice/modules/sampler.py ADDED Viewed

@@ -0,0 +1,231 @@
+import torch
+from torch import nn
+class Sampler(nn.Module):
+    """
+    Optimized sampler implementation using vectorized operations instead of loops, significantly improving performance
+    Performance optimizations:
+    1. Using batch processing instead of sequence loops, reducing Python loop overhead
+    2. Using PyTorch's vectorized operations (like torch.sort, torch.gather) for parallel computation
+    3. Using mask operations to apply top-k filtering at once, avoiding per-sequence processing
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, logits: torch.Tensor, temperatures: torch.Tensor, top_k: int = None):
+        """
+        Perform sampling operation using vectorized method for top-k filtering
+        Args:
+            logits: Logits tensor with shape [batch_size, vocab_size]
+            temperatures: Temperature parameters with shape [batch_size]
+            top_k: Top-k value for filtering (uniform across all sequences)
+        Returns:
+            Sampled token IDs
+        """
+        logits = logits.to(torch.float)
+        greedy_tokens = logits.argmax(dim=-1)  # Greedy decoding result, used when temperature=0
+        logits.div_(temperatures.unsqueeze(dim=1))  # Apply temperature scaling
+        # Apply uniform top-k filtering if top_k is provided
+        if top_k is not None and top_k > 0:
+            vocab_size = logits.size(-1)
+            # Create a mask to store which positions should be kept
+            mask = torch.zeros_like(logits, dtype=torch.bool)
+            # Batch sorting for all sequences at once
+            sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True)
+            # Get threshold for each sequence (the k-th largest value)
+            k_value = min(top_k, vocab_size)  # Ensure k doesn't exceed vocab size
+            thresholds = sorted_logits[:, k_value-1:k_value]  # Shape [batch_size, 1]
+            thresholds = thresholds.expand(-1, vocab_size)    # Expand to match logits shape
+            # Create mask: only keep logits greater than or equal to threshold
+            mask = logits >= thresholds
+            # Apply mask: set logits not in top-k to negative infinity
+            logits = torch.where(mask, logits, torch.tensor(float('-inf'), device=logits.device))
+        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+        # logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+        sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1)
+        return torch.where(temperatures == 0, greedy_tokens, sample_tokens)
+class RasSampler(nn.Module):
+    """
+    Optimized Repetition Aware Sampling implementation
+    Performance optimizations:
+    1. Using vectorized nucleus sampling instead of loop implementation, improving sampling efficiency
+    2. Using tensor operations to calculate repetition rate, reducing Python loop overhead
+    3. Optimizing EOS handling logic, reducing unnecessary resampling
+    4. Using PyTorch's vectorized operations for parallel computation
+    5. Batch processing for all sequences, dramatically improving throughput
+    6. Robust handling for sequences of any length, including empty sequences
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, logits: torch.Tensor, decoded_tokens_list: list,
+                win_size: int = 10, tau_r: float = 0.1,
+                top_p: float = 0.8, top_k: int = 25,
+                eos_token: int = 6561, min_tokens: list[int] = None):
+        """
+        Execute repetition-aware sampling using optimized vectorized operations with batch processing
+        Args:
+            logits: Input logits with shape [batch_size, vocab_size]
+            decoded_tokens_list: List of decoded tokens, each element is a token list for a batch
+            win_size: Window size for repetition detection (uniform across all batch items)
+            tau_r: Repetition threshold (uniform across all batch items)
+            top_p: Nucleus sampling probability threshold (uniform across all batch items)
+            top_k: Nucleus sampling top-k threshold (uniform across all batch items)
+            eos_token: End of sequence token ID (uniform across all batch items)
+            min_tokens: List of minimum tokens to generate before allowing EOS, one per batch item
+        Returns:
+            Selected token IDs
+        """
+        batch_size = logits.size(0)
+        device = logits.device
+        result = torch.zeros(batch_size, dtype=torch.long, device=device)
+        # Set default values if not provided
+        if min_tokens is None:
+            min_tokens = [2] * batch_size
+        # Ensure min_tokens list has the correct length
+        assert len(min_tokens) == batch_size, f"min_tokens length {len(min_tokens)} != batch_size {batch_size}"
+        # Force continue decode first token
+        for i in range(batch_size):
+            if i < len(decoded_tokens_list) and len(decoded_tokens_list[i]) == 0:
+                logits[i, eos_token] = -float('inf')
+        # 1. First, perform nucleus sampling for all sequences
+        probs = torch.softmax(logits, dim=-1)
+        # Use vectorized nucleus sampling for all sequences
+        # This can be done in batch since top_p and top_k are uniform
+        sorted_probs, sorted_indices = probs.sort(dim=-1, descending=True)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        # Create masks for top-p and top-k filtering
+        top_p_mask = cumulative_probs <= top_p
+        # Create top-k mask (first top_k positions are True)
+        top_k_mask = torch.zeros_like(top_p_mask)
+        top_k_mask[:, :top_k] = True
+        # Combine masks
+        mask = top_p_mask & top_k_mask
+        # Ensure at least one token is selected per sequence
+        first_token_mask = torch.zeros_like(mask)
+        first_token_mask[:, 0] = True
+        mask = mask | first_token_mask
+        # Sample from the filtered distribution
+        sample_probs = torch.where(mask, sorted_probs, torch.zeros_like(sorted_probs))
+        sample_probs = sample_probs / sample_probs.sum(dim=-1, keepdim=True)
+        # Sample indices from the filtered distribution
+        sampled_indices = torch.multinomial(sample_probs, 1).squeeze(-1)
+        top_ids = torch.gather(sorted_indices, -1, sampled_indices.unsqueeze(-1)).squeeze(-1)
+        # 2. Check for repetitions and apply random sampling if needed
+        # Extract recent tokens for each sequence, handling empty or short sequences
+        recent_tokens_list = []
+        for i in range(batch_size):
+            # Handle index out of range or empty tokens
+            if i < len(decoded_tokens_list):
+                tokens = decoded_tokens_list[i]
+                if len(tokens) > 0:
+                    start_idx = max(0, len(tokens) - win_size)
+                    recent_tokens_list.append(tokens[start_idx:])
+                else:
+                    recent_tokens_list.append([])  # Empty list for empty tokens
+            else:
+                recent_tokens_list.append([])  # Empty list for missing batch items
+        # Check if we have any tokens to process for repetition detection
+        if any(len(tokens) > 0 for tokens in recent_tokens_list):
+            # Convert to padded tensor for batch processing
+            max_recent_len = max(len(tokens) for tokens in recent_tokens_list)
+            if max_recent_len > 0:  # Only proceed if we have tokens
+                recent_tokens_tensor = torch.zeros((batch_size, max_recent_len), dtype=torch.long, device=device) - 1
+                for i, tokens in enumerate(recent_tokens_list):
+                    if len(tokens) > 0:
+                        recent_tokens_tensor[i, -len(tokens):] = torch.tensor(tokens, device=device)
+                # Create a mask for valid positions and to avoid division by zero
+                valid_positions_mask = torch.zeros_like(recent_tokens_tensor, dtype=torch.bool)
+                for i, tokens in enumerate(recent_tokens_list):
+                    if len(tokens) > 0:
+                        valid_positions_mask[i, -len(tokens):] = True
+                # Check repetition rates
+                repetition_counts = torch.zeros(batch_size, device=device)
+                for i in range(batch_size):
+                    if len(recent_tokens_list[i]) > 0:
+                        repetition_counts[i] = (recent_tokens_tensor[i] == top_ids[i]).sum()
+                # Calculate repetition rates, avoiding division by zero
+                recent_lengths = torch.tensor([max(1, len(tokens)) for tokens in recent_tokens_list], device=device)
+                repetition_rates = repetition_counts / recent_lengths
+                # Identify sequences needing random sampling
+                need_random = repetition_rates >= tau_r
+                # Apply random sampling where needed
+                if need_random.any():
+                    random_indices = torch.multinomial(probs[need_random], 1).squeeze(-1)
+                    top_ids[need_random] = random_indices
+        # 3. Handle EOS tokens
+        # Create mask for sequences that should ignore EOS tokens
+        ignore_eos_mask = torch.zeros(batch_size, dtype=torch.bool, device=device)
+        for i in range(batch_size):
+            if i < len(decoded_tokens_list):
+                ignore_eos_mask[i] = len(decoded_tokens_list[i]) < min_tokens[i]
+            else:
+                ignore_eos_mask[i] = True  # Default to ignoring EOS for missing sequences
+        is_eos_mask = top_ids == eos_token
+        need_resample = ignore_eos_mask & is_eos_mask
+        # Resample for sequences that need it
+        if need_resample.any():
+            max_trials = 100
+            for attempt in range(max_trials):
+                # Break if no more resampling needed
+                if not need_resample.any():
+                    break
+                # Sample new tokens for sequences that need resampling
+                new_samples = torch.multinomial(probs[need_resample], 1).squeeze(-1)
+                # Update top_ids with new samples
+                top_ids[need_resample] = new_samples
+                # Update which sequences still need resampling
+                is_eos_mask = top_ids == eos_token
+                need_resample = ignore_eos_mask & is_eos_mask
+            # If still have EOS tokens that should be ignored, force them to be non-EOS
+            if need_resample.any():
+                # Force to a non-EOS token (e.g., the second most likely token)
+                for i in range(batch_size):
+                    if need_resample[i]:
+                        # Get second most likely token (or first if only one token)
+                        second_best_idx = 1 if sorted_indices.size(1) > 1 else 0
+                        top_ids[i] = sorted_indices[i, second_best_idx]
+        result = top_ids
+        return result

stepaudio2/flashcosyvoice/utils/__init__.py ADDED Viewed

File without changes

stepaudio2/flashcosyvoice/utils/audio.py ADDED Viewed

@@ -0,0 +1,77 @@
+import numpy as np
+import torch
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft=1920, num_mels=80, sampling_rate=24000, hop_size=480,
+                    win_size=1920, fmin=0, fmax=8000, center=False):
+    global mel_basis, hann_window  # pylint: disable=global-statement
+    if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

stepaudio2/flashcosyvoice/utils/context.py ADDED Viewed

@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+import torch
+@dataclass
+class Context:
+    is_prefill: bool = False
+    cu_seqlens_q: torch.Tensor | None = None
+    cu_seqlens_k: torch.Tensor | None = None
+    max_seqlen_q: int = 0
+    max_seqlen_k: int = 0
+    slot_mapping: torch.Tensor | None = None
+    context_lens: torch.Tensor | None = None
+    block_tables: torch.Tensor | None = None
+_CONTEXT = Context()
+def get_context():
+    return _CONTEXT
+def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None):
+    global _CONTEXT
+    _CONTEXT = Context(is_prefill, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables)
+def reset_context():
+    global _CONTEXT
+    _CONTEXT = Context()

stepaudio2/flashcosyvoice/utils/loader.py ADDED Viewed

@@ -0,0 +1,116 @@
+import os
+from glob import glob
+import torch
+from safetensors import safe_open
+from torch import nn
+from stepaudio2.flashcosyvoice.config import CosyVoice2LLMConfig
+def default_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor):
+    param.data.copy_(loaded_weight)
+def load_text_llm(model: nn.Module, path: str):
+    packed_modules_mapping = getattr(model, "packed_modules_mapping", {})
+    for file in glob(os.path.join(path, "*.safetensors")):
+        with safe_open(file, "pt", "cpu") as f:
+            for weight_name in f.keys():
+                for k in packed_modules_mapping:
+                    if k in weight_name:
+                        v, shard_id = packed_modules_mapping[k]
+                        param_name = weight_name.replace(k, v)
+                        param = model.get_parameter(param_name)
+                        weight_loader = param.weight_loader
+                        weight_loader(param, f.get_tensor(weight_name), shard_id)
+                        break
+                else:
+                    param = model.get_parameter(weight_name)
+                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                    weight_loader(param, f.get_tensor(weight_name))
+def load_speech_llm(model: nn.Module, path: str, hf_config: CosyVoice2LLMConfig):
+    packed_modules_mapping = getattr(model, "packed_modules_mapping", {})
+    # NOTE(xcsong): 1. load speech embedding + sos/taskid embedding + lm head
+    embedding_weights = {}
+    tmp_weights = torch.load(f"{path}/llm.pt", map_location="cpu", weights_only=True)
+    missed, missed_names = 0, []
+    for k, v in tmp_weights.items():
+        if k == "speech_embedding.weight":  # torch.Size([6564, 896])
+            speech_embedding_size = hf_config.speech_vocab_size  # 6562
+            # NOTE(xcsong): padding to 6592 for vllm tensor parallel
+            if speech_embedding_size != v.shape[0]:  # [6564, 896] -> [6562, 896]
+                assert speech_embedding_size <= v.shape[0], f"speech_embedding_size should be less than or equal to {v.shape[0]}, but got {speech_embedding_size}"
+                v = v[:speech_embedding_size, :]
+            embedding_weights["speech_embedding.weight"] = v
+        elif k == "llm_embedding.weight":  # torch.Size([2, 896]), eos and task_id
+            assert v.shape[0] == 2, f"llm_embedding.weight should be of shape [2, 896], but got {v.shape}"
+            embedding_weights["llm_embedding.weight"] = v
+        elif k == "llm.model.model.embed_tokens.weight":  # torch.Size([151936, 896])
+            embedding_weights["model.embed_tokens.weight"] = v
+        elif k == "llm_decoder.weight":  # torch.Size([6564, 896])
+            lm_head_size = hf_config.speech_vocab_size  # 6562
+            if lm_head_size != v.shape[0]:  # [6564, 896] -> [6562, 896]
+                assert lm_head_size <= v.shape[0], f"lm_head_size should be less than or equal to {v.shape[0]}, but got {lm_head_size}"
+                v = v[:lm_head_size, :]
+            param = model.get_parameter("lm_head.weight")
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, v)
+        elif k == "llm_decoder.bias":  # torch.Size([6564])
+            lm_head_size = hf_config.speech_vocab_size  # 6562
+            if lm_head_size != v.shape[0]:  # [6564] -> [6562]
+                assert lm_head_size <= v.shape[0], f"lm_head_size should be less than or equal to {v.shape[0]}, but got {lm_head_size}"
+                v = v[:lm_head_size]
+            param = model.get_parameter("lm_head.bias")
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, v)
+        elif "llm.model." in k:
+            weight_name = k.replace("llm.model.", "")
+            for kk in packed_modules_mapping:
+                if kk in weight_name:
+                    vv, shard_id = packed_modules_mapping[kk]
+                    param_name = weight_name.replace(kk, vv)
+                    try:
+                        param = model.get_parameter(param_name)
+                        weight_loader = param.weight_loader
+                        weight_loader(param, v, shard_id)
+                        break
+                    except Exception as e:
+                        print(e)
+                        print(f"skip parameter (1): {weight_name}")
+                        continue
+            else:
+                try:
+                    param = model.get_parameter(weight_name)
+                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                    weight_loader(param, v)
+                except Exception as e:
+                    print(e)
+                    print(f"skip parameter (2): {weight_name}")
+                    continue
+        else:
+            missed += 1
+            missed_names.append(weight_name)
+            continue
+    print(f"missed {missed} parameters: {missed_names}")
+    # NOTE(xcsong): 2. merge text embedding, sos/taskid embedding, and speech embedding
+    text_embedding_weight = embedding_weights["model.embed_tokens.weight"].cpu()  # [151936, 896]
+    sos_taskid_embedding_weight = embedding_weights["llm_embedding.weight"].cpu()  # [2, 896]
+    speech_embedding_weight = embedding_weights["speech_embedding.weight"].cpu()  # [6562, 896]
+    final_embedding_weight = torch.cat([speech_embedding_weight, sos_taskid_embedding_weight, text_embedding_weight], dim=0)  # [158500, 896]
+    param = model.get_parameter("model.embed_tokens.weight")
+    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+    weight_loader(param, final_embedding_weight)
+def load_model(model: nn.Module, path: str, hf_config: CosyVoice2LLMConfig | None = None):
+    if model.model_type == "speech_llm":
+        load_speech_llm(model, path, hf_config)
+    elif model.model_type == "text_llm":
+        load_text_llm(model, path)
+    else:
+        raise ValueError(f"Unsupported model type: {model.model_type}")

stepaudio2/flashcosyvoice/utils/memory.py ADDED Viewed

@@ -0,0 +1,19 @@
+import os
+import torch
+from pynvml import *  # noqa
+def get_gpu_memory():
+    torch.cuda.synchronize()
+    nvmlInit()
+    visible_device = list(map(int, os.getenv("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7").split(',')))
+    cuda_device_idx = torch.cuda.current_device()
+    cuda_device_idx = visible_device[cuda_device_idx]
+    handle = nvmlDeviceGetHandleByIndex(cuda_device_idx)
+    mem_info = nvmlDeviceGetMemoryInfo(handle)
+    total_memory = mem_info.total
+    used_memory = mem_info.used
+    free_memory = mem_info.free
+    nvmlShutdown()
+    return total_memory, used_memory, free_memory

stepaudio2/stepaudio2.py ADDED Viewed

@@ -0,0 +1,204 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+from stepaudio2.utils import compute_token_num, load_audio, log_mel_spectrogram, padding_mels
+class StepAudio2Base:
+    def __init__(self, model_path: str):
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side="right")
+        self.llm = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
+        self.eos_token_id = self.llm_tokenizer.eos_token_id
+    def __call__(self, messages: list, **kwargs):
+        messages, mels = self.apply_chat_template(messages)
+        # Tokenize prompts
+        prompt_ids = []
+        for msg in messages:
+            if isinstance(msg, str):
+                prompt_ids.append(self.llm_tokenizer(text=msg, return_tensors="pt", padding=True)["input_ids"])
+            elif isinstance(msg, list):
+                prompt_ids.append(torch.tensor([msg], dtype=torch.int32))
+            else:
+                raise ValueError(f"Unsupported content type: {type(msg)}")
+        prompt_ids = torch.cat(prompt_ids, dim=-1).cuda()
+        attention_mask = torch.ones_like(prompt_ids)
+        #mels = None if len(mels) == 0 else torch.stack(mels).cuda()
+        #mel_lengths = None if mels is None else torch.tensor([mel.shape[1] - 2 for mel in mels], dtype=torch.int32, device='cuda')
+        if len(mels)==0:
+            mels = None
+            mel_lengths = None
+        else:
+            mels, mel_lengths = padding_mels(mels)
+            mels = mels.cuda()
+            mel_lengths = mel_lengths.cuda()
+        generate_inputs = {
+            "input_ids": prompt_ids,
+            "wavs": mels,
+            "wav_lens": mel_lengths,
+            "attention_mask":attention_mask
+        }
+        generation_config = dict(max_new_tokens=2048,
+            pad_token_id=self.llm_tokenizer.pad_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+        generation_config.update(kwargs)
+        generation_config = GenerationConfig(**generation_config)
+        outputs = self.llm.generate(**generate_inputs, generation_config=generation_config, tokenizer=self.llm_tokenizer)
+        output_token_ids = outputs[0, prompt_ids.shape[-1] : -1].tolist()
+        output_text_tokens = [i for i in output_token_ids if i < 151688]
+        output_audio_tokens = [i - 151696 for i in output_token_ids if i > 151695]
+        output_text = self.llm_tokenizer.decode(output_text_tokens)
+        return output_token_ids, output_text, output_audio_tokens
+    def apply_chat_template(self, messages: list):
+        results = []
+        mels = []
+        for msg in messages:
+            content = msg
+            if isinstance(content, str):
+                text_with_audio = content
+                results.append(text_with_audio)
+            elif isinstance(content, dict):
+                if content["type"] == "text":
+                    results.append(f"{content['text']}")
+                elif content["type"] == "audio":
+                    audio = load_audio(content['audio'])
+                    for i in range(0, audio.shape[0], 16000 * 25):
+                        mel = log_mel_spectrogram(audio[i:i+16000*25], n_mels=128, padding=479)
+                        mels.append(mel)
+                        audio_tokens = "<audio_patch>" * compute_token_num(mel.shape[1])
+                        results.append(f"<audio_start>{audio_tokens}<audio_end>")
+                elif content["type"] == "token":
+                    results.append(content["token"])
+            else:
+                raise ValueError(f"Unsupported content type: {type(content)}")
+        # print(results)
+        return results, mels
+class StepAudio2(StepAudio2Base):
+    def __init__(self, model_path: str):
+        super().__init__(model_path)
+        self.llm_tokenizer.eos_token = "<|EOT|>"
+        self.llm.config.eos_token_id = self.llm_tokenizer.convert_tokens_to_ids("<|EOT|>")
+        self.eos_token_id = self.llm_tokenizer.convert_tokens_to_ids("<|EOT|>")
+    def apply_chat_template(self, messages: list):
+        results = []
+        mels = []
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "user":
+                role = "human"
+            if isinstance(content, str):
+                text_with_audio = f"<|BOT|>{role}\n{content}"
+                text_with_audio += '<|EOT|>' if msg.get('eot', True) else ''
+                results.append(text_with_audio)
+            elif isinstance(content, list):
+                results.append(f"<|BOT|>{role}\n")
+                for item in content:
+                    if item["type"] == "text":
+                        results.append(f"{item['text']}")
+                    elif item["type"] == "audio":
+                        audio = load_audio(item['audio'])
+                        for i in range(0, audio.shape[0], 16000 * 25):
+                            mel = log_mel_spectrogram(audio[i:i+16000*25], n_mels=128, padding=479)
+                            mels.append(mel)
+                            audio_tokens = "<audio_patch>" * compute_token_num(mel.shape[1])
+                            results.append(f"<audio_start>{audio_tokens}<audio_end>")
+                    elif item["type"] == "token":
+                        results.append(item["token"])
+                if msg.get('eot', True):
+                    results.append('<|EOT|>')
+            elif content is None:
+                results.append(f"<|BOT|>{role}\n")
+            else:
+                raise ValueError(f"Unsupported content type: {type(content)}")
+        # print(results)
+        return results, mels
+if __name__ == '__main__':
+    from stepaudio2.token2wav import Token2wav
+    model = StepAudio2('Step-Audio-2-mini')
+    token2wav = Token2wav('Step-Audio-2-mini/token2wav')
+    # Text-to-text conversation
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": "Give me a brief introduction to the Great Wall."},
+            {"role": "assistant", "content": None}
+    ]
+    tokens, text, _ = model(messages, max_new_tokens=256, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    # Text-to-speech conversation
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": "Give me a brief introduction to the Great Wall."},
+            {"role": "assistant", "content": "<tts_start>", "eot": False}, # Insert <tts_start> for speech response
+    ]
+    tokens, text, audio = model(messages, max_new_tokens=4096, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    print(tokens)
+    audio = token2wav(audio, prompt_wav='assets/default_male.wav')
+    with open('output-male.wav', 'wb') as f:
+        f.write(audio)
+    # Speech-to-text conversation
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": [{"type": "audio", "audio": "assets/give_me_a_brief_introduction_to_the_great_wall.wav"}]},
+            {"role": "assistant", "content": None}
+    ]
+    tokens, text, _ = model(messages, max_new_tokens=256, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    # Speech-to-speech conversation
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": [{"type": "audio", "audio": "assets/give_me_a_brief_introduction_to_the_great_wall.wav"}]},
+            {"role": "assistant", "content": "<tts_start>", "eot": False}, # Insert <tts_start> for speech response
+    ]
+    tokens, text, audio = model(messages, max_new_tokens=4096, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    print(tokens)
+    audio = token2wav(audio, prompt_wav='assets/default_female.wav')
+    with open('output-female.wav', 'wb') as f:
+        f.write(audio)
+    # Multi-turn conversation
+    print()
+    messages.pop(-1)
+    messages += [
+            {"role": "assistant", "content": [{"type": "text", "text": "<tts_start>"},
+                                              {"type": "token", "token": tokens}]},
+            {"role": "human", "content": "Now write a 4-line poem about it."},
+            {"role": "assistant", "content": None}
+    ]
+    tokens, text, audio = model(messages, max_new_tokens=256, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    # Multi-modal inputs
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": [{"type": "text", "text": "Translate the speech into Chinese."},
+                                          {"type": "audio", "audio": "assets/give_me_a_brief_introduction_to_the_great_wall.wav"}]},
+            {"role": "assistant", "content": None}
+    ]
+    tokens, text, audio = model(messages, max_new_tokens=256, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)