PyPI - minicpmo-utils - Versions diffs - 0.1.0__py3-none-any.whl - Mend

minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

cosyvoice/__init__.py +17 -0
cosyvoice/bin/average_model.py +93 -0
cosyvoice/bin/export_jit.py +103 -0
cosyvoice/bin/export_onnx.py +120 -0
cosyvoice/bin/inference_deprecated.py +126 -0
cosyvoice/bin/train.py +195 -0
cosyvoice/cli/__init__.py +0 -0
cosyvoice/cli/cosyvoice.py +209 -0
cosyvoice/cli/frontend.py +238 -0
cosyvoice/cli/model.py +386 -0
cosyvoice/dataset/__init__.py +0 -0
cosyvoice/dataset/dataset.py +151 -0
cosyvoice/dataset/processor.py +434 -0
cosyvoice/flow/decoder.py +494 -0
cosyvoice/flow/flow.py +281 -0
cosyvoice/flow/flow_matching.py +227 -0
cosyvoice/flow/length_regulator.py +70 -0
cosyvoice/hifigan/discriminator.py +230 -0
cosyvoice/hifigan/f0_predictor.py +58 -0
cosyvoice/hifigan/generator.py +582 -0
cosyvoice/hifigan/hifigan.py +67 -0
cosyvoice/llm/llm.py +610 -0
cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
cosyvoice/tokenizer/tokenizer.py +279 -0
cosyvoice/transformer/__init__.py +0 -0
cosyvoice/transformer/activation.py +84 -0
cosyvoice/transformer/attention.py +330 -0
cosyvoice/transformer/convolution.py +145 -0
cosyvoice/transformer/decoder.py +396 -0
cosyvoice/transformer/decoder_layer.py +132 -0
cosyvoice/transformer/embedding.py +302 -0
cosyvoice/transformer/encoder.py +474 -0
cosyvoice/transformer/encoder_layer.py +236 -0
cosyvoice/transformer/label_smoothing_loss.py +96 -0
cosyvoice/transformer/positionwise_feed_forward.py +115 -0
cosyvoice/transformer/subsampling.py +383 -0
cosyvoice/transformer/upsample_encoder.py +320 -0
cosyvoice/utils/__init__.py +0 -0
cosyvoice/utils/class_utils.py +83 -0
cosyvoice/utils/common.py +186 -0
cosyvoice/utils/executor.py +176 -0
cosyvoice/utils/file_utils.py +129 -0
cosyvoice/utils/frontend_utils.py +136 -0
cosyvoice/utils/losses.py +57 -0
cosyvoice/utils/mask.py +265 -0
cosyvoice/utils/scheduler.py +738 -0
cosyvoice/utils/train_utils.py +367 -0
cosyvoice/vllm/cosyvoice2.py +103 -0
matcha/__init__.py +0 -0
matcha/app.py +357 -0
matcha/cli.py +418 -0
matcha/hifigan/__init__.py +0 -0
matcha/hifigan/config.py +28 -0
matcha/hifigan/denoiser.py +64 -0
matcha/hifigan/env.py +17 -0
matcha/hifigan/meldataset.py +217 -0
matcha/hifigan/models.py +368 -0
matcha/hifigan/xutils.py +60 -0
matcha/models/__init__.py +0 -0
matcha/models/baselightningmodule.py +209 -0
matcha/models/components/__init__.py +0 -0
matcha/models/components/decoder.py +443 -0
matcha/models/components/flow_matching.py +132 -0
matcha/models/components/text_encoder.py +410 -0
matcha/models/components/transformer.py +316 -0
matcha/models/matcha_tts.py +239 -0
matcha/onnx/__init__.py +0 -0
matcha/onnx/export.py +181 -0
matcha/onnx/infer.py +168 -0
matcha/text/__init__.py +53 -0
matcha/text/cleaners.py +116 -0
matcha/text/numbers.py +71 -0
matcha/text/symbols.py +17 -0
matcha/train.py +122 -0
matcha/utils/__init__.py +5 -0
matcha/utils/audio.py +82 -0
matcha/utils/generate_data_statistics.py +111 -0
matcha/utils/instantiators.py +56 -0
matcha/utils/logging_utils.py +53 -0
matcha/utils/model.py +90 -0
matcha/utils/monotonic_align/__init__.py +22 -0
matcha/utils/monotonic_align/setup.py +7 -0
matcha/utils/pylogger.py +21 -0
matcha/utils/rich_utils.py +101 -0
matcha/utils/utils.py +219 -0
minicpmo/__init__.py +24 -0
minicpmo/utils.py +636 -0
minicpmo/version.py +2 -0
minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
s3tokenizer/__init__.py +153 -0
s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
s3tokenizer/assets/mel_filters.npz +0 -0
s3tokenizer/cli.py +183 -0
s3tokenizer/model.py +546 -0
s3tokenizer/model_v2.py +605 -0
s3tokenizer/utils.py +390 -0
stepaudio2/__init__.py +40 -0
stepaudio2/cosyvoice2/__init__.py +1 -0
stepaudio2/cosyvoice2/flow/__init__.py +0 -0
stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
stepaudio2/cosyvoice2/flow/flow.py +230 -0
stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
stepaudio2/cosyvoice2/transformer/attention.py +328 -0
stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
stepaudio2/cosyvoice2/utils/__init__.py +1 -0
stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
stepaudio2/cosyvoice2/utils/common.py +101 -0
stepaudio2/cosyvoice2/utils/mask.py +49 -0
stepaudio2/flashcosyvoice/__init__.py +0 -0
stepaudio2/flashcosyvoice/cli.py +424 -0
stepaudio2/flashcosyvoice/config.py +80 -0
stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/flow.py +198 -0
stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
stepaudio2/flashcosyvoice/utils/audio.py +77 -0
stepaudio2/flashcosyvoice/utils/context.py +28 -0
stepaudio2/flashcosyvoice/utils/loader.py +116 -0
stepaudio2/flashcosyvoice/utils/memory.py +19 -0
stepaudio2/stepaudio2.py +204 -0
stepaudio2/token2wav.py +248 -0
stepaudio2/utils.py +91 -0

stepaudio2/flashcosyvoice/cosyvoice2.py ADDED Viewed

@@ -0,0 +1,160 @@
+# Copyright (c) 2025 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from datetime import datetime
+import s3tokenizer
+import torch
+from tqdm import tqdm
+from stepaudio2.flashcosyvoice.config import Config, SamplingParams
+from stepaudio2.flashcosyvoice.engine.llm_engine import LLMEngine
+from stepaudio2.flashcosyvoice.modules.flow import CausalMaskedDiffWithXvec
+from stepaudio2.flashcosyvoice.modules.hifigan import HiFTGenerator
+class CosyVoice2(torch.nn.Module):
+    def __init__(self, config: Config = None):
+        super().__init__()
+        self.config = Config() if config is None else config
+        self.audio_tokenizer = s3tokenizer.load_model("speech_tokenizer_v2_25hz").cuda().eval()
+        self.llm = LLMEngine(**self.config.__dict__)
+        self.use_tqdm = torch.distributed.get_node_local_rank() == 0
+        self.flow = CausalMaskedDiffWithXvec()
+        if self.config.hf_config.fp16_flow:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [INFO] - Casting flow to fp16")
+            self.flow.half()
+        self.flow.load_state_dict(torch.load(f"{self.config.model}/flow.pt", map_location="cpu", weights_only=True), strict=True)
+        self.flow.cuda().eval()
+        self.hift = HiFTGenerator()
+        hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(f"{self.config.model}/hift.pt", map_location="cpu", weights_only=True).items()}
+        self.hift.load_state_dict(hift_state_dict, strict=True)
+        self.hift.cuda().eval()
+    @torch.inference_mode()
+    def forward(
+        self, prompt_mels_for_llm: torch.Tensor, prompt_mels_lens_for_llm: torch.Tensor,
+        prompt_text_tokens_for_llm: list[list[int]], text_tokens_for_llm: list[list[int]],
+        prompt_mels_for_flow: torch.Tensor, prompt_mels_lens_for_flow: torch.Tensor,
+        spk_emb_for_flow: torch.Tensor,
+        sampling_params: SamplingParams | list[SamplingParams],
+        batch_size_flow: int,
+        only_llm: bool,
+        **kwargs,  # for compatibility
+    ):
+        timing_stats = {}
+        # Audio tokenization
+        start_time = time.time()
+        prompt_speech_tokens, prompt_speech_tokens_lens = self.audio_tokenizer.quantize(
+            prompt_mels_for_llm.cuda(), prompt_mels_lens_for_llm.cuda()
+        )
+        timing_stats['audio_tokenization'] = time.time() - start_time
+        batch_size = prompt_speech_tokens.shape[0]
+        assert len(prompt_text_tokens_for_llm) == batch_size
+        # Prepare LLM inputs
+        start_time = time.time()
+        valid_prompt_speech_tokens = []
+        inputs = []
+        for i in range(batch_size):
+            speech_tokens_i = prompt_speech_tokens[i, :prompt_speech_tokens_lens[i].item()].tolist()
+            valid_prompt_speech_tokens.append(speech_tokens_i)
+            inputs.append([self.config.hf_config.speech_vocab_size] + prompt_text_tokens_for_llm[i] + text_tokens_for_llm[i] + [self.config.hf_config.speech_vocab_size + 1] + speech_tokens_i)
+        timing_stats['prepare_llm_inputs'] = time.time() - start_time
+        # LLM generation
+        start_time = time.time()
+        llm_outputs = self.llm.generate(inputs, sampling_params, use_tqdm=self.use_tqdm)
+        timing_stats['llm_generation'] = time.time() - start_time
+        results_dict = {
+            "prompt_speech_tokens": valid_prompt_speech_tokens,
+            "generated_speech_tokens": [o['token_ids'][:-1] for o in llm_outputs],
+        }
+        if only_llm:
+            return results_dict, timing_stats
+        # Prepare Flow inputs
+        start_time = time.time()
+        flow_inputs = []
+        flow_inputs_lens = []
+        for i, o in enumerate(llm_outputs):
+            generated_speech_tokens = o['token_ids'][:-1]  # ignore last eos
+            prompt_speech_tokens = valid_prompt_speech_tokens[i]
+            flow_inputs.append(torch.tensor(prompt_speech_tokens + generated_speech_tokens))
+            flow_inputs_lens.append(len(prompt_speech_tokens) + len(generated_speech_tokens))
+        flow_inputs = torch.nn.utils.rnn.pad_sequence(flow_inputs, batch_first=True, padding_value=0)
+        flow_inputs_lens = torch.tensor(flow_inputs_lens)
+        timing_stats['prepare_flow_inputs'] = time.time() - start_time
+        # Flow generation and HiFi-GAN generation (with batching)
+        total_batch_size = flow_inputs.shape[0]
+        generated_wavs = []
+        flow_total_time = 0.0
+        hifigan_total_time = 0.0
+        # Process in batches according to batch_size_flow, batch_size_flow <= total_batch_size
+        # NOTE(xcsong): When executing both LLM and Flow on the same GPU,
+        #   Flow can easily fill up the SM and memory. Therefore, batch processing is required to avoid OOM.
+        num_batches = (total_batch_size + batch_size_flow - 1) // batch_size_flow
+        batch_iterator = range(0, total_batch_size, batch_size_flow)
+        if self.use_tqdm:
+            batch_iterator = tqdm(batch_iterator, desc="Generating wavs (Flow+HiFi-GAN)", leave=False, unit="batch",
+                                  total=num_batches, dynamic_ncols=True, position=self.config.rank + 1)
+        for start_idx in batch_iterator:
+            end_idx = min(start_idx + batch_size_flow, total_batch_size)
+            batch_flow_inputs = flow_inputs[start_idx:end_idx]
+            batch_flow_inputs_lens = flow_inputs_lens[start_idx:end_idx]
+            batch_prompt_mels = prompt_mels_for_flow[start_idx:end_idx]
+            batch_prompt_mels_lens = prompt_mels_lens_for_flow[start_idx:end_idx]
+            batch_spk_emb = spk_emb_for_flow[start_idx:end_idx]
+            # Flow generation for this batch
+            flow_start_time = time.time()
+            with torch.amp.autocast("cuda", dtype=torch.float16 if self.config.hf_config.fp16_flow else torch.float32):
+                batch_generated_mels, batch_generated_mels_lens = self.flow(
+                    batch_flow_inputs.cuda(), batch_flow_inputs_lens.cuda(),
+                    batch_prompt_mels.cuda(), batch_prompt_mels_lens.cuda(), batch_spk_emb.cuda(),
+                    streaming=False, finalize=True
+                )
+            flow_total_time += time.time() - flow_start_time
+            # HiFi-GAN generation for this batch
+            hifigan_start_time = time.time()
+            batch_size_current = end_idx - start_idx
+            for i in range(batch_size_current):
+                mel = batch_generated_mels[i, :, batch_prompt_mels_lens[i].item():batch_generated_mels_lens[i].item()].unsqueeze(0)
+                wav, _ = self.hift(speech_feat=mel)
+                generated_wavs.append(wav)
+            hifigan_total_time += time.time() - hifigan_start_time
+        timing_stats['flow_generation'] = flow_total_time
+        timing_stats['hifigan_generation'] = hifigan_total_time
+        # Calculate total time and batch statistics
+        timing_stats['model.forward_total'] = sum(timing_stats.values())
+        timing_stats['batch_size'] = len(generated_wavs)
+        timing_stats['batch_size_flow'] = batch_size_flow
+        results_dict['generated_wavs'] = generated_wavs
+        return results_dict, timing_stats

stepaudio2/flashcosyvoice/cosyvoice3.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # TODO(xcsong): Implement CosyVoice3 when it is released

stepaudio2/flashcosyvoice/engine/__init__.py ADDED Viewed

File without changes

stepaudio2/flashcosyvoice/engine/block_manager.py ADDED Viewed

@@ -0,0 +1,114 @@
+from collections import deque
+import numpy as np
+import xxhash
+from stepaudio2.flashcosyvoice.engine.sequence import Sequence
+class Block:
+    def __init__(self, block_id):
+        self.block_id = block_id
+        self.ref_count = 0
+        self.hash = -1
+        self.token_ids = []
+    def update(self, hash: int, token_ids: list[int]):
+        self.hash = hash
+        self.token_ids = token_ids
+    def reset(self):
+        self.ref_count = 1
+        self.hash = -1
+        self.token_ids = []
+class BlockManager:
+    def __init__(self, num_blocks: int, block_size: int):
+        assert num_blocks > 0
+        self.block_size = block_size
+        self.blocks: list[Block] = [Block(i) for i in range(num_blocks)]
+        self.hash_to_block_id: dict[int, int] = dict()
+        self.free_block_ids: deque[int] = deque(range(num_blocks))
+        self.used_block_ids: set[int] = set()
+    @classmethod
+    def compute_hash(cls, token_ids: list[int], prefix: int = -1):
+        h = xxhash.xxh64()
+        if prefix != -1:
+            h.update(prefix.to_bytes(8, "little"))
+        h.update(np.array(token_ids).tobytes())
+        return h.intdigest()
+    def _allocate_block(self, block_id: int) -> Block:
+        block = self.blocks[block_id]
+        assert block.ref_count == 0
+        block.reset()
+        self.free_block_ids.remove(block_id)
+        self.used_block_ids.add(block_id)
+        return self.blocks[block_id]
+    def _deallocate_block(self, block_id: int) -> Block:
+        assert self.blocks[block_id].ref_count == 0
+        self.used_block_ids.remove(block_id)
+        self.free_block_ids.append(block_id)
+    def can_allocate(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= seq.num_blocks
+    def allocate(self, seq: Sequence):
+        assert not seq.block_table
+        h = -1
+        cache_miss = False
+        for i in range(seq.num_blocks):
+            token_ids = seq.block(i)
+            h = self.compute_hash(token_ids, h) if len(token_ids) == self.block_size else -1
+            block_id = self.hash_to_block_id.get(h, -1)
+            if block_id == -1 or self.blocks[block_id].token_ids != token_ids:
+                cache_miss = True
+            if cache_miss:
+                block_id = self.free_block_ids[0]
+                block = self._allocate_block(block_id)
+            else:
+                seq.num_cached_tokens += self.block_size
+                if block_id in self.used_block_ids:
+                    block = self.blocks[block_id]
+                    block.ref_count += 1
+                else:
+                    block = self._allocate_block(block_id)
+            if h != -1:
+                block.update(h, token_ids)
+                self.hash_to_block_id[h] = block_id
+            seq.block_table.append(block_id)
+    def deallocate(self, seq: Sequence):
+        for block_id in reversed(seq.block_table):
+            block = self.blocks[block_id]
+            block.ref_count -= 1
+            if block.ref_count == 0:
+                self._deallocate_block(block_id)
+        seq.num_cached_tokens = 0
+        seq.block_table.clear()
+    def can_append(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= (len(seq) % self.block_size == 1)
+    def may_append(self, seq: Sequence):
+        block_table = seq.block_table
+        last_block = self.blocks[block_table[-1]]
+        if len(seq) % self.block_size == 1:
+            assert last_block.hash != -1
+            block_id = self.free_block_ids[0]
+            self._allocate_block(block_id)
+            block_table.append(block_id)
+        elif len(seq) % self.block_size == 0:
+            assert last_block.hash == -1
+            token_ids = seq.block(seq.num_blocks - 1)
+            prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1
+            h = self.compute_hash(token_ids, prefix)
+            last_block.update(h, token_ids)
+            self.hash_to_block_id[h] = last_block.block_id
+        else:
+            assert last_block.hash == -1

stepaudio2/flashcosyvoice/engine/llm_engine.py ADDED Viewed

@@ -0,0 +1,125 @@
+import atexit
+from dataclasses import fields
+from time import perf_counter
+import torch.multiprocessing as mp
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+from stepaudio2.flashcosyvoice.config import Config, SamplingParams
+from stepaudio2.flashcosyvoice.engine.model_runner import ModelRunner
+from stepaudio2.flashcosyvoice.engine.scheduler import Scheduler
+from stepaudio2.flashcosyvoice.engine.sequence import Sequence
+class LLMEngine:
+    def __init__(self, model, **kwargs):
+        config_fields = {field.name for field in fields(Config)}
+        config_kwargs = {k: v for k, v in kwargs.items() if k in config_fields}
+        config = Config(model, **config_kwargs)
+        self.ps = []
+        self.events = []
+        ctx = mp.get_context("spawn")
+        assert config.tensor_parallel_size == 1, "NOTE(xcsong): Currently only support tp=1"
+        for i in range(1, config.tensor_parallel_size):
+            event = ctx.Event()
+            process = ctx.Process(target=ModelRunner, args=(config, i, event))
+            process.start()
+            self.ps.append(process)
+            self.events.append(event)
+        if hasattr(config.hf_config, "speech_vocab_size"):
+            # NOTE: non-chat model, all these special tokens keep randomly initialized.
+            special_tokens = {
+                'eos_token': '<|endoftext|>',
+                'pad_token': '<|endoftext|>',
+                'additional_special_tokens': [
+                    '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                    '[breath]', '<strong>', '</strong>', '[noise]',
+                    '[laughter]', '[cough]', '[clucking]', '[accent]',
+                    '[quick_breath]',
+                    "<laughter>", "</laughter>",
+                    "[hissing]", "[sigh]", "[vocalized-noise]",
+                    "[lipsmack]", "[mn]"
+                ]
+            }
+            self.tokenizer = AutoTokenizer.from_pretrained(f"{config.model}/CosyVoice-BlankEN")
+            self.tokenizer.add_special_tokens(special_tokens)
+            self.skip_special_tokens = True
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True)
+        if hasattr(config.hf_config, "eos_token_id"):
+            config.eos = config.hf_config.eos_token_id
+        else:
+            config.eos = self.tokenizer.eos_token_id
+        self.model_runner = ModelRunner(config, config.rank, self.events)
+        self.scheduler = Scheduler(config)
+        self.config = config
+        atexit.register(self.exit)
+    def exit(self):
+        self.model_runner.call("exit")
+        del self.model_runner
+        for p in self.ps:
+            p.join()
+    def add_request(self, prompt: str | list[int], sampling_params: SamplingParams):
+        if isinstance(prompt, str):
+            prompt = self.tokenizer.encode(prompt)
+        seq = Sequence(prompt, sampling_params)
+        self.scheduler.add(seq)
+    def step(self):
+        seqs, is_prefill = self.scheduler.schedule()
+        token_ids = self.model_runner.call("run", seqs, is_prefill)
+        self.scheduler.postprocess(seqs, token_ids)
+        outputs = [(seq.seq_id, seq.completion_token_ids) for seq in seqs if seq.is_finished]
+        num_tokens = sum(len(seq) for seq in seqs) if is_prefill else -len(seqs)
+        return outputs, num_tokens
+    def is_finished(self):
+        return self.scheduler.is_finished()
+    def generate(
+        self,
+        prompts: list[str] | list[list[int]],
+        sampling_params: SamplingParams | list[SamplingParams],
+        use_tqdm: bool = True,
+    ) -> list[str]:
+        if use_tqdm:
+            pbar = tqdm(total=len(prompts), desc="Generating tokens (LLM)", leave=False,
+                        dynamic_ncols=True, position=self.config.rank + 1)
+        if not isinstance(sampling_params, list):
+            sampling_params = [sampling_params] * len(prompts)
+        for prompt, sp in zip(prompts, sampling_params):
+            self.add_request(prompt, sp)
+        outputs = {}
+        prefill_throughput = decode_throughput = instant_decode_throughput = 0.
+        total_decode_tokens = 0
+        total_decode_time = 0.
+        while not self.is_finished():
+            t = perf_counter()
+            output, num_tokens = self.step()
+            step_time = perf_counter() - t
+            if use_tqdm:
+                if num_tokens > 0:
+                    prefill_throughput = num_tokens / step_time
+                else:
+                    instant_decode_throughput = -num_tokens / step_time
+                    total_decode_tokens += -num_tokens
+                    total_decode_time += step_time
+                    decode_throughput = total_decode_tokens / total_decode_time if total_decode_time > 0 else 0
+                pbar.set_postfix({
+                    "Prefill": f"{int(prefill_throughput)}tok/s",
+                    "AvgDecode": f"{int(decode_throughput)}tok/s",
+                    "InstDecode": f"{int(instant_decode_throughput)}tok/s",
+                })
+            for seq_id, token_ids in output:
+                outputs[seq_id] = token_ids
+                if use_tqdm:
+                    pbar.update(1)
+        outputs = [outputs[seq_id] for seq_id in sorted(outputs)]
+        outputs = [{"text": self.tokenizer.decode(token_ids), "token_ids": token_ids} for token_ids in outputs]
+        if use_tqdm:
+            pbar.close()
+        return outputs