PyPI - voxtream - Versions diffs - 0.1.0__tar.gz - Mend

voxtream 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

voxtream-0.1.0/MANIFEST.in +6 -0
voxtream-0.1.0/PKG-INFO +143 -0
voxtream-0.1.0/README.md +103 -0
voxtream-0.1.0/assets/audio/female.wav +0 -0
voxtream-0.1.0/assets/audio/male.wav +0 -0
voxtream-0.1.0/assets/benchmark/common_voice_en_10119832.wav +0 -0
voxtream-0.1.0/assets/benchmark/common_voice_en_103675.wav +0 -0
voxtream-0.1.0/assets/benchmark/common_voice_en_10933823.wav +0 -0
voxtream-0.1.0/assets/benchmark/common_voice_en_120405.wav +0 -0
voxtream-0.1.0/assets/benchmark/common_voice_en_1205005.wav +0 -0
voxtream-0.1.0/assets/benchmark/common_voice_en_123125.wav +0 -0
voxtream-0.1.0/assets/benchmark/meta.csv +12 -0
voxtream-0.1.0/configs/generator.json +49 -0
voxtream-0.1.0/pyproject.toml +81 -0
voxtream-0.1.0/requirements.txt +15 -0
voxtream-0.1.0/setup.cfg +4 -0
voxtream-0.1.0/voxtream/__init__.py +0 -0
voxtream-0.1.0/voxtream/benchmark.py +70 -0
voxtream-0.1.0/voxtream/dataset.py +235 -0
voxtream-0.1.0/voxtream/generator.py +452 -0
voxtream-0.1.0/voxtream/model.py +333 -0
voxtream-0.1.0/voxtream/run.py +69 -0
voxtream-0.1.0/voxtream/train.py +73 -0
voxtream-0.1.0/voxtream/trainer.py +111 -0
voxtream-0.1.0/voxtream/utils/__init__.py +0 -0
voxtream-0.1.0/voxtream/utils/aligner.py +492 -0
voxtream-0.1.0/voxtream/utils/generator.py +349 -0
voxtream-0.1.0/voxtream/utils/model.py +100 -0
voxtream-0.1.0/voxtream/utils/sampling.py +126 -0
voxtream-0.1.0/voxtream/utils/trainer.py +31 -0
voxtream-0.1.0/voxtream.egg-info/PKG-INFO +143 -0
voxtream-0.1.0/voxtream.egg-info/SOURCES.txt +34 -0
voxtream-0.1.0/voxtream.egg-info/dependency_links.txt +1 -0
voxtream-0.1.0/voxtream.egg-info/entry_points.txt +3 -0
voxtream-0.1.0/voxtream.egg-info/requires.txt +22 -0
voxtream-0.1.0/voxtream.egg-info/top_level.txt +1 -0

voxtream-0.1.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,6 @@
+include README.md
+include voxtream/VERSION
+include requirements.txt
+recursive-include assets *.wav
+recursive-include assets *.csv
+recursive-include configs *.json

voxtream-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,143 @@
+Metadata-Version: 2.4
+Name: voxtream
+Version: 0.1.0
+Summary: Full-Stream Zero-shot TTS model with Extremely Low Latency
+Author-email: Nikita Torgashov <torgaschov.nikita@gmail.com>
+License: MIT
+Project-URL: Homepage, https://herimor.github.io/voxtream
+Project-URL: Bug Reports, https://github.com/herimor/voxtream/issues
+Project-URL: Source, https://github.com/herimor/voxtream
+Keywords: text-to-speech,streaming,tts,speech-synthesis,voice-cloning
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Multimedia :: Sound/Audio
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: torch==2.4.0
+Requires-Dist: torchaudio==2.4.0
+Requires-Dist: torchtune==0.4.0
+Requires-Dist: torchao==0.9.0
+Requires-Dist: lightning==2.4.0
+Requires-Dist: moshi==0.2.2
+Requires-Dist: huggingface_hub==0.28.1
+Requires-Dist: g2p-en==2.1.0
+Requires-Dist: librosa==0.11.0
+Requires-Dist: soundfile==0.13.1
+Requires-Dist: inflect==7.5.0
+Requires-Dist: nltk==3.9.1
+Requires-Dist: hydra-core==1.3.2
+Requires-Dist: tensorboard==2.19.0
+Requires-Dist: transformers==4.50.0
+Provides-Extra: dev
+Requires-Dist: black; extra == "dev"
+Requires-Dist: isort; extra == "dev"
+Requires-Dist: flake8; extra == "dev"
+Requires-Dist: mypy; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+# VoXtream: Full-Stream Text-to-Speech with Extremely Low Latency
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/pdf/2509.15969)
+[![demo](https://img.shields.io/badge/VoXtream-Demo-red)](https://herimor.github.io/voxtream)
+[![model](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow)](https://huggingface.co/herimor/voxtream)
+[![python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)](https://www.python.org/downloads/release/python-3119)
+[![pytorch](https://img.shields.io/badge/PyTorch_2.4+-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/get-started/locally)
+We present VoXtream, a fully autoregressive, zero-shot streaming text-to-speech system for real-time use that begins speaking from the first word.
+### Key featues
+- **Streaming**: Support a full-stream scenario, where the full sentence is not known in advance. The model takes the text stream coming word-by-word as input and outputs an audio stream in 80ms chunks.
+- **Speed**: Works **5x** times faster than real-time and achieves **102 ms** first packet latency on GPU.
+- **Quality and efficiency**: With only 9k hours of training data, it matches or surpasses the quality and intelligibility of larger models or models trained on large datasets.
+## Installation
+```bash
+pip install voxtream
+```
+## Usage
+### Output streaming
+```bash
+voxtream \
+    --prompt-audio assets/audio/male.wav \
+    --prompt-text "The liquor was first created as 'Brandy Milk', produced with milk, brandy and vanilla." \
+    --text "In general, however, some method is then needed to evaluate each approximation." \
+    --output "output_stream.wav"
+```
+* Note: Initial run may take some additional time to download model weights.
+### Full streaming
+```bash
+voxtream \
+    --prompt-audio assets/audio/female.wav \
+    --prompt-text "Betty Cooper helps Archie with cleaning a store room, when Reggie attacks her." \
+    --text "Staff do not always do enough to prevent violence." \
+    --output "full_stream.wav" \
+    --full-stream
+```
+## Training
+- Build the Docker container. If you have another version of Docker compose installed use `docker compose -f ...` instead.
+```bash
+docker-compose -f .devcontainer/docker-compose.yaml build voxtream
+```
+- Run training using the `train.py` script. You should specify GPU IDs that will be seen inside the container, ex. `GPU_IDS=0,1`. Specify the batch size according to your GPU. The default batch size is 32 (tested on RTX3090), 64 fits into A100-40Gb, and 128 fits into A100-80Gb. The dataset will be downloaded automatically to the HF cache directory. Dataset size is 20Gb. The data will be loaded to RAM during training, make sure you can allocate ~20Gb of RAM per GPU. Results will be stored at the `./experiments` directory.
+Example of running the training using 2 GPUs with batch size 32:
+```bash
+GPU_IDS=0,1 docker-compose -f .devcontainer/docker-compose.yaml run voxtream python voxtream/train.py batch_size=32
+```
+## Benchmark
+To evaluate model's real time factor (RTF) and First packet latency (FPL) run `voxtream-benchmark`. You can compile model for faster inference using `--compile` flag (note that initial compilation take some time).
+| Device  | Compiled           | FPL, ms | RTF  |
+| :-:     | :-:                | :-:     | :-:  |
+| A100    |                    | 176     | 1.00 |
+| A100    | :heavy_check_mark: | 102     | 0.17 |
+| RTX3090 |                    | 205     | 1.19 |
+| RTX3090 | :heavy_check_mark: | 123     | 0.19 |
+## TODO
+- [x] Add a neural phoneme aligner. Remove MFA dependency
+- [x] Add PyPI package
+- [ ] Gradio demo
+- [ ] HuggingFace Spaces demo
+- [ ] Evaluation scripts
+## License
+The code in this repository is provided under the MIT License.
+The Depth Transformer component from SesameAI-CSM is included under the Apache 2.0 License (see LICENSE-APACHE and NOTICE).
+The model weights were trained on data licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0). Redistribution of the weights must include proper attribution to the original dataset creators (see ATTRIBUTION.md).
+## Acknowledgements
+- [Mimi](https://huggingface.co/kyutai/mimi): Streaming audio codec from [Kyutai](https://kyutai.org)
+- [CSM](https://github.com/SesameAILabs/csm): Conversation speech model from [Sesame](https://www.sesame.com)
+- [ReDimNet](https://github.com/IDRnD/redimnet): Speaker recognition model from [IDR&D](https://www.idrnd.ai)
+## Citation
+```
+@article{torgashov2025voxtream,
+  author    = {Torgashov, Nikita and Henter, Gustav Eje and Skantze, Gabriel},
+  title     = {Vo{X}tream: Full-Stream Text-to-Speech with Extremely Low Latency},
+  journal   = {arXiv:2509.15969},
+  year      = {2025}
+}
+```
+## Disclaimer
+Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.

voxtream-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,103 @@
+# VoXtream: Full-Stream Text-to-Speech with Extremely Low Latency
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/pdf/2509.15969)
+[![demo](https://img.shields.io/badge/VoXtream-Demo-red)](https://herimor.github.io/voxtream)
+[![model](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow)](https://huggingface.co/herimor/voxtream)
+[![python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)](https://www.python.org/downloads/release/python-3119)
+[![pytorch](https://img.shields.io/badge/PyTorch_2.4+-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/get-started/locally)
+We present VoXtream, a fully autoregressive, zero-shot streaming text-to-speech system for real-time use that begins speaking from the first word.
+### Key featues
+- **Streaming**: Support a full-stream scenario, where the full sentence is not known in advance. The model takes the text stream coming word-by-word as input and outputs an audio stream in 80ms chunks.
+- **Speed**: Works **5x** times faster than real-time and achieves **102 ms** first packet latency on GPU.
+- **Quality and efficiency**: With only 9k hours of training data, it matches or surpasses the quality and intelligibility of larger models or models trained on large datasets.
+## Installation
+```bash
+pip install voxtream
+```
+## Usage
+### Output streaming
+```bash
+voxtream \
+    --prompt-audio assets/audio/male.wav \
+    --prompt-text "The liquor was first created as 'Brandy Milk', produced with milk, brandy and vanilla." \
+    --text "In general, however, some method is then needed to evaluate each approximation." \
+    --output "output_stream.wav"
+```
+* Note: Initial run may take some additional time to download model weights.
+### Full streaming
+```bash
+voxtream \
+    --prompt-audio assets/audio/female.wav \
+    --prompt-text "Betty Cooper helps Archie with cleaning a store room, when Reggie attacks her." \
+    --text "Staff do not always do enough to prevent violence." \
+    --output "full_stream.wav" \
+    --full-stream
+```
+## Training
+- Build the Docker container. If you have another version of Docker compose installed use `docker compose -f ...` instead.
+```bash
+docker-compose -f .devcontainer/docker-compose.yaml build voxtream
+```
+- Run training using the `train.py` script. You should specify GPU IDs that will be seen inside the container, ex. `GPU_IDS=0,1`. Specify the batch size according to your GPU. The default batch size is 32 (tested on RTX3090), 64 fits into A100-40Gb, and 128 fits into A100-80Gb. The dataset will be downloaded automatically to the HF cache directory. Dataset size is 20Gb. The data will be loaded to RAM during training, make sure you can allocate ~20Gb of RAM per GPU. Results will be stored at the `./experiments` directory.
+Example of running the training using 2 GPUs with batch size 32:
+```bash
+GPU_IDS=0,1 docker-compose -f .devcontainer/docker-compose.yaml run voxtream python voxtream/train.py batch_size=32
+```
+## Benchmark
+To evaluate model's real time factor (RTF) and First packet latency (FPL) run `voxtream-benchmark`. You can compile model for faster inference using `--compile` flag (note that initial compilation take some time).
+| Device  | Compiled           | FPL, ms | RTF  |
+| :-:     | :-:                | :-:     | :-:  |
+| A100    |                    | 176     | 1.00 |
+| A100    | :heavy_check_mark: | 102     | 0.17 |
+| RTX3090 |                    | 205     | 1.19 |
+| RTX3090 | :heavy_check_mark: | 123     | 0.19 |
+## TODO
+- [x] Add a neural phoneme aligner. Remove MFA dependency
+- [x] Add PyPI package
+- [ ] Gradio demo
+- [ ] HuggingFace Spaces demo
+- [ ] Evaluation scripts
+## License
+The code in this repository is provided under the MIT License.
+The Depth Transformer component from SesameAI-CSM is included under the Apache 2.0 License (see LICENSE-APACHE and NOTICE).
+The model weights were trained on data licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0). Redistribution of the weights must include proper attribution to the original dataset creators (see ATTRIBUTION.md).
+## Acknowledgements
+- [Mimi](https://huggingface.co/kyutai/mimi): Streaming audio codec from [Kyutai](https://kyutai.org)
+- [CSM](https://github.com/SesameAILabs/csm): Conversation speech model from [Sesame](https://www.sesame.com)
+- [ReDimNet](https://github.com/IDRnD/redimnet): Speaker recognition model from [IDR&D](https://www.idrnd.ai)
+## Citation
+```
+@article{torgashov2025voxtream,
+  author    = {Torgashov, Nikita and Henter, Gustav Eje and Skantze, Gabriel},
+  title     = {Vo{X}tream: Full-Stream Text-to-Speech with Extremely Low Latency},
+  journal   = {arXiv:2509.15969},
+  year      = {2025}
+}
+```
+## Disclaimer
+Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.

voxtream-0.1.0/assets/audio/female.wav ADDED Viewed

Binary file

voxtream-0.1.0/assets/audio/male.wav ADDED Viewed

Binary file

voxtream-0.1.0/assets/benchmark/common_voice_en_10119832.wav ADDED Viewed

Binary file

voxtream-0.1.0/assets/benchmark/common_voice_en_103675.wav ADDED Viewed

Binary file

voxtream-0.1.0/assets/benchmark/common_voice_en_10933823.wav ADDED Viewed

Binary file

voxtream-0.1.0/assets/benchmark/common_voice_en_120405.wav ADDED Viewed

Binary file

voxtream-0.1.0/assets/benchmark/common_voice_en_1205005.wav ADDED Viewed

Binary file

voxtream-0.1.0/assets/benchmark/common_voice_en_123125.wav ADDED Viewed

Binary file

voxtream-0.1.0/assets/benchmark/meta.csv ADDED Viewed

@@ -0,0 +1,12 @@
+prompt_audio,prompt_text,text
+assets/benchmark/common_voice_en_10119832.wav,"We asked over twenty different people, and they all said it was his.",Get the trust fund to the bank early.
+assets/benchmark/common_voice_en_10119832.wav,"We asked over twenty different people, and they all said it was his.",The stained glass offered a hypnotic atmosphere.
+assets/benchmark/common_voice_en_103675.wav,I'm never more aware of a room's acoustics than when I'm trying to enjoy a snack I have no intention of sharing.,"One by one, the campfires were extinguished, and the oasis fell as quiet as the desert."
+assets/benchmark/common_voice_en_103675.wav,I'm never more aware of a room's acoustics than when I'm trying to enjoy a snack I have no intention of sharing.,The boy knew the desert sensed his fear.
+assets/benchmark/common_voice_en_10933823.wav,Sometimes I overthink things which leads me to postpone and ultimately never achieve the goal I had in mind.,"When it comes to the crunch, our company will become insolvent."
+assets/benchmark/common_voice_en_10933823.wav,Sometimes I overthink things which leads me to postpone and ultimately never achieve the goal I had in mind.,The primary coil has fifty turns.
+assets/benchmark/common_voice_en_120405.wav,He approached the mass and was surprised at the size and the shape.,I'm never more aware of a room's acoustics than when I'm trying to enjoy a snack I have no intention of sharing.
+assets/benchmark/common_voice_en_120405.wav,He approached the mass and was surprised at the size and the shape.,The only shadow was that of the few scattered pine trees.
+assets/benchmark/common_voice_en_1205005.wav,"Roaming endlessly around the park, she wants to go home.",The work of the tailor is seen on each side.
+assets/benchmark/common_voice_en_1205005.wav,"Roaming endlessly around the park, she wants to go home.",NASA plans to launch the rocket tomorrow.
+assets/benchmark/common_voice_en_123125.wav,"There's no danger, the boy said, when they had moved on past the encampment.","After all, who doesn’t want to overcome new challenges and achieve great heights?"

voxtream-0.1.0/configs/generator.json ADDED Viewed

@@ -0,0 +1,49 @@
+{
+    "sil_token": 69,
+    "bos_token": 71,
+    "eos_token": 72,
+    "end_pad": 5,
+    "num_codebooks": 12,
+    "num_phones_per_frame": 2,
+    "audio_delay_frames": 1,
+    "temperature": 0.9,
+    "topk": 5,
+    "max_audio_length_ms": 60000,
+    "device": "cuda",
+    "model_repo": "herimor/voxtream",
+    "model_name": "model.safetensors",
+    "model_config_name": "config.json",
+    "mimi_sr": 24000,
+    "mimi_vocab_size": 2048,
+    "mimi_frame_ms": 80,
+    "mimi_repo": "kyutai/moshiko-pytorch-bf16",
+    "mimi_name": "tokenizer-e351c8d8-checkpoint125.safetensors",
+    "spk_enc_sr": 16000,
+    "spk_enc_repo": "IDRnD/ReDimNet",
+    "spk_enc_model": "ReDimNet",
+    "spk_enc_model_name": "M",
+    "spk_enc_train_type": "ft_mix",
+    "spk_enc_dataset": "vb2+vox2+cnc",
+    "phoneme_dict_name": "phoneme_to_token.json",
+    "nltk_resource": "taggers/averaged_perceptron_tagger_eng",
+    "aligner": "charsiu/en_w2v2_fc_10ms",
+    "cache_prompt": false,
+    "phoneme_index_map": {
+        "0": [
+            0,
+            1
+        ],
+        "1": [
+            0,
+            2
+        ],
+        "2": [
+            1,
+            1
+        ],
+        "3": [
+            1,
+            2
+        ]
+    }
+}

voxtream-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,81 @@
+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools]
+license-files = ["LICENSE"]
+[project]
+name = "voxtream"
+version = "0.1.0"
+description = "Full-Stream Zero-shot TTS model with Extremely Low Latency"
+readme = "README.md"
+license = { text = "MIT" }
+authors = [
+  { name = "Nikita Torgashov", email = "torgaschov.nikita@gmail.com" }
+]
+requires-python = ">=3.11"
+keywords = [
+  "text-to-speech",
+  "streaming",
+  "tts",
+  "speech-synthesis",
+  "voice-cloning"
+]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Intended Audience :: Developers",
+  "Topic :: Multimedia :: Sound/Audio",
+  "License :: OSI Approved :: MIT License",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.11",
+]
+# install_requires (from requirements.txt)
+dependencies = [
+  "torch==2.4.0",
+  "torchaudio==2.4.0",
+  "torchtune==0.4.0",
+  "torchao==0.9.0",
+  "lightning==2.4.0",
+  "moshi==0.2.2",
+  "huggingface_hub==0.28.1",
+  "g2p-en==2.1.0",
+  "librosa==0.11.0",
+  "soundfile==0.13.1",
+  "inflect==7.5.0",
+  "nltk==3.9.1",
+  "hydra-core==1.3.2",
+  "tensorboard==2.19.0",
+  "transformers==4.50.0"
+]
+[project.optional-dependencies]
+dev = ["black", "isort", "flake8", "mypy", "pytest"]
+[project.urls]
+Homepage = "https://herimor.github.io/voxtream"
+"Bug Reports" = "https://github.com/herimor/voxtream/issues"
+Source = "https://github.com/herimor/voxtream"
+[project.scripts]
+voxtream = "voxtream.run:main"
+voxtream-benchmark = "voxtream.benchmark:main"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["voxtream*"]
+# Tool configs merged from your original pyproject.toml
+[tool.black]
+line-length = 88
+target-version = ["py311"]
+skip-string-normalization = false
+[tool.isort]
+profile = "black"
+[tool.ruff]
+line-length = 88
+lint.select = ["E", "F", "W", "C90", "B", "I"]
+lint.ignore = ["E501"]

voxtream-0.1.0/requirements.txt ADDED Viewed

@@ -0,0 +1,15 @@
+torch==2.4.0
+torchaudio==2.4.0
+torchtune==0.4.0
+torchao==0.9.0
+lightning==2.4.0
+moshi==0.2.2
+huggingface_hub==0.28.1
+g2p-en==2.1.0
+librosa==0.11.0
+soundfile==0.13.1
+inflect==7.5.0
+nltk==3.9.1
+hydra-core==1.3.2
+tensorboard==2.19.0
+transformers==4.50.0

voxtream-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

voxtream-0.1.0/voxtream/__init__.py ADDED Viewed

File without changes

voxtream-0.1.0/voxtream/benchmark.py ADDED Viewed

@@ -0,0 +1,70 @@
+import argparse
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch._inductor.config
+from tqdm.auto import tqdm
+from voxtream.generator import SpeechGenerator, SpeechGeneratorConfig
+from voxtream.utils.generator import existing_file, set_seed, text_generator
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.fx_graph_cache = True
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--compile", action="store_true", help="Compile graph")
+    parser.add_argument(
+        "-cfg",
+        "--config",
+        type=existing_file,
+        help="Path to the config file",
+        default="configs/generator.json",
+    )
+    parser.add_argument(
+        "-m",
+        "--meta",
+        type=existing_file,
+        help="Path to the metadata file",
+        default="assets/benchmark/meta.csv",
+    )
+    args = parser.parse_args()
+    set_seed()
+    with open(args.config) as f:
+        config = SpeechGeneratorConfig(**json.load(f))
+    speech_generator = SpeechGenerator(config, compile=args.compile)
+    meta = pd.read_csv(args.meta)
+    audio_frames, first_packet_latency, gen_times = [], [], []
+    for idx, row in tqdm(meta.iterrows(), total=len(meta)):
+        speech_stream = speech_generator.generate_stream(
+            prompt_text=row.prompt_text,
+            prompt_audio_path=Path(row.prompt_audio),
+            text=text_generator(row.text),
+        )
+        if idx == 0:
+            # warmup
+            for _, _ in speech_stream:
+                pass
+            continue
+        for i, (audio_frame, gen_time) in enumerate(speech_stream):
+            audio_frames.append(audio_frame)
+            if i == 0:
+                first_packet_latency.append(gen_time)
+            else:
+                gen_times.append(gen_time)
+    rtf = (np.mean(gen_times) * 1000) / config.mimi_frame_ms
+    print(f"First packet latency: {round(np.mean(first_packet_latency) * 1000)} ms")
+    print(f"RTF: {round(rtf, 2)}")
+if __name__ == "__main__":
+    main()