voxtream 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. voxtream-0.1.0/MANIFEST.in +6 -0
  2. voxtream-0.1.0/PKG-INFO +143 -0
  3. voxtream-0.1.0/README.md +103 -0
  4. voxtream-0.1.0/assets/audio/female.wav +0 -0
  5. voxtream-0.1.0/assets/audio/male.wav +0 -0
  6. voxtream-0.1.0/assets/benchmark/common_voice_en_10119832.wav +0 -0
  7. voxtream-0.1.0/assets/benchmark/common_voice_en_103675.wav +0 -0
  8. voxtream-0.1.0/assets/benchmark/common_voice_en_10933823.wav +0 -0
  9. voxtream-0.1.0/assets/benchmark/common_voice_en_120405.wav +0 -0
  10. voxtream-0.1.0/assets/benchmark/common_voice_en_1205005.wav +0 -0
  11. voxtream-0.1.0/assets/benchmark/common_voice_en_123125.wav +0 -0
  12. voxtream-0.1.0/assets/benchmark/meta.csv +12 -0
  13. voxtream-0.1.0/configs/generator.json +49 -0
  14. voxtream-0.1.0/pyproject.toml +81 -0
  15. voxtream-0.1.0/requirements.txt +15 -0
  16. voxtream-0.1.0/setup.cfg +4 -0
  17. voxtream-0.1.0/voxtream/__init__.py +0 -0
  18. voxtream-0.1.0/voxtream/benchmark.py +70 -0
  19. voxtream-0.1.0/voxtream/dataset.py +235 -0
  20. voxtream-0.1.0/voxtream/generator.py +452 -0
  21. voxtream-0.1.0/voxtream/model.py +333 -0
  22. voxtream-0.1.0/voxtream/run.py +69 -0
  23. voxtream-0.1.0/voxtream/train.py +73 -0
  24. voxtream-0.1.0/voxtream/trainer.py +111 -0
  25. voxtream-0.1.0/voxtream/utils/__init__.py +0 -0
  26. voxtream-0.1.0/voxtream/utils/aligner.py +492 -0
  27. voxtream-0.1.0/voxtream/utils/generator.py +349 -0
  28. voxtream-0.1.0/voxtream/utils/model.py +100 -0
  29. voxtream-0.1.0/voxtream/utils/sampling.py +126 -0
  30. voxtream-0.1.0/voxtream/utils/trainer.py +31 -0
  31. voxtream-0.1.0/voxtream.egg-info/PKG-INFO +143 -0
  32. voxtream-0.1.0/voxtream.egg-info/SOURCES.txt +34 -0
  33. voxtream-0.1.0/voxtream.egg-info/dependency_links.txt +1 -0
  34. voxtream-0.1.0/voxtream.egg-info/entry_points.txt +3 -0
  35. voxtream-0.1.0/voxtream.egg-info/requires.txt +22 -0
  36. voxtream-0.1.0/voxtream.egg-info/top_level.txt +1 -0
@@ -0,0 +1,6 @@
1
+ include README.md
2
+ include voxtream/VERSION
3
+ include requirements.txt
4
+ recursive-include assets *.wav
5
+ recursive-include assets *.csv
6
+ recursive-include configs *.json
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: voxtream
3
+ Version: 0.1.0
4
+ Summary: Full-Stream Zero-shot TTS model with Extremely Low Latency
5
+ Author-email: Nikita Torgashov <torgaschov.nikita@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://herimor.github.io/voxtream
8
+ Project-URL: Bug Reports, https://github.com/herimor/voxtream/issues
9
+ Project-URL: Source, https://github.com/herimor/voxtream
10
+ Keywords: text-to-speech,streaming,tts,speech-synthesis,voice-cloning
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Topic :: Multimedia :: Sound/Audio
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: torch==2.4.0
20
+ Requires-Dist: torchaudio==2.4.0
21
+ Requires-Dist: torchtune==0.4.0
22
+ Requires-Dist: torchao==0.9.0
23
+ Requires-Dist: lightning==2.4.0
24
+ Requires-Dist: moshi==0.2.2
25
+ Requires-Dist: huggingface_hub==0.28.1
26
+ Requires-Dist: g2p-en==2.1.0
27
+ Requires-Dist: librosa==0.11.0
28
+ Requires-Dist: soundfile==0.13.1
29
+ Requires-Dist: inflect==7.5.0
30
+ Requires-Dist: nltk==3.9.1
31
+ Requires-Dist: hydra-core==1.3.2
32
+ Requires-Dist: tensorboard==2.19.0
33
+ Requires-Dist: transformers==4.50.0
34
+ Provides-Extra: dev
35
+ Requires-Dist: black; extra == "dev"
36
+ Requires-Dist: isort; extra == "dev"
37
+ Requires-Dist: flake8; extra == "dev"
38
+ Requires-Dist: mypy; extra == "dev"
39
+ Requires-Dist: pytest; extra == "dev"
40
+
41
+ # VoXtream: Full-Stream Text-to-Speech with Extremely Low Latency
42
+
43
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/pdf/2509.15969)
44
+ [![demo](https://img.shields.io/badge/VoXtream-Demo-red)](https://herimor.github.io/voxtream)
45
+ [![model](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow)](https://huggingface.co/herimor/voxtream)
46
+ [![python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)](https://www.python.org/downloads/release/python-3119)
47
+ [![pytorch](https://img.shields.io/badge/PyTorch_2.4+-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/get-started/locally)
48
+
49
+ We present VoXtream, a fully autoregressive, zero-shot streaming text-to-speech system for real-time use that begins speaking from the first word.
50
+
51
+ ### Key featues
52
+
53
+ - **Streaming**: Support a full-stream scenario, where the full sentence is not known in advance. The model takes the text stream coming word-by-word as input and outputs an audio stream in 80ms chunks.
54
+ - **Speed**: Works **5x** times faster than real-time and achieves **102 ms** first packet latency on GPU.
55
+ - **Quality and efficiency**: With only 9k hours of training data, it matches or surpasses the quality and intelligibility of larger models or models trained on large datasets.
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install voxtream
61
+ ```
62
+
63
+ ## Usage
64
+
65
+ ### Output streaming
66
+ ```bash
67
+ voxtream \
68
+ --prompt-audio assets/audio/male.wav \
69
+ --prompt-text "The liquor was first created as 'Brandy Milk', produced with milk, brandy and vanilla." \
70
+ --text "In general, however, some method is then needed to evaluate each approximation." \
71
+ --output "output_stream.wav"
72
+ ```
73
+ * Note: Initial run may take some additional time to download model weights.
74
+
75
+ ### Full streaming
76
+ ```bash
77
+ voxtream \
78
+ --prompt-audio assets/audio/female.wav \
79
+ --prompt-text "Betty Cooper helps Archie with cleaning a store room, when Reggie attacks her." \
80
+ --text "Staff do not always do enough to prevent violence." \
81
+ --output "full_stream.wav" \
82
+ --full-stream
83
+ ```
84
+
85
+ ## Training
86
+
87
+ - Build the Docker container. If you have another version of Docker compose installed use `docker compose -f ...` instead.
88
+ ```bash
89
+ docker-compose -f .devcontainer/docker-compose.yaml build voxtream
90
+ ```
91
+
92
+ - Run training using the `train.py` script. You should specify GPU IDs that will be seen inside the container, ex. `GPU_IDS=0,1`. Specify the batch size according to your GPU. The default batch size is 32 (tested on RTX3090), 64 fits into A100-40Gb, and 128 fits into A100-80Gb. The dataset will be downloaded automatically to the HF cache directory. Dataset size is 20Gb. The data will be loaded to RAM during training, make sure you can allocate ~20Gb of RAM per GPU. Results will be stored at the `./experiments` directory.
93
+
94
+ Example of running the training using 2 GPUs with batch size 32:
95
+ ```bash
96
+ GPU_IDS=0,1 docker-compose -f .devcontainer/docker-compose.yaml run voxtream python voxtream/train.py batch_size=32
97
+ ```
98
+
99
+ ## Benchmark
100
+
101
+ To evaluate model's real time factor (RTF) and First packet latency (FPL) run `voxtream-benchmark`. You can compile model for faster inference using `--compile` flag (note that initial compilation take some time).
102
+
103
+ | Device | Compiled | FPL, ms | RTF |
104
+ | :-: | :-: | :-: | :-: |
105
+ | A100 | | 176 | 1.00 |
106
+ | A100 | :heavy_check_mark: | 102 | 0.17 |
107
+ | RTX3090 | | 205 | 1.19 |
108
+ | RTX3090 | :heavy_check_mark: | 123 | 0.19 |
109
+
110
+ ## TODO
111
+
112
+ - [x] Add a neural phoneme aligner. Remove MFA dependency
113
+ - [x] Add PyPI package
114
+ - [ ] Gradio demo
115
+ - [ ] HuggingFace Spaces demo
116
+ - [ ] Evaluation scripts
117
+
118
+ ## License
119
+
120
+ The code in this repository is provided under the MIT License.
121
+
122
+ The Depth Transformer component from SesameAI-CSM is included under the Apache 2.0 License (see LICENSE-APACHE and NOTICE).
123
+
124
+ The model weights were trained on data licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0). Redistribution of the weights must include proper attribution to the original dataset creators (see ATTRIBUTION.md).
125
+
126
+ ## Acknowledgements
127
+
128
+ - [Mimi](https://huggingface.co/kyutai/mimi): Streaming audio codec from [Kyutai](https://kyutai.org)
129
+ - [CSM](https://github.com/SesameAILabs/csm): Conversation speech model from [Sesame](https://www.sesame.com)
130
+ - [ReDimNet](https://github.com/IDRnD/redimnet): Speaker recognition model from [IDR&D](https://www.idrnd.ai)
131
+
132
+ ## Citation
133
+ ```
134
+ @article{torgashov2025voxtream,
135
+ author = {Torgashov, Nikita and Henter, Gustav Eje and Skantze, Gabriel},
136
+ title = {Vo{X}tream: Full-Stream Text-to-Speech with Extremely Low Latency},
137
+ journal = {arXiv:2509.15969},
138
+ year = {2025}
139
+ }
140
+ ```
141
+
142
+ ## Disclaimer
143
+ Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
@@ -0,0 +1,103 @@
1
+ # VoXtream: Full-Stream Text-to-Speech with Extremely Low Latency
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/pdf/2509.15969)
4
+ [![demo](https://img.shields.io/badge/VoXtream-Demo-red)](https://herimor.github.io/voxtream)
5
+ [![model](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow)](https://huggingface.co/herimor/voxtream)
6
+ [![python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)](https://www.python.org/downloads/release/python-3119)
7
+ [![pytorch](https://img.shields.io/badge/PyTorch_2.4+-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/get-started/locally)
8
+
9
+ We present VoXtream, a fully autoregressive, zero-shot streaming text-to-speech system for real-time use that begins speaking from the first word.
10
+
11
+ ### Key featues
12
+
13
+ - **Streaming**: Support a full-stream scenario, where the full sentence is not known in advance. The model takes the text stream coming word-by-word as input and outputs an audio stream in 80ms chunks.
14
+ - **Speed**: Works **5x** times faster than real-time and achieves **102 ms** first packet latency on GPU.
15
+ - **Quality and efficiency**: With only 9k hours of training data, it matches or surpasses the quality and intelligibility of larger models or models trained on large datasets.
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install voxtream
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ### Output streaming
26
+ ```bash
27
+ voxtream \
28
+ --prompt-audio assets/audio/male.wav \
29
+ --prompt-text "The liquor was first created as 'Brandy Milk', produced with milk, brandy and vanilla." \
30
+ --text "In general, however, some method is then needed to evaluate each approximation." \
31
+ --output "output_stream.wav"
32
+ ```
33
+ * Note: Initial run may take some additional time to download model weights.
34
+
35
+ ### Full streaming
36
+ ```bash
37
+ voxtream \
38
+ --prompt-audio assets/audio/female.wav \
39
+ --prompt-text "Betty Cooper helps Archie with cleaning a store room, when Reggie attacks her." \
40
+ --text "Staff do not always do enough to prevent violence." \
41
+ --output "full_stream.wav" \
42
+ --full-stream
43
+ ```
44
+
45
+ ## Training
46
+
47
+ - Build the Docker container. If you have another version of Docker compose installed use `docker compose -f ...` instead.
48
+ ```bash
49
+ docker-compose -f .devcontainer/docker-compose.yaml build voxtream
50
+ ```
51
+
52
+ - Run training using the `train.py` script. You should specify GPU IDs that will be seen inside the container, ex. `GPU_IDS=0,1`. Specify the batch size according to your GPU. The default batch size is 32 (tested on RTX3090), 64 fits into A100-40Gb, and 128 fits into A100-80Gb. The dataset will be downloaded automatically to the HF cache directory. Dataset size is 20Gb. The data will be loaded to RAM during training, make sure you can allocate ~20Gb of RAM per GPU. Results will be stored at the `./experiments` directory.
53
+
54
+ Example of running the training using 2 GPUs with batch size 32:
55
+ ```bash
56
+ GPU_IDS=0,1 docker-compose -f .devcontainer/docker-compose.yaml run voxtream python voxtream/train.py batch_size=32
57
+ ```
58
+
59
+ ## Benchmark
60
+
61
+ To evaluate model's real time factor (RTF) and First packet latency (FPL) run `voxtream-benchmark`. You can compile model for faster inference using `--compile` flag (note that initial compilation take some time).
62
+
63
+ | Device | Compiled | FPL, ms | RTF |
64
+ | :-: | :-: | :-: | :-: |
65
+ | A100 | | 176 | 1.00 |
66
+ | A100 | :heavy_check_mark: | 102 | 0.17 |
67
+ | RTX3090 | | 205 | 1.19 |
68
+ | RTX3090 | :heavy_check_mark: | 123 | 0.19 |
69
+
70
+ ## TODO
71
+
72
+ - [x] Add a neural phoneme aligner. Remove MFA dependency
73
+ - [x] Add PyPI package
74
+ - [ ] Gradio demo
75
+ - [ ] HuggingFace Spaces demo
76
+ - [ ] Evaluation scripts
77
+
78
+ ## License
79
+
80
+ The code in this repository is provided under the MIT License.
81
+
82
+ The Depth Transformer component from SesameAI-CSM is included under the Apache 2.0 License (see LICENSE-APACHE and NOTICE).
83
+
84
+ The model weights were trained on data licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0). Redistribution of the weights must include proper attribution to the original dataset creators (see ATTRIBUTION.md).
85
+
86
+ ## Acknowledgements
87
+
88
+ - [Mimi](https://huggingface.co/kyutai/mimi): Streaming audio codec from [Kyutai](https://kyutai.org)
89
+ - [CSM](https://github.com/SesameAILabs/csm): Conversation speech model from [Sesame](https://www.sesame.com)
90
+ - [ReDimNet](https://github.com/IDRnD/redimnet): Speaker recognition model from [IDR&D](https://www.idrnd.ai)
91
+
92
+ ## Citation
93
+ ```
94
+ @article{torgashov2025voxtream,
95
+ author = {Torgashov, Nikita and Henter, Gustav Eje and Skantze, Gabriel},
96
+ title = {Vo{X}tream: Full-Stream Text-to-Speech with Extremely Low Latency},
97
+ journal = {arXiv:2509.15969},
98
+ year = {2025}
99
+ }
100
+ ```
101
+
102
+ ## Disclaimer
103
+ Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
Binary file
Binary file
@@ -0,0 +1,12 @@
1
+ prompt_audio,prompt_text,text
2
+ assets/benchmark/common_voice_en_10119832.wav,"We asked over twenty different people, and they all said it was his.",Get the trust fund to the bank early.
3
+ assets/benchmark/common_voice_en_10119832.wav,"We asked over twenty different people, and they all said it was his.",The stained glass offered a hypnotic atmosphere.
4
+ assets/benchmark/common_voice_en_103675.wav,I'm never more aware of a room's acoustics than when I'm trying to enjoy a snack I have no intention of sharing.,"One by one, the campfires were extinguished, and the oasis fell as quiet as the desert."
5
+ assets/benchmark/common_voice_en_103675.wav,I'm never more aware of a room's acoustics than when I'm trying to enjoy a snack I have no intention of sharing.,The boy knew the desert sensed his fear.
6
+ assets/benchmark/common_voice_en_10933823.wav,Sometimes I overthink things which leads me to postpone and ultimately never achieve the goal I had in mind.,"When it comes to the crunch, our company will become insolvent."
7
+ assets/benchmark/common_voice_en_10933823.wav,Sometimes I overthink things which leads me to postpone and ultimately never achieve the goal I had in mind.,The primary coil has fifty turns.
8
+ assets/benchmark/common_voice_en_120405.wav,He approached the mass and was surprised at the size and the shape.,I'm never more aware of a room's acoustics than when I'm trying to enjoy a snack I have no intention of sharing.
9
+ assets/benchmark/common_voice_en_120405.wav,He approached the mass and was surprised at the size and the shape.,The only shadow was that of the few scattered pine trees.
10
+ assets/benchmark/common_voice_en_1205005.wav,"Roaming endlessly around the park, she wants to go home.",The work of the tailor is seen on each side.
11
+ assets/benchmark/common_voice_en_1205005.wav,"Roaming endlessly around the park, she wants to go home.",NASA plans to launch the rocket tomorrow.
12
+ assets/benchmark/common_voice_en_123125.wav,"There's no danger, the boy said, when they had moved on past the encampment.","After all, who doesn’t want to overcome new challenges and achieve great heights?"
@@ -0,0 +1,49 @@
1
+ {
2
+ "sil_token": 69,
3
+ "bos_token": 71,
4
+ "eos_token": 72,
5
+ "end_pad": 5,
6
+ "num_codebooks": 12,
7
+ "num_phones_per_frame": 2,
8
+ "audio_delay_frames": 1,
9
+ "temperature": 0.9,
10
+ "topk": 5,
11
+ "max_audio_length_ms": 60000,
12
+ "device": "cuda",
13
+ "model_repo": "herimor/voxtream",
14
+ "model_name": "model.safetensors",
15
+ "model_config_name": "config.json",
16
+ "mimi_sr": 24000,
17
+ "mimi_vocab_size": 2048,
18
+ "mimi_frame_ms": 80,
19
+ "mimi_repo": "kyutai/moshiko-pytorch-bf16",
20
+ "mimi_name": "tokenizer-e351c8d8-checkpoint125.safetensors",
21
+ "spk_enc_sr": 16000,
22
+ "spk_enc_repo": "IDRnD/ReDimNet",
23
+ "spk_enc_model": "ReDimNet",
24
+ "spk_enc_model_name": "M",
25
+ "spk_enc_train_type": "ft_mix",
26
+ "spk_enc_dataset": "vb2+vox2+cnc",
27
+ "phoneme_dict_name": "phoneme_to_token.json",
28
+ "nltk_resource": "taggers/averaged_perceptron_tagger_eng",
29
+ "aligner": "charsiu/en_w2v2_fc_10ms",
30
+ "cache_prompt": false,
31
+ "phoneme_index_map": {
32
+ "0": [
33
+ 0,
34
+ 1
35
+ ],
36
+ "1": [
37
+ 0,
38
+ 2
39
+ ],
40
+ "2": [
41
+ 1,
42
+ 1
43
+ ],
44
+ "3": [
45
+ 1,
46
+ 2
47
+ ]
48
+ }
49
+ }
@@ -0,0 +1,81 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.setuptools]
6
+ license-files = ["LICENSE"]
7
+
8
+ [project]
9
+ name = "voxtream"
10
+ version = "0.1.0"
11
+ description = "Full-Stream Zero-shot TTS model with Extremely Low Latency"
12
+ readme = "README.md"
13
+ license = { text = "MIT" }
14
+ authors = [
15
+ { name = "Nikita Torgashov", email = "torgaschov.nikita@gmail.com" }
16
+ ]
17
+ requires-python = ">=3.11"
18
+ keywords = [
19
+ "text-to-speech",
20
+ "streaming",
21
+ "tts",
22
+ "speech-synthesis",
23
+ "voice-cloning"
24
+ ]
25
+ classifiers = [
26
+ "Development Status :: 3 - Alpha",
27
+ "Intended Audience :: Developers",
28
+ "Topic :: Multimedia :: Sound/Audio",
29
+ "License :: OSI Approved :: MIT License",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.11",
32
+ ]
33
+
34
+ # install_requires (from requirements.txt)
35
+ dependencies = [
36
+ "torch==2.4.0",
37
+ "torchaudio==2.4.0",
38
+ "torchtune==0.4.0",
39
+ "torchao==0.9.0",
40
+ "lightning==2.4.0",
41
+ "moshi==0.2.2",
42
+ "huggingface_hub==0.28.1",
43
+ "g2p-en==2.1.0",
44
+ "librosa==0.11.0",
45
+ "soundfile==0.13.1",
46
+ "inflect==7.5.0",
47
+ "nltk==3.9.1",
48
+ "hydra-core==1.3.2",
49
+ "tensorboard==2.19.0",
50
+ "transformers==4.50.0"
51
+ ]
52
+
53
+ [project.optional-dependencies]
54
+ dev = ["black", "isort", "flake8", "mypy", "pytest"]
55
+
56
+ [project.urls]
57
+ Homepage = "https://herimor.github.io/voxtream"
58
+ "Bug Reports" = "https://github.com/herimor/voxtream/issues"
59
+ Source = "https://github.com/herimor/voxtream"
60
+
61
+ [project.scripts]
62
+ voxtream = "voxtream.run:main"
63
+ voxtream-benchmark = "voxtream.benchmark:main"
64
+
65
+ [tool.setuptools.packages.find]
66
+ where = ["."]
67
+ include = ["voxtream*"]
68
+
69
+ # Tool configs merged from your original pyproject.toml
70
+ [tool.black]
71
+ line-length = 88
72
+ target-version = ["py311"]
73
+ skip-string-normalization = false
74
+
75
+ [tool.isort]
76
+ profile = "black"
77
+
78
+ [tool.ruff]
79
+ line-length = 88
80
+ lint.select = ["E", "F", "W", "C90", "B", "I"]
81
+ lint.ignore = ["E501"]
@@ -0,0 +1,15 @@
1
+ torch==2.4.0
2
+ torchaudio==2.4.0
3
+ torchtune==0.4.0
4
+ torchao==0.9.0
5
+ lightning==2.4.0
6
+ moshi==0.2.2
7
+ huggingface_hub==0.28.1
8
+ g2p-en==2.1.0
9
+ librosa==0.11.0
10
+ soundfile==0.13.1
11
+ inflect==7.5.0
12
+ nltk==3.9.1
13
+ hydra-core==1.3.2
14
+ tensorboard==2.19.0
15
+ transformers==4.50.0
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,70 @@
1
+ import argparse
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import torch._inductor.config
8
+ from tqdm.auto import tqdm
9
+
10
+ from voxtream.generator import SpeechGenerator, SpeechGeneratorConfig
11
+ from voxtream.utils.generator import existing_file, set_seed, text_generator
12
+
13
+ torch._inductor.config.coordinate_descent_tuning = True
14
+ torch._inductor.config.fx_graph_cache = True
15
+
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument("-c", "--compile", action="store_true", help="Compile graph")
20
+ parser.add_argument(
21
+ "-cfg",
22
+ "--config",
23
+ type=existing_file,
24
+ help="Path to the config file",
25
+ default="configs/generator.json",
26
+ )
27
+ parser.add_argument(
28
+ "-m",
29
+ "--meta",
30
+ type=existing_file,
31
+ help="Path to the metadata file",
32
+ default="assets/benchmark/meta.csv",
33
+ )
34
+ args = parser.parse_args()
35
+
36
+ set_seed()
37
+ with open(args.config) as f:
38
+ config = SpeechGeneratorConfig(**json.load(f))
39
+ speech_generator = SpeechGenerator(config, compile=args.compile)
40
+
41
+ meta = pd.read_csv(args.meta)
42
+
43
+ audio_frames, first_packet_latency, gen_times = [], [], []
44
+ for idx, row in tqdm(meta.iterrows(), total=len(meta)):
45
+ speech_stream = speech_generator.generate_stream(
46
+ prompt_text=row.prompt_text,
47
+ prompt_audio_path=Path(row.prompt_audio),
48
+ text=text_generator(row.text),
49
+ )
50
+
51
+ if idx == 0:
52
+ # warmup
53
+ for _, _ in speech_stream:
54
+ pass
55
+ continue
56
+
57
+ for i, (audio_frame, gen_time) in enumerate(speech_stream):
58
+ audio_frames.append(audio_frame)
59
+ if i == 0:
60
+ first_packet_latency.append(gen_time)
61
+ else:
62
+ gen_times.append(gen_time)
63
+
64
+ rtf = (np.mean(gen_times) * 1000) / config.mimi_frame_ms
65
+ print(f"First packet latency: {round(np.mean(first_packet_latency) * 1000)} ms")
66
+ print(f"RTF: {round(rtf, 2)}")
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()