pocket-tts 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ from pathlib import Path
2
+
3
+ import safetensors
4
+
5
+
6
+ def get_flow_lm_state_dict(path: Path) -> dict:
7
+ state_dict = {}
8
+ with safetensors.safe_open(path, framework="pt", device="cpu") as f:
9
+ for key in f.keys():
10
+ if (
11
+ key.startswith("flow.w_s_t.")
12
+ or key == "condition_provider.conditioners.transcript_in_segment.learnt_padding"
13
+ or key == "condition_provider.conditioners.speaker_wavs.learnt_padding"
14
+ ):
15
+ # skip lookup table weights
16
+ continue
17
+ new_name = key
18
+ if key == "condition_provider.conditioners.transcript_in_segment.embed.weight":
19
+ new_name = "conditioner.embed.weight"
20
+ if key == "condition_provider.conditioners.speaker_wavs.output_proj.weight":
21
+ new_name = "speaker_proj_weight"
22
+ state_dict[new_name] = f.get_tensor(key)
23
+ return state_dict
24
+
25
+
26
+ def get_mimi_state_dict(path: Path) -> dict:
27
+ state_dict = {}
28
+ with safetensors.safe_open(path, framework="pt", device="cpu") as f:
29
+ for key in f.keys():
30
+ if key.startswith("model.quantizer.vq.") or key == "model.quantizer.logvar_proj.weight":
31
+ # skip vq weights
32
+ continue
33
+
34
+ state_dict[key.removeprefix("model.")] = f.get_tensor(key)
35
+ return state_dict
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: pocket-tts
3
+ Version: 1.0.2
4
+ Summary: Kyutai's pocket-sized text-to-speech!
5
+ License-File: LICENSE
6
+ Requires-Python: <3.15,>=3.10
7
+ Requires-Dist: beartype>=0.22.5
8
+ Requires-Dist: einops>=0.4.0
9
+ Requires-Dist: fastapi>=0.100
10
+ Requires-Dist: huggingface-hub>=0.10
11
+ Requires-Dist: numpy>=2
12
+ Requires-Dist: pydantic>=2
13
+ Requires-Dist: python-multipart>=0.0.21
14
+ Requires-Dist: requests>=2.20.0
15
+ Requires-Dist: safetensors>=0.4.0
16
+ Requires-Dist: scipy>=1.5.0
17
+ Requires-Dist: sentencepiece>=0.2.1
18
+ Requires-Dist: torch>=2.5.0
19
+ Requires-Dist: typer>=0.10.0
20
+ Requires-Dist: typing-extensions>=4.0.0
21
+ Requires-Dist: uvicorn>=0.13.0
22
+ Provides-Extra: audio
23
+ Requires-Dist: soundfile>=0.12.0; extra == 'audio'
24
+ Description-Content-Type: text/markdown
25
+
26
+ # Pocket TTS
27
+
28
+ <img width="1446" height="622" alt="pocket-tts-logo-v2-transparent" src="https://github.com/user-attachments/assets/637b5ed6-831f-4023-9b4c-741be21ab238" />
29
+
30
+ A lightweight text-to-speech (TTS) application designed to run efficiently on CPUs.
31
+ Forget about the hassle of using GPUs and web APIs serving TTS models. With Kyutai's Pocket TTS, generating audio is just a pip install and a function call away.
32
+
33
+ Supports Python 3.10, 3.11, 3.12, 3.13 and 3.14. Requires PyTorch 2.5+. Does not require the gpu version of PyTorch.
34
+
35
+ [🔊 Demo](https://kyutai.org/pocket-tts) |
36
+ [🐱‍💻GitHub Repository](https://github.com/kyutai-labs/pocket-tts) |
37
+ [🤗 Hugging Face Model Card](https://huggingface.co/kyutai/pocket-tts) |
38
+ [⚙️ Tech report](https://kyutai.org/blog/2026-01-13-pocket-tts) |
39
+ [📄 Paper](https://arxiv.org/abs/2509.06926) |
40
+ [📚 Documentation](https://github.com/kyutai-labs/pocket-tts/tree/main/docs)
41
+
42
+
43
+ ## Main takeaways
44
+ * Runs on CPU
45
+ * Small model size, 100M parameters
46
+ * Audio streaming
47
+ * Low latency, ~200ms to get the first audio chunk
48
+ * Faster than real-time, ~6x real-time on a CPU of MacBook Air M4
49
+ * Uses only 2 CPU cores
50
+ * Python API and CLI
51
+ * Voice cloning
52
+ * English only at the moment
53
+ * Can handle infinitely long text inputs
54
+
55
+ ## Trying it from the website, without installing anything
56
+
57
+ Navigate to the [Kyutai website](https://kyutai.org/pocket-tts) to try it out directly in your browser. You can input text, select different voices, and generate speech without any installation.
58
+
59
+ ## Trying it with the CLI
60
+
61
+ ### The `generate` command
62
+ You can use pocket-tts directly from the command line. We recommend using
63
+ `uv` as it installs any dependencies on the fly in an isolated environment (uv installation instructions [here](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer)).
64
+ You can also use `pip install pocket-tts` to install it manually.
65
+
66
+ This will generate a wav file `./tts_output.wav` saying the default text with the default voice, and display some speed statistics.
67
+ ```bash
68
+ uvx pocket-tts generate
69
+ # or if you installed it manually with pip:
70
+ pocket-tts generate
71
+ ```
72
+ Modify the voice with `--voice` and the text with `--text`. We provide a small catalog of voices.
73
+
74
+ You can take a look at [this page](https://huggingface.co/kyutai/tts-voices) which details the licenses
75
+ for each voice.
76
+
77
+ * [alba](https://huggingface.co/kyutai/tts-voices/blob/main/alba-mackenna/casual.wav)
78
+ * [marius](https://huggingface.co/kyutai/tts-voices/blob/main/voice-donations/Selfie.wav)
79
+ * [javert](https://huggingface.co/kyutai/tts-voices/blob/main/voice-donations/Butter.wav)
80
+ * [jean](https://huggingface.co/kyutai/tts-voices/blob/main/ears/p010/freeform_speech_01.wav)
81
+ * [fantine](https://huggingface.co/kyutai/tts-voices/blob/main/vctk/p244_023.wav)
82
+ * [cosette](https://huggingface.co/kyutai/tts-voices/blob/main/expresso/ex04-ex02_confused_001_channel1_499s.wav)
83
+ * [eponine](https://huggingface.co/kyutai/tts-voices/blob/main/vctk/p262_023.wav)
84
+ * [azelma](https://huggingface.co/kyutai/tts-voices/blob/main/vctk/p303_023.wav)
85
+
86
+ The `--voice` argument can also take a plain wav file as input for voice cloning.
87
+ You can use your own or check out our [voice repository](https://huggingface.co/kyutai/tts-voices).
88
+
89
+ Feel free to check out the [generate documentation](https://github.com/kyutai-labs/pocket-tts/tree/main/docs/generate.md) for more details and examples.
90
+ For trying multiple voices and prompts quickly, prefer using the `serve` command.
91
+
92
+ ### The `serve` command
93
+
94
+ You can also run a local server to generate audio via HTTP requests.
95
+ ```bash
96
+ uvx pocket-tts serve
97
+ # or if you installed it manually with pip:
98
+ pocket-tts serve
99
+ ```
100
+ Navigate to `http://localhost:8000` to try the web interface, it's faster than the command line as the model is kept in memory between requests.
101
+
102
+ You can check out the [serve documentation](https://github.com/kyutai-labs/pocket-tts/tree/main/docs/serve.md) for more details and examples.
103
+
104
+ ## Using it as a Python library
105
+
106
+ You can try out the Python library on Colab [here](https://colab.research.google.com/github/kyutai-labs/pocket-tts/blob/main/docs/pocket-tts-example.ipynb).
107
+
108
+ Install the package with
109
+ ```bash
110
+ pip install pocket-tts
111
+ # or
112
+ uv add pocket-tts
113
+ ```
114
+
115
+ You can use this package as a simple Python library to generate audio from text.
116
+ ```python
117
+ from pocket_tts import TTSModel
118
+ import scipy.io.wavfile
119
+
120
+ tts_model = TTSModel.load_model()
121
+ voice_state = tts_model.get_state_for_audio_prompt(
122
+ "alba" # One of the pre-made voices, see above
123
+ # You can also use any voice file you have locally or from Hugging Face:
124
+ # "./some_audio.wav"
125
+ # or "hf://kyutai/tts-voices/expresso/ex01-ex02_default_001_channel2_198s.wav"
126
+ )
127
+ audio = tts_model.generate_audio(voice_state, "Hello world, this is a test.")
128
+ # Audio is a 1D torch tensor containing PCM data.
129
+ scipy.io.wavfile.write("output.wav", tts_model.sample_rate, audio.numpy())
130
+ ```
131
+
132
+ You can have multiple voice states around if
133
+ you have multiple voices you want to use. `load_model()`
134
+ and `get_state_for_audio_prompt()` are relatively slow operations,
135
+ so we recommend to keep the model and voice states in memory if you can.
136
+
137
+ You can check out the [Python API documentation](https://github.com/kyutai-labs/pocket-tts/tree/main/docs/python-api.md) for more details and examples.
138
+
139
+ ## Unsupported features
140
+
141
+ At the moment, we do not support (but would love pull requests adding):
142
+ - [Running the TTS inside a web browser (WebAssembly)](https://github.com/kyutai-labs/pocket-tts/issues/1)
143
+ - [A compiled version with for example `torch.compile()` or `candle`.](https://github.com/kyutai-labs/pocket-tts/issues/2)
144
+ - [Adding silence in the text input to generate pauses.](https://github.com/kyutai-labs/pocket-tts/issues/6)
145
+ - [Quantization to run the computation in int8.](https://github.com/kyutai-labs/pocket-tts/issues/7)
146
+
147
+ We tried running this TTS model on the GPU but did not observe a speedup compared to CPU execution,
148
+ notably because we use a batch size of 1 and a very small model.
149
+
150
+ ## Development and local setup
151
+
152
+ We accept contributions! Feel free to open issues or pull requests on GitHub.
153
+
154
+ You can find development instructions in the [CONTRIBUTING.md](https://github.com/kyutai-labs/pocket-tts/tree/main/CONTRIBUTING.md) file. You'll also find there how to have an editable install of the package for local development.
155
+
156
+ ## Alternative implementations
157
+
158
+ - [babybirdprd/pocket-tts](https://github.com/babybirdprd/pocket-tts) - Candle version (Rust) with WebAssembly and PyO3 bindings. Can run in the browser!
159
+
160
+ ## Projects using pocket-tts
161
+
162
+ - [lukasmwerner/pocket-reader](https://github.com/lukasmwerner/pocket-reader) - Browser screen reader
163
+ - [ikidd/pocket-tts-wyoming](https://github.com/ikidd/pocket-tts-wyoming) - Docker container for pocket-tts using Wyoming protocol, ready for Home Assistant Voice use.
164
+
165
+ ## Prohibited use
166
+
167
+ Use of our model must comply with all applicable laws and regulations and must not result in, involve, or facilitate any illegal, harmful, deceptive, fraudulent, or unauthorized activity. Prohibited uses include, without limitation, voice impersonation or cloning without explicit and lawful consent; misinformation, disinformation, or deception (including fake news, fraudulent calls, or presenting generated content as genuine recordings of real people or events); and the generation of unlawful, harmful, libelous, abusive, harassing, discriminatory, hateful, or privacy-invasive content. We disclaim all liability for any non-compliant use.
168
+
169
+
170
+ ## Authors
171
+
172
+ Manu Orsini*, Simon Rouard*, Gabriel De Marmiesse*, Václav Volhejn, Neil Zeghidour, Alexandre Défossez
173
+
174
+ *equal contribution
@@ -0,0 +1,38 @@
1
+ pocket_tts/__init__.py,sha256=ieUh2FGHNweWMg54vrRXFFkcJlbFK7VkdgUsuw0Rgqg,407
2
+ pocket_tts/__main__.py,sha256=dkNeZB6TY6pgIlgqXqz0wsDw9nTL-ZxqhQ0RRIw2u5U,102
3
+ pocket_tts/default_parameters.py,sha256=fw1FtELqqKSDMLk-Igp0d2Y2eV5VQL4cxlwIoiIzt9A,202
4
+ pocket_tts/main.py,sha256=ws2j5LdJbWMtFvf1nd8FcPnzR3SUb3m_8IECzq4AEKo,8828
5
+ pocket_tts/conditioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ pocket_tts/conditioners/base.py,sha256=1pXfrUQNGzOeti0LwShRrSzQmcMCwYcuXb0tUDgyqfE,1181
7
+ pocket_tts/conditioners/text.py,sha256=RSHD-NA_E_Cx70Ao5qlbu2HKkiLlKyulcDtQQFCRS9k,2285
8
+ pocket_tts/config/b6369a24.yaml,sha256=g2bRvmXfvqa5_0Oax4MKwpXuTYXnHVJVVone0abMlKU,1306
9
+ pocket_tts/data/__init__.py,sha256=szxWXTBcZH57eTtfrd5WLTZ9TWOR-uBgpOOkjFgkHiI,97
10
+ pocket_tts/data/audio.py,sha256=pR73nJTA9yw8T4tJbKng8rkD0Xxfn3INnHHSvPZNIr8,5019
11
+ pocket_tts/data/audio_utils.py,sha256=57ySXzAUgtY0GNgjQZQJfoVpA8hYQmK8H2gOUwt2P2k,947
12
+ pocket_tts/models/__init__.py,sha256=3zt8ACLBwyH6i-w8nkHXdKBuica2W2lq5XsNp0vPxF0,56
13
+ pocket_tts/models/flow_lm.py,sha256=ZJPQ45HgcNn5JM3Z5KpK1u9KP9WaNWBk05qZVPfIKeU,8176
14
+ pocket_tts/models/mimi.py,sha256=dIAiTO68vxGAQE9Z_4_E7p1OrZ7AdbsKP4YdejBrioQ,4232
15
+ pocket_tts/models/tts_model.py,sha256=IxxsVrCqeZfiKC6C_v4bzTdC45ItyvBDaITxzO_d6Yg,33692
16
+ pocket_tts/modules/__init__.py,sha256=0PvYo5-vS2OK-ZJXf4mibx0p_rhlrJjO7Jf_fvdWv-s,44
17
+ pocket_tts/modules/conv.py,sha256=Wu7kzgFQdjW3U8on5DQcGozxnXZHXawUFk1vzWuf55E,5573
18
+ pocket_tts/modules/dummy_quantizer.py,sha256=aKWpV-ad9ypWqoJyehEux8kTRW-pYXM-Na3OsfKQEz8,608
19
+ pocket_tts/modules/layer_scale.py,sha256=RdE2wh64XQCvppJyecKXY7s7cK4yh0yW_yfXmhFcYeQ,281
20
+ pocket_tts/modules/mimi_transformer.py,sha256=NwfHW9vEIKYcGyUnNowlXoIQ7Vb3UjuVN0Db8Ebjiic,10081
21
+ pocket_tts/modules/mlp.py,sha256=qTbuPuMhN5sf9H4sO67mdF1p5ZXSPqHvcgyMqJkFaiA,6798
22
+ pocket_tts/modules/resample.py,sha256=YuVWfTHEDxcIlGSkAP1dyNSYbM7_ddRmfgXqfj6TrOk,1244
23
+ pocket_tts/modules/rope.py,sha256=SPvwNKl0FbFEezqXGw94W8bIp_WcaizAQAP1tzKPaCQ,2063
24
+ pocket_tts/modules/seanet.py,sha256=OezlPIQQCOSEmMfAYOWy_-B1SWRr1YUVK9xXcAGZ8V0,5945
25
+ pocket_tts/modules/stateful_module.py,sha256=_lMh2b0JdcuJQb1-pLQ2vROXYV6f5xSyZ45QBV21QX0,1514
26
+ pocket_tts/modules/transformer.py,sha256=SRexJzP4IvdgZg9vNmdWaY3njA3KLH1Fc-LytjwNwbs,4770
27
+ pocket_tts/static/index.html,sha256=bd0alW81Rv0F2NA6l7YZ9wuLkbl3B9BsTapotqC2rMQ,14556
28
+ pocket_tts/utils/__init__.py,sha256=CANSY8vGq_6v8rWhWRIdnk-Wo5LA2R9Wjg1nqbWqLOw,17
29
+ pocket_tts/utils/config.py,sha256=p5uKRzvmaefgQo7vCn8R5jzrLT9muAc9s2okvZGdjZ4,2492
30
+ pocket_tts/utils/debugging.py,sha256=ZD69I3kNcTvRTRgkB0IVMLZhR30zgNtMiy52sjXUedk,858
31
+ pocket_tts/utils/logging_utils.py,sha256=Rgfzk1dJ3OV2O0AGQfP5ZZBTyJm_UFhHcXtP7qYGNYs,1261
32
+ pocket_tts/utils/utils.py,sha256=dP9OaqPpVx5Rn_qu5fQHgi7WvMM5tHWH8iXM9SGHBCI,3579
33
+ pocket_tts/utils/weights_loading.py,sha256=xJfa1iKZsicFDf9YX0hbUdNOOJ5TaUoHyTTAtccGhoM,1358
34
+ pocket_tts-1.0.2.dist-info/METADATA,sha256=sJc4qiDGUtOKPtzSMBKBHW5X1FgQQnCLBCGgZP8HPmc,8420
35
+ pocket_tts-1.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
+ pocket_tts-1.0.2.dist-info/entry_points.txt,sha256=FKSc-DmMNsl35jUe_TnmIh74B12cfIPAy4Xeqnmps54,55
37
+ pocket_tts-1.0.2.dist-info/licenses/LICENSE,sha256=I_GOA9xJ35FiL-KnYXZJdATkbO2KcV2dK2enRGVxzKM,1023
38
+ pocket_tts-1.0.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pocket-tts = pocket_tts.main:cli_app
@@ -0,0 +1,23 @@
1
+ Permission is hereby granted, free of charge, to any
2
+ person obtaining a copy of this software and associated
3
+ documentation files (the "Software"), to deal in the
4
+ Software without restriction, including without
5
+ limitation the rights to use, copy, modify, merge,
6
+ publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software
8
+ is furnished to do so, subject to the following
9
+ conditions:
10
+
11
+ The above copyright notice and this permission notice
12
+ shall be included in all copies or substantial portions
13
+ of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
16
+ ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
17
+ TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
18
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
19
+ SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
22
+ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
+ DEALINGS IN THE SOFTWARE.