sopro 1.0.2__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sopro-1.0.2 → sopro-1.5.0}/PKG-INFO +30 -7
- {sopro-1.0.2 → sopro-1.5.0}/README.md +29 -6
- {sopro-1.0.2 → sopro-1.5.0}/pyproject.toml +1 -1
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/__init__.py +1 -1
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/cli.py +37 -53
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/config.py +15 -20
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/hub.py +2 -3
- sopro-1.5.0/src/sopro/model.py +583 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/nn/__init__.py +7 -3
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/nn/blocks.py +78 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/nn/embeddings.py +16 -0
- sopro-1.5.0/src/sopro/nn/generator.py +130 -0
- sopro-1.5.0/src/sopro/nn/nar.py +116 -0
- sopro-1.5.0/src/sopro/nn/ref.py +160 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/nn/speaker.py +14 -17
- sopro-1.5.0/src/sopro/nn/text.py +132 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/sampling.py +3 -3
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/streaming.py +25 -38
- {sopro-1.0.2/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/PKG-INFO +30 -7
- {sopro-1.0.2/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/SOURCES.txt +4 -7
- sopro-1.0.2/src/sopro/model.py +0 -853
- sopro-1.0.2/src/sopro/nn/xattn.py +0 -98
- {sopro-1.0.2 → sopro-1.5.0}/LICENSE.txt +0 -0
- {sopro-1.0.2 → sopro-1.5.0}/setup.cfg +0 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/audio.py +0 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/codec/__init__.py +0 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/codec/mimi.py +0 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/constants.py +0 -0
- {sopro-1.0.2 → sopro-1.5.0}/src/sopro/tokenizer.py +0 -0
- {sopro-1.0.2/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/dependency_links.txt +0 -0
- {sopro-1.0.2/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/entry_points.txt +0 -0
- {sopro-1.0.2/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/requires.txt +0 -0
- {sopro-1.0.2/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sopro
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: A lightweight text-to-speech model with zero-shot voice cloning.
|
|
5
5
|
Author-email: Samuel Vitorino <samvitorino@gmail.com>
|
|
6
6
|
License: Apache 2.0
|
|
@@ -27,14 +27,18 @@ https://github.com/user-attachments/assets/40254391-248f-45ff-b9a4-107d64fbb95f
|
|
|
27
27
|
|
|
28
28
|
[](https://huggingface.co/samuel-vitorino/sopro)
|
|
29
29
|
|
|
30
|
+
### 📰 News
|
|
31
|
+
|
|
32
|
+
**2026.02.04 – SoproTTS v1.5 is out: more stable, faster, and smaller. Trained for just $100, it reaches 250 ms TTFA streaming and 0.05 RTF (~20× realtime) on CPU.**
|
|
33
|
+
|
|
30
34
|
Sopro (from the Portuguese word for “breath/blow”) is a lightweight English text-to-speech model I trained as a side project. Sopro is composed of dilated convs (à la WaveNet) and lightweight cross-attention layers, instead of the common Transformer architecture. Even though Sopro is not SOTA across most voices and situations, I still think it’s a cool project made with a very low budget (trained on a single L40S GPU), and it can be improved with better data.
|
|
31
35
|
|
|
32
36
|
Some of the main features are:
|
|
33
37
|
|
|
34
|
-
- **
|
|
38
|
+
- **147M parameters**
|
|
35
39
|
- **Streaming**
|
|
36
40
|
- **Zero-shot voice cloning**
|
|
37
|
-
- **0.
|
|
41
|
+
- **0.05 RTF on CPU** (measured on an M3 base model), meaning it generates 32 seconds of audio in 1.77 seconds
|
|
38
42
|
- **3-12 seconds of reference audio** for voice cloning
|
|
39
43
|
|
|
40
44
|
---
|
|
@@ -53,7 +57,7 @@ conda activate soprotts
|
|
|
53
57
|
### From PyPI
|
|
54
58
|
|
|
55
59
|
```bash
|
|
56
|
-
pip install sopro
|
|
60
|
+
pip install -U sopro
|
|
57
61
|
```
|
|
58
62
|
|
|
59
63
|
### From the repo
|
|
@@ -79,9 +83,7 @@ soprotts \
|
|
|
79
83
|
|
|
80
84
|
You have the expected `temperature` and `top_p` parameters, alongside:
|
|
81
85
|
|
|
82
|
-
- `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.
|
|
83
|
-
- `--no_stop_head` to disable early stopping
|
|
84
|
-
- `--stop_threshold` and `--stop_patience` (number of consecutive frames that must be classified as final before **stopping**). For short sentences, the stop head may fail to trigger, in which case you can lower these values. Likewise, if the model stops before producing the full text, adjusting these parameters up can help.
|
|
86
|
+
- `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.2`)
|
|
85
87
|
|
|
86
88
|
### Python
|
|
87
89
|
|
|
@@ -119,6 +121,27 @@ wav = torch.cat(chunks, dim=-1)
|
|
|
119
121
|
tts.save_wav("out_stream.wav", wav)
|
|
120
122
|
```
|
|
121
123
|
|
|
124
|
+
You can also precalculate the reference to reduce TTFA:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import torch
|
|
128
|
+
from sopro import SoproTTS
|
|
129
|
+
|
|
130
|
+
tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device="cpu")
|
|
131
|
+
|
|
132
|
+
ref = tts.prepare_reference(ref_audio_path="ref.mp3")
|
|
133
|
+
|
|
134
|
+
chunks = []
|
|
135
|
+
for chunk in tts.stream(
|
|
136
|
+
"Hello! This is a streaming Sopro TTS example.",
|
|
137
|
+
ref=ref,
|
|
138
|
+
):
|
|
139
|
+
chunks.append(chunk.cpu())
|
|
140
|
+
|
|
141
|
+
wav = torch.cat(chunks, dim=-1)
|
|
142
|
+
tts.save_wav("out_stream.wav", wav)
|
|
143
|
+
```
|
|
144
|
+
|
|
122
145
|
---
|
|
123
146
|
|
|
124
147
|
## Interactive streaming demo
|
|
@@ -4,14 +4,18 @@ https://github.com/user-attachments/assets/40254391-248f-45ff-b9a4-107d64fbb95f
|
|
|
4
4
|
|
|
5
5
|
[](https://huggingface.co/samuel-vitorino/sopro)
|
|
6
6
|
|
|
7
|
+
### 📰 News
|
|
8
|
+
|
|
9
|
+
**2026.02.04 – SoproTTS v1.5 is out: more stable, faster, and smaller. Trained for just $100, it reaches 250 ms TTFA streaming and 0.05 RTF (~20× realtime) on CPU.**
|
|
10
|
+
|
|
7
11
|
Sopro (from the Portuguese word for “breath/blow”) is a lightweight English text-to-speech model I trained as a side project. Sopro is composed of dilated convs (à la WaveNet) and lightweight cross-attention layers, instead of the common Transformer architecture. Even though Sopro is not SOTA across most voices and situations, I still think it’s a cool project made with a very low budget (trained on a single L40S GPU), and it can be improved with better data.
|
|
8
12
|
|
|
9
13
|
Some of the main features are:
|
|
10
14
|
|
|
11
|
-
- **
|
|
15
|
+
- **147M parameters**
|
|
12
16
|
- **Streaming**
|
|
13
17
|
- **Zero-shot voice cloning**
|
|
14
|
-
- **0.
|
|
18
|
+
- **0.05 RTF on CPU** (measured on an M3 base model), meaning it generates 32 seconds of audio in 1.77 seconds
|
|
15
19
|
- **3-12 seconds of reference audio** for voice cloning
|
|
16
20
|
|
|
17
21
|
---
|
|
@@ -30,7 +34,7 @@ conda activate soprotts
|
|
|
30
34
|
### From PyPI
|
|
31
35
|
|
|
32
36
|
```bash
|
|
33
|
-
pip install sopro
|
|
37
|
+
pip install -U sopro
|
|
34
38
|
```
|
|
35
39
|
|
|
36
40
|
### From the repo
|
|
@@ -56,9 +60,7 @@ soprotts \
|
|
|
56
60
|
|
|
57
61
|
You have the expected `temperature` and `top_p` parameters, alongside:
|
|
58
62
|
|
|
59
|
-
- `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.
|
|
60
|
-
- `--no_stop_head` to disable early stopping
|
|
61
|
-
- `--stop_threshold` and `--stop_patience` (number of consecutive frames that must be classified as final before **stopping**). For short sentences, the stop head may fail to trigger, in which case you can lower these values. Likewise, if the model stops before producing the full text, adjusting these parameters up can help.
|
|
63
|
+
- `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.2`)
|
|
62
64
|
|
|
63
65
|
### Python
|
|
64
66
|
|
|
@@ -96,6 +98,27 @@ wav = torch.cat(chunks, dim=-1)
|
|
|
96
98
|
tts.save_wav("out_stream.wav", wav)
|
|
97
99
|
```
|
|
98
100
|
|
|
101
|
+
You can also precalculate the reference to reduce TTFA:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
import torch
|
|
105
|
+
from sopro import SoproTTS
|
|
106
|
+
|
|
107
|
+
tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device="cpu")
|
|
108
|
+
|
|
109
|
+
ref = tts.prepare_reference(ref_audio_path="ref.mp3")
|
|
110
|
+
|
|
111
|
+
chunks = []
|
|
112
|
+
for chunk in tts.stream(
|
|
113
|
+
"Hello! This is a streaming Sopro TTS example.",
|
|
114
|
+
ref=ref,
|
|
115
|
+
):
|
|
116
|
+
chunks.append(chunk.cpu())
|
|
117
|
+
|
|
118
|
+
wav = torch.cat(chunks, dim=-1)
|
|
119
|
+
tts.save_wav("out_stream.wav", wav)
|
|
120
|
+
```
|
|
121
|
+
|
|
99
122
|
---
|
|
100
123
|
|
|
101
124
|
## Interactive streaming demo
|
|
@@ -32,8 +32,6 @@ def main() -> None:
|
|
|
32
32
|
ap.add_argument("--temperature", type=float, default=1.05)
|
|
33
33
|
ap.add_argument("--no_anti_loop", action="store_true")
|
|
34
34
|
|
|
35
|
-
ap.add_argument("--no_prefix", action="store_true")
|
|
36
|
-
ap.add_argument("--prefix_sec", type=float, default=None)
|
|
37
35
|
ap.add_argument("--style_strength", type=float, default=None)
|
|
38
36
|
ap.add_argument("--ref_seconds", type=float, default=None)
|
|
39
37
|
|
|
@@ -77,6 +75,7 @@ def main() -> None:
|
|
|
77
75
|
torch.cuda.manual_seed_all(args.seed)
|
|
78
76
|
|
|
79
77
|
t0 = time.perf_counter()
|
|
78
|
+
|
|
80
79
|
tts = SoproTTS.from_pretrained(
|
|
81
80
|
args.repo_id,
|
|
82
81
|
revision=args.revision,
|
|
@@ -84,6 +83,7 @@ def main() -> None:
|
|
|
84
83
|
token=args.hf_token,
|
|
85
84
|
device=device,
|
|
86
85
|
)
|
|
86
|
+
|
|
87
87
|
t1 = time.perf_counter()
|
|
88
88
|
if not args.quiet:
|
|
89
89
|
print(f"[Load] {t1 - t0:.2f}s")
|
|
@@ -99,7 +99,7 @@ def main() -> None:
|
|
|
99
99
|
|
|
100
100
|
with torch.inference_mode():
|
|
101
101
|
text_ids = tts.encode_text(args.text)
|
|
102
|
-
ref = tts.
|
|
102
|
+
ref = tts.prepare_reference(
|
|
103
103
|
ref_audio_path=args.ref_audio,
|
|
104
104
|
ref_tokens_tq=ref_tokens_tq,
|
|
105
105
|
ref_seconds=args.ref_seconds,
|
|
@@ -119,58 +119,42 @@ def main() -> None:
|
|
|
119
119
|
|
|
120
120
|
t_start = time.perf_counter()
|
|
121
121
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
unit="frame",
|
|
127
|
-
disable=args.quiet,
|
|
128
|
-
)
|
|
122
|
+
hist_A: list[int] = []
|
|
123
|
+
pbar = tqdm(
|
|
124
|
+
total=args.max_frames + 1, desc="AR sampling", unit="step", disable=args.quiet
|
|
125
|
+
)
|
|
129
126
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
stop_patience=args.stop_patience,
|
|
140
|
-
stop_threshold=args.stop_threshold,
|
|
141
|
-
):
|
|
142
|
-
hist_A.append(int(rvq1))
|
|
127
|
+
for _t, tok, is_eos in tts.model.ar_stream(
|
|
128
|
+
prep,
|
|
129
|
+
max_frames=args.max_frames,
|
|
130
|
+
top_p=args.top_p,
|
|
131
|
+
temperature=args.temperature,
|
|
132
|
+
anti_loop=(not args.no_anti_loop),
|
|
133
|
+
):
|
|
134
|
+
if is_eos:
|
|
135
|
+
pbar.set_postfix(eos="yes")
|
|
143
136
|
pbar.update(1)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
return
|
|
166
|
-
|
|
167
|
-
tokens_A = torch.tensor(hist_A, device=tts.device, dtype=torch.long).unsqueeze(0)
|
|
168
|
-
cond_seq = prep["cond_all"][:, :T, :]
|
|
169
|
-
tokens_1xTQ = tts.model.nar_refine(cond_seq, tokens_A)
|
|
170
|
-
tokens_tq = tokens_1xTQ.squeeze(0)
|
|
171
|
-
|
|
172
|
-
wav = tts.codec.decode_full(tokens_tq)
|
|
173
|
-
save_audio(args.out, wav, sr=TARGET_SR)
|
|
137
|
+
break
|
|
138
|
+
hist_A.append(int(tok))
|
|
139
|
+
pbar.update(1)
|
|
140
|
+
|
|
141
|
+
t_after_sampling = time.perf_counter()
|
|
142
|
+
|
|
143
|
+
pbar.n = len(hist_A)
|
|
144
|
+
pbar.close()
|
|
145
|
+
|
|
146
|
+
T = len(hist_A)
|
|
147
|
+
if T == 0:
|
|
148
|
+
save_audio(args.out, torch.zeros(1, 0), sr=TARGET_SR)
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
tokens_A = torch.tensor(hist_A, device=tts.device, dtype=torch.long).unsqueeze(0)
|
|
152
|
+
cond_seq = prep["cond_ar"][:, :T, :]
|
|
153
|
+
tokens_1xTQ = tts.model.nar_refine(cond_seq, tokens_A)
|
|
154
|
+
tokens_tq = tokens_1xTQ.squeeze(0)
|
|
155
|
+
|
|
156
|
+
wav = tts.codec.decode_full(tokens_tq)
|
|
157
|
+
save_audio(args.out, wav, sr=TARGET_SR)
|
|
174
158
|
|
|
175
159
|
t_end = time.perf_counter()
|
|
176
160
|
if not args.quiet:
|
|
@@ -13,36 +13,31 @@ class SoproTTSConfig:
|
|
|
13
13
|
audio_sr: int = TARGET_SR
|
|
14
14
|
|
|
15
15
|
d_model: int = 384
|
|
16
|
-
n_layers_text: int =
|
|
17
|
-
n_layers_ar: int = 6
|
|
18
|
-
n_layers_nar: int = 6
|
|
16
|
+
n_layers_text: int = 2
|
|
19
17
|
dropout: float = 0.05
|
|
20
|
-
|
|
21
18
|
pos_emb_max: int = 4096
|
|
22
19
|
max_text_len: int = 2048
|
|
23
20
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
stop_patience: int = 5
|
|
21
|
+
n_layers_ar: int = 6
|
|
22
|
+
ar_kernel: int = 13
|
|
23
|
+
ar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 1)
|
|
24
|
+
ar_text_attn_freq: int = 2
|
|
29
25
|
min_gen_frames: int = 12
|
|
30
26
|
|
|
27
|
+
n_layers_nar: int = 6
|
|
28
|
+
nar_head_dim: int = 256
|
|
29
|
+
nar_kernel_size: int = 11
|
|
30
|
+
nar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 8)
|
|
31
|
+
|
|
31
32
|
stage_B: Tuple[int, int] = (2, 4)
|
|
32
33
|
stage_C: Tuple[int, int] = (5, 8)
|
|
33
34
|
stage_D: Tuple[int, int] = (9, 16)
|
|
34
35
|
stage_E: Tuple[int, int] = (17, 32)
|
|
35
36
|
|
|
36
|
-
ar_lookback: int = 4
|
|
37
|
-
ar_kernel: int = 13
|
|
38
|
-
ar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 1)
|
|
39
|
-
|
|
40
|
-
ar_text_attn_freq: int = 2
|
|
41
|
-
|
|
42
|
-
ref_attn_heads: int = 2
|
|
43
|
-
ref_seconds_max: float = 12.0
|
|
44
|
-
|
|
45
|
-
preprompt_sec_max: float = 4.0
|
|
46
|
-
|
|
47
37
|
sv_student_dim: int = 192
|
|
48
38
|
style_strength: float = 1.0
|
|
39
|
+
|
|
40
|
+
ref_enc_layers: int = 2
|
|
41
|
+
ref_xattn_heads: int = 2
|
|
42
|
+
ref_xattn_layers: int = 3
|
|
43
|
+
ref_xattn_gmax: float = 0.35
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
4
5
|
import struct
|
|
5
6
|
from typing import Any, Dict, Optional
|
|
6
7
|
|
|
@@ -44,9 +45,7 @@ def load_cfg_from_safetensors(path: str) -> SoproTTSConfig:
|
|
|
44
45
|
for k in SoproTTSConfig.__annotations__.keys():
|
|
45
46
|
if k in cfg_dict:
|
|
46
47
|
init[k] = cfg_dict[k]
|
|
47
|
-
|
|
48
|
-
cfg = SoproTTSConfig(**init)
|
|
49
|
-
return cfg
|
|
48
|
+
return SoproTTSConfig(**init)
|
|
50
49
|
|
|
51
50
|
|
|
52
51
|
def load_state_dict_from_safetensors(path: str) -> Dict[str, torch.Tensor]:
|