sopro 1.0.1__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sopro-1.0.1 → sopro-1.5.0}/PKG-INFO +30 -7
- {sopro-1.0.1 → sopro-1.5.0}/README.md +29 -6
- {sopro-1.0.1 → sopro-1.5.0}/pyproject.toml +1 -1
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/__init__.py +1 -1
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/cli.py +31 -46
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/config.py +15 -20
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/hub.py +2 -3
- sopro-1.5.0/src/sopro/model.py +583 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/nn/__init__.py +7 -3
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/nn/blocks.py +78 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/nn/embeddings.py +16 -0
- sopro-1.5.0/src/sopro/nn/generator.py +130 -0
- sopro-1.5.0/src/sopro/nn/nar.py +116 -0
- sopro-1.5.0/src/sopro/nn/ref.py +160 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/nn/speaker.py +14 -17
- sopro-1.5.0/src/sopro/nn/text.py +132 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/sampling.py +3 -3
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/streaming.py +25 -38
- {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/PKG-INFO +30 -7
- {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/SOURCES.txt +4 -7
- sopro-1.0.1/src/sopro/model.py +0 -853
- sopro-1.0.1/src/sopro/nn/xattn.py +0 -98
- {sopro-1.0.1 → sopro-1.5.0}/LICENSE.txt +0 -0
- {sopro-1.0.1 → sopro-1.5.0}/setup.cfg +0 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/audio.py +0 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/codec/__init__.py +0 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/codec/mimi.py +0 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/constants.py +0 -0
- {sopro-1.0.1 → sopro-1.5.0}/src/sopro/tokenizer.py +0 -0
- {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/dependency_links.txt +0 -0
- {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/entry_points.txt +0 -0
- {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/requires.txt +0 -0
- {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sopro
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: A lightweight text-to-speech model with zero-shot voice cloning.
|
|
5
5
|
Author-email: Samuel Vitorino <samvitorino@gmail.com>
|
|
6
6
|
License: Apache 2.0
|
|
@@ -27,14 +27,18 @@ https://github.com/user-attachments/assets/40254391-248f-45ff-b9a4-107d64fbb95f
|
|
|
27
27
|
|
|
28
28
|
[](https://huggingface.co/samuel-vitorino/sopro)
|
|
29
29
|
|
|
30
|
+
### 📰 News
|
|
31
|
+
|
|
32
|
+
**2026.02.04 – SoproTTS v1.5 is out: more stable, faster, and smaller. Trained for just $100, it reaches 250 ms TTFA streaming and 0.05 RTF (~20× realtime) on CPU.**
|
|
33
|
+
|
|
30
34
|
Sopro (from the Portuguese word for “breath/blow”) is a lightweight English text-to-speech model I trained as a side project. Sopro is composed of dilated convs (à la WaveNet) and lightweight cross-attention layers, instead of the common Transformer architecture. Even though Sopro is not SOTA across most voices and situations, I still think it’s a cool project made with a very low budget (trained on a single L40S GPU), and it can be improved with better data.
|
|
31
35
|
|
|
32
36
|
Some of the main features are:
|
|
33
37
|
|
|
34
|
-
- **
|
|
38
|
+
- **147M parameters**
|
|
35
39
|
- **Streaming**
|
|
36
40
|
- **Zero-shot voice cloning**
|
|
37
|
-
- **0.
|
|
41
|
+
- **0.05 RTF on CPU** (measured on an M3 base model), meaning it generates 32 seconds of audio in 1.77 seconds
|
|
38
42
|
- **3-12 seconds of reference audio** for voice cloning
|
|
39
43
|
|
|
40
44
|
---
|
|
@@ -53,7 +57,7 @@ conda activate soprotts
|
|
|
53
57
|
### From PyPI
|
|
54
58
|
|
|
55
59
|
```bash
|
|
56
|
-
pip install sopro
|
|
60
|
+
pip install -U sopro
|
|
57
61
|
```
|
|
58
62
|
|
|
59
63
|
### From the repo
|
|
@@ -79,9 +83,7 @@ soprotts \
|
|
|
79
83
|
|
|
80
84
|
You have the expected `temperature` and `top_p` parameters, alongside:
|
|
81
85
|
|
|
82
|
-
- `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.
|
|
83
|
-
- `--no_stop_head` to disable early stopping
|
|
84
|
-
- `--stop_threshold` and `--stop_patience` (number of consecutive frames that must be classified as final before **stopping**). For short sentences, the stop head may fail to trigger, in which case you can lower these values. Likewise, if the model stops before producing the full text, adjusting these parameters up can help.
|
|
86
|
+
- `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.2`)
|
|
85
87
|
|
|
86
88
|
### Python
|
|
87
89
|
|
|
@@ -119,6 +121,27 @@ wav = torch.cat(chunks, dim=-1)
|
|
|
119
121
|
tts.save_wav("out_stream.wav", wav)
|
|
120
122
|
```
|
|
121
123
|
|
|
124
|
+
You can also precalculate the reference to reduce TTFA:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import torch
|
|
128
|
+
from sopro import SoproTTS
|
|
129
|
+
|
|
130
|
+
tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device="cpu")
|
|
131
|
+
|
|
132
|
+
ref = tts.prepare_reference(ref_audio_path="ref.mp3")
|
|
133
|
+
|
|
134
|
+
chunks = []
|
|
135
|
+
for chunk in tts.stream(
|
|
136
|
+
"Hello! This is a streaming Sopro TTS example.",
|
|
137
|
+
ref=ref,
|
|
138
|
+
):
|
|
139
|
+
chunks.append(chunk.cpu())
|
|
140
|
+
|
|
141
|
+
wav = torch.cat(chunks, dim=-1)
|
|
142
|
+
tts.save_wav("out_stream.wav", wav)
|
|
143
|
+
```
|
|
144
|
+
|
|
122
145
|
---
|
|
123
146
|
|
|
124
147
|
## Interactive streaming demo
|
|
@@ -4,14 +4,18 @@ https://github.com/user-attachments/assets/40254391-248f-45ff-b9a4-107d64fbb95f
|
|
|
4
4
|
|
|
5
5
|
[](https://huggingface.co/samuel-vitorino/sopro)
|
|
6
6
|
|
|
7
|
+
### 📰 News
|
|
8
|
+
|
|
9
|
+
**2026.02.04 – SoproTTS v1.5 is out: more stable, faster, and smaller. Trained for just $100, it reaches 250 ms TTFA streaming and 0.05 RTF (~20× realtime) on CPU.**
|
|
10
|
+
|
|
7
11
|
Sopro (from the Portuguese word for “breath/blow”) is a lightweight English text-to-speech model I trained as a side project. Sopro is composed of dilated convs (à la WaveNet) and lightweight cross-attention layers, instead of the common Transformer architecture. Even though Sopro is not SOTA across most voices and situations, I still think it’s a cool project made with a very low budget (trained on a single L40S GPU), and it can be improved with better data.
|
|
8
12
|
|
|
9
13
|
Some of the main features are:
|
|
10
14
|
|
|
11
|
-
- **
|
|
15
|
+
- **147M parameters**
|
|
12
16
|
- **Streaming**
|
|
13
17
|
- **Zero-shot voice cloning**
|
|
14
|
-
- **0.
|
|
18
|
+
- **0.05 RTF on CPU** (measured on an M3 base model), meaning it generates 32 seconds of audio in 1.77 seconds
|
|
15
19
|
- **3-12 seconds of reference audio** for voice cloning
|
|
16
20
|
|
|
17
21
|
---
|
|
@@ -30,7 +34,7 @@ conda activate soprotts
|
|
|
30
34
|
### From PyPI
|
|
31
35
|
|
|
32
36
|
```bash
|
|
33
|
-
pip install sopro
|
|
37
|
+
pip install -U sopro
|
|
34
38
|
```
|
|
35
39
|
|
|
36
40
|
### From the repo
|
|
@@ -56,9 +60,7 @@ soprotts \
|
|
|
56
60
|
|
|
57
61
|
You have the expected `temperature` and `top_p` parameters, alongside:
|
|
58
62
|
|
|
59
|
-
- `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.
|
|
60
|
-
- `--no_stop_head` to disable early stopping
|
|
61
|
-
- `--stop_threshold` and `--stop_patience` (number of consecutive frames that must be classified as final before **stopping**). For short sentences, the stop head may fail to trigger, in which case you can lower these values. Likewise, if the model stops before producing the full text, adjusting these parameters up can help.
|
|
63
|
+
- `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.2`)
|
|
62
64
|
|
|
63
65
|
### Python
|
|
64
66
|
|
|
@@ -96,6 +98,27 @@ wav = torch.cat(chunks, dim=-1)
|
|
|
96
98
|
tts.save_wav("out_stream.wav", wav)
|
|
97
99
|
```
|
|
98
100
|
|
|
101
|
+
You can also precalculate the reference to reduce TTFA:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
import torch
|
|
105
|
+
from sopro import SoproTTS
|
|
106
|
+
|
|
107
|
+
tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device="cpu")
|
|
108
|
+
|
|
109
|
+
ref = tts.prepare_reference(ref_audio_path="ref.mp3")
|
|
110
|
+
|
|
111
|
+
chunks = []
|
|
112
|
+
for chunk in tts.stream(
|
|
113
|
+
"Hello! This is a streaming Sopro TTS example.",
|
|
114
|
+
ref=ref,
|
|
115
|
+
):
|
|
116
|
+
chunks.append(chunk.cpu())
|
|
117
|
+
|
|
118
|
+
wav = torch.cat(chunks, dim=-1)
|
|
119
|
+
tts.save_wav("out_stream.wav", wav)
|
|
120
|
+
```
|
|
121
|
+
|
|
99
122
|
---
|
|
100
123
|
|
|
101
124
|
## Interactive streaming demo
|
|
@@ -32,8 +32,6 @@ def main() -> None:
|
|
|
32
32
|
ap.add_argument("--temperature", type=float, default=1.05)
|
|
33
33
|
ap.add_argument("--no_anti_loop", action="store_true")
|
|
34
34
|
|
|
35
|
-
ap.add_argument("--no_prefix", action="store_true")
|
|
36
|
-
ap.add_argument("--prefix_sec", type=float, default=None)
|
|
37
35
|
ap.add_argument("--style_strength", type=float, default=None)
|
|
38
36
|
ap.add_argument("--ref_seconds", type=float, default=None)
|
|
39
37
|
|
|
@@ -77,6 +75,7 @@ def main() -> None:
|
|
|
77
75
|
torch.cuda.manual_seed_all(args.seed)
|
|
78
76
|
|
|
79
77
|
t0 = time.perf_counter()
|
|
78
|
+
|
|
80
79
|
tts = SoproTTS.from_pretrained(
|
|
81
80
|
args.repo_id,
|
|
82
81
|
revision=args.revision,
|
|
@@ -84,6 +83,7 @@ def main() -> None:
|
|
|
84
83
|
token=args.hf_token,
|
|
85
84
|
device=device,
|
|
86
85
|
)
|
|
86
|
+
|
|
87
87
|
t1 = time.perf_counter()
|
|
88
88
|
if not args.quiet:
|
|
89
89
|
print(f"[Load] {t1 - t0:.2f}s")
|
|
@@ -97,74 +97,59 @@ def main() -> None:
|
|
|
97
97
|
arr = np.load(args.ref_tokens)
|
|
98
98
|
ref_tokens_tq = torch.from_numpy(arr).long()
|
|
99
99
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
100
|
+
with torch.inference_mode():
|
|
101
|
+
text_ids = tts.encode_text(args.text)
|
|
102
|
+
ref = tts.prepare_reference(
|
|
103
|
+
ref_audio_path=args.ref_audio,
|
|
104
|
+
ref_tokens_tq=ref_tokens_tq,
|
|
105
|
+
ref_seconds=args.ref_seconds,
|
|
106
|
+
)
|
|
106
107
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
108
|
+
prep = tts.model.prepare_conditioning(
|
|
109
|
+
text_ids,
|
|
110
|
+
ref,
|
|
111
|
+
max_frames=args.max_frames,
|
|
112
|
+
device=tts.device,
|
|
113
|
+
style_strength=float(
|
|
114
|
+
args.style_strength
|
|
115
|
+
if args.style_strength is not None
|
|
116
|
+
else cfg.style_strength
|
|
117
|
+
),
|
|
118
|
+
)
|
|
118
119
|
|
|
119
|
-
|
|
120
|
+
t_start = time.perf_counter()
|
|
120
121
|
|
|
121
122
|
hist_A: list[int] = []
|
|
122
123
|
pbar = tqdm(
|
|
123
|
-
total=args.max_frames,
|
|
124
|
-
desc="AR sampling",
|
|
125
|
-
unit="frame",
|
|
126
|
-
disable=args.quiet,
|
|
124
|
+
total=args.max_frames + 1, desc="AR sampling", unit="step", disable=args.quiet
|
|
127
125
|
)
|
|
128
126
|
|
|
129
|
-
for _t,
|
|
127
|
+
for _t, tok, is_eos in tts.model.ar_stream(
|
|
130
128
|
prep,
|
|
131
129
|
max_frames=args.max_frames,
|
|
132
130
|
top_p=args.top_p,
|
|
133
131
|
temperature=args.temperature,
|
|
134
132
|
anti_loop=(not args.no_anti_loop),
|
|
135
|
-
use_prefix=(not args.no_prefix),
|
|
136
|
-
prefix_sec_fixed=args.prefix_sec,
|
|
137
|
-
use_stop_head=(False if args.no_stop_head else None),
|
|
138
|
-
stop_patience=args.stop_patience,
|
|
139
|
-
stop_threshold=args.stop_threshold,
|
|
140
133
|
):
|
|
141
|
-
|
|
134
|
+
if is_eos:
|
|
135
|
+
pbar.set_postfix(eos="yes")
|
|
136
|
+
pbar.update(1)
|
|
137
|
+
break
|
|
138
|
+
hist_A.append(int(tok))
|
|
142
139
|
pbar.update(1)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
else:
|
|
146
|
-
pbar.set_postfix(p_stop=f"{float(p_stop):.2f}")
|
|
140
|
+
|
|
141
|
+
t_after_sampling = time.perf_counter()
|
|
147
142
|
|
|
148
143
|
pbar.n = len(hist_A)
|
|
149
144
|
pbar.close()
|
|
150
145
|
|
|
151
|
-
t_after_sampling = time.perf_counter()
|
|
152
|
-
|
|
153
146
|
T = len(hist_A)
|
|
154
147
|
if T == 0:
|
|
155
148
|
save_audio(args.out, torch.zeros(1, 0), sr=TARGET_SR)
|
|
156
|
-
t_end = time.perf_counter()
|
|
157
|
-
if not args.quiet:
|
|
158
|
-
print(
|
|
159
|
-
f"[Timing] sampling={t_after_sampling - t_start:.2f}s, "
|
|
160
|
-
f"postproc+decode+save={t_end - t_after_sampling:.2f}s, "
|
|
161
|
-
f"total={t_end - t_start:.2f}s"
|
|
162
|
-
)
|
|
163
|
-
print(f"[Done] Wrote {args.out}")
|
|
164
149
|
return
|
|
165
150
|
|
|
166
151
|
tokens_A = torch.tensor(hist_A, device=tts.device, dtype=torch.long).unsqueeze(0)
|
|
167
|
-
cond_seq = prep["
|
|
152
|
+
cond_seq = prep["cond_ar"][:, :T, :]
|
|
168
153
|
tokens_1xTQ = tts.model.nar_refine(cond_seq, tokens_A)
|
|
169
154
|
tokens_tq = tokens_1xTQ.squeeze(0)
|
|
170
155
|
|
|
@@ -13,36 +13,31 @@ class SoproTTSConfig:
|
|
|
13
13
|
audio_sr: int = TARGET_SR
|
|
14
14
|
|
|
15
15
|
d_model: int = 384
|
|
16
|
-
n_layers_text: int =
|
|
17
|
-
n_layers_ar: int = 6
|
|
18
|
-
n_layers_nar: int = 6
|
|
16
|
+
n_layers_text: int = 2
|
|
19
17
|
dropout: float = 0.05
|
|
20
|
-
|
|
21
18
|
pos_emb_max: int = 4096
|
|
22
19
|
max_text_len: int = 2048
|
|
23
20
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
stop_patience: int = 5
|
|
21
|
+
n_layers_ar: int = 6
|
|
22
|
+
ar_kernel: int = 13
|
|
23
|
+
ar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 1)
|
|
24
|
+
ar_text_attn_freq: int = 2
|
|
29
25
|
min_gen_frames: int = 12
|
|
30
26
|
|
|
27
|
+
n_layers_nar: int = 6
|
|
28
|
+
nar_head_dim: int = 256
|
|
29
|
+
nar_kernel_size: int = 11
|
|
30
|
+
nar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 8)
|
|
31
|
+
|
|
31
32
|
stage_B: Tuple[int, int] = (2, 4)
|
|
32
33
|
stage_C: Tuple[int, int] = (5, 8)
|
|
33
34
|
stage_D: Tuple[int, int] = (9, 16)
|
|
34
35
|
stage_E: Tuple[int, int] = (17, 32)
|
|
35
36
|
|
|
36
|
-
ar_lookback: int = 4
|
|
37
|
-
ar_kernel: int = 13
|
|
38
|
-
ar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 1)
|
|
39
|
-
|
|
40
|
-
ar_text_attn_freq: int = 2
|
|
41
|
-
|
|
42
|
-
ref_attn_heads: int = 2
|
|
43
|
-
ref_seconds_max: float = 12.0
|
|
44
|
-
|
|
45
|
-
preprompt_sec_max: float = 4.0
|
|
46
|
-
|
|
47
37
|
sv_student_dim: int = 192
|
|
48
38
|
style_strength: float = 1.0
|
|
39
|
+
|
|
40
|
+
ref_enc_layers: int = 2
|
|
41
|
+
ref_xattn_heads: int = 2
|
|
42
|
+
ref_xattn_layers: int = 3
|
|
43
|
+
ref_xattn_gmax: float = 0.35
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
4
5
|
import struct
|
|
5
6
|
from typing import Any, Dict, Optional
|
|
6
7
|
|
|
@@ -44,9 +45,7 @@ def load_cfg_from_safetensors(path: str) -> SoproTTSConfig:
|
|
|
44
45
|
for k in SoproTTSConfig.__annotations__.keys():
|
|
45
46
|
if k in cfg_dict:
|
|
46
47
|
init[k] = cfg_dict[k]
|
|
47
|
-
|
|
48
|
-
cfg = SoproTTSConfig(**init)
|
|
49
|
-
return cfg
|
|
48
|
+
return SoproTTSConfig(**init)
|
|
50
49
|
|
|
51
50
|
|
|
52
51
|
def load_state_dict_from_safetensors(path: str) -> Dict[str, torch.Tensor]:
|