sopro 1.0.1__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {sopro-1.0.1 → sopro-1.5.0}/PKG-INFO +30 -7
  2. {sopro-1.0.1 → sopro-1.5.0}/README.md +29 -6
  3. {sopro-1.0.1 → sopro-1.5.0}/pyproject.toml +1 -1
  4. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/__init__.py +1 -1
  5. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/cli.py +31 -46
  6. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/config.py +15 -20
  7. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/hub.py +2 -3
  8. sopro-1.5.0/src/sopro/model.py +583 -0
  9. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/nn/__init__.py +7 -3
  10. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/nn/blocks.py +78 -0
  11. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/nn/embeddings.py +16 -0
  12. sopro-1.5.0/src/sopro/nn/generator.py +130 -0
  13. sopro-1.5.0/src/sopro/nn/nar.py +116 -0
  14. sopro-1.5.0/src/sopro/nn/ref.py +160 -0
  15. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/nn/speaker.py +14 -17
  16. sopro-1.5.0/src/sopro/nn/text.py +132 -0
  17. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/sampling.py +3 -3
  18. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/streaming.py +25 -38
  19. {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/PKG-INFO +30 -7
  20. {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/SOURCES.txt +4 -7
  21. sopro-1.0.1/src/sopro/model.py +0 -853
  22. sopro-1.0.1/src/sopro/nn/xattn.py +0 -98
  23. {sopro-1.0.1 → sopro-1.5.0}/LICENSE.txt +0 -0
  24. {sopro-1.0.1 → sopro-1.5.0}/setup.cfg +0 -0
  25. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/audio.py +0 -0
  26. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/codec/__init__.py +0 -0
  27. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/codec/mimi.py +0 -0
  28. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/constants.py +0 -0
  29. {sopro-1.0.1 → sopro-1.5.0}/src/sopro/tokenizer.py +0 -0
  30. {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/dependency_links.txt +0 -0
  31. {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/entry_points.txt +0 -0
  32. {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/requires.txt +0 -0
  33. {sopro-1.0.1/src/Sopro.egg-info → sopro-1.5.0/src/sopro.egg-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sopro
3
- Version: 1.0.1
3
+ Version: 1.5.0
4
4
  Summary: A lightweight text-to-speech model with zero-shot voice cloning.
5
5
  Author-email: Samuel Vitorino <samvitorino@gmail.com>
6
6
  License: Apache 2.0
@@ -27,14 +27,18 @@ https://github.com/user-attachments/assets/40254391-248f-45ff-b9a4-107d64fbb95f
27
27
 
28
28
  [![Alt Text](https://img.shields.io/badge/HuggingFace-Model-orange?logo=huggingface)](https://huggingface.co/samuel-vitorino/sopro)
29
29
 
30
+ ### 📰 News
31
+
32
+ **2026.02.04 – SoproTTS v1.5 is out: more stable, faster, and smaller. Trained for just $100, it reaches 250 ms TTFA streaming and 0.05 RTF (~20× realtime) on CPU.**
33
+
30
34
  Sopro (from the Portuguese word for “breath/blow”) is a lightweight English text-to-speech model I trained as a side project. Sopro is composed of dilated convs (à la WaveNet) and lightweight cross-attention layers, instead of the common Transformer architecture. Even though Sopro is not SOTA across most voices and situations, I still think it’s a cool project made with a very low budget (trained on a single L40S GPU), and it can be improved with better data.
31
35
 
32
36
  Some of the main features are:
33
37
 
34
- - **169M parameters**
38
+ - **147M parameters**
35
39
  - **Streaming**
36
40
  - **Zero-shot voice cloning**
37
- - **0.25 RTF on CPU** (measured on an M3 base model), meaning it generates 30 seconds of audio in 7.5 seconds
41
+ - **0.05 RTF on CPU** (measured on an M3 base model), meaning it generates 32 seconds of audio in 1.77 seconds
38
42
  - **3-12 seconds of reference audio** for voice cloning
39
43
 
40
44
  ---
@@ -53,7 +57,7 @@ conda activate soprotts
53
57
  ### From PyPI
54
58
 
55
59
  ```bash
56
- pip install sopro
60
+ pip install -U sopro
57
61
  ```
58
62
 
59
63
  ### From the repo
@@ -79,9 +83,7 @@ soprotts \
79
83
 
80
84
  You have the expected `temperature` and `top_p` parameters, alongside:
81
85
 
82
- - `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.0`)
83
- - `--no_stop_head` to disable early stopping
84
- - `--stop_threshold` and `--stop_patience` (number of consecutive frames that must be classified as final before **stopping**). For short sentences, the stop head may fail to trigger, in which case you can lower these values. Likewise, if the model stops before producing the full text, adjusting these parameters up can help.
86
+ - `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.2`)
85
87
 
86
88
  ### Python
87
89
 
@@ -119,6 +121,27 @@ wav = torch.cat(chunks, dim=-1)
119
121
  tts.save_wav("out_stream.wav", wav)
120
122
  ```
121
123
 
124
+ You can also precalculate the reference to reduce TTFA:
125
+
126
+ ```python
127
+ import torch
128
+ from sopro import SoproTTS
129
+
130
+ tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device="cpu")
131
+
132
+ ref = tts.prepare_reference(ref_audio_path="ref.mp3")
133
+
134
+ chunks = []
135
+ for chunk in tts.stream(
136
+ "Hello! This is a streaming Sopro TTS example.",
137
+ ref=ref,
138
+ ):
139
+ chunks.append(chunk.cpu())
140
+
141
+ wav = torch.cat(chunks, dim=-1)
142
+ tts.save_wav("out_stream.wav", wav)
143
+ ```
144
+
122
145
  ---
123
146
 
124
147
  ## Interactive streaming demo
@@ -4,14 +4,18 @@ https://github.com/user-attachments/assets/40254391-248f-45ff-b9a4-107d64fbb95f
4
4
 
5
5
  [![Alt Text](https://img.shields.io/badge/HuggingFace-Model-orange?logo=huggingface)](https://huggingface.co/samuel-vitorino/sopro)
6
6
 
7
+ ### 📰 News
8
+
9
+ **2026.02.04 – SoproTTS v1.5 is out: more stable, faster, and smaller. Trained for just $100, it reaches 250 ms TTFA streaming and 0.05 RTF (~20× realtime) on CPU.**
10
+
7
11
  Sopro (from the Portuguese word for “breath/blow”) is a lightweight English text-to-speech model I trained as a side project. Sopro is composed of dilated convs (à la WaveNet) and lightweight cross-attention layers, instead of the common Transformer architecture. Even though Sopro is not SOTA across most voices and situations, I still think it’s a cool project made with a very low budget (trained on a single L40S GPU), and it can be improved with better data.
8
12
 
9
13
  Some of the main features are:
10
14
 
11
- - **169M parameters**
15
+ - **147M parameters**
12
16
  - **Streaming**
13
17
  - **Zero-shot voice cloning**
14
- - **0.25 RTF on CPU** (measured on an M3 base model), meaning it generates 30 seconds of audio in 7.5 seconds
18
+ - **0.05 RTF on CPU** (measured on an M3 base model), meaning it generates 32 seconds of audio in 1.77 seconds
15
19
  - **3-12 seconds of reference audio** for voice cloning
16
20
 
17
21
  ---
@@ -30,7 +34,7 @@ conda activate soprotts
30
34
  ### From PyPI
31
35
 
32
36
  ```bash
33
- pip install sopro
37
+ pip install -U sopro
34
38
  ```
35
39
 
36
40
  ### From the repo
@@ -56,9 +60,7 @@ soprotts \
56
60
 
57
61
  You have the expected `temperature` and `top_p` parameters, alongside:
58
62
 
59
- - `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.0`)
60
- - `--no_stop_head` to disable early stopping
61
- - `--stop_threshold` and `--stop_patience` (number of consecutive frames that must be classified as final before **stopping**). For short sentences, the stop head may fail to trigger, in which case you can lower these values. Likewise, if the model stops before producing the full text, adjusting these parameters up can help.
63
+ - `--style_strength` (controls the FiLM strength; increasing it can improve or reduce voice similarity; default `1.2`)
62
64
 
63
65
  ### Python
64
66
 
@@ -96,6 +98,27 @@ wav = torch.cat(chunks, dim=-1)
96
98
  tts.save_wav("out_stream.wav", wav)
97
99
  ```
98
100
 
101
+ You can also precalculate the reference to reduce TTFA:
102
+
103
+ ```python
104
+ import torch
105
+ from sopro import SoproTTS
106
+
107
+ tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device="cpu")
108
+
109
+ ref = tts.prepare_reference(ref_audio_path="ref.mp3")
110
+
111
+ chunks = []
112
+ for chunk in tts.stream(
113
+ "Hello! This is a streaming Sopro TTS example.",
114
+ ref=ref,
115
+ ):
116
+ chunks.append(chunk.cpu())
117
+
118
+ wav = torch.cat(chunks, dim=-1)
119
+ tts.save_wav("out_stream.wav", wav)
120
+ ```
121
+
99
122
  ---
100
123
 
101
124
  ## Interactive streaming demo
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sopro"
7
- version = "1.0.1"
7
+ version = "1.5.0"
8
8
  description = "A lightweight text-to-speech model with zero-shot voice cloning."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -3,4 +3,4 @@ from __future__ import annotations
3
3
  from .model import SoproTTS
4
4
 
5
5
  __all__ = ["SoproTTS"]
6
- __version__ = "1.0.1"
6
+ __version__ = "1.5.0"
@@ -32,8 +32,6 @@ def main() -> None:
32
32
  ap.add_argument("--temperature", type=float, default=1.05)
33
33
  ap.add_argument("--no_anti_loop", action="store_true")
34
34
 
35
- ap.add_argument("--no_prefix", action="store_true")
36
- ap.add_argument("--prefix_sec", type=float, default=None)
37
35
  ap.add_argument("--style_strength", type=float, default=None)
38
36
  ap.add_argument("--ref_seconds", type=float, default=None)
39
37
 
@@ -77,6 +75,7 @@ def main() -> None:
77
75
  torch.cuda.manual_seed_all(args.seed)
78
76
 
79
77
  t0 = time.perf_counter()
78
+
80
79
  tts = SoproTTS.from_pretrained(
81
80
  args.repo_id,
82
81
  revision=args.revision,
@@ -84,6 +83,7 @@ def main() -> None:
84
83
  token=args.hf_token,
85
84
  device=device,
86
85
  )
86
+
87
87
  t1 = time.perf_counter()
88
88
  if not args.quiet:
89
89
  print(f"[Load] {t1 - t0:.2f}s")
@@ -97,74 +97,59 @@ def main() -> None:
97
97
  arr = np.load(args.ref_tokens)
98
98
  ref_tokens_tq = torch.from_numpy(arr).long()
99
99
 
100
- text_ids = tts.encode_text(args.text)
101
- ref = tts.encode_reference(
102
- ref_audio_path=args.ref_audio,
103
- ref_tokens_tq=ref_tokens_tq,
104
- ref_seconds=args.ref_seconds,
105
- )
100
+ with torch.inference_mode():
101
+ text_ids = tts.encode_text(args.text)
102
+ ref = tts.prepare_reference(
103
+ ref_audio_path=args.ref_audio,
104
+ ref_tokens_tq=ref_tokens_tq,
105
+ ref_seconds=args.ref_seconds,
106
+ )
106
107
 
107
- prep = tts.model.prepare_conditioning(
108
- text_ids,
109
- ref,
110
- max_frames=args.max_frames,
111
- device=tts.device,
112
- style_strength=float(
113
- args.style_strength
114
- if args.style_strength is not None
115
- else cfg.style_strength
116
- ),
117
- )
108
+ prep = tts.model.prepare_conditioning(
109
+ text_ids,
110
+ ref,
111
+ max_frames=args.max_frames,
112
+ device=tts.device,
113
+ style_strength=float(
114
+ args.style_strength
115
+ if args.style_strength is not None
116
+ else cfg.style_strength
117
+ ),
118
+ )
118
119
 
119
- t_start = time.perf_counter()
120
+ t_start = time.perf_counter()
120
121
 
121
122
  hist_A: list[int] = []
122
123
  pbar = tqdm(
123
- total=args.max_frames,
124
- desc="AR sampling",
125
- unit="frame",
126
- disable=args.quiet,
124
+ total=args.max_frames + 1, desc="AR sampling", unit="step", disable=args.quiet
127
125
  )
128
126
 
129
- for _t, rvq1, p_stop in tts.model.ar_stream(
127
+ for _t, tok, is_eos in tts.model.ar_stream(
130
128
  prep,
131
129
  max_frames=args.max_frames,
132
130
  top_p=args.top_p,
133
131
  temperature=args.temperature,
134
132
  anti_loop=(not args.no_anti_loop),
135
- use_prefix=(not args.no_prefix),
136
- prefix_sec_fixed=args.prefix_sec,
137
- use_stop_head=(False if args.no_stop_head else None),
138
- stop_patience=args.stop_patience,
139
- stop_threshold=args.stop_threshold,
140
133
  ):
141
- hist_A.append(int(rvq1))
134
+ if is_eos:
135
+ pbar.set_postfix(eos="yes")
136
+ pbar.update(1)
137
+ break
138
+ hist_A.append(int(tok))
142
139
  pbar.update(1)
143
- if p_stop is None:
144
- pbar.set_postfix(p_stop="off")
145
- else:
146
- pbar.set_postfix(p_stop=f"{float(p_stop):.2f}")
140
+
141
+ t_after_sampling = time.perf_counter()
147
142
 
148
143
  pbar.n = len(hist_A)
149
144
  pbar.close()
150
145
 
151
- t_after_sampling = time.perf_counter()
152
-
153
146
  T = len(hist_A)
154
147
  if T == 0:
155
148
  save_audio(args.out, torch.zeros(1, 0), sr=TARGET_SR)
156
- t_end = time.perf_counter()
157
- if not args.quiet:
158
- print(
159
- f"[Timing] sampling={t_after_sampling - t_start:.2f}s, "
160
- f"postproc+decode+save={t_end - t_after_sampling:.2f}s, "
161
- f"total={t_end - t_start:.2f}s"
162
- )
163
- print(f"[Done] Wrote {args.out}")
164
149
  return
165
150
 
166
151
  tokens_A = torch.tensor(hist_A, device=tts.device, dtype=torch.long).unsqueeze(0)
167
- cond_seq = prep["cond_all"][:, :T, :]
152
+ cond_seq = prep["cond_ar"][:, :T, :]
168
153
  tokens_1xTQ = tts.model.nar_refine(cond_seq, tokens_A)
169
154
  tokens_tq = tokens_1xTQ.squeeze(0)
170
155
 
@@ -13,36 +13,31 @@ class SoproTTSConfig:
13
13
  audio_sr: int = TARGET_SR
14
14
 
15
15
  d_model: int = 384
16
- n_layers_text: int = 4
17
- n_layers_ar: int = 6
18
- n_layers_nar: int = 6
16
+ n_layers_text: int = 2
19
17
  dropout: float = 0.05
20
-
21
18
  pos_emb_max: int = 4096
22
19
  max_text_len: int = 2048
23
20
 
24
- nar_head_dim: int = 256
25
-
26
- use_stop_head: bool = True
27
- stop_threshold: float = 0.8
28
- stop_patience: int = 5
21
+ n_layers_ar: int = 6
22
+ ar_kernel: int = 13
23
+ ar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 1)
24
+ ar_text_attn_freq: int = 2
29
25
  min_gen_frames: int = 12
30
26
 
27
+ n_layers_nar: int = 6
28
+ nar_head_dim: int = 256
29
+ nar_kernel_size: int = 11
30
+ nar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 8)
31
+
31
32
  stage_B: Tuple[int, int] = (2, 4)
32
33
  stage_C: Tuple[int, int] = (5, 8)
33
34
  stage_D: Tuple[int, int] = (9, 16)
34
35
  stage_E: Tuple[int, int] = (17, 32)
35
36
 
36
- ar_lookback: int = 4
37
- ar_kernel: int = 13
38
- ar_dilation_cycle: Tuple[int, ...] = (1, 2, 4, 1)
39
-
40
- ar_text_attn_freq: int = 2
41
-
42
- ref_attn_heads: int = 2
43
- ref_seconds_max: float = 12.0
44
-
45
- preprompt_sec_max: float = 4.0
46
-
47
37
  sv_student_dim: int = 192
48
38
  style_strength: float = 1.0
39
+
40
+ ref_enc_layers: int = 2
41
+ ref_xattn_heads: int = 2
42
+ ref_xattn_layers: int = 3
43
+ ref_xattn_gmax: float = 0.35
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import os
4
5
  import struct
5
6
  from typing import Any, Dict, Optional
6
7
 
@@ -44,9 +45,7 @@ def load_cfg_from_safetensors(path: str) -> SoproTTSConfig:
44
45
  for k in SoproTTSConfig.__annotations__.keys():
45
46
  if k in cfg_dict:
46
47
  init[k] = cfg_dict[k]
47
-
48
- cfg = SoproTTSConfig(**init)
49
- return cfg
48
+ return SoproTTSConfig(**init)
50
49
 
51
50
 
52
51
  def load_state_dict_from_safetensors(path: str) -> Dict[str, torch.Tensor]: