PyPI - sopro - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

sopro 1.0.1py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

sopro/__init__.py +1 -1
sopro/cli.py +70 -69
sopro/model.py +1 -1
sopro/streaming.py +1 -1
{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/METADATA +1 -1
{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/RECORD +10 -10
{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/WHEEL +0 -0
{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/entry_points.txt +0 -0
{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/licenses/LICENSE.txt +0 -0
{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/top_level.txt +0 -0

sopro/__init__.py CHANGED Viewed

@@ -3,4 +3,4 @@ from __future__ import annotations
 from .model import SoproTTS
 __all__ = ["SoproTTS"]
-__version__ = "1.0.1"
+__version__ = "1.0.2"

sopro/cli.py CHANGED Viewed

@@ -97,79 +97,80 @@ def main() -> None:
         arr = np.load(args.ref_tokens)
         ref_tokens_tq = torch.from_numpy(arr).long()
-    text_ids = tts.encode_text(args.text)
-    ref = tts.encode_reference(
-        ref_audio_path=args.ref_audio,
-        ref_tokens_tq=ref_tokens_tq,
-        ref_seconds=args.ref_seconds,
-    )
+    with torch.inference_mode():
+        text_ids = tts.encode_text(args.text)
+        ref = tts.encode_reference(
+            ref_audio_path=args.ref_audio,
+            ref_tokens_tq=ref_tokens_tq,
+            ref_seconds=args.ref_seconds,
+        )
-    prep = tts.model.prepare_conditioning(
-        text_ids,
-        ref,
-        max_frames=args.max_frames,
-        device=tts.device,
-        style_strength=float(
-            args.style_strength
-            if args.style_strength is not None
-            else cfg.style_strength
-        ),
-    )
+        prep = tts.model.prepare_conditioning(
+            text_ids,
+            ref,
+            max_frames=args.max_frames,
+            device=tts.device,
+            style_strength=float(
+                args.style_strength
+                if args.style_strength is not None
+                else cfg.style_strength
+            ),
+        )
-    t_start = time.perf_counter()
+        t_start = time.perf_counter()
-    hist_A: list[int] = []
-    pbar = tqdm(
-        total=args.max_frames,
-        desc="AR sampling",
-        unit="frame",
-        disable=args.quiet,
-    )
+        hist_A: list[int] = []
+        pbar = tqdm(
+            total=args.max_frames,
+            desc="AR sampling",
+            unit="frame",
+            disable=args.quiet,
+        )
-    for _t, rvq1, p_stop in tts.model.ar_stream(
-        prep,
-        max_frames=args.max_frames,
-        top_p=args.top_p,
-        temperature=args.temperature,
-        anti_loop=(not args.no_anti_loop),
-        use_prefix=(not args.no_prefix),
-        prefix_sec_fixed=args.prefix_sec,
-        use_stop_head=(False if args.no_stop_head else None),
-        stop_patience=args.stop_patience,
-        stop_threshold=args.stop_threshold,
-    ):
-        hist_A.append(int(rvq1))
-        pbar.update(1)
-        if p_stop is None:
-            pbar.set_postfix(p_stop="off")
-        else:
-            pbar.set_postfix(p_stop=f"{float(p_stop):.2f}")
-    pbar.n = len(hist_A)
-    pbar.close()
-    t_after_sampling = time.perf_counter()
-    T = len(hist_A)
-    if T == 0:
-        save_audio(args.out, torch.zeros(1, 0), sr=TARGET_SR)
-        t_end = time.perf_counter()
-        if not args.quiet:
-            print(
-                f"[Timing] sampling={t_after_sampling - t_start:.2f}s, "
-                f"postproc+decode+save={t_end - t_after_sampling:.2f}s, "
-                f"total={t_end - t_start:.2f}s"
-            )
-            print(f"[Done] Wrote {args.out}")
-        return
-    tokens_A = torch.tensor(hist_A, device=tts.device, dtype=torch.long).unsqueeze(0)
-    cond_seq = prep["cond_all"][:, :T, :]
-    tokens_1xTQ = tts.model.nar_refine(cond_seq, tokens_A)
-    tokens_tq = tokens_1xTQ.squeeze(0)
-    wav = tts.codec.decode_full(tokens_tq)
-    save_audio(args.out, wav, sr=TARGET_SR)
+        for _t, rvq1, p_stop in tts.model.ar_stream(
+            prep,
+            max_frames=args.max_frames,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            anti_loop=(not args.no_anti_loop),
+            use_prefix=(not args.no_prefix),
+            prefix_sec_fixed=args.prefix_sec,
+            use_stop_head=(False if args.no_stop_head else None),
+            stop_patience=args.stop_patience,
+            stop_threshold=args.stop_threshold,
+        ):
+            hist_A.append(int(rvq1))
+            pbar.update(1)
+            if p_stop is None:
+                pbar.set_postfix(p_stop="off")
+            else:
+                pbar.set_postfix(p_stop=f"{float(p_stop):.2f}")
+        pbar.n = len(hist_A)
+        pbar.close()
+        t_after_sampling = time.perf_counter()
+        T = len(hist_A)
+        if T == 0:
+            save_audio(args.out, torch.zeros(1, 0), sr=TARGET_SR)
+            t_end = time.perf_counter()
+            if not args.quiet:
+                print(
+                    f"[Timing] sampling={t_after_sampling - t_start:.2f}s, "
+                    f"postproc+decode+save={t_end - t_after_sampling:.2f}s, "
+                    f"total={t_end - t_start:.2f}s"
+                )
+                print(f"[Done] Wrote {args.out}")
+            return
+        tokens_A = torch.tensor(hist_A, device=tts.device, dtype=torch.long).unsqueeze(0)
+        cond_seq = prep["cond_all"][:, :T, :]
+        tokens_1xTQ = tts.model.nar_refine(cond_seq, tokens_A)
+        tokens_tq = tokens_1xTQ.squeeze(0)
+        wav = tts.codec.decode_full(tokens_tq)
+        save_audio(args.out, wav, sr=TARGET_SR)
     t_end = time.perf_counter()
     if not args.quiet:

sopro/model.py CHANGED Viewed

@@ -793,7 +793,7 @@ class SoproTTS:
         )
         return ref
-    @torch.no_grad()
+    @torch.inference_mode()
     def synthesize(
         self,
         text: str,

sopro/streaming.py CHANGED Viewed

@@ -145,7 +145,7 @@ class SoproTTSStreamer:
             if wav is not None:
                 yield wav
+@torch.inference_mode()
 def stream(
     tts: SoproTTS,
     text: str,

{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sopro
-Version: 1.0.1
+Version: 1.0.2
 Summary: A lightweight text-to-speech model with zero-shot voice cloning.
 Author-email: Samuel Vitorino <samvitorino@gmail.com>
 License: Apache 2.0

{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-sopro/__init__.py,sha256=NFZuESqdCL7bGXuTB8c61XxUJqhkHPUOSTqzH4pyUfU,110
+sopro/__init__.py,sha256=SreucP3h4V4KsUU8PskOofqKEAmL8RvXYA6Ma53zb8Y,110
 sopro/audio.py,sha256=xlp6aYzzGlOMcNZ-p9lDeeU0TUkSHMcvmLantwg_4-0,4162
-sopro/cli.py,sha256=YKfGalyhbRuvjVrGJuo1NlIC7h8CszlMxuTwhYgUSwQ,5751
+sopro/cli.py,sha256=v4mZ_zfYt6BusVFMIHo3ae32xLw2_O_v5SDkfmt_Cvc,6040
 sopro/config.py,sha256=OBD-k2z5GUdjFS545MyBXx-dAGhwnhRG11LW-zQt1-g,1063
 sopro/constants.py,sha256=wSjFKeFIcLCxyVUVb3njxMK666IuxjlNzVT4_jfPovQ,97
 sopro/hub.py,sha256=xsHfeO8X7v__FELvaQxWHYG8P39ygrgbluPs5GQjoCM,1391
-sopro/model.py,sha256=YXwcVGN3v5T0kvKttmo9WNPpewF-b5aOZoTMVypkzO8,28624
+sopro/model.py,sha256=Rj10OPdx8UEhH2reU2SQ4oTztNJCOvLS7pm84V0E_xo,28631
 sopro/sampling.py,sha256=Q5rbuef_BIuy12cv5J7v6k9ob3zQ0OFJIlMHssOkiuU,2951
-sopro/streaming.py,sha256=O5Kkl4cUBjzgjTrEwQK2ka5h6sgcYaEZmIp66-obcPM,4975
+sopro/streaming.py,sha256=AgPzaxdimeDT-8potXAMXuRi7zrWUGgxViwHJw2R2Lk,4998
 sopro/tokenizer.py,sha256=ucb86Jr-EaAyD9OHDoCmwB9Nh9AFIZK_TlZmMkv46KQ,1325
 sopro/codec/__init__.py,sha256=6D6Q0M-SUZZnq79OT1nATenEc8zIZDrhZBpm7zdPEE4,129
 sopro/codec/mimi.py,sha256=RNKnXfhWXUqHiU27C90wj18Rb3R2IZHpm5_cS_XAs9Y,5798
@@ -15,9 +15,9 @@ sopro/nn/blocks.py,sha256=zDEVUH2LXapXuQ4DyhplNh1I0iJYrNUL20IxHoz8ucs,3221
 sopro/nn/embeddings.py,sha256=7YfYKj1v1oafTV4-iucJG4fmeT43fP_rQiJ6ACRKPNI,3185
 sopro/nn/speaker.py,sha256=L2bs-bPlyxoWZyMTctBBuMTaEWm6FP7K1udrXehnTGM,2964
 sopro/nn/xattn.py,sha256=OeRo1HbRZs0AkQ6AV6Q8cqYZP9K4vI-IwT3uVn9jOqg,2939
-sopro-1.0.1.dist-info/licenses/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sopro-1.0.1.dist-info/METADATA,sha256=tlq9mTTsNEFgMyCtle7om5hqKRm5LwrVCFLo4olQ3_s,6470
-sopro-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sopro-1.0.1.dist-info/entry_points.txt,sha256=OWcKgC5Syk8rzOhNzTZ3QR5GJEG88UfiShkovrwb2cI,44
-sopro-1.0.1.dist-info/top_level.txt,sha256=Tik26_lEwzSKDuwQdqwoqA_O0b7CDATzousa0Q17PBo,6
-sopro-1.0.1.dist-info/RECORD,,
+sopro-1.0.2.dist-info/licenses/LICENSE.txt,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sopro-1.0.2.dist-info/METADATA,sha256=LPMr5tnwQx3Rq5FX9CCMq6s4IvreA-EWQvt-OzQkm7g,6470
+sopro-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sopro-1.0.2.dist-info/entry_points.txt,sha256=OWcKgC5Syk8rzOhNzTZ3QR5GJEG88UfiShkovrwb2cI,44
+sopro-1.0.2.dist-info/top_level.txt,sha256=Tik26_lEwzSKDuwQdqwoqA_O0b7CDATzousa0Q17PBo,6
+sopro-1.0.2.dist-info/RECORD,,

{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{sopro-1.0.1.dist-info → sopro-1.0.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

sopro 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

sopro 1.0.1py3-none-any.whl → 1.0.2py3-none-any.whl