PyPI - sopro - Versions diffs - 1.0.1__tar.gz → 1.0.2__tar.gz - Mend

sopro 1.0.1tar.gz → 1.0.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{sopro-1.0.1 → sopro-1.0.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sopro
-Version: 1.0.1
+Version: 1.0.2
 Summary: A lightweight text-to-speech model with zero-shot voice cloning.
 Author-email: Samuel Vitorino <samvitorino@gmail.com>
 License: Apache 2.0

{sopro-1.0.1 → sopro-1.0.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sopro"
-version = "1.0.1"
+version = "1.0.2"
 description = "A lightweight text-to-speech model with zero-shot voice cloning."
 readme = "README.md"
 requires-python = ">=3.9"

{sopro-1.0.1 → sopro-1.0.2}/src/Sopro.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sopro
-Version: 1.0.1
+Version: 1.0.2
 Summary: A lightweight text-to-speech model with zero-shot voice cloning.
 Author-email: Samuel Vitorino <samvitorino@gmail.com>
 License: Apache 2.0

{sopro-1.0.1 → sopro-1.0.2}/src/sopro/__init__.py RENAMED Viewed

@@ -3,4 +3,4 @@ from __future__ import annotations
 from .model import SoproTTS
 __all__ = ["SoproTTS"]
-__version__ = "1.0.1"
+__version__ = "1.0.2"

{sopro-1.0.1 → sopro-1.0.2}/src/sopro/cli.py RENAMED Viewed

@@ -97,79 +97,80 @@ def main() -> None:
         arr = np.load(args.ref_tokens)
         ref_tokens_tq = torch.from_numpy(arr).long()
-    text_ids = tts.encode_text(args.text)
-    ref = tts.encode_reference(
-        ref_audio_path=args.ref_audio,
-        ref_tokens_tq=ref_tokens_tq,
-        ref_seconds=args.ref_seconds,
-    )
+    with torch.inference_mode():
+        text_ids = tts.encode_text(args.text)
+        ref = tts.encode_reference(
+            ref_audio_path=args.ref_audio,
+            ref_tokens_tq=ref_tokens_tq,
+            ref_seconds=args.ref_seconds,
+        )
-    prep = tts.model.prepare_conditioning(
-        text_ids,
-        ref,
-        max_frames=args.max_frames,
-        device=tts.device,
-        style_strength=float(
-            args.style_strength
-            if args.style_strength is not None
-            else cfg.style_strength
-        ),
-    )
+        prep = tts.model.prepare_conditioning(
+            text_ids,
+            ref,
+            max_frames=args.max_frames,
+            device=tts.device,
+            style_strength=float(
+                args.style_strength
+                if args.style_strength is not None
+                else cfg.style_strength
+            ),
+        )
-    t_start = time.perf_counter()
+        t_start = time.perf_counter()
-    hist_A: list[int] = []
-    pbar = tqdm(
-        total=args.max_frames,
-        desc="AR sampling",
-        unit="frame",
-        disable=args.quiet,
-    )
+        hist_A: list[int] = []
+        pbar = tqdm(
+            total=args.max_frames,
+            desc="AR sampling",
+            unit="frame",
+            disable=args.quiet,
+        )
-    for _t, rvq1, p_stop in tts.model.ar_stream(
-        prep,
-        max_frames=args.max_frames,
-        top_p=args.top_p,
-        temperature=args.temperature,
-        anti_loop=(not args.no_anti_loop),
-        use_prefix=(not args.no_prefix),
-        prefix_sec_fixed=args.prefix_sec,
-        use_stop_head=(False if args.no_stop_head else None),
-        stop_patience=args.stop_patience,
-        stop_threshold=args.stop_threshold,
-    ):
-        hist_A.append(int(rvq1))
-        pbar.update(1)
-        if p_stop is None:
-            pbar.set_postfix(p_stop="off")
-        else:
-            pbar.set_postfix(p_stop=f"{float(p_stop):.2f}")
-    pbar.n = len(hist_A)
-    pbar.close()
-    t_after_sampling = time.perf_counter()
-    T = len(hist_A)
-    if T == 0:
-        save_audio(args.out, torch.zeros(1, 0), sr=TARGET_SR)
-        t_end = time.perf_counter()
-        if not args.quiet:
-            print(
-                f"[Timing] sampling={t_after_sampling - t_start:.2f}s, "
-                f"postproc+decode+save={t_end - t_after_sampling:.2f}s, "
-                f"total={t_end - t_start:.2f}s"
-            )
-            print(f"[Done] Wrote {args.out}")
-        return
-    tokens_A = torch.tensor(hist_A, device=tts.device, dtype=torch.long).unsqueeze(0)
-    cond_seq = prep["cond_all"][:, :T, :]
-    tokens_1xTQ = tts.model.nar_refine(cond_seq, tokens_A)
-    tokens_tq = tokens_1xTQ.squeeze(0)
-    wav = tts.codec.decode_full(tokens_tq)
-    save_audio(args.out, wav, sr=TARGET_SR)
+        for _t, rvq1, p_stop in tts.model.ar_stream(
+            prep,
+            max_frames=args.max_frames,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            anti_loop=(not args.no_anti_loop),
+            use_prefix=(not args.no_prefix),
+            prefix_sec_fixed=args.prefix_sec,
+            use_stop_head=(False if args.no_stop_head else None),
+            stop_patience=args.stop_patience,
+            stop_threshold=args.stop_threshold,
+        ):
+            hist_A.append(int(rvq1))
+            pbar.update(1)
+            if p_stop is None:
+                pbar.set_postfix(p_stop="off")
+            else:
+                pbar.set_postfix(p_stop=f"{float(p_stop):.2f}")
+        pbar.n = len(hist_A)
+        pbar.close()
+        t_after_sampling = time.perf_counter()
+        T = len(hist_A)
+        if T == 0:
+            save_audio(args.out, torch.zeros(1, 0), sr=TARGET_SR)
+            t_end = time.perf_counter()
+            if not args.quiet:
+                print(
+                    f"[Timing] sampling={t_after_sampling - t_start:.2f}s, "
+                    f"postproc+decode+save={t_end - t_after_sampling:.2f}s, "
+                    f"total={t_end - t_start:.2f}s"
+                )
+                print(f"[Done] Wrote {args.out}")
+            return
+        tokens_A = torch.tensor(hist_A, device=tts.device, dtype=torch.long).unsqueeze(0)
+        cond_seq = prep["cond_all"][:, :T, :]
+        tokens_1xTQ = tts.model.nar_refine(cond_seq, tokens_A)
+        tokens_tq = tokens_1xTQ.squeeze(0)
+        wav = tts.codec.decode_full(tokens_tq)
+        save_audio(args.out, wav, sr=TARGET_SR)
     t_end = time.perf_counter()
     if not args.quiet:

{sopro-1.0.1 → sopro-1.0.2}/src/sopro/model.py RENAMED Viewed

@@ -793,7 +793,7 @@ class SoproTTS:
         )
         return ref
-    @torch.no_grad()
+    @torch.inference_mode()
     def synthesize(
         self,
         text: str,

{sopro-1.0.1 → sopro-1.0.2}/src/sopro/streaming.py RENAMED Viewed

@@ -145,7 +145,7 @@ class SoproTTSStreamer:
             if wav is not None:
                 yield wav
+@torch.inference_mode()
 def stream(
     tts: SoproTTS,
     text: str,