lectura-multispeaker 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. lectura_multispeaker-3.0.0/PKG-INFO +22 -0
  2. lectura_multispeaker-3.0.0/pyproject.toml +37 -0
  3. lectura_multispeaker-3.0.0/setup.cfg +4 -0
  4. lectura_multispeaker-3.0.0/src/lectura_multispeaker/__init__.py +181 -0
  5. lectura_multispeaker-3.0.0/src/lectura_multispeaker/_chargeur.py +127 -0
  6. lectura_multispeaker-3.0.0/src/lectura_multispeaker/_crypto.py +48 -0
  7. lectura_multispeaker-3.0.0/src/lectura_multispeaker/_enhance.py +148 -0
  8. lectura_multispeaker-3.0.0/src/lectura_multispeaker/data/__init__.py +0 -0
  9. lectura_multispeaker-3.0.0/src/lectura_multispeaker/data/phoneme_vocab.json +108 -0
  10. lectura_multispeaker-3.0.0/src/lectura_multispeaker/data/speakers.json +51 -0
  11. lectura_multispeaker-3.0.0/src/lectura_multispeaker/inference_api.py +193 -0
  12. lectura_multispeaker-3.0.0/src/lectura_multispeaker/inference_onnx.py +815 -0
  13. lectura_multispeaker-3.0.0/src/lectura_multispeaker/phonemes.py +160 -0
  14. lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/PKG-INFO +22 -0
  15. lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/SOURCES.txt +22 -0
  16. lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/dependency_links.txt +1 -0
  17. lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/requires.txt +4 -0
  18. lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/top_level.txt +1 -0
  19. lectura_multispeaker-3.0.0/tests/test_chargeur.py +207 -0
  20. lectura_multispeaker-3.0.0/tests/test_crypto.py +46 -0
  21. lectura_multispeaker-3.0.0/tests/test_enhance.py +55 -0
  22. lectura_multispeaker-3.0.0/tests/test_inference.py +512 -0
  23. lectura_multispeaker-3.0.0/tests/test_integration.py +218 -0
  24. lectura_multispeaker-3.0.0/tests/test_phonemes.py +65 -0
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: lectura-multispeaker
3
+ Version: 3.0.0
4
+ Summary: Synthese vocale neuronale multi-speaker francais — Matcha-Conformer + HiFi-GAN (ONNX)
5
+ Author-email: Max Carriere <admin@lectura.world>
6
+ License: AGPL-3.0-or-later
7
+ Project-URL: Homepage, https://www.lectura.world/developpement/modules/outils/tts-multi/
8
+ Project-URL: Repository, https://github.com/maxcarriere/lectura-modules/tree/main/MultiSpeaker
9
+ Project-URL: Issues, https://github.com/maxcarriere/lectura-modules/issues
10
+ Keywords: tts,french,speech-synthesis,onnx,matcha,conformer,multi-speaker
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Provides-Extra: onnx
21
+ Requires-Dist: onnxruntime>=1.16; extra == "onnx"
22
+ Requires-Dist: numpy>=1.24; extra == "onnx"
@@ -0,0 +1,37 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "lectura-multispeaker"
7
+ version = "3.0.0"
8
+ description = "Synthese vocale neuronale multi-speaker francais — Matcha-Conformer + HiFi-GAN (ONNX)"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "AGPL-3.0-or-later"}
12
+ authors = [{name = "Max Carriere", email = "admin@lectura.world"}]
13
+ keywords = ["tts", "french", "speech-synthesis", "onnx", "matcha", "conformer", "multi-speaker"]
14
+ classifiers = [
15
+ "Development Status :: 5 - Production/Stable",
16
+ "Intended Audience :: Developers",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ ]
23
+ dependencies = []
24
+
25
+ [project.optional-dependencies]
26
+ onnx = ["onnxruntime>=1.16", "numpy>=1.24"]
27
+
28
+ [project.urls]
29
+ Homepage = "https://www.lectura.world/developpement/modules/outils/tts-multi/"
30
+ Repository = "https://github.com/maxcarriere/lectura-modules/tree/main/MultiSpeaker"
31
+ Issues = "https://github.com/maxcarriere/lectura-modules/issues"
32
+
33
+ [tool.setuptools.packages.find]
34
+ where = ["src"]
35
+
36
+ [tool.setuptools.package-data]
37
+ lectura_multispeaker = ["data/*.json", "modeles/*.onnx", "modeles/*.enc", "modeles/*.json"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,181 @@
1
+ """lectura-multispeaker — Synthese vocale neuronale multi-speaker francais.
2
+
3
+ Exports publics :
4
+ - creer_engine(mode, speaker, models_dir, ...) -> engine
5
+ - synthetiser(texte, speaker, **kwargs) -> numpy array float32
6
+ - liste_speakers() -> list[dict]
7
+ - OnnxTTSEngine, ApiTTSEngine
8
+ - TTSResult, PhonemeTiming
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import logging
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ log = logging.getLogger(__name__)
19
+
20
+ __version__ = "3.0.0"
21
+
22
+ _SPEAKERS_DATA: list[dict] | None = None
23
+ _DEFAULT_SPEAKER: str | None = None
24
+
25
+
26
+ def _load_speakers() -> tuple[list[dict], str]:
27
+ """Charge le fichier speakers.json (singleton)."""
28
+ global _SPEAKERS_DATA, _DEFAULT_SPEAKER
29
+ if _SPEAKERS_DATA is not None:
30
+ return _SPEAKERS_DATA, _DEFAULT_SPEAKER
31
+
32
+ speakers_path = Path(__file__).parent / "data" / "speakers.json"
33
+ with open(speakers_path, encoding="utf-8") as f:
34
+ data = json.load(f)
35
+ _SPEAKERS_DATA = data["speakers"]
36
+ _DEFAULT_SPEAKER = data.get("default", "siwis")
37
+ return _SPEAKERS_DATA, _DEFAULT_SPEAKER
38
+
39
+
40
+ def liste_speakers() -> list[dict]:
41
+ """Retourne la liste des speakers disponibles avec metadata.
42
+
43
+ Returns
44
+ -------
45
+ list[dict]
46
+ Chaque dict contient : id, name, gender, label
47
+ """
48
+ speakers, _ = _load_speakers()
49
+ return list(speakers)
50
+
51
+
52
+ def creer_engine(
53
+ mode: str = "auto",
54
+ speaker: str = "siwis",
55
+ models_dir: str | Path | None = None,
56
+ api_url: str | None = None,
57
+ api_key: str | None = None,
58
+ ):
59
+ """Factory pour creer un engine d'inference TTS multi-speaker.
60
+
61
+ Parameters
62
+ ----------
63
+ mode : str
64
+ "auto" : ONNX local si disponible, sinon API
65
+ "local" : force l'inference ONNX locale
66
+ "api" : force l'API distante
67
+ speaker : str
68
+ Nom du speaker (siwis, ezwa, nadine, bernard, gilles, zeckou)
69
+ models_dir : Path | None
70
+ Repertoire des modeles ONNX (override la detection auto)
71
+ api_url : str | None
72
+ URL du serveur API
73
+ api_key : str | None
74
+ Cle API
75
+
76
+ Returns
77
+ -------
78
+ OnnxTTSEngine | ApiTTSEngine
79
+ Engine avec interface unifiee (synthesize, synthesize_phonemes)
80
+ """
81
+ if mode in ("auto", "local"):
82
+ engine = _try_local(speaker, models_dir)
83
+ if engine is not None:
84
+ return engine
85
+ if mode == "local":
86
+ raise FileNotFoundError(
87
+ "Modeles ONNX introuvables. Verifiez l'installation ou "
88
+ "specifiez models_dir. Voir README pour les emplacements."
89
+ )
90
+ log.info("Modeles locaux non disponibles, fallback vers API")
91
+
92
+ from lectura_multispeaker.inference_api import ApiTTSEngine
93
+ return ApiTTSEngine(api_url=api_url, api_key=api_key, speaker=speaker)
94
+
95
+
96
+ def _try_local(speaker: str, models_dir: str | Path | None = None):
97
+ """Tente de creer un engine ONNX local."""
98
+ try:
99
+ import onnxruntime # noqa: F401
100
+ import numpy # noqa: F401
101
+ except ImportError:
102
+ return None
103
+
104
+ from lectura_multispeaker._chargeur import find_models_dir
105
+
106
+ resolved = find_models_dir(speaker, models_dir)
107
+ if resolved is None:
108
+ return None
109
+
110
+ from lectura_multispeaker.inference_onnx import OnnxTTSEngine
111
+ return OnnxTTSEngine(resolved, speaker=speaker)
112
+
113
+
114
+ def synthetiser(
115
+ texte: str,
116
+ speaker: str = "siwis",
117
+ mode: str = "auto",
118
+ models_dir: str | Path | None = None,
119
+ api_url: str | None = None,
120
+ api_key: str | None = None,
121
+ phrase_type: int | None = None,
122
+ duration_scale: float = 1.0,
123
+ pitch_shift: float = 0.0,
124
+ pitch_range: float = 1.0,
125
+ energy_scale: float = 1.0,
126
+ pause_scale: float = 1.0,
127
+ style: str | None = None,
128
+ style_vector: list[float] | None = None,
129
+ n_ode_steps: int | None = None,
130
+ duration_noise: float | None = None,
131
+ ) -> Any:
132
+ """Convenience : texte -> numpy audio float32.
133
+
134
+ Parameters
135
+ ----------
136
+ texte : str
137
+ Texte francais a synthetiser
138
+ speaker : str
139
+ Nom du speaker (siwis, ezwa, nadine, bernard, gilles, zeckou)
140
+ mode, models_dir, api_url, api_key :
141
+ Parametres de creer_engine()
142
+ phrase_type, duration_scale, pitch_shift, pitch_range, energy_scale, pause_scale :
143
+ Controles prosodiques
144
+ style : str | None
145
+ Nom d'un preset de style (ex: "narratif", "dialogue")
146
+ style_vector : list[float] | None
147
+ Vecteur de style explicite [n_style_dims]
148
+ n_ode_steps : int | None
149
+ Nombre de pas ODE (Matcha-Conformer uniquement, defaut: config)
150
+ duration_noise : float | None
151
+ Bruit de duree lisse (0.0=off, 0.1=subtil, 0.2=prononce)
152
+
153
+ Returns
154
+ -------
155
+ numpy.ndarray
156
+ Audio float32 mono, 22050 Hz
157
+ """
158
+ engine = creer_engine(mode=mode, speaker=speaker, models_dir=models_dir,
159
+ api_url=api_url, api_key=api_key)
160
+ result = engine.synthesize(
161
+ texte,
162
+ phrase_type=phrase_type,
163
+ duration_scale=duration_scale,
164
+ pitch_shift=pitch_shift,
165
+ pitch_range=pitch_range,
166
+ energy_scale=energy_scale,
167
+ pause_scale=pause_scale,
168
+ style=style,
169
+ style_vector=style_vector,
170
+ n_ode_steps=n_ode_steps,
171
+ duration_noise=duration_noise,
172
+ )
173
+ return result.samples
174
+
175
+
176
+ __all__ = [
177
+ "creer_engine",
178
+ "synthetiser",
179
+ "liste_speakers",
180
+ "__version__",
181
+ ]
@@ -0,0 +1,127 @@
1
+ """Localisateur de modeles multi-speaker — cascade de chemins.
2
+
3
+ Ordre de recherche :
4
+ 1. Parametre explicite models_dir
5
+ 2. Variable d'environnement LECTURA_MODELS_DIR/tts_multispeaker
6
+ 3. Repertoire utilisateur ~/.lectura/models/tts_multispeaker/
7
+ 4. Modeles embarques dans le package (site-packages, version privee)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import os
14
+ from pathlib import Path
15
+
16
+ log = logging.getLogger(__name__)
17
+
18
+ _PACKAGE_MODELS = Path(__file__).parent / "modeles"
19
+
20
+ # Fichiers partages requis pour l'inference ONNX locale
21
+ # Matcha-Conformer (v2) : matcha_unet.onnx + hifigan.onnx
22
+ # FastPitch (v1 legacy) : decoder.onnx + hifigan.onnx
23
+ SHARED_FILES_MATCHA = [
24
+ "matcha_unet.onnx",
25
+ "hifigan.onnx",
26
+ ]
27
+ SHARED_FILES_FASTPITCH = [
28
+ "decoder.onnx",
29
+ "hifigan.onnx",
30
+ ]
31
+
32
+
33
+ def find_models_dir(
34
+ speaker: str = "siwis",
35
+ models_dir: str | Path | None = None,
36
+ ) -> Path | None:
37
+ """Trouve le repertoire contenant les modeles ONNX.
38
+
39
+ Args:
40
+ speaker: Nom du speaker dont on verifie la presence de l'encodeur.
41
+ models_dir: Override explicite.
42
+
43
+ Returns:
44
+ Path du repertoire ou None si aucun modele trouve.
45
+ """
46
+ candidates: list[Path] = []
47
+
48
+ # 1. Parametre explicite
49
+ if models_dir is not None:
50
+ candidates.append(Path(models_dir))
51
+
52
+ # 2. Variable d'environnement
53
+ env_dir = os.environ.get("LECTURA_MODELS_DIR", "")
54
+ if env_dir:
55
+ candidates.append(Path(env_dir) / "tts_multispeaker")
56
+
57
+ # 3. Repertoire utilisateur
58
+ candidates.append(Path.home() / ".lectura" / "models" / "tts_multispeaker")
59
+
60
+ # 4. Embarques dans le package
61
+ candidates.append(_PACKAGE_MODELS)
62
+
63
+ for candidate in candidates:
64
+ if _has_models(candidate, speaker):
65
+ log.debug("Modeles trouves : %s", candidate)
66
+ return candidate
67
+
68
+ return None
69
+
70
+
71
+ def _file_exists(directory: Path, filename: str) -> bool:
72
+ """Verifie si un fichier .onnx ou .enc existe."""
73
+ return ((directory / filename).exists()
74
+ or (directory / filename.replace(".onnx", ".enc")).exists())
75
+
76
+
77
+ def _has_models(directory: Path, speaker: str = "siwis") -> bool:
78
+ """Verifie que le repertoire contient les fichiers ONNX necessaires.
79
+
80
+ Detecte Matcha-Conformer d'abord, puis fallback FastPitch.
81
+ """
82
+ if not directory.is_dir():
83
+ return False
84
+
85
+ # --- Matcha-Conformer (v2) ---
86
+ matcha_shared_ok = all(
87
+ _file_exists(directory, f) for f in SHARED_FILES_MATCHA
88
+ )
89
+ if matcha_shared_ok:
90
+ # Unified matcha encoder
91
+ if _file_exists(directory, "matcha_encoder.onnx"):
92
+ return True
93
+ # Per-speaker matcha encoder
94
+ if _file_exists(directory, f"matcha_encoder_{speaker}.onnx"):
95
+ return True
96
+
97
+ # --- FastPitch (v1 legacy) ---
98
+ fastpitch_shared_ok = all(
99
+ _file_exists(directory, f) for f in SHARED_FILES_FASTPITCH
100
+ )
101
+ if fastpitch_shared_ok:
102
+ # Unified encoder
103
+ if _file_exists(directory, "encoder.onnx"):
104
+ return True
105
+ # Per-speaker encoder
106
+ if _file_exists(directory, f"encoder_{speaker}.onnx"):
107
+ return True
108
+
109
+ return False
110
+
111
+
112
+ def load_model_bytes(models_dir: Path, filename: str) -> bytes | None:
113
+ """Charge un modele (clair ou chiffre) depuis le repertoire.
114
+
115
+ Returns:
116
+ bytes du modele ONNX ou None si introuvable.
117
+ """
118
+ onnx_path = models_dir / filename
119
+ enc_path = models_dir / filename.replace(".onnx", ".enc")
120
+
121
+ if onnx_path.exists():
122
+ return onnx_path.read_bytes()
123
+ elif enc_path.exists():
124
+ from lectura_multispeaker._crypto import load_encrypted_model
125
+ return load_encrypted_model(enc_path)
126
+
127
+ return None
@@ -0,0 +1,48 @@
1
+ """Dechiffrement des modeles ONNX chiffres (.enc) au runtime.
2
+
3
+ Les constantes de derivation sont specifiques au module TTS multi-speaker.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import hashlib
9
+ from pathlib import Path
10
+
11
+ # Fragments de sel — specifiques TTS multi-speaker
12
+ _P1 = "Lectura-TTS-MultiSpeaker"
13
+ _P2 = 0x4D53
14
+ _P3 = "2025-FastPitch-HiFiGAN-6voices"
15
+ _P4 = "ONNX-PerSpeaker-Encoder"
16
+
17
+
18
+ def _derive_key() -> bytes:
19
+ """Derive une cle de 256 bytes depuis les constantes."""
20
+ material = f"{_P1}:{_P2:#06x}:{_P3}:{_P4}"
21
+ key = hashlib.sha256(material.encode("utf-8")).digest()
22
+ extended = key
23
+ for _ in range(7):
24
+ key = hashlib.sha256(key + material.encode("utf-8")).digest()
25
+ extended += key
26
+ return extended[:256]
27
+
28
+
29
+ def load_encrypted_model(enc_path: Path) -> bytes:
30
+ """Lit un fichier .enc et retourne les bytes ONNX dechiffres."""
31
+ data = enc_path.read_bytes()
32
+ key = _derive_key()
33
+ key_len = len(key)
34
+ out = bytearray(len(data))
35
+ for i, b in enumerate(data):
36
+ out[i] = b ^ key[i % key_len]
37
+ return bytes(out)
38
+
39
+
40
+ def encrypt_model(onnx_path: Path, enc_path: Path) -> None:
41
+ """Chiffre un fichier ONNX en .enc (pour la preparation du package)."""
42
+ data = onnx_path.read_bytes()
43
+ key = _derive_key()
44
+ key_len = len(key)
45
+ out = bytearray(len(data))
46
+ for i, b in enumerate(data):
47
+ out[i] = b ^ key[i % key_len]
48
+ enc_path.write_bytes(bytes(out))
@@ -0,0 +1,148 @@
1
+ """Enhancement mel-spectrogramme — numpy pur (pas de scipy).
2
+
3
+ Compense le lissage L1 du FastPitch :
4
+ - Contraste spectral (inter-bande par frame)
5
+ - Sharpening temporel (unsharp mask avec kernel gaussien numpy)
6
+ - Noise gate (frames silencieuses)
7
+ - Fade-out (anti-pop fin d'utterance)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import numpy as np
13
+
14
+
15
+ def _gaussian_kernel_1d(sigma: float) -> np.ndarray:
16
+ """Cree un kernel gaussien 1D normalise."""
17
+ ksize = int(sigma * 4) | 1 # taille impaire
18
+ x = np.arange(ksize) - ksize // 2
19
+ kernel = np.exp(-0.5 * (x / sigma) ** 2)
20
+ kernel /= kernel.sum()
21
+ return kernel
22
+
23
+
24
+ def _smooth_temporal(mel: np.ndarray, sigma: float = 3.0) -> np.ndarray:
25
+ """Lissage temporel par convolution gaussienne, bande par bande."""
26
+ T = mel.shape[1]
27
+ kernel = _gaussian_kernel_1d(sigma)
28
+ # Pour les sequences tres courtes (< taille kernel), tronquer le kernel
29
+ if T < len(kernel):
30
+ center = len(kernel) // 2
31
+ half = T // 2
32
+ kernel = kernel[center - half : center + half + (T % 2)]
33
+ if kernel.sum() > 0:
34
+ kernel /= kernel.sum()
35
+ smoothed = np.zeros_like(mel)
36
+ for i in range(mel.shape[0]):
37
+ smoothed[i] = np.convolve(mel[i], kernel, mode="same")[:T]
38
+ return smoothed
39
+
40
+
41
+ def enhance_mel(
42
+ mel: np.ndarray,
43
+ spectral_alpha: float = 0.20,
44
+ temporal_alpha: float = 0.20,
45
+ clip_min: float = -11.5,
46
+ clip_max: float = 2.0,
47
+ ) -> np.ndarray:
48
+ """Enhancement spectral + temporel du mel-spectrogramme.
49
+
50
+ Args:
51
+ mel: [n_mels, T] mel-spectrogram
52
+ spectral_alpha: force du boost de contraste inter-bandes
53
+ temporal_alpha: force du sharpening temporel
54
+ clip_min: borne basse (log(1e-5) ~ -11.5)
55
+ clip_max: borne haute
56
+
57
+ Returns:
58
+ mel enhance, memes dimensions
59
+ """
60
+ # 1. Contraste spectral : eloigner les bandes de la moyenne par frame
61
+ frame_mean = mel.mean(axis=0, keepdims=True) # [1, T]
62
+ spectral_detail = mel - frame_mean
63
+ mel = frame_mean + spectral_detail * (1.0 + spectral_alpha)
64
+
65
+ # 2. Sharpening temporel : unsharp mask
66
+ smoothed = _smooth_temporal(mel, sigma=3.0)
67
+ temporal_detail = mel - smoothed
68
+ mel = mel + temporal_detail * temporal_alpha
69
+
70
+ return np.clip(mel, clip_min, clip_max)
71
+
72
+
73
+ def noise_gate(
74
+ mel: np.ndarray,
75
+ threshold: float = -8.0,
76
+ silence_val: float = -11.5,
77
+ ) -> np.ndarray:
78
+ """Gate les frames dont l'energie moyenne est sous le seuil."""
79
+ frame_mean = mel.mean(axis=0) # [T]
80
+ gate_strength = np.clip(
81
+ (threshold - frame_mean) / (threshold - silence_val), 0, 1
82
+ )
83
+ return mel * (1 - gate_strength) + silence_val * gate_strength
84
+
85
+
86
+ def fade_out(
87
+ mel: np.ndarray,
88
+ n_frames: int = 5,
89
+ silence_val: float = -11.5,
90
+ ) -> np.ndarray:
91
+ """Fade-out lineaire sur les derniers frames."""
92
+ n = min(n_frames, mel.shape[1])
93
+ if n > 0:
94
+ fade = np.linspace(1.0, 0.0, n)
95
+ mel[:, -n:] = mel[:, -n:] * fade + silence_val * (1.0 - fade)
96
+ return mel
97
+
98
+
99
+ def waveform_silence_gate(
100
+ audio: np.ndarray,
101
+ sample_rate: int = 22050,
102
+ window_ms: float = 15.0,
103
+ threshold_db: float = -35.0,
104
+ fade_samples: int = 128,
105
+ ) -> np.ndarray:
106
+ """Gate les zones silencieuses de la forme d'onde (post-vocoder)."""
107
+ if len(audio) == 0:
108
+ return audio
109
+
110
+ win_size = max(1, int(sample_rate * window_ms / 1000))
111
+ threshold_lin = 10 ** (threshold_db / 20)
112
+
113
+ n_windows = len(audio) // win_size
114
+ if n_windows == 0:
115
+ return audio
116
+
117
+ gate = np.ones(len(audio), dtype=np.float32)
118
+
119
+ for i in range(n_windows):
120
+ start = i * win_size
121
+ end = start + win_size
122
+ rms = np.sqrt(np.mean(audio[start:end] ** 2))
123
+ if rms < threshold_lin:
124
+ gate[start:end] = 0.0
125
+
126
+ remainder = len(audio) - n_windows * win_size
127
+ if remainder > 0:
128
+ start = n_windows * win_size
129
+ rms = np.sqrt(np.mean(audio[start:] ** 2))
130
+ if rms < threshold_lin:
131
+ gate[start:] = 0.0
132
+
133
+ diff = np.diff(gate, prepend=gate[0])
134
+ opens = np.where(diff > 0.5)[0]
135
+ for idx in opens:
136
+ start = max(0, idx - fade_samples)
137
+ length = idx - start
138
+ if length > 0:
139
+ gate[start:idx] = np.linspace(0.0, 1.0, length)
140
+
141
+ closes = np.where(diff < -0.5)[0]
142
+ for idx in closes:
143
+ end = min(len(audio), idx + fade_samples)
144
+ length = end - idx
145
+ if length > 0:
146
+ gate[idx:end] = np.linspace(1.0, 0.0, length)
147
+
148
+ return audio * gate
@@ -0,0 +1,108 @@
1
+ {
2
+ "vocab": [
3
+ "<PAD>",
4
+ "<UNK>",
5
+ "#",
6
+ "|",
7
+ "a",
8
+ "e",
9
+ "i",
10
+ "o",
11
+ "u",
12
+ "y",
13
+ "ɛ",
14
+ "ɔ",
15
+ "ø",
16
+ "œ",
17
+ "ə",
18
+ "ɑ",
19
+ "ɛ̃",
20
+ "ɔ̃",
21
+ "ɑ̃",
22
+ "œ̃",
23
+ "p",
24
+ "b",
25
+ "t",
26
+ "d",
27
+ "k",
28
+ "ɡ",
29
+ "f",
30
+ "v",
31
+ "s",
32
+ "z",
33
+ "ʃ",
34
+ "ʒ",
35
+ "m",
36
+ "n",
37
+ "ɲ",
38
+ "l",
39
+ "ʁ",
40
+ "j",
41
+ "w",
42
+ "ɥ",
43
+ "tʃ",
44
+ "dʒ",
45
+ "x",
46
+ "ɣ",
47
+ "ŋ",
48
+ "ɹ",
49
+ ",",
50
+ ".",
51
+ "?",
52
+ "!",
53
+ "…"
54
+ ],
55
+ "phone2id": {
56
+ "<PAD>": 0,
57
+ "<UNK>": 1,
58
+ "#": 2,
59
+ "|": 3,
60
+ "a": 4,
61
+ "e": 5,
62
+ "i": 6,
63
+ "o": 7,
64
+ "u": 8,
65
+ "y": 9,
66
+ "ɛ": 10,
67
+ "ɔ": 11,
68
+ "ø": 12,
69
+ "œ": 13,
70
+ "ə": 14,
71
+ "ɑ": 15,
72
+ "ɛ̃": 16,
73
+ "ɔ̃": 17,
74
+ "ɑ̃": 18,
75
+ "œ̃": 19,
76
+ "p": 20,
77
+ "b": 21,
78
+ "t": 22,
79
+ "d": 23,
80
+ "k": 24,
81
+ "ɡ": 25,
82
+ "f": 26,
83
+ "v": 27,
84
+ "s": 28,
85
+ "z": 29,
86
+ "ʃ": 30,
87
+ "ʒ": 31,
88
+ "m": 32,
89
+ "n": 33,
90
+ "ɲ": 34,
91
+ "l": 35,
92
+ "ʁ": 36,
93
+ "j": 37,
94
+ "w": 38,
95
+ "ɥ": 39,
96
+ "tʃ": 40,
97
+ "dʒ": 41,
98
+ "x": 42,
99
+ "ɣ": 43,
100
+ "ŋ": 44,
101
+ "ɹ": 45,
102
+ ",": 46,
103
+ ".": 47,
104
+ "?": 48,
105
+ "!": 49,
106
+ "…": 50
107
+ }
108
+ }