lectura-multispeaker 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lectura_multispeaker-3.0.0/PKG-INFO +22 -0
- lectura_multispeaker-3.0.0/pyproject.toml +37 -0
- lectura_multispeaker-3.0.0/setup.cfg +4 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/__init__.py +181 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/_chargeur.py +127 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/_crypto.py +48 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/_enhance.py +148 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/data/__init__.py +0 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/data/phoneme_vocab.json +108 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/data/speakers.json +51 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/inference_api.py +193 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/inference_onnx.py +815 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker/phonemes.py +160 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/PKG-INFO +22 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/SOURCES.txt +22 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/dependency_links.txt +1 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/requires.txt +4 -0
- lectura_multispeaker-3.0.0/src/lectura_multispeaker.egg-info/top_level.txt +1 -0
- lectura_multispeaker-3.0.0/tests/test_chargeur.py +207 -0
- lectura_multispeaker-3.0.0/tests/test_crypto.py +46 -0
- lectura_multispeaker-3.0.0/tests/test_enhance.py +55 -0
- lectura_multispeaker-3.0.0/tests/test_inference.py +512 -0
- lectura_multispeaker-3.0.0/tests/test_integration.py +218 -0
- lectura_multispeaker-3.0.0/tests/test_phonemes.py +65 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lectura-multispeaker
|
|
3
|
+
Version: 3.0.0
|
|
4
|
+
Summary: Synthese vocale neuronale multi-speaker francais — Matcha-Conformer + HiFi-GAN (ONNX)
|
|
5
|
+
Author-email: Max Carriere <admin@lectura.world>
|
|
6
|
+
License: AGPL-3.0-or-later
|
|
7
|
+
Project-URL: Homepage, https://www.lectura.world/developpement/modules/outils/tts-multi/
|
|
8
|
+
Project-URL: Repository, https://github.com/maxcarriere/lectura-modules/tree/main/MultiSpeaker
|
|
9
|
+
Project-URL: Issues, https://github.com/maxcarriere/lectura-modules/issues
|
|
10
|
+
Keywords: tts,french,speech-synthesis,onnx,matcha,conformer,multi-speaker
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Provides-Extra: onnx
|
|
21
|
+
Requires-Dist: onnxruntime>=1.16; extra == "onnx"
|
|
22
|
+
Requires-Dist: numpy>=1.24; extra == "onnx"
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "lectura-multispeaker"
|
|
7
|
+
version = "3.0.0"
|
|
8
|
+
description = "Synthese vocale neuronale multi-speaker francais — Matcha-Conformer + HiFi-GAN (ONNX)"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "AGPL-3.0-or-later"}
|
|
12
|
+
authors = [{name = "Max Carriere", email = "admin@lectura.world"}]
|
|
13
|
+
keywords = ["tts", "french", "speech-synthesis", "onnx", "matcha", "conformer", "multi-speaker"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 5 - Production/Stable",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = []
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
onnx = ["onnxruntime>=1.16", "numpy>=1.24"]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://www.lectura.world/developpement/modules/outils/tts-multi/"
|
|
30
|
+
Repository = "https://github.com/maxcarriere/lectura-modules/tree/main/MultiSpeaker"
|
|
31
|
+
Issues = "https://github.com/maxcarriere/lectura-modules/issues"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.package-data]
|
|
37
|
+
lectura_multispeaker = ["data/*.json", "modeles/*.onnx", "modeles/*.enc", "modeles/*.json"]
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""lectura-multispeaker — Synthese vocale neuronale multi-speaker francais.
|
|
2
|
+
|
|
3
|
+
Exports publics :
|
|
4
|
+
- creer_engine(mode, speaker, models_dir, ...) -> engine
|
|
5
|
+
- synthetiser(texte, speaker, **kwargs) -> numpy array float32
|
|
6
|
+
- liste_speakers() -> list[dict]
|
|
7
|
+
- OnnxTTSEngine, ApiTTSEngine
|
|
8
|
+
- TTSResult, PhonemeTiming
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
__version__ = "3.0.0"
|
|
21
|
+
|
|
22
|
+
_SPEAKERS_DATA: list[dict] | None = None
|
|
23
|
+
_DEFAULT_SPEAKER: str | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _load_speakers() -> tuple[list[dict], str]:
|
|
27
|
+
"""Charge le fichier speakers.json (singleton)."""
|
|
28
|
+
global _SPEAKERS_DATA, _DEFAULT_SPEAKER
|
|
29
|
+
if _SPEAKERS_DATA is not None:
|
|
30
|
+
return _SPEAKERS_DATA, _DEFAULT_SPEAKER
|
|
31
|
+
|
|
32
|
+
speakers_path = Path(__file__).parent / "data" / "speakers.json"
|
|
33
|
+
with open(speakers_path, encoding="utf-8") as f:
|
|
34
|
+
data = json.load(f)
|
|
35
|
+
_SPEAKERS_DATA = data["speakers"]
|
|
36
|
+
_DEFAULT_SPEAKER = data.get("default", "siwis")
|
|
37
|
+
return _SPEAKERS_DATA, _DEFAULT_SPEAKER
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def liste_speakers() -> list[dict]:
|
|
41
|
+
"""Retourne la liste des speakers disponibles avec metadata.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
list[dict]
|
|
46
|
+
Chaque dict contient : id, name, gender, label
|
|
47
|
+
"""
|
|
48
|
+
speakers, _ = _load_speakers()
|
|
49
|
+
return list(speakers)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def creer_engine(
|
|
53
|
+
mode: str = "auto",
|
|
54
|
+
speaker: str = "siwis",
|
|
55
|
+
models_dir: str | Path | None = None,
|
|
56
|
+
api_url: str | None = None,
|
|
57
|
+
api_key: str | None = None,
|
|
58
|
+
):
|
|
59
|
+
"""Factory pour creer un engine d'inference TTS multi-speaker.
|
|
60
|
+
|
|
61
|
+
Parameters
|
|
62
|
+
----------
|
|
63
|
+
mode : str
|
|
64
|
+
"auto" : ONNX local si disponible, sinon API
|
|
65
|
+
"local" : force l'inference ONNX locale
|
|
66
|
+
"api" : force l'API distante
|
|
67
|
+
speaker : str
|
|
68
|
+
Nom du speaker (siwis, ezwa, nadine, bernard, gilles, zeckou)
|
|
69
|
+
models_dir : Path | None
|
|
70
|
+
Repertoire des modeles ONNX (override la detection auto)
|
|
71
|
+
api_url : str | None
|
|
72
|
+
URL du serveur API
|
|
73
|
+
api_key : str | None
|
|
74
|
+
Cle API
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
OnnxTTSEngine | ApiTTSEngine
|
|
79
|
+
Engine avec interface unifiee (synthesize, synthesize_phonemes)
|
|
80
|
+
"""
|
|
81
|
+
if mode in ("auto", "local"):
|
|
82
|
+
engine = _try_local(speaker, models_dir)
|
|
83
|
+
if engine is not None:
|
|
84
|
+
return engine
|
|
85
|
+
if mode == "local":
|
|
86
|
+
raise FileNotFoundError(
|
|
87
|
+
"Modeles ONNX introuvables. Verifiez l'installation ou "
|
|
88
|
+
"specifiez models_dir. Voir README pour les emplacements."
|
|
89
|
+
)
|
|
90
|
+
log.info("Modeles locaux non disponibles, fallback vers API")
|
|
91
|
+
|
|
92
|
+
from lectura_multispeaker.inference_api import ApiTTSEngine
|
|
93
|
+
return ApiTTSEngine(api_url=api_url, api_key=api_key, speaker=speaker)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _try_local(speaker: str, models_dir: str | Path | None = None):
|
|
97
|
+
"""Tente de creer un engine ONNX local."""
|
|
98
|
+
try:
|
|
99
|
+
import onnxruntime # noqa: F401
|
|
100
|
+
import numpy # noqa: F401
|
|
101
|
+
except ImportError:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
from lectura_multispeaker._chargeur import find_models_dir
|
|
105
|
+
|
|
106
|
+
resolved = find_models_dir(speaker, models_dir)
|
|
107
|
+
if resolved is None:
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
from lectura_multispeaker.inference_onnx import OnnxTTSEngine
|
|
111
|
+
return OnnxTTSEngine(resolved, speaker=speaker)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def synthetiser(
|
|
115
|
+
texte: str,
|
|
116
|
+
speaker: str = "siwis",
|
|
117
|
+
mode: str = "auto",
|
|
118
|
+
models_dir: str | Path | None = None,
|
|
119
|
+
api_url: str | None = None,
|
|
120
|
+
api_key: str | None = None,
|
|
121
|
+
phrase_type: int | None = None,
|
|
122
|
+
duration_scale: float = 1.0,
|
|
123
|
+
pitch_shift: float = 0.0,
|
|
124
|
+
pitch_range: float = 1.0,
|
|
125
|
+
energy_scale: float = 1.0,
|
|
126
|
+
pause_scale: float = 1.0,
|
|
127
|
+
style: str | None = None,
|
|
128
|
+
style_vector: list[float] | None = None,
|
|
129
|
+
n_ode_steps: int | None = None,
|
|
130
|
+
duration_noise: float | None = None,
|
|
131
|
+
) -> Any:
|
|
132
|
+
"""Convenience : texte -> numpy audio float32.
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
texte : str
|
|
137
|
+
Texte francais a synthetiser
|
|
138
|
+
speaker : str
|
|
139
|
+
Nom du speaker (siwis, ezwa, nadine, bernard, gilles, zeckou)
|
|
140
|
+
mode, models_dir, api_url, api_key :
|
|
141
|
+
Parametres de creer_engine()
|
|
142
|
+
phrase_type, duration_scale, pitch_shift, pitch_range, energy_scale, pause_scale :
|
|
143
|
+
Controles prosodiques
|
|
144
|
+
style : str | None
|
|
145
|
+
Nom d'un preset de style (ex: "narratif", "dialogue")
|
|
146
|
+
style_vector : list[float] | None
|
|
147
|
+
Vecteur de style explicite [n_style_dims]
|
|
148
|
+
n_ode_steps : int | None
|
|
149
|
+
Nombre de pas ODE (Matcha-Conformer uniquement, defaut: config)
|
|
150
|
+
duration_noise : float | None
|
|
151
|
+
Bruit de duree lisse (0.0=off, 0.1=subtil, 0.2=prononce)
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
numpy.ndarray
|
|
156
|
+
Audio float32 mono, 22050 Hz
|
|
157
|
+
"""
|
|
158
|
+
engine = creer_engine(mode=mode, speaker=speaker, models_dir=models_dir,
|
|
159
|
+
api_url=api_url, api_key=api_key)
|
|
160
|
+
result = engine.synthesize(
|
|
161
|
+
texte,
|
|
162
|
+
phrase_type=phrase_type,
|
|
163
|
+
duration_scale=duration_scale,
|
|
164
|
+
pitch_shift=pitch_shift,
|
|
165
|
+
pitch_range=pitch_range,
|
|
166
|
+
energy_scale=energy_scale,
|
|
167
|
+
pause_scale=pause_scale,
|
|
168
|
+
style=style,
|
|
169
|
+
style_vector=style_vector,
|
|
170
|
+
n_ode_steps=n_ode_steps,
|
|
171
|
+
duration_noise=duration_noise,
|
|
172
|
+
)
|
|
173
|
+
return result.samples
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
__all__ = [
|
|
177
|
+
"creer_engine",
|
|
178
|
+
"synthetiser",
|
|
179
|
+
"liste_speakers",
|
|
180
|
+
"__version__",
|
|
181
|
+
]
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Localisateur de modeles multi-speaker — cascade de chemins.
|
|
2
|
+
|
|
3
|
+
Ordre de recherche :
|
|
4
|
+
1. Parametre explicite models_dir
|
|
5
|
+
2. Variable d'environnement LECTURA_MODELS_DIR/tts_multispeaker
|
|
6
|
+
3. Repertoire utilisateur ~/.lectura/models/tts_multispeaker/
|
|
7
|
+
4. Modeles embarques dans le package (site-packages, version privee)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
log = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
_PACKAGE_MODELS = Path(__file__).parent / "modeles"
|
|
19
|
+
|
|
20
|
+
# Fichiers partages requis pour l'inference ONNX locale
|
|
21
|
+
# Matcha-Conformer (v2) : matcha_unet.onnx + hifigan.onnx
|
|
22
|
+
# FastPitch (v1 legacy) : decoder.onnx + hifigan.onnx
|
|
23
|
+
SHARED_FILES_MATCHA = [
|
|
24
|
+
"matcha_unet.onnx",
|
|
25
|
+
"hifigan.onnx",
|
|
26
|
+
]
|
|
27
|
+
SHARED_FILES_FASTPITCH = [
|
|
28
|
+
"decoder.onnx",
|
|
29
|
+
"hifigan.onnx",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def find_models_dir(
|
|
34
|
+
speaker: str = "siwis",
|
|
35
|
+
models_dir: str | Path | None = None,
|
|
36
|
+
) -> Path | None:
|
|
37
|
+
"""Trouve le repertoire contenant les modeles ONNX.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
speaker: Nom du speaker dont on verifie la presence de l'encodeur.
|
|
41
|
+
models_dir: Override explicite.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Path du repertoire ou None si aucun modele trouve.
|
|
45
|
+
"""
|
|
46
|
+
candidates: list[Path] = []
|
|
47
|
+
|
|
48
|
+
# 1. Parametre explicite
|
|
49
|
+
if models_dir is not None:
|
|
50
|
+
candidates.append(Path(models_dir))
|
|
51
|
+
|
|
52
|
+
# 2. Variable d'environnement
|
|
53
|
+
env_dir = os.environ.get("LECTURA_MODELS_DIR", "")
|
|
54
|
+
if env_dir:
|
|
55
|
+
candidates.append(Path(env_dir) / "tts_multispeaker")
|
|
56
|
+
|
|
57
|
+
# 3. Repertoire utilisateur
|
|
58
|
+
candidates.append(Path.home() / ".lectura" / "models" / "tts_multispeaker")
|
|
59
|
+
|
|
60
|
+
# 4. Embarques dans le package
|
|
61
|
+
candidates.append(_PACKAGE_MODELS)
|
|
62
|
+
|
|
63
|
+
for candidate in candidates:
|
|
64
|
+
if _has_models(candidate, speaker):
|
|
65
|
+
log.debug("Modeles trouves : %s", candidate)
|
|
66
|
+
return candidate
|
|
67
|
+
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _file_exists(directory: Path, filename: str) -> bool:
|
|
72
|
+
"""Verifie si un fichier .onnx ou .enc existe."""
|
|
73
|
+
return ((directory / filename).exists()
|
|
74
|
+
or (directory / filename.replace(".onnx", ".enc")).exists())
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _has_models(directory: Path, speaker: str = "siwis") -> bool:
|
|
78
|
+
"""Verifie que le repertoire contient les fichiers ONNX necessaires.
|
|
79
|
+
|
|
80
|
+
Detecte Matcha-Conformer d'abord, puis fallback FastPitch.
|
|
81
|
+
"""
|
|
82
|
+
if not directory.is_dir():
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
# --- Matcha-Conformer (v2) ---
|
|
86
|
+
matcha_shared_ok = all(
|
|
87
|
+
_file_exists(directory, f) for f in SHARED_FILES_MATCHA
|
|
88
|
+
)
|
|
89
|
+
if matcha_shared_ok:
|
|
90
|
+
# Unified matcha encoder
|
|
91
|
+
if _file_exists(directory, "matcha_encoder.onnx"):
|
|
92
|
+
return True
|
|
93
|
+
# Per-speaker matcha encoder
|
|
94
|
+
if _file_exists(directory, f"matcha_encoder_{speaker}.onnx"):
|
|
95
|
+
return True
|
|
96
|
+
|
|
97
|
+
# --- FastPitch (v1 legacy) ---
|
|
98
|
+
fastpitch_shared_ok = all(
|
|
99
|
+
_file_exists(directory, f) for f in SHARED_FILES_FASTPITCH
|
|
100
|
+
)
|
|
101
|
+
if fastpitch_shared_ok:
|
|
102
|
+
# Unified encoder
|
|
103
|
+
if _file_exists(directory, "encoder.onnx"):
|
|
104
|
+
return True
|
|
105
|
+
# Per-speaker encoder
|
|
106
|
+
if _file_exists(directory, f"encoder_{speaker}.onnx"):
|
|
107
|
+
return True
|
|
108
|
+
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def load_model_bytes(models_dir: Path, filename: str) -> bytes | None:
|
|
113
|
+
"""Charge un modele (clair ou chiffre) depuis le repertoire.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
bytes du modele ONNX ou None si introuvable.
|
|
117
|
+
"""
|
|
118
|
+
onnx_path = models_dir / filename
|
|
119
|
+
enc_path = models_dir / filename.replace(".onnx", ".enc")
|
|
120
|
+
|
|
121
|
+
if onnx_path.exists():
|
|
122
|
+
return onnx_path.read_bytes()
|
|
123
|
+
elif enc_path.exists():
|
|
124
|
+
from lectura_multispeaker._crypto import load_encrypted_model
|
|
125
|
+
return load_encrypted_model(enc_path)
|
|
126
|
+
|
|
127
|
+
return None
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Dechiffrement des modeles ONNX chiffres (.enc) au runtime.
|
|
2
|
+
|
|
3
|
+
Les constantes de derivation sont specifiques au module TTS multi-speaker.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
# Fragments de sel — specifiques TTS multi-speaker
|
|
12
|
+
_P1 = "Lectura-TTS-MultiSpeaker"
|
|
13
|
+
_P2 = 0x4D53
|
|
14
|
+
_P3 = "2025-FastPitch-HiFiGAN-6voices"
|
|
15
|
+
_P4 = "ONNX-PerSpeaker-Encoder"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _derive_key() -> bytes:
|
|
19
|
+
"""Derive une cle de 256 bytes depuis les constantes."""
|
|
20
|
+
material = f"{_P1}:{_P2:#06x}:{_P3}:{_P4}"
|
|
21
|
+
key = hashlib.sha256(material.encode("utf-8")).digest()
|
|
22
|
+
extended = key
|
|
23
|
+
for _ in range(7):
|
|
24
|
+
key = hashlib.sha256(key + material.encode("utf-8")).digest()
|
|
25
|
+
extended += key
|
|
26
|
+
return extended[:256]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def load_encrypted_model(enc_path: Path) -> bytes:
|
|
30
|
+
"""Lit un fichier .enc et retourne les bytes ONNX dechiffres."""
|
|
31
|
+
data = enc_path.read_bytes()
|
|
32
|
+
key = _derive_key()
|
|
33
|
+
key_len = len(key)
|
|
34
|
+
out = bytearray(len(data))
|
|
35
|
+
for i, b in enumerate(data):
|
|
36
|
+
out[i] = b ^ key[i % key_len]
|
|
37
|
+
return bytes(out)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def encrypt_model(onnx_path: Path, enc_path: Path) -> None:
|
|
41
|
+
"""Chiffre un fichier ONNX en .enc (pour la preparation du package)."""
|
|
42
|
+
data = onnx_path.read_bytes()
|
|
43
|
+
key = _derive_key()
|
|
44
|
+
key_len = len(key)
|
|
45
|
+
out = bytearray(len(data))
|
|
46
|
+
for i, b in enumerate(data):
|
|
47
|
+
out[i] = b ^ key[i % key_len]
|
|
48
|
+
enc_path.write_bytes(bytes(out))
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Enhancement mel-spectrogramme — numpy pur (pas de scipy).
|
|
2
|
+
|
|
3
|
+
Compense le lissage L1 du FastPitch :
|
|
4
|
+
- Contraste spectral (inter-bande par frame)
|
|
5
|
+
- Sharpening temporel (unsharp mask avec kernel gaussien numpy)
|
|
6
|
+
- Noise gate (frames silencieuses)
|
|
7
|
+
- Fade-out (anti-pop fin d'utterance)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _gaussian_kernel_1d(sigma: float) -> np.ndarray:
|
|
16
|
+
"""Cree un kernel gaussien 1D normalise."""
|
|
17
|
+
ksize = int(sigma * 4) | 1 # taille impaire
|
|
18
|
+
x = np.arange(ksize) - ksize // 2
|
|
19
|
+
kernel = np.exp(-0.5 * (x / sigma) ** 2)
|
|
20
|
+
kernel /= kernel.sum()
|
|
21
|
+
return kernel
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _smooth_temporal(mel: np.ndarray, sigma: float = 3.0) -> np.ndarray:
|
|
25
|
+
"""Lissage temporel par convolution gaussienne, bande par bande."""
|
|
26
|
+
T = mel.shape[1]
|
|
27
|
+
kernel = _gaussian_kernel_1d(sigma)
|
|
28
|
+
# Pour les sequences tres courtes (< taille kernel), tronquer le kernel
|
|
29
|
+
if T < len(kernel):
|
|
30
|
+
center = len(kernel) // 2
|
|
31
|
+
half = T // 2
|
|
32
|
+
kernel = kernel[center - half : center + half + (T % 2)]
|
|
33
|
+
if kernel.sum() > 0:
|
|
34
|
+
kernel /= kernel.sum()
|
|
35
|
+
smoothed = np.zeros_like(mel)
|
|
36
|
+
for i in range(mel.shape[0]):
|
|
37
|
+
smoothed[i] = np.convolve(mel[i], kernel, mode="same")[:T]
|
|
38
|
+
return smoothed
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def enhance_mel(
|
|
42
|
+
mel: np.ndarray,
|
|
43
|
+
spectral_alpha: float = 0.20,
|
|
44
|
+
temporal_alpha: float = 0.20,
|
|
45
|
+
clip_min: float = -11.5,
|
|
46
|
+
clip_max: float = 2.0,
|
|
47
|
+
) -> np.ndarray:
|
|
48
|
+
"""Enhancement spectral + temporel du mel-spectrogramme.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
mel: [n_mels, T] mel-spectrogram
|
|
52
|
+
spectral_alpha: force du boost de contraste inter-bandes
|
|
53
|
+
temporal_alpha: force du sharpening temporel
|
|
54
|
+
clip_min: borne basse (log(1e-5) ~ -11.5)
|
|
55
|
+
clip_max: borne haute
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
mel enhance, memes dimensions
|
|
59
|
+
"""
|
|
60
|
+
# 1. Contraste spectral : eloigner les bandes de la moyenne par frame
|
|
61
|
+
frame_mean = mel.mean(axis=0, keepdims=True) # [1, T]
|
|
62
|
+
spectral_detail = mel - frame_mean
|
|
63
|
+
mel = frame_mean + spectral_detail * (1.0 + spectral_alpha)
|
|
64
|
+
|
|
65
|
+
# 2. Sharpening temporel : unsharp mask
|
|
66
|
+
smoothed = _smooth_temporal(mel, sigma=3.0)
|
|
67
|
+
temporal_detail = mel - smoothed
|
|
68
|
+
mel = mel + temporal_detail * temporal_alpha
|
|
69
|
+
|
|
70
|
+
return np.clip(mel, clip_min, clip_max)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def noise_gate(
|
|
74
|
+
mel: np.ndarray,
|
|
75
|
+
threshold: float = -8.0,
|
|
76
|
+
silence_val: float = -11.5,
|
|
77
|
+
) -> np.ndarray:
|
|
78
|
+
"""Gate les frames dont l'energie moyenne est sous le seuil."""
|
|
79
|
+
frame_mean = mel.mean(axis=0) # [T]
|
|
80
|
+
gate_strength = np.clip(
|
|
81
|
+
(threshold - frame_mean) / (threshold - silence_val), 0, 1
|
|
82
|
+
)
|
|
83
|
+
return mel * (1 - gate_strength) + silence_val * gate_strength
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def fade_out(
|
|
87
|
+
mel: np.ndarray,
|
|
88
|
+
n_frames: int = 5,
|
|
89
|
+
silence_val: float = -11.5,
|
|
90
|
+
) -> np.ndarray:
|
|
91
|
+
"""Fade-out lineaire sur les derniers frames."""
|
|
92
|
+
n = min(n_frames, mel.shape[1])
|
|
93
|
+
if n > 0:
|
|
94
|
+
fade = np.linspace(1.0, 0.0, n)
|
|
95
|
+
mel[:, -n:] = mel[:, -n:] * fade + silence_val * (1.0 - fade)
|
|
96
|
+
return mel
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def waveform_silence_gate(
|
|
100
|
+
audio: np.ndarray,
|
|
101
|
+
sample_rate: int = 22050,
|
|
102
|
+
window_ms: float = 15.0,
|
|
103
|
+
threshold_db: float = -35.0,
|
|
104
|
+
fade_samples: int = 128,
|
|
105
|
+
) -> np.ndarray:
|
|
106
|
+
"""Gate les zones silencieuses de la forme d'onde (post-vocoder)."""
|
|
107
|
+
if len(audio) == 0:
|
|
108
|
+
return audio
|
|
109
|
+
|
|
110
|
+
win_size = max(1, int(sample_rate * window_ms / 1000))
|
|
111
|
+
threshold_lin = 10 ** (threshold_db / 20)
|
|
112
|
+
|
|
113
|
+
n_windows = len(audio) // win_size
|
|
114
|
+
if n_windows == 0:
|
|
115
|
+
return audio
|
|
116
|
+
|
|
117
|
+
gate = np.ones(len(audio), dtype=np.float32)
|
|
118
|
+
|
|
119
|
+
for i in range(n_windows):
|
|
120
|
+
start = i * win_size
|
|
121
|
+
end = start + win_size
|
|
122
|
+
rms = np.sqrt(np.mean(audio[start:end] ** 2))
|
|
123
|
+
if rms < threshold_lin:
|
|
124
|
+
gate[start:end] = 0.0
|
|
125
|
+
|
|
126
|
+
remainder = len(audio) - n_windows * win_size
|
|
127
|
+
if remainder > 0:
|
|
128
|
+
start = n_windows * win_size
|
|
129
|
+
rms = np.sqrt(np.mean(audio[start:] ** 2))
|
|
130
|
+
if rms < threshold_lin:
|
|
131
|
+
gate[start:] = 0.0
|
|
132
|
+
|
|
133
|
+
diff = np.diff(gate, prepend=gate[0])
|
|
134
|
+
opens = np.where(diff > 0.5)[0]
|
|
135
|
+
for idx in opens:
|
|
136
|
+
start = max(0, idx - fade_samples)
|
|
137
|
+
length = idx - start
|
|
138
|
+
if length > 0:
|
|
139
|
+
gate[start:idx] = np.linspace(0.0, 1.0, length)
|
|
140
|
+
|
|
141
|
+
closes = np.where(diff < -0.5)[0]
|
|
142
|
+
for idx in closes:
|
|
143
|
+
end = min(len(audio), idx + fade_samples)
|
|
144
|
+
length = end - idx
|
|
145
|
+
if length > 0:
|
|
146
|
+
gate[idx:end] = np.linspace(1.0, 0.0, length)
|
|
147
|
+
|
|
148
|
+
return audio * gate
|
|
File without changes
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
{
|
|
2
|
+
"vocab": [
|
|
3
|
+
"<PAD>",
|
|
4
|
+
"<UNK>",
|
|
5
|
+
"#",
|
|
6
|
+
"|",
|
|
7
|
+
"a",
|
|
8
|
+
"e",
|
|
9
|
+
"i",
|
|
10
|
+
"o",
|
|
11
|
+
"u",
|
|
12
|
+
"y",
|
|
13
|
+
"ɛ",
|
|
14
|
+
"ɔ",
|
|
15
|
+
"ø",
|
|
16
|
+
"œ",
|
|
17
|
+
"ə",
|
|
18
|
+
"ɑ",
|
|
19
|
+
"ɛ̃",
|
|
20
|
+
"ɔ̃",
|
|
21
|
+
"ɑ̃",
|
|
22
|
+
"œ̃",
|
|
23
|
+
"p",
|
|
24
|
+
"b",
|
|
25
|
+
"t",
|
|
26
|
+
"d",
|
|
27
|
+
"k",
|
|
28
|
+
"ɡ",
|
|
29
|
+
"f",
|
|
30
|
+
"v",
|
|
31
|
+
"s",
|
|
32
|
+
"z",
|
|
33
|
+
"ʃ",
|
|
34
|
+
"ʒ",
|
|
35
|
+
"m",
|
|
36
|
+
"n",
|
|
37
|
+
"ɲ",
|
|
38
|
+
"l",
|
|
39
|
+
"ʁ",
|
|
40
|
+
"j",
|
|
41
|
+
"w",
|
|
42
|
+
"ɥ",
|
|
43
|
+
"tʃ",
|
|
44
|
+
"dʒ",
|
|
45
|
+
"x",
|
|
46
|
+
"ɣ",
|
|
47
|
+
"ŋ",
|
|
48
|
+
"ɹ",
|
|
49
|
+
",",
|
|
50
|
+
".",
|
|
51
|
+
"?",
|
|
52
|
+
"!",
|
|
53
|
+
"…"
|
|
54
|
+
],
|
|
55
|
+
"phone2id": {
|
|
56
|
+
"<PAD>": 0,
|
|
57
|
+
"<UNK>": 1,
|
|
58
|
+
"#": 2,
|
|
59
|
+
"|": 3,
|
|
60
|
+
"a": 4,
|
|
61
|
+
"e": 5,
|
|
62
|
+
"i": 6,
|
|
63
|
+
"o": 7,
|
|
64
|
+
"u": 8,
|
|
65
|
+
"y": 9,
|
|
66
|
+
"ɛ": 10,
|
|
67
|
+
"ɔ": 11,
|
|
68
|
+
"ø": 12,
|
|
69
|
+
"œ": 13,
|
|
70
|
+
"ə": 14,
|
|
71
|
+
"ɑ": 15,
|
|
72
|
+
"ɛ̃": 16,
|
|
73
|
+
"ɔ̃": 17,
|
|
74
|
+
"ɑ̃": 18,
|
|
75
|
+
"œ̃": 19,
|
|
76
|
+
"p": 20,
|
|
77
|
+
"b": 21,
|
|
78
|
+
"t": 22,
|
|
79
|
+
"d": 23,
|
|
80
|
+
"k": 24,
|
|
81
|
+
"ɡ": 25,
|
|
82
|
+
"f": 26,
|
|
83
|
+
"v": 27,
|
|
84
|
+
"s": 28,
|
|
85
|
+
"z": 29,
|
|
86
|
+
"ʃ": 30,
|
|
87
|
+
"ʒ": 31,
|
|
88
|
+
"m": 32,
|
|
89
|
+
"n": 33,
|
|
90
|
+
"ɲ": 34,
|
|
91
|
+
"l": 35,
|
|
92
|
+
"ʁ": 36,
|
|
93
|
+
"j": 37,
|
|
94
|
+
"w": 38,
|
|
95
|
+
"ɥ": 39,
|
|
96
|
+
"tʃ": 40,
|
|
97
|
+
"dʒ": 41,
|
|
98
|
+
"x": 42,
|
|
99
|
+
"ɣ": 43,
|
|
100
|
+
"ŋ": 44,
|
|
101
|
+
"ɹ": 45,
|
|
102
|
+
",": 46,
|
|
103
|
+
".": 47,
|
|
104
|
+
"?": 48,
|
|
105
|
+
"!": 49,
|
|
106
|
+
"…": 50
|
|
107
|
+
}
|
|
108
|
+
}
|