piper-tts-plus 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- piper/__init__.py +5 -0
- piper/__main__.py +159 -0
- piper/config.py +54 -0
- piper/const.py +5 -0
- piper/download.py +139 -0
- piper/file_hash.py +46 -0
- piper/http_server.py +127 -0
- piper/util.py +12 -0
- piper/voice.py +216 -0
- piper/voices.json +4222 -0
- piper_tts_plus-1.2.0.dist-info/METADATA +32 -0
- piper_tts_plus-1.2.0.dist-info/RECORD +15 -0
- piper_tts_plus-1.2.0.dist-info/WHEEL +5 -0
- piper_tts_plus-1.2.0.dist-info/entry_points.txt +2 -0
- piper_tts_plus-1.2.0.dist-info/top_level.txt +1 -0
piper/voice.py
ADDED
@@ -0,0 +1,216 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import wave
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import onnxruntime
|
10
|
+
import pyopenjtalk
|
11
|
+
from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
|
12
|
+
|
13
|
+
from .config import PhonemeType, PiperConfig
|
14
|
+
from .const import BOS, EOS, PAD
|
15
|
+
from .util import audio_float_to_int16
|
16
|
+
|
17
|
+
_LOGGER = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
@dataclass
|
21
|
+
class PiperVoice:
|
22
|
+
session: onnxruntime.InferenceSession
|
23
|
+
config: PiperConfig
|
24
|
+
|
25
|
+
@staticmethod
|
26
|
+
def load(
|
27
|
+
model_path: Union[str, Path],
|
28
|
+
config_path: Optional[Union[str, Path]] = None,
|
29
|
+
use_cuda: bool = False,
|
30
|
+
) -> "PiperVoice":
|
31
|
+
"""Load an ONNX model and config."""
|
32
|
+
if config_path is None:
|
33
|
+
config_path = f"{model_path}.json"
|
34
|
+
|
35
|
+
with open(config_path, "r", encoding="utf-8") as config_file:
|
36
|
+
config_dict = json.load(config_file)
|
37
|
+
|
38
|
+
providers: List[Union[str, Tuple[str, Dict[str, Any]]]]
|
39
|
+
if use_cuda:
|
40
|
+
providers = [
|
41
|
+
(
|
42
|
+
"CUDAExecutionProvider",
|
43
|
+
{"cudnn_conv_algo_search": "HEURISTIC"},
|
44
|
+
)
|
45
|
+
]
|
46
|
+
else:
|
47
|
+
providers = ["CPUExecutionProvider"]
|
48
|
+
|
49
|
+
return PiperVoice(
|
50
|
+
config=PiperConfig.from_dict(config_dict),
|
51
|
+
session=onnxruntime.InferenceSession(
|
52
|
+
str(model_path),
|
53
|
+
sess_options=onnxruntime.SessionOptions(),
|
54
|
+
providers=providers,
|
55
|
+
),
|
56
|
+
)
|
57
|
+
|
58
|
+
def phonemize(self, text: str) -> List[List[str]]:
|
59
|
+
"""Text to phonemes grouped by sentence."""
|
60
|
+
if self.config.phoneme_type == PhonemeType.ESPEAK:
|
61
|
+
if self.config.espeak_voice == "ar":
|
62
|
+
# Arabic diacritization
|
63
|
+
# https://github.com/mush42/libtashkeel/
|
64
|
+
text = tashkeel_run(text)
|
65
|
+
|
66
|
+
return phonemize_espeak(text, self.config.espeak_voice)
|
67
|
+
|
68
|
+
if self.config.phoneme_type == PhonemeType.TEXT:
|
69
|
+
return phonemize_codepoints(text)
|
70
|
+
|
71
|
+
if self.config.phoneme_type == PhonemeType.OPENJTALK:
|
72
|
+
# Piper の学習時と同じアルゴリズム(accent/prosody 付き)で音素化
|
73
|
+
try:
|
74
|
+
# `piper_train` がインストールされていれば専用実装を利用
|
75
|
+
from piper_train.phonemize.japanese import phonemize_japanese # type: ignore
|
76
|
+
|
77
|
+
tokens = phonemize_japanese(text)
|
78
|
+
return [tokens]
|
79
|
+
except Exception: # pragma: no cover – フォールバック
|
80
|
+
# 学習環境に piper_train が無い場合の簡易フォールバック
|
81
|
+
phonemes = pyopenjtalk.g2p(text, kana=False).split()
|
82
|
+
|
83
|
+
converted = []
|
84
|
+
for ph in phonemes:
|
85
|
+
if ph == "pau":
|
86
|
+
converted.append("_")
|
87
|
+
continue
|
88
|
+
|
89
|
+
# Devoiced vowels come back as upper-case (A,I,U,E,O)
|
90
|
+
if ph in {"A", "I", "U", "E", "O"}:
|
91
|
+
ph = ph.lower()
|
92
|
+
|
93
|
+
converted.append(ph)
|
94
|
+
|
95
|
+
return [converted]
|
96
|
+
|
97
|
+
raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
|
98
|
+
|
99
|
+
def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
|
100
|
+
"""Phonemes to ids."""
|
101
|
+
id_map = self.config.phoneme_id_map
|
102
|
+
ids: List[int] = list(id_map[BOS])
|
103
|
+
|
104
|
+
for phoneme in phonemes:
|
105
|
+
if phoneme not in id_map:
|
106
|
+
_LOGGER.warning("Missing phoneme from id map: %s", phoneme)
|
107
|
+
continue
|
108
|
+
|
109
|
+
ids.extend(id_map[phoneme])
|
110
|
+
|
111
|
+
# 学習データが PAD("_") を各音素ごとに含んでいるのは eSpeak 方式のみ。
|
112
|
+
# openjtalk で学習したモデルでは PAD は明示的に含まれていないので追加しない。
|
113
|
+
if self.config.phoneme_type != PhonemeType.OPENJTALK:
|
114
|
+
ids.extend(id_map[PAD])
|
115
|
+
|
116
|
+
ids.extend(id_map[EOS])
|
117
|
+
|
118
|
+
return ids
|
119
|
+
|
120
|
+
def synthesize(
|
121
|
+
self,
|
122
|
+
text: str,
|
123
|
+
wav_file: wave.Wave_write,
|
124
|
+
speaker_id: Optional[int] = None,
|
125
|
+
length_scale: Optional[float] = None,
|
126
|
+
noise_scale: Optional[float] = None,
|
127
|
+
noise_w: Optional[float] = None,
|
128
|
+
sentence_silence: float = 0.0,
|
129
|
+
):
|
130
|
+
"""Synthesize WAV audio from text."""
|
131
|
+
wav_file.setframerate(self.config.sample_rate)
|
132
|
+
wav_file.setsampwidth(2) # 16-bit
|
133
|
+
wav_file.setnchannels(1) # mono
|
134
|
+
|
135
|
+
for audio_bytes in self.synthesize_stream_raw(
|
136
|
+
text,
|
137
|
+
speaker_id=speaker_id,
|
138
|
+
length_scale=length_scale,
|
139
|
+
noise_scale=noise_scale,
|
140
|
+
noise_w=noise_w,
|
141
|
+
sentence_silence=sentence_silence,
|
142
|
+
):
|
143
|
+
wav_file.writeframes(audio_bytes)
|
144
|
+
|
145
|
+
def synthesize_stream_raw(
|
146
|
+
self,
|
147
|
+
text: str,
|
148
|
+
speaker_id: Optional[int] = None,
|
149
|
+
length_scale: Optional[float] = None,
|
150
|
+
noise_scale: Optional[float] = None,
|
151
|
+
noise_w: Optional[float] = None,
|
152
|
+
sentence_silence: float = 0.0,
|
153
|
+
) -> Iterable[bytes]:
|
154
|
+
"""Synthesize raw audio per sentence from text."""
|
155
|
+
sentence_phonemes = self.phonemize(text)
|
156
|
+
|
157
|
+
# 16-bit mono
|
158
|
+
num_silence_samples = int(sentence_silence * self.config.sample_rate)
|
159
|
+
silence_bytes = bytes(num_silence_samples * 2)
|
160
|
+
|
161
|
+
for phonemes in sentence_phonemes:
|
162
|
+
phoneme_ids = self.phonemes_to_ids(phonemes)
|
163
|
+
yield self.synthesize_ids_to_raw(
|
164
|
+
phoneme_ids,
|
165
|
+
speaker_id=speaker_id,
|
166
|
+
length_scale=length_scale,
|
167
|
+
noise_scale=noise_scale,
|
168
|
+
noise_w=noise_w,
|
169
|
+
) + silence_bytes
|
170
|
+
|
171
|
+
def synthesize_ids_to_raw(
|
172
|
+
self,
|
173
|
+
phoneme_ids: List[int],
|
174
|
+
speaker_id: Optional[int] = None,
|
175
|
+
length_scale: Optional[float] = None,
|
176
|
+
noise_scale: Optional[float] = None,
|
177
|
+
noise_w: Optional[float] = None,
|
178
|
+
) -> bytes:
|
179
|
+
"""Synthesize raw audio from phoneme ids."""
|
180
|
+
if length_scale is None:
|
181
|
+
length_scale = self.config.length_scale
|
182
|
+
|
183
|
+
if noise_scale is None:
|
184
|
+
noise_scale = self.config.noise_scale
|
185
|
+
|
186
|
+
if noise_w is None:
|
187
|
+
noise_w = self.config.noise_w
|
188
|
+
|
189
|
+
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
190
|
+
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
191
|
+
scales = np.array(
|
192
|
+
[noise_scale, length_scale, noise_w],
|
193
|
+
dtype=np.float32,
|
194
|
+
)
|
195
|
+
|
196
|
+
args = {
|
197
|
+
"input": phoneme_ids_array,
|
198
|
+
"input_lengths": phoneme_ids_lengths,
|
199
|
+
"scales": scales
|
200
|
+
}
|
201
|
+
|
202
|
+
if self.config.num_speakers <= 1:
|
203
|
+
speaker_id = None
|
204
|
+
|
205
|
+
if (self.config.num_speakers > 1) and (speaker_id is None):
|
206
|
+
# Default speaker
|
207
|
+
speaker_id = 0
|
208
|
+
|
209
|
+
if speaker_id is not None:
|
210
|
+
sid = np.array([speaker_id], dtype=np.int64)
|
211
|
+
args["sid"] = sid
|
212
|
+
|
213
|
+
# Synthesize through Onnx
|
214
|
+
audio = self.session.run(None, args, )[0].squeeze((0, 1))
|
215
|
+
audio = audio_float_to_int16(audio.squeeze())
|
216
|
+
return audio.tobytes()
|