piper-tts-plus 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
piper/voice.py ADDED
@@ -0,0 +1,216 @@
1
+ import json
2
+ import logging
3
+ import wave
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
7
+
8
+ import numpy as np
9
+ import onnxruntime
10
+ import pyopenjtalk
11
+ from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
12
+
13
+ from .config import PhonemeType, PiperConfig
14
+ from .const import BOS, EOS, PAD
15
+ from .util import audio_float_to_int16
16
+
17
+ _LOGGER = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class PiperVoice:
22
+ session: onnxruntime.InferenceSession
23
+ config: PiperConfig
24
+
25
+ @staticmethod
26
+ def load(
27
+ model_path: Union[str, Path],
28
+ config_path: Optional[Union[str, Path]] = None,
29
+ use_cuda: bool = False,
30
+ ) -> "PiperVoice":
31
+ """Load an ONNX model and config."""
32
+ if config_path is None:
33
+ config_path = f"{model_path}.json"
34
+
35
+ with open(config_path, "r", encoding="utf-8") as config_file:
36
+ config_dict = json.load(config_file)
37
+
38
+ providers: List[Union[str, Tuple[str, Dict[str, Any]]]]
39
+ if use_cuda:
40
+ providers = [
41
+ (
42
+ "CUDAExecutionProvider",
43
+ {"cudnn_conv_algo_search": "HEURISTIC"},
44
+ )
45
+ ]
46
+ else:
47
+ providers = ["CPUExecutionProvider"]
48
+
49
+ return PiperVoice(
50
+ config=PiperConfig.from_dict(config_dict),
51
+ session=onnxruntime.InferenceSession(
52
+ str(model_path),
53
+ sess_options=onnxruntime.SessionOptions(),
54
+ providers=providers,
55
+ ),
56
+ )
57
+
58
+ def phonemize(self, text: str) -> List[List[str]]:
59
+ """Text to phonemes grouped by sentence."""
60
+ if self.config.phoneme_type == PhonemeType.ESPEAK:
61
+ if self.config.espeak_voice == "ar":
62
+ # Arabic diacritization
63
+ # https://github.com/mush42/libtashkeel/
64
+ text = tashkeel_run(text)
65
+
66
+ return phonemize_espeak(text, self.config.espeak_voice)
67
+
68
+ if self.config.phoneme_type == PhonemeType.TEXT:
69
+ return phonemize_codepoints(text)
70
+
71
+ if self.config.phoneme_type == PhonemeType.OPENJTALK:
72
+ # Piper の学習時と同じアルゴリズム(accent/prosody 付き)で音素化
73
+ try:
74
+ # `piper_train` がインストールされていれば専用実装を利用
75
+ from piper_train.phonemize.japanese import phonemize_japanese # type: ignore
76
+
77
+ tokens = phonemize_japanese(text)
78
+ return [tokens]
79
+ except Exception: # pragma: no cover – フォールバック
80
+ # 学習環境に piper_train が無い場合の簡易フォールバック
81
+ phonemes = pyopenjtalk.g2p(text, kana=False).split()
82
+
83
+ converted = []
84
+ for ph in phonemes:
85
+ if ph == "pau":
86
+ converted.append("_")
87
+ continue
88
+
89
+ # Devoiced vowels come back as upper-case (A,I,U,E,O)
90
+ if ph in {"A", "I", "U", "E", "O"}:
91
+ ph = ph.lower()
92
+
93
+ converted.append(ph)
94
+
95
+ return [converted]
96
+
97
+ raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
98
+
99
+ def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
100
+ """Phonemes to ids."""
101
+ id_map = self.config.phoneme_id_map
102
+ ids: List[int] = list(id_map[BOS])
103
+
104
+ for phoneme in phonemes:
105
+ if phoneme not in id_map:
106
+ _LOGGER.warning("Missing phoneme from id map: %s", phoneme)
107
+ continue
108
+
109
+ ids.extend(id_map[phoneme])
110
+
111
+ # 学習データが PAD("_") を各音素ごとに含んでいるのは eSpeak 方式のみ。
112
+ # openjtalk で学習したモデルでは PAD は明示的に含まれていないので追加しない。
113
+ if self.config.phoneme_type != PhonemeType.OPENJTALK:
114
+ ids.extend(id_map[PAD])
115
+
116
+ ids.extend(id_map[EOS])
117
+
118
+ return ids
119
+
120
+ def synthesize(
121
+ self,
122
+ text: str,
123
+ wav_file: wave.Wave_write,
124
+ speaker_id: Optional[int] = None,
125
+ length_scale: Optional[float] = None,
126
+ noise_scale: Optional[float] = None,
127
+ noise_w: Optional[float] = None,
128
+ sentence_silence: float = 0.0,
129
+ ):
130
+ """Synthesize WAV audio from text."""
131
+ wav_file.setframerate(self.config.sample_rate)
132
+ wav_file.setsampwidth(2) # 16-bit
133
+ wav_file.setnchannels(1) # mono
134
+
135
+ for audio_bytes in self.synthesize_stream_raw(
136
+ text,
137
+ speaker_id=speaker_id,
138
+ length_scale=length_scale,
139
+ noise_scale=noise_scale,
140
+ noise_w=noise_w,
141
+ sentence_silence=sentence_silence,
142
+ ):
143
+ wav_file.writeframes(audio_bytes)
144
+
145
+ def synthesize_stream_raw(
146
+ self,
147
+ text: str,
148
+ speaker_id: Optional[int] = None,
149
+ length_scale: Optional[float] = None,
150
+ noise_scale: Optional[float] = None,
151
+ noise_w: Optional[float] = None,
152
+ sentence_silence: float = 0.0,
153
+ ) -> Iterable[bytes]:
154
+ """Synthesize raw audio per sentence from text."""
155
+ sentence_phonemes = self.phonemize(text)
156
+
157
+ # 16-bit mono
158
+ num_silence_samples = int(sentence_silence * self.config.sample_rate)
159
+ silence_bytes = bytes(num_silence_samples * 2)
160
+
161
+ for phonemes in sentence_phonemes:
162
+ phoneme_ids = self.phonemes_to_ids(phonemes)
163
+ yield self.synthesize_ids_to_raw(
164
+ phoneme_ids,
165
+ speaker_id=speaker_id,
166
+ length_scale=length_scale,
167
+ noise_scale=noise_scale,
168
+ noise_w=noise_w,
169
+ ) + silence_bytes
170
+
171
+ def synthesize_ids_to_raw(
172
+ self,
173
+ phoneme_ids: List[int],
174
+ speaker_id: Optional[int] = None,
175
+ length_scale: Optional[float] = None,
176
+ noise_scale: Optional[float] = None,
177
+ noise_w: Optional[float] = None,
178
+ ) -> bytes:
179
+ """Synthesize raw audio from phoneme ids."""
180
+ if length_scale is None:
181
+ length_scale = self.config.length_scale
182
+
183
+ if noise_scale is None:
184
+ noise_scale = self.config.noise_scale
185
+
186
+ if noise_w is None:
187
+ noise_w = self.config.noise_w
188
+
189
+ phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
190
+ phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
191
+ scales = np.array(
192
+ [noise_scale, length_scale, noise_w],
193
+ dtype=np.float32,
194
+ )
195
+
196
+ args = {
197
+ "input": phoneme_ids_array,
198
+ "input_lengths": phoneme_ids_lengths,
199
+ "scales": scales
200
+ }
201
+
202
+ if self.config.num_speakers <= 1:
203
+ speaker_id = None
204
+
205
+ if (self.config.num_speakers > 1) and (speaker_id is None):
206
+ # Default speaker
207
+ speaker_id = 0
208
+
209
+ if speaker_id is not None:
210
+ sid = np.array([speaker_id], dtype=np.int64)
211
+ args["sid"] = sid
212
+
213
+ # Synthesize through Onnx
214
+ audio = self.session.run(None, args, )[0].squeeze((0, 1))
215
+ audio = audio_float_to_int16(audio.squeeze())
216
+ return audio.tobytes()