wyoming-microsoft-tts 1.3.5__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/PKG-INFO +1 -1
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/pyproject.toml +1 -1
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/requirements.txt +2 -2
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/setup.py +1 -1
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/tests/conftest.py +5 -0
- wyoming_microsoft_tts-1.4.0/tests/test_microsoft_tts.py +352 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/__main__.py +26 -0
- wyoming_microsoft_tts-1.4.0/wyoming_microsoft_tts/microsoft_tts.py +115 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/version.py +1 -1
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/PKG-INFO +1 -1
- wyoming_microsoft_tts-1.3.5/tests/test_microsoft_tts.py +0 -17
- wyoming_microsoft_tts-1.3.5/wyoming_microsoft_tts/microsoft_tts.py +0 -62
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/MANIFEST.in +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/README.md +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/setup.cfg +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/tests/__init__.py +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/tests/test_download.py +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/tests/test_voice_parsing.py +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/__init__.py +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/download.py +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/handler.py +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/sentence_boundary.py +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/voices.json +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/SOURCES.txt +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/dependency_links.txt +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/requires.txt +0 -0
- {wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/top_level.txt +0 -0
|
@@ -20,7 +20,7 @@ data_files = [module_dir / "voices.json"]
|
|
|
20
20
|
|
|
21
21
|
setup(
|
|
22
22
|
name="wyoming_microsoft_tts",
|
|
23
|
-
version="1.
|
|
23
|
+
version="1.4.0",
|
|
24
24
|
description="Wyoming Server for Microsoft TTS",
|
|
25
25
|
url="https://github.com/hugobloem/wyoming-microsoft-tts",
|
|
26
26
|
author="Hugo Bloem",
|
|
@@ -21,6 +21,11 @@ def microsoft_tts(configuration):
|
|
|
21
21
|
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
22
22
|
service_region=os.environ.get("SPEECH_REGION"),
|
|
23
23
|
download_dir="/tmp/",
|
|
24
|
+
rate=None,
|
|
25
|
+
pitch=None,
|
|
26
|
+
volume=None,
|
|
27
|
+
style=None,
|
|
28
|
+
style_degree=None,
|
|
24
29
|
**configuration,
|
|
25
30
|
)
|
|
26
31
|
return MicrosoftTTS(args)
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""Tests for the MicrosoftTTS class."""
|
|
2
|
+
|
|
3
|
+
from types import SimpleNamespace
|
|
4
|
+
import os
|
|
5
|
+
import pytest
|
|
6
|
+
from wyoming_microsoft_tts.microsoft_tts import MicrosoftTTS
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_initialize(microsoft_tts, configuration):
|
|
10
|
+
"""Test initialization."""
|
|
11
|
+
assert microsoft_tts.args.voice == configuration["voice"]
|
|
12
|
+
assert microsoft_tts.speech_config is not None
|
|
13
|
+
assert microsoft_tts.output_dir is not None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.mark.skipif(
|
|
17
|
+
not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
|
|
18
|
+
reason="SPEECH_KEY and SPEECH_REGION environment variables required",
|
|
19
|
+
)
|
|
20
|
+
def test_synthesize(microsoft_tts):
|
|
21
|
+
"""Test synthesize."""
|
|
22
|
+
text = "Hello, world!"
|
|
23
|
+
voice = "en-US-JennyNeural"
|
|
24
|
+
|
|
25
|
+
result = microsoft_tts.synthesize(text, voice)
|
|
26
|
+
assert result.endswith(".wav")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# SSML Building Tests
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_build_ssml_with_rate():
|
|
33
|
+
"""Test SSML generation with rate parameter."""
|
|
34
|
+
args = SimpleNamespace(
|
|
35
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
36
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
37
|
+
download_dir="/tmp/",
|
|
38
|
+
voice="en-US-JennyNeural",
|
|
39
|
+
rate="+30%",
|
|
40
|
+
pitch=None,
|
|
41
|
+
volume=None,
|
|
42
|
+
style=None,
|
|
43
|
+
style_degree=None,
|
|
44
|
+
)
|
|
45
|
+
tts = MicrosoftTTS(args)
|
|
46
|
+
ssml = tts._build_ssml("Hello, world!", "en-US-JennyNeural")
|
|
47
|
+
|
|
48
|
+
assert '<?xml version="1.0" encoding="UTF-8"?>' in ssml
|
|
49
|
+
assert '<speak version="1.0"' in ssml
|
|
50
|
+
assert '<prosody rate="+30%">' in ssml
|
|
51
|
+
assert "</prosody>" in ssml
|
|
52
|
+
assert "Hello, world!" in ssml
|
|
53
|
+
assert "xmlns:mstts" not in ssml # No style, so no mstts namespace
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_build_ssml_with_pitch():
|
|
57
|
+
"""Test SSML generation with pitch parameter."""
|
|
58
|
+
args = SimpleNamespace(
|
|
59
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
60
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
61
|
+
download_dir="/tmp/",
|
|
62
|
+
voice="en-US-JennyNeural",
|
|
63
|
+
rate=None,
|
|
64
|
+
pitch="+10%",
|
|
65
|
+
volume=None,
|
|
66
|
+
style=None,
|
|
67
|
+
style_degree=None,
|
|
68
|
+
)
|
|
69
|
+
tts = MicrosoftTTS(args)
|
|
70
|
+
ssml = tts._build_ssml("Testing pitch", "en-US-JennyNeural")
|
|
71
|
+
|
|
72
|
+
assert '<prosody pitch="+10%">' in ssml
|
|
73
|
+
assert "</prosody>" in ssml
|
|
74
|
+
assert "Testing pitch" in ssml
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_build_ssml_with_volume():
|
|
78
|
+
"""Test SSML generation with volume parameter."""
|
|
79
|
+
args = SimpleNamespace(
|
|
80
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
81
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
82
|
+
download_dir="/tmp/",
|
|
83
|
+
voice="en-US-JennyNeural",
|
|
84
|
+
rate=None,
|
|
85
|
+
pitch=None,
|
|
86
|
+
volume="loud",
|
|
87
|
+
style=None,
|
|
88
|
+
style_degree=None,
|
|
89
|
+
)
|
|
90
|
+
tts = MicrosoftTTS(args)
|
|
91
|
+
ssml = tts._build_ssml("Volume test", "en-US-JennyNeural")
|
|
92
|
+
|
|
93
|
+
assert '<prosody volume="loud">' in ssml
|
|
94
|
+
assert "</prosody>" in ssml
|
|
95
|
+
assert "Volume test" in ssml
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_build_ssml_with_all_prosody():
|
|
99
|
+
"""Test SSML generation with all prosody parameters."""
|
|
100
|
+
args = SimpleNamespace(
|
|
101
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
102
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
103
|
+
download_dir="/tmp/",
|
|
104
|
+
voice="en-US-JennyNeural",
|
|
105
|
+
rate="fast",
|
|
106
|
+
pitch="high",
|
|
107
|
+
volume="+20%",
|
|
108
|
+
style=None,
|
|
109
|
+
style_degree=None,
|
|
110
|
+
)
|
|
111
|
+
tts = MicrosoftTTS(args)
|
|
112
|
+
ssml = tts._build_ssml("All prosody", "en-US-JennyNeural")
|
|
113
|
+
|
|
114
|
+
assert '<prosody rate="fast" pitch="high" volume="+20%">' in ssml
|
|
115
|
+
assert "</prosody>" in ssml
|
|
116
|
+
assert "All prosody" in ssml
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_build_ssml_with_style():
|
|
120
|
+
"""Test SSML generation with style parameter."""
|
|
121
|
+
args = SimpleNamespace(
|
|
122
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
123
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
124
|
+
download_dir="/tmp/",
|
|
125
|
+
voice="en-US-JennyNeural",
|
|
126
|
+
rate=None,
|
|
127
|
+
pitch=None,
|
|
128
|
+
volume=None,
|
|
129
|
+
style="cheerful",
|
|
130
|
+
style_degree=None,
|
|
131
|
+
)
|
|
132
|
+
tts = MicrosoftTTS(args)
|
|
133
|
+
ssml = tts._build_ssml("Style test", "en-US-JennyNeural")
|
|
134
|
+
|
|
135
|
+
assert 'xmlns:mstts="https://www.w3.org/2001/mstts"' in ssml
|
|
136
|
+
assert '<mstts:express-as style="cheerful">' in ssml
|
|
137
|
+
assert "</mstts:express-as>" in ssml
|
|
138
|
+
assert "Style test" in ssml
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_build_ssml_with_style_and_degree():
|
|
142
|
+
"""Test SSML generation with style and style_degree parameters."""
|
|
143
|
+
args = SimpleNamespace(
|
|
144
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
145
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
146
|
+
download_dir="/tmp/",
|
|
147
|
+
voice="en-US-JennyNeural",
|
|
148
|
+
rate=None,
|
|
149
|
+
pitch=None,
|
|
150
|
+
volume=None,
|
|
151
|
+
style="sad",
|
|
152
|
+
style_degree=1.5,
|
|
153
|
+
)
|
|
154
|
+
tts = MicrosoftTTS(args)
|
|
155
|
+
ssml = tts._build_ssml("Sad voice", "en-US-JennyNeural")
|
|
156
|
+
|
|
157
|
+
assert 'xmlns:mstts="https://www.w3.org/2001/mstts"' in ssml
|
|
158
|
+
assert '<mstts:express-as style="sad" styledegree="1.5">' in ssml
|
|
159
|
+
assert "</mstts:express-as>" in ssml
|
|
160
|
+
assert "Sad voice" in ssml
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def test_build_ssml_with_prosody_and_style():
|
|
164
|
+
"""Test SSML generation with both prosody and style parameters."""
|
|
165
|
+
args = SimpleNamespace(
|
|
166
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
167
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
168
|
+
download_dir="/tmp/",
|
|
169
|
+
voice="en-US-JennyNeural",
|
|
170
|
+
rate="slow",
|
|
171
|
+
pitch="low",
|
|
172
|
+
volume="soft",
|
|
173
|
+
style="calm",
|
|
174
|
+
style_degree=0.5,
|
|
175
|
+
)
|
|
176
|
+
tts = MicrosoftTTS(args)
|
|
177
|
+
ssml = tts._build_ssml("Combined test", "en-US-JennyNeural")
|
|
178
|
+
|
|
179
|
+
assert 'xmlns:mstts="https://www.w3.org/2001/mstts"' in ssml
|
|
180
|
+
assert '<mstts:express-as style="calm" styledegree="0.5">' in ssml
|
|
181
|
+
assert '<prosody rate="slow" pitch="low" volume="soft">' in ssml
|
|
182
|
+
assert "</prosody>" in ssml
|
|
183
|
+
assert "</mstts:express-as>" in ssml
|
|
184
|
+
assert "Combined test" in ssml
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def test_build_ssml_voice_key_and_lang():
|
|
188
|
+
"""Test that SSML uses correct voice key and language."""
|
|
189
|
+
args = SimpleNamespace(
|
|
190
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
191
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
192
|
+
download_dir="/tmp/",
|
|
193
|
+
voice="en-GB-SoniaNeural",
|
|
194
|
+
rate="+10%",
|
|
195
|
+
pitch=None,
|
|
196
|
+
volume=None,
|
|
197
|
+
style=None,
|
|
198
|
+
style_degree=None,
|
|
199
|
+
)
|
|
200
|
+
tts = MicrosoftTTS(args)
|
|
201
|
+
ssml = tts._build_ssml("UK voice", "en-GB-SoniaNeural")
|
|
202
|
+
|
|
203
|
+
# Should contain the voice key from the voices.json
|
|
204
|
+
assert 'xml:lang="en-GB"' in ssml
|
|
205
|
+
assert '<voice name="en-GB-SoniaNeural">' in ssml
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# Integration Tests with Synthesize
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@pytest.mark.skipif(
|
|
212
|
+
not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
|
|
213
|
+
reason="SPEECH_KEY and SPEECH_REGION environment variables required",
|
|
214
|
+
)
|
|
215
|
+
def test_synthesize_with_rate():
|
|
216
|
+
"""Test synthesize with rate parameter."""
|
|
217
|
+
args = SimpleNamespace(
|
|
218
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
219
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
220
|
+
download_dir="/tmp/",
|
|
221
|
+
voice="en-US-JennyNeural",
|
|
222
|
+
rate="+30%",
|
|
223
|
+
pitch=None,
|
|
224
|
+
volume=None,
|
|
225
|
+
style=None,
|
|
226
|
+
style_degree=None,
|
|
227
|
+
)
|
|
228
|
+
tts = MicrosoftTTS(args)
|
|
229
|
+
result = tts.synthesize("Testing rate parameter", "en-US-JennyNeural")
|
|
230
|
+
|
|
231
|
+
assert result is not None
|
|
232
|
+
assert result.endswith(".wav")
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@pytest.mark.skipif(
|
|
236
|
+
not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
|
|
237
|
+
reason="SPEECH_KEY and SPEECH_REGION environment variables required",
|
|
238
|
+
)
|
|
239
|
+
def test_synthesize_with_pitch():
|
|
240
|
+
"""Test synthesize with pitch parameter."""
|
|
241
|
+
args = SimpleNamespace(
|
|
242
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
243
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
244
|
+
download_dir="/tmp/",
|
|
245
|
+
voice="en-US-JennyNeural",
|
|
246
|
+
rate=None,
|
|
247
|
+
pitch="+5%",
|
|
248
|
+
volume=None,
|
|
249
|
+
style=None,
|
|
250
|
+
style_degree=None,
|
|
251
|
+
)
|
|
252
|
+
tts = MicrosoftTTS(args)
|
|
253
|
+
result = tts.synthesize("Testing pitch parameter", "en-US-JennyNeural")
|
|
254
|
+
|
|
255
|
+
assert result is not None
|
|
256
|
+
assert result.endswith(".wav")
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@pytest.mark.skipif(
|
|
260
|
+
not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
|
|
261
|
+
reason="SPEECH_KEY and SPEECH_REGION environment variables required",
|
|
262
|
+
)
|
|
263
|
+
def test_synthesize_with_volume():
|
|
264
|
+
"""Test synthesize with volume parameter."""
|
|
265
|
+
args = SimpleNamespace(
|
|
266
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
267
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
268
|
+
download_dir="/tmp/",
|
|
269
|
+
voice="en-US-JennyNeural",
|
|
270
|
+
rate=None,
|
|
271
|
+
pitch=None,
|
|
272
|
+
volume="loud",
|
|
273
|
+
style=None,
|
|
274
|
+
style_degree=None,
|
|
275
|
+
)
|
|
276
|
+
tts = MicrosoftTTS(args)
|
|
277
|
+
result = tts.synthesize("Testing volume parameter", "en-US-JennyNeural")
|
|
278
|
+
|
|
279
|
+
assert result is not None
|
|
280
|
+
assert result.endswith(".wav")
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@pytest.mark.skipif(
|
|
284
|
+
not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
|
|
285
|
+
reason="SPEECH_KEY and SPEECH_REGION environment variables required",
|
|
286
|
+
)
|
|
287
|
+
def test_synthesize_with_style():
|
|
288
|
+
"""Test synthesize with style parameter."""
|
|
289
|
+
args = SimpleNamespace(
|
|
290
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
291
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
292
|
+
download_dir="/tmp/",
|
|
293
|
+
voice="en-US-JennyNeural",
|
|
294
|
+
rate=None,
|
|
295
|
+
pitch=None,
|
|
296
|
+
volume=None,
|
|
297
|
+
style="cheerful",
|
|
298
|
+
style_degree=None,
|
|
299
|
+
)
|
|
300
|
+
tts = MicrosoftTTS(args)
|
|
301
|
+
result = tts.synthesize("Testing style parameter", "en-US-JennyNeural")
|
|
302
|
+
|
|
303
|
+
assert result is not None
|
|
304
|
+
assert result.endswith(".wav")
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
@pytest.mark.skipif(
|
|
308
|
+
not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
|
|
309
|
+
reason="SPEECH_KEY and SPEECH_REGION environment variables required",
|
|
310
|
+
)
|
|
311
|
+
def test_synthesize_with_combined_parameters():
|
|
312
|
+
"""Test synthesize with multiple parameters combined."""
|
|
313
|
+
args = SimpleNamespace(
|
|
314
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
315
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
316
|
+
download_dir="/tmp/",
|
|
317
|
+
voice="en-US-JennyNeural",
|
|
318
|
+
rate="fast",
|
|
319
|
+
pitch="+10%",
|
|
320
|
+
volume="loud",
|
|
321
|
+
style="excited",
|
|
322
|
+
style_degree=1.2,
|
|
323
|
+
)
|
|
324
|
+
tts = MicrosoftTTS(args)
|
|
325
|
+
result = tts.synthesize("Testing all parameters together", "en-US-JennyNeural")
|
|
326
|
+
|
|
327
|
+
assert result is not None
|
|
328
|
+
assert result.endswith(".wav")
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@pytest.mark.skipif(
|
|
332
|
+
not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
|
|
333
|
+
reason="SPEECH_KEY and SPEECH_REGION environment variables required",
|
|
334
|
+
)
|
|
335
|
+
def test_synthesize_without_parameters_still_works():
|
|
336
|
+
"""Test that synthesize still works without any new parameters."""
|
|
337
|
+
args = SimpleNamespace(
|
|
338
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
339
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
340
|
+
download_dir="/tmp/",
|
|
341
|
+
voice="en-US-JennyNeural",
|
|
342
|
+
rate=None,
|
|
343
|
+
pitch=None,
|
|
344
|
+
volume=None,
|
|
345
|
+
style=None,
|
|
346
|
+
style_degree=None,
|
|
347
|
+
)
|
|
348
|
+
tts = MicrosoftTTS(args)
|
|
349
|
+
result = tts.synthesize("Testing without parameters", "en-US-JennyNeural")
|
|
350
|
+
|
|
351
|
+
assert result is not None
|
|
352
|
+
assert result.endswith(".wav")
|
{wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/__main__.py
RENAMED
|
@@ -68,6 +68,32 @@ def parse_arguments():
|
|
|
68
68
|
)
|
|
69
69
|
parser.add_argument("--samples-per-chunk", type=int, default=1024)
|
|
70
70
|
#
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--rate",
|
|
73
|
+
type=str,
|
|
74
|
+
help="Speech rate (e.g., '+30%', '0.5', 'fast', 'slow')",
|
|
75
|
+
)
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--pitch",
|
|
78
|
+
type=str,
|
|
79
|
+
help="Speech pitch (e.g., '+10%', 'high', 'low', '+80Hz')",
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--volume",
|
|
83
|
+
type=str,
|
|
84
|
+
help="Speech volume (e.g., '+20%', 'loud', 'soft', '75')",
|
|
85
|
+
)
|
|
86
|
+
parser.add_argument(
|
|
87
|
+
"--style",
|
|
88
|
+
type=str,
|
|
89
|
+
help="Speaking style (e.g., 'cheerful', 'sad', 'angry', 'calm')",
|
|
90
|
+
)
|
|
91
|
+
parser.add_argument(
|
|
92
|
+
"--style-degree",
|
|
93
|
+
type=float,
|
|
94
|
+
help="Style intensity from 0.01 to 2 (default: 1)",
|
|
95
|
+
)
|
|
96
|
+
#
|
|
71
97
|
parser.add_argument(
|
|
72
98
|
"--update-voices",
|
|
73
99
|
action="store_true",
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Microsoft TTS."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import azure.cognitiveservices.speech as speechsdk
|
|
9
|
+
|
|
10
|
+
from .download import get_voices
|
|
11
|
+
|
|
12
|
+
_LOGGER = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MicrosoftTTS:
|
|
16
|
+
"""Class to handle Microsoft TTS."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, args) -> None:
|
|
19
|
+
"""Initialize."""
|
|
20
|
+
_LOGGER.debug("Initialize Microsoft TTS")
|
|
21
|
+
self.args = args
|
|
22
|
+
self.speech_config = speechsdk.SpeechConfig(
|
|
23
|
+
subscription=args.subscription_key, region=args.service_region
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
output_dir = str(tempfile.TemporaryDirectory())
|
|
27
|
+
output_dir = Path(output_dir)
|
|
28
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
self.output_dir = output_dir
|
|
30
|
+
|
|
31
|
+
self.voices = get_voices(args.download_dir)
|
|
32
|
+
|
|
33
|
+
def _build_ssml(self, text, voice):
|
|
34
|
+
"""Build SSML with prosody and style parameters."""
|
|
35
|
+
voice_key = self.voices[voice]["key"]
|
|
36
|
+
voice_lang = self.voices[voice]["language"]["code"]
|
|
37
|
+
|
|
38
|
+
ssml_parts = [
|
|
39
|
+
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
40
|
+
'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"',
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
if self.args.style or self.args.style_degree:
|
|
44
|
+
ssml_parts.append(' xmlns:mstts="https://www.w3.org/2001/mstts"')
|
|
45
|
+
|
|
46
|
+
ssml_parts.append(f' xml:lang="{voice_lang}">')
|
|
47
|
+
ssml_parts.append(f'<voice name="{voice_key}">')
|
|
48
|
+
|
|
49
|
+
has_style = self.args.style is not None
|
|
50
|
+
has_prosody = any([self.args.rate, self.args.pitch, self.args.volume])
|
|
51
|
+
|
|
52
|
+
if has_style:
|
|
53
|
+
style_attrs = [f'style="{self.args.style}"']
|
|
54
|
+
if self.args.style_degree is not None:
|
|
55
|
+
style_attrs.append(f'styledegree="{self.args.style_degree}"')
|
|
56
|
+
ssml_parts.append(f'<mstts:express-as {" ".join(style_attrs)}>')
|
|
57
|
+
|
|
58
|
+
if has_prosody:
|
|
59
|
+
prosody_attrs = []
|
|
60
|
+
if self.args.rate:
|
|
61
|
+
prosody_attrs.append(f'rate="{self.args.rate}"')
|
|
62
|
+
if self.args.pitch:
|
|
63
|
+
prosody_attrs.append(f'pitch="{self.args.pitch}"')
|
|
64
|
+
if self.args.volume:
|
|
65
|
+
prosody_attrs.append(f'volume="{self.args.volume}"')
|
|
66
|
+
ssml_parts.append(f'<prosody {" ".join(prosody_attrs)}>')
|
|
67
|
+
|
|
68
|
+
ssml_parts.append(text)
|
|
69
|
+
|
|
70
|
+
if has_prosody:
|
|
71
|
+
ssml_parts.append('</prosody>')
|
|
72
|
+
|
|
73
|
+
if has_style:
|
|
74
|
+
ssml_parts.append('</mstts:express-as>')
|
|
75
|
+
|
|
76
|
+
ssml_parts.append('</voice>')
|
|
77
|
+
ssml_parts.append('</speak>')
|
|
78
|
+
|
|
79
|
+
return ''.join(ssml_parts)
|
|
80
|
+
|
|
81
|
+
def synthesize(self, text, voice=None):
|
|
82
|
+
"""Synthesize text to speech."""
|
|
83
|
+
_LOGGER.debug(f"Requested TTS for [{text}]")
|
|
84
|
+
if voice is None:
|
|
85
|
+
voice = self.args.voice
|
|
86
|
+
|
|
87
|
+
# Convert the requested voice to the key microsoft use.
|
|
88
|
+
self.speech_config.speech_synthesis_voice_name = self.voices[voice]["key"]
|
|
89
|
+
|
|
90
|
+
file_name = self.output_dir / f"{time.monotonic_ns()}.wav"
|
|
91
|
+
audio_config = speechsdk.audio.AudioOutputConfig(filename=str(file_name))
|
|
92
|
+
|
|
93
|
+
speech_synthesizer = speechsdk.SpeechSynthesizer(
|
|
94
|
+
speech_config=self.speech_config, audio_config=audio_config
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if any([self.args.rate, self.args.pitch, self.args.volume, self.args.style, self.args.style_degree]):
|
|
98
|
+
ssml = self._build_ssml(text, voice)
|
|
99
|
+
_LOGGER.debug(f"Using SSML: {ssml}")
|
|
100
|
+
speech_synthesis_result = speech_synthesizer.speak_ssml_async(ssml).get()
|
|
101
|
+
else:
|
|
102
|
+
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
|
|
103
|
+
|
|
104
|
+
if (
|
|
105
|
+
speech_synthesis_result.reason
|
|
106
|
+
== speechsdk.ResultReason.SynthesizingAudioCompleted
|
|
107
|
+
):
|
|
108
|
+
_LOGGER.debug(f"Speech synthesized for text [{text}]")
|
|
109
|
+
return str(file_name)
|
|
110
|
+
|
|
111
|
+
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
|
|
112
|
+
cancellation_details = speech_synthesis_result.cancellation_details
|
|
113
|
+
_LOGGER.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
|
|
114
|
+
if cancellation_details.reason == speechsdk.CancellationReason.Error:
|
|
115
|
+
_LOGGER.warning(f"Error details: {cancellation_details.error_details}")
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
"""Tests for the MicrosoftTTS class."""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def test_initialize(microsoft_tts, configuration):
|
|
5
|
-
"""Test initialization."""
|
|
6
|
-
assert microsoft_tts.args.voice == configuration["voice"]
|
|
7
|
-
assert microsoft_tts.speech_config is not None
|
|
8
|
-
assert microsoft_tts.output_dir is not None
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def test_synthesize(microsoft_tts):
|
|
12
|
-
"""Test synthesize."""
|
|
13
|
-
text = "Hello, world!"
|
|
14
|
-
voice = "en-US-JennyNeural"
|
|
15
|
-
|
|
16
|
-
result = microsoft_tts.synthesize(text, voice)
|
|
17
|
-
assert result.endswith(".wav")
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
"""Microsoft TTS."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import tempfile
|
|
5
|
-
import time
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import azure.cognitiveservices.speech as speechsdk
|
|
9
|
-
|
|
10
|
-
from .download import get_voices
|
|
11
|
-
|
|
12
|
-
_LOGGER = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class MicrosoftTTS:
|
|
16
|
-
"""Class to handle Microsoft TTS."""
|
|
17
|
-
|
|
18
|
-
def __init__(self, args) -> None:
|
|
19
|
-
"""Initialize."""
|
|
20
|
-
_LOGGER.debug("Initialize Microsoft TTS")
|
|
21
|
-
self.args = args
|
|
22
|
-
self.speech_config = speechsdk.SpeechConfig(
|
|
23
|
-
subscription=args.subscription_key, region=args.service_region
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
output_dir = str(tempfile.TemporaryDirectory())
|
|
27
|
-
output_dir = Path(output_dir)
|
|
28
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
29
|
-
self.output_dir = output_dir
|
|
30
|
-
|
|
31
|
-
self.voices = get_voices(args.download_dir)
|
|
32
|
-
|
|
33
|
-
def synthesize(self, text, voice=None):
|
|
34
|
-
"""Synthesize text to speech."""
|
|
35
|
-
_LOGGER.debug(f"Requested TTS for [{text}]")
|
|
36
|
-
if voice is None:
|
|
37
|
-
voice = self.args.voice
|
|
38
|
-
|
|
39
|
-
# Convert the requested voice to the key microsoft use.
|
|
40
|
-
self.speech_config.speech_synthesis_voice_name = self.voices[voice]["key"]
|
|
41
|
-
|
|
42
|
-
file_name = self.output_dir / f"{time.monotonic_ns()}.wav"
|
|
43
|
-
audio_config = speechsdk.audio.AudioOutputConfig(filename=str(file_name))
|
|
44
|
-
|
|
45
|
-
speech_synthesizer = speechsdk.SpeechSynthesizer(
|
|
46
|
-
speech_config=self.speech_config, audio_config=audio_config
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
|
|
50
|
-
|
|
51
|
-
if (
|
|
52
|
-
speech_synthesis_result.reason
|
|
53
|
-
== speechsdk.ResultReason.SynthesizingAudioCompleted
|
|
54
|
-
):
|
|
55
|
-
_LOGGER.debug(f"Speech synthesized for text [{text}]")
|
|
56
|
-
return str(file_name)
|
|
57
|
-
|
|
58
|
-
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
|
|
59
|
-
cancellation_details = speech_synthesis_result.cancellation_details
|
|
60
|
-
_LOGGER.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
|
|
61
|
-
if cancellation_details.reason == speechsdk.CancellationReason.Error:
|
|
62
|
-
_LOGGER.warning(f"Error details: {cancellation_details.error_details}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/__init__.py
RENAMED
|
File without changes
|
{wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/download.py
RENAMED
|
File without changes
|
{wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/handler.py
RENAMED
|
File without changes
|
|
File without changes
|
{wyoming_microsoft_tts-1.3.5 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/voices.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|