wyoming-microsoft-tts 1.3.4__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/PKG-INFO +1 -1
  2. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/pyproject.toml +1 -1
  3. wyoming_microsoft_tts-1.4.0/requirements.txt +7 -0
  4. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/setup.py +1 -1
  5. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/tests/conftest.py +5 -0
  6. wyoming_microsoft_tts-1.4.0/tests/test_microsoft_tts.py +352 -0
  7. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/__main__.py +26 -0
  8. wyoming_microsoft_tts-1.4.0/wyoming_microsoft_tts/microsoft_tts.py +115 -0
  9. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/version.py +1 -1
  10. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/PKG-INFO +1 -1
  11. wyoming_microsoft_tts-1.3.4/requirements.txt +0 -7
  12. wyoming_microsoft_tts-1.3.4/tests/test_microsoft_tts.py +0 -17
  13. wyoming_microsoft_tts-1.3.4/wyoming_microsoft_tts/microsoft_tts.py +0 -62
  14. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/MANIFEST.in +0 -0
  15. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/README.md +0 -0
  16. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/setup.cfg +0 -0
  17. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/tests/__init__.py +0 -0
  18. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/tests/test_download.py +0 -0
  19. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/tests/test_voice_parsing.py +0 -0
  20. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/__init__.py +0 -0
  21. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/download.py +0 -0
  22. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/handler.py +0 -0
  23. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/sentence_boundary.py +0 -0
  24. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts/voices.json +0 -0
  25. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/SOURCES.txt +0 -0
  26. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/dependency_links.txt +0 -0
  27. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/requires.txt +0 -0
  28. {wyoming_microsoft_tts-1.3.4 → wyoming_microsoft_tts-1.4.0}/wyoming_microsoft_tts.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wyoming-microsoft-tts
3
- Version: 1.3.4
3
+ Version: 1.4.0
4
4
  Summary: Add your description here
5
5
  Home-page: https://github.com/hugobloem/wyoming-microsoft-tts
6
6
  Author: Hugo Bloem
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "wyoming-microsoft-tts"
3
- version = "1.3.4"
3
+ version = "1.4.0"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13"
@@ -0,0 +1,7 @@
1
+ wyoming==1.8.0
2
+ azure-cognitiveservices-speech==1.47.0
3
+ black>=24,<27
4
+ lxml>=5,<7
5
+ pycountry>=23.12.11
6
+ ruff
7
+ regex
@@ -20,7 +20,7 @@ data_files = [module_dir / "voices.json"]
20
20
 
21
21
  setup(
22
22
  name="wyoming_microsoft_tts",
23
- version="1.3.4",
23
+ version="1.4.0",
24
24
  description="Wyoming Server for Microsoft TTS",
25
25
  url="https://github.com/hugobloem/wyoming-microsoft-tts",
26
26
  author="Hugo Bloem",
@@ -21,6 +21,11 @@ def microsoft_tts(configuration):
21
21
  subscription_key=os.environ.get("SPEECH_KEY"),
22
22
  service_region=os.environ.get("SPEECH_REGION"),
23
23
  download_dir="/tmp/",
24
+ rate=None,
25
+ pitch=None,
26
+ volume=None,
27
+ style=None,
28
+ style_degree=None,
24
29
  **configuration,
25
30
  )
26
31
  return MicrosoftTTS(args)
@@ -0,0 +1,352 @@
1
+ """Tests for the MicrosoftTTS class."""
2
+
3
+ from types import SimpleNamespace
4
+ import os
5
+ import pytest
6
+ from wyoming_microsoft_tts.microsoft_tts import MicrosoftTTS
7
+
8
+
9
+ def test_initialize(microsoft_tts, configuration):
10
+ """Test initialization."""
11
+ assert microsoft_tts.args.voice == configuration["voice"]
12
+ assert microsoft_tts.speech_config is not None
13
+ assert microsoft_tts.output_dir is not None
14
+
15
+
16
+ @pytest.mark.skipif(
17
+ not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
18
+ reason="SPEECH_KEY and SPEECH_REGION environment variables required",
19
+ )
20
+ def test_synthesize(microsoft_tts):
21
+ """Test synthesize."""
22
+ text = "Hello, world!"
23
+ voice = "en-US-JennyNeural"
24
+
25
+ result = microsoft_tts.synthesize(text, voice)
26
+ assert result.endswith(".wav")
27
+
28
+
29
+ # SSML Building Tests
30
+
31
+
32
+ def test_build_ssml_with_rate():
33
+ """Test SSML generation with rate parameter."""
34
+ args = SimpleNamespace(
35
+ subscription_key=os.environ.get("SPEECH_KEY"),
36
+ service_region=os.environ.get("SPEECH_REGION"),
37
+ download_dir="/tmp/",
38
+ voice="en-US-JennyNeural",
39
+ rate="+30%",
40
+ pitch=None,
41
+ volume=None,
42
+ style=None,
43
+ style_degree=None,
44
+ )
45
+ tts = MicrosoftTTS(args)
46
+ ssml = tts._build_ssml("Hello, world!", "en-US-JennyNeural")
47
+
48
+ assert '<?xml version="1.0" encoding="UTF-8"?>' in ssml
49
+ assert '<speak version="1.0"' in ssml
50
+ assert '<prosody rate="+30%">' in ssml
51
+ assert "</prosody>" in ssml
52
+ assert "Hello, world!" in ssml
53
+ assert "xmlns:mstts" not in ssml # No style, so no mstts namespace
54
+
55
+
56
+ def test_build_ssml_with_pitch():
57
+ """Test SSML generation with pitch parameter."""
58
+ args = SimpleNamespace(
59
+ subscription_key=os.environ.get("SPEECH_KEY"),
60
+ service_region=os.environ.get("SPEECH_REGION"),
61
+ download_dir="/tmp/",
62
+ voice="en-US-JennyNeural",
63
+ rate=None,
64
+ pitch="+10%",
65
+ volume=None,
66
+ style=None,
67
+ style_degree=None,
68
+ )
69
+ tts = MicrosoftTTS(args)
70
+ ssml = tts._build_ssml("Testing pitch", "en-US-JennyNeural")
71
+
72
+ assert '<prosody pitch="+10%">' in ssml
73
+ assert "</prosody>" in ssml
74
+ assert "Testing pitch" in ssml
75
+
76
+
77
+ def test_build_ssml_with_volume():
78
+ """Test SSML generation with volume parameter."""
79
+ args = SimpleNamespace(
80
+ subscription_key=os.environ.get("SPEECH_KEY"),
81
+ service_region=os.environ.get("SPEECH_REGION"),
82
+ download_dir="/tmp/",
83
+ voice="en-US-JennyNeural",
84
+ rate=None,
85
+ pitch=None,
86
+ volume="loud",
87
+ style=None,
88
+ style_degree=None,
89
+ )
90
+ tts = MicrosoftTTS(args)
91
+ ssml = tts._build_ssml("Volume test", "en-US-JennyNeural")
92
+
93
+ assert '<prosody volume="loud">' in ssml
94
+ assert "</prosody>" in ssml
95
+ assert "Volume test" in ssml
96
+
97
+
98
+ def test_build_ssml_with_all_prosody():
99
+ """Test SSML generation with all prosody parameters."""
100
+ args = SimpleNamespace(
101
+ subscription_key=os.environ.get("SPEECH_KEY"),
102
+ service_region=os.environ.get("SPEECH_REGION"),
103
+ download_dir="/tmp/",
104
+ voice="en-US-JennyNeural",
105
+ rate="fast",
106
+ pitch="high",
107
+ volume="+20%",
108
+ style=None,
109
+ style_degree=None,
110
+ )
111
+ tts = MicrosoftTTS(args)
112
+ ssml = tts._build_ssml("All prosody", "en-US-JennyNeural")
113
+
114
+ assert '<prosody rate="fast" pitch="high" volume="+20%">' in ssml
115
+ assert "</prosody>" in ssml
116
+ assert "All prosody" in ssml
117
+
118
+
119
+ def test_build_ssml_with_style():
120
+ """Test SSML generation with style parameter."""
121
+ args = SimpleNamespace(
122
+ subscription_key=os.environ.get("SPEECH_KEY"),
123
+ service_region=os.environ.get("SPEECH_REGION"),
124
+ download_dir="/tmp/",
125
+ voice="en-US-JennyNeural",
126
+ rate=None,
127
+ pitch=None,
128
+ volume=None,
129
+ style="cheerful",
130
+ style_degree=None,
131
+ )
132
+ tts = MicrosoftTTS(args)
133
+ ssml = tts._build_ssml("Style test", "en-US-JennyNeural")
134
+
135
+ assert 'xmlns:mstts="https://www.w3.org/2001/mstts"' in ssml
136
+ assert '<mstts:express-as style="cheerful">' in ssml
137
+ assert "</mstts:express-as>" in ssml
138
+ assert "Style test" in ssml
139
+
140
+
141
+ def test_build_ssml_with_style_and_degree():
142
+ """Test SSML generation with style and style_degree parameters."""
143
+ args = SimpleNamespace(
144
+ subscription_key=os.environ.get("SPEECH_KEY"),
145
+ service_region=os.environ.get("SPEECH_REGION"),
146
+ download_dir="/tmp/",
147
+ voice="en-US-JennyNeural",
148
+ rate=None,
149
+ pitch=None,
150
+ volume=None,
151
+ style="sad",
152
+ style_degree=1.5,
153
+ )
154
+ tts = MicrosoftTTS(args)
155
+ ssml = tts._build_ssml("Sad voice", "en-US-JennyNeural")
156
+
157
+ assert 'xmlns:mstts="https://www.w3.org/2001/mstts"' in ssml
158
+ assert '<mstts:express-as style="sad" styledegree="1.5">' in ssml
159
+ assert "</mstts:express-as>" in ssml
160
+ assert "Sad voice" in ssml
161
+
162
+
163
+ def test_build_ssml_with_prosody_and_style():
164
+ """Test SSML generation with both prosody and style parameters."""
165
+ args = SimpleNamespace(
166
+ subscription_key=os.environ.get("SPEECH_KEY"),
167
+ service_region=os.environ.get("SPEECH_REGION"),
168
+ download_dir="/tmp/",
169
+ voice="en-US-JennyNeural",
170
+ rate="slow",
171
+ pitch="low",
172
+ volume="soft",
173
+ style="calm",
174
+ style_degree=0.5,
175
+ )
176
+ tts = MicrosoftTTS(args)
177
+ ssml = tts._build_ssml("Combined test", "en-US-JennyNeural")
178
+
179
+ assert 'xmlns:mstts="https://www.w3.org/2001/mstts"' in ssml
180
+ assert '<mstts:express-as style="calm" styledegree="0.5">' in ssml
181
+ assert '<prosody rate="slow" pitch="low" volume="soft">' in ssml
182
+ assert "</prosody>" in ssml
183
+ assert "</mstts:express-as>" in ssml
184
+ assert "Combined test" in ssml
185
+
186
+
187
+ def test_build_ssml_voice_key_and_lang():
188
+ """Test that SSML uses correct voice key and language."""
189
+ args = SimpleNamespace(
190
+ subscription_key=os.environ.get("SPEECH_KEY"),
191
+ service_region=os.environ.get("SPEECH_REGION"),
192
+ download_dir="/tmp/",
193
+ voice="en-GB-SoniaNeural",
194
+ rate="+10%",
195
+ pitch=None,
196
+ volume=None,
197
+ style=None,
198
+ style_degree=None,
199
+ )
200
+ tts = MicrosoftTTS(args)
201
+ ssml = tts._build_ssml("UK voice", "en-GB-SoniaNeural")
202
+
203
+ # Should contain the voice key from the voices.json
204
+ assert 'xml:lang="en-GB"' in ssml
205
+ assert '<voice name="en-GB-SoniaNeural">' in ssml
206
+
207
+
208
+ # Integration Tests with Synthesize
209
+
210
+
211
+ @pytest.mark.skipif(
212
+ not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
213
+ reason="SPEECH_KEY and SPEECH_REGION environment variables required",
214
+ )
215
+ def test_synthesize_with_rate():
216
+ """Test synthesize with rate parameter."""
217
+ args = SimpleNamespace(
218
+ subscription_key=os.environ.get("SPEECH_KEY"),
219
+ service_region=os.environ.get("SPEECH_REGION"),
220
+ download_dir="/tmp/",
221
+ voice="en-US-JennyNeural",
222
+ rate="+30%",
223
+ pitch=None,
224
+ volume=None,
225
+ style=None,
226
+ style_degree=None,
227
+ )
228
+ tts = MicrosoftTTS(args)
229
+ result = tts.synthesize("Testing rate parameter", "en-US-JennyNeural")
230
+
231
+ assert result is not None
232
+ assert result.endswith(".wav")
233
+
234
+
235
+ @pytest.mark.skipif(
236
+ not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
237
+ reason="SPEECH_KEY and SPEECH_REGION environment variables required",
238
+ )
239
+ def test_synthesize_with_pitch():
240
+ """Test synthesize with pitch parameter."""
241
+ args = SimpleNamespace(
242
+ subscription_key=os.environ.get("SPEECH_KEY"),
243
+ service_region=os.environ.get("SPEECH_REGION"),
244
+ download_dir="/tmp/",
245
+ voice="en-US-JennyNeural",
246
+ rate=None,
247
+ pitch="+5%",
248
+ volume=None,
249
+ style=None,
250
+ style_degree=None,
251
+ )
252
+ tts = MicrosoftTTS(args)
253
+ result = tts.synthesize("Testing pitch parameter", "en-US-JennyNeural")
254
+
255
+ assert result is not None
256
+ assert result.endswith(".wav")
257
+
258
+
259
+ @pytest.mark.skipif(
260
+ not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
261
+ reason="SPEECH_KEY and SPEECH_REGION environment variables required",
262
+ )
263
+ def test_synthesize_with_volume():
264
+ """Test synthesize with volume parameter."""
265
+ args = SimpleNamespace(
266
+ subscription_key=os.environ.get("SPEECH_KEY"),
267
+ service_region=os.environ.get("SPEECH_REGION"),
268
+ download_dir="/tmp/",
269
+ voice="en-US-JennyNeural",
270
+ rate=None,
271
+ pitch=None,
272
+ volume="loud",
273
+ style=None,
274
+ style_degree=None,
275
+ )
276
+ tts = MicrosoftTTS(args)
277
+ result = tts.synthesize("Testing volume parameter", "en-US-JennyNeural")
278
+
279
+ assert result is not None
280
+ assert result.endswith(".wav")
281
+
282
+
283
+ @pytest.mark.skipif(
284
+ not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
285
+ reason="SPEECH_KEY and SPEECH_REGION environment variables required",
286
+ )
287
+ def test_synthesize_with_style():
288
+ """Test synthesize with style parameter."""
289
+ args = SimpleNamespace(
290
+ subscription_key=os.environ.get("SPEECH_KEY"),
291
+ service_region=os.environ.get("SPEECH_REGION"),
292
+ download_dir="/tmp/",
293
+ voice="en-US-JennyNeural",
294
+ rate=None,
295
+ pitch=None,
296
+ volume=None,
297
+ style="cheerful",
298
+ style_degree=None,
299
+ )
300
+ tts = MicrosoftTTS(args)
301
+ result = tts.synthesize("Testing style parameter", "en-US-JennyNeural")
302
+
303
+ assert result is not None
304
+ assert result.endswith(".wav")
305
+
306
+
307
+ @pytest.mark.skipif(
308
+ not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
309
+ reason="SPEECH_KEY and SPEECH_REGION environment variables required",
310
+ )
311
+ def test_synthesize_with_combined_parameters():
312
+ """Test synthesize with multiple parameters combined."""
313
+ args = SimpleNamespace(
314
+ subscription_key=os.environ.get("SPEECH_KEY"),
315
+ service_region=os.environ.get("SPEECH_REGION"),
316
+ download_dir="/tmp/",
317
+ voice="en-US-JennyNeural",
318
+ rate="fast",
319
+ pitch="+10%",
320
+ volume="loud",
321
+ style="excited",
322
+ style_degree=1.2,
323
+ )
324
+ tts = MicrosoftTTS(args)
325
+ result = tts.synthesize("Testing all parameters together", "en-US-JennyNeural")
326
+
327
+ assert result is not None
328
+ assert result.endswith(".wav")
329
+
330
+
331
+ @pytest.mark.skipif(
332
+ not os.environ.get("SPEECH_KEY") or not os.environ.get("SPEECH_REGION"),
333
+ reason="SPEECH_KEY and SPEECH_REGION environment variables required",
334
+ )
335
+ def test_synthesize_without_parameters_still_works():
336
+ """Test that synthesize still works without any new parameters."""
337
+ args = SimpleNamespace(
338
+ subscription_key=os.environ.get("SPEECH_KEY"),
339
+ service_region=os.environ.get("SPEECH_REGION"),
340
+ download_dir="/tmp/",
341
+ voice="en-US-JennyNeural",
342
+ rate=None,
343
+ pitch=None,
344
+ volume=None,
345
+ style=None,
346
+ style_degree=None,
347
+ )
348
+ tts = MicrosoftTTS(args)
349
+ result = tts.synthesize("Testing without parameters", "en-US-JennyNeural")
350
+
351
+ assert result is not None
352
+ assert result.endswith(".wav")
@@ -68,6 +68,32 @@ def parse_arguments():
68
68
  )
69
69
  parser.add_argument("--samples-per-chunk", type=int, default=1024)
70
70
  #
71
+ parser.add_argument(
72
+ "--rate",
73
+ type=str,
74
+ help="Speech rate (e.g., '+30%', '0.5', 'fast', 'slow')",
75
+ )
76
+ parser.add_argument(
77
+ "--pitch",
78
+ type=str,
79
+ help="Speech pitch (e.g., '+10%', 'high', 'low', '+80Hz')",
80
+ )
81
+ parser.add_argument(
82
+ "--volume",
83
+ type=str,
84
+ help="Speech volume (e.g., '+20%', 'loud', 'soft', '75')",
85
+ )
86
+ parser.add_argument(
87
+ "--style",
88
+ type=str,
89
+ help="Speaking style (e.g., 'cheerful', 'sad', 'angry', 'calm')",
90
+ )
91
+ parser.add_argument(
92
+ "--style-degree",
93
+ type=float,
94
+ help="Style intensity from 0.01 to 2 (default: 1)",
95
+ )
96
+ #
71
97
  parser.add_argument(
72
98
  "--update-voices",
73
99
  action="store_true",
@@ -0,0 +1,115 @@
1
+ """Microsoft TTS."""
2
+
3
+ import logging
4
+ import tempfile
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import azure.cognitiveservices.speech as speechsdk
9
+
10
+ from .download import get_voices
11
+
12
+ _LOGGER = logging.getLogger(__name__)
13
+
14
+
15
+ class MicrosoftTTS:
16
+ """Class to handle Microsoft TTS."""
17
+
18
+ def __init__(self, args) -> None:
19
+ """Initialize."""
20
+ _LOGGER.debug("Initialize Microsoft TTS")
21
+ self.args = args
22
+ self.speech_config = speechsdk.SpeechConfig(
23
+ subscription=args.subscription_key, region=args.service_region
24
+ )
25
+
26
+ output_dir = str(tempfile.TemporaryDirectory())
27
+ output_dir = Path(output_dir)
28
+ output_dir.mkdir(parents=True, exist_ok=True)
29
+ self.output_dir = output_dir
30
+
31
+ self.voices = get_voices(args.download_dir)
32
+
33
+ def _build_ssml(self, text, voice):
34
+ """Build SSML with prosody and style parameters."""
35
+ voice_key = self.voices[voice]["key"]
36
+ voice_lang = self.voices[voice]["language"]["code"]
37
+
38
+ ssml_parts = [
39
+ '<?xml version="1.0" encoding="UTF-8"?>',
40
+ '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"',
41
+ ]
42
+
43
+ if self.args.style or self.args.style_degree:
44
+ ssml_parts.append(' xmlns:mstts="https://www.w3.org/2001/mstts"')
45
+
46
+ ssml_parts.append(f' xml:lang="{voice_lang}">')
47
+ ssml_parts.append(f'<voice name="{voice_key}">')
48
+
49
+ has_style = self.args.style is not None
50
+ has_prosody = any([self.args.rate, self.args.pitch, self.args.volume])
51
+
52
+ if has_style:
53
+ style_attrs = [f'style="{self.args.style}"']
54
+ if self.args.style_degree is not None:
55
+ style_attrs.append(f'styledegree="{self.args.style_degree}"')
56
+ ssml_parts.append(f'<mstts:express-as {" ".join(style_attrs)}>')
57
+
58
+ if has_prosody:
59
+ prosody_attrs = []
60
+ if self.args.rate:
61
+ prosody_attrs.append(f'rate="{self.args.rate}"')
62
+ if self.args.pitch:
63
+ prosody_attrs.append(f'pitch="{self.args.pitch}"')
64
+ if self.args.volume:
65
+ prosody_attrs.append(f'volume="{self.args.volume}"')
66
+ ssml_parts.append(f'<prosody {" ".join(prosody_attrs)}>')
67
+
68
+ ssml_parts.append(text)
69
+
70
+ if has_prosody:
71
+ ssml_parts.append('</prosody>')
72
+
73
+ if has_style:
74
+ ssml_parts.append('</mstts:express-as>')
75
+
76
+ ssml_parts.append('</voice>')
77
+ ssml_parts.append('</speak>')
78
+
79
+ return ''.join(ssml_parts)
80
+
81
+ def synthesize(self, text, voice=None):
82
+ """Synthesize text to speech."""
83
+ _LOGGER.debug(f"Requested TTS for [{text}]")
84
+ if voice is None:
85
+ voice = self.args.voice
86
+
87
+ # Convert the requested voice to the key microsoft use.
88
+ self.speech_config.speech_synthesis_voice_name = self.voices[voice]["key"]
89
+
90
+ file_name = self.output_dir / f"{time.monotonic_ns()}.wav"
91
+ audio_config = speechsdk.audio.AudioOutputConfig(filename=str(file_name))
92
+
93
+ speech_synthesizer = speechsdk.SpeechSynthesizer(
94
+ speech_config=self.speech_config, audio_config=audio_config
95
+ )
96
+
97
+ if any([self.args.rate, self.args.pitch, self.args.volume, self.args.style, self.args.style_degree]):
98
+ ssml = self._build_ssml(text, voice)
99
+ _LOGGER.debug(f"Using SSML: {ssml}")
100
+ speech_synthesis_result = speech_synthesizer.speak_ssml_async(ssml).get()
101
+ else:
102
+ speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
103
+
104
+ if (
105
+ speech_synthesis_result.reason
106
+ == speechsdk.ResultReason.SynthesizingAudioCompleted
107
+ ):
108
+ _LOGGER.debug(f"Speech synthesized for text [{text}]")
109
+ return str(file_name)
110
+
111
+ elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
112
+ cancellation_details = speech_synthesis_result.cancellation_details
113
+ _LOGGER.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
114
+ if cancellation_details.reason == speechsdk.CancellationReason.Error:
115
+ _LOGGER.warning(f"Error details: {cancellation_details.error_details}")
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "1.3.4"
3
+ __version__ = "1.4.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wyoming-microsoft-tts
3
- Version: 1.3.4
3
+ Version: 1.4.0
4
4
  Summary: Add your description here
5
5
  Home-page: https://github.com/hugobloem/wyoming-microsoft-tts
6
6
  Author: Hugo Bloem
@@ -1,7 +0,0 @@
1
- wyoming==1.7.2
2
- azure-cognitiveservices-speech==1.46.0
3
- black>=24,<26
4
- lxml>=5,<7
5
- pycountry>=23.12.11
6
- ruff
7
- regex
@@ -1,17 +0,0 @@
1
- """Tests for the MicrosoftTTS class."""
2
-
3
-
4
- def test_initialize(microsoft_tts, configuration):
5
- """Test initialization."""
6
- assert microsoft_tts.args.voice == configuration["voice"]
7
- assert microsoft_tts.speech_config is not None
8
- assert microsoft_tts.output_dir is not None
9
-
10
-
11
- def test_synthesize(microsoft_tts):
12
- """Test synthesize."""
13
- text = "Hello, world!"
14
- voice = "en-US-JennyNeural"
15
-
16
- result = microsoft_tts.synthesize(text, voice)
17
- assert result.endswith(".wav")
@@ -1,62 +0,0 @@
1
- """Microsoft TTS."""
2
-
3
- import logging
4
- import tempfile
5
- import time
6
- from pathlib import Path
7
-
8
- import azure.cognitiveservices.speech as speechsdk
9
-
10
- from .download import get_voices
11
-
12
- _LOGGER = logging.getLogger(__name__)
13
-
14
-
15
- class MicrosoftTTS:
16
- """Class to handle Microsoft TTS."""
17
-
18
- def __init__(self, args) -> None:
19
- """Initialize."""
20
- _LOGGER.debug("Initialize Microsoft TTS")
21
- self.args = args
22
- self.speech_config = speechsdk.SpeechConfig(
23
- subscription=args.subscription_key, region=args.service_region
24
- )
25
-
26
- output_dir = str(tempfile.TemporaryDirectory())
27
- output_dir = Path(output_dir)
28
- output_dir.mkdir(parents=True, exist_ok=True)
29
- self.output_dir = output_dir
30
-
31
- self.voices = get_voices(args.download_dir)
32
-
33
- def synthesize(self, text, voice=None):
34
- """Synthesize text to speech."""
35
- _LOGGER.debug(f"Requested TTS for [{text}]")
36
- if voice is None:
37
- voice = self.args.voice
38
-
39
- # Convert the requested voice to the key microsoft use.
40
- self.speech_config.speech_synthesis_voice_name = self.voices[voice]["key"]
41
-
42
- file_name = self.output_dir / f"{time.monotonic_ns()}.wav"
43
- audio_config = speechsdk.audio.AudioOutputConfig(filename=str(file_name))
44
-
45
- speech_synthesizer = speechsdk.SpeechSynthesizer(
46
- speech_config=self.speech_config, audio_config=audio_config
47
- )
48
-
49
- speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
50
-
51
- if (
52
- speech_synthesis_result.reason
53
- == speechsdk.ResultReason.SynthesizingAudioCompleted
54
- ):
55
- _LOGGER.debug(f"Speech synthesized for text [{text}]")
56
- return str(file_name)
57
-
58
- elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
59
- cancellation_details = speech_synthesis_result.cancellation_details
60
- _LOGGER.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
61
- if cancellation_details.reason == speechsdk.CancellationReason.Error:
62
- _LOGGER.warning(f"Error details: {cancellation_details.error_details}")