wyoming-microsoft-tts 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/__init__.py +1 -0
- tests/conftest.py +26 -0
- tests/test_download.py +75 -0
- tests/test_microsoft_tts.py +17 -0
- tests/test_voice_parsing.py +169 -0
- wyoming_microsoft_tts/__init__.py +1 -0
- wyoming_microsoft_tts/__main__.py +208 -0
- wyoming_microsoft_tts/download.py +182 -0
- wyoming_microsoft_tts/handler.py +183 -0
- wyoming_microsoft_tts/microsoft_tts.py +62 -0
- wyoming_microsoft_tts/sentence_boundary.py +63 -0
- wyoming_microsoft_tts/version.py +3 -0
- wyoming_microsoft_tts/voices.json +12419 -0
- wyoming_microsoft_tts-1.3.3.dist-info/METADATA +92 -0
- wyoming_microsoft_tts-1.3.3.dist-info/RECORD +17 -0
- wyoming_microsoft_tts-1.3.3.dist-info/WHEEL +5 -0
- wyoming_microsoft_tts-1.3.3.dist-info/top_level.txt +2 -0
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests."""
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Fixtures for tests."""
|
|
2
|
+
|
|
3
|
+
from types import SimpleNamespace
|
|
4
|
+
import pytest
|
|
5
|
+
from wyoming_microsoft_tts.microsoft_tts import MicrosoftTTS
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.fixture
|
|
10
|
+
def configuration():
|
|
11
|
+
"""Return configuration."""
|
|
12
|
+
return {
|
|
13
|
+
"voice": "en-GB-SoniaNeural",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.fixture
|
|
18
|
+
def microsoft_tts(configuration):
|
|
19
|
+
"""Return MicrosoftTTS instance."""
|
|
20
|
+
args = SimpleNamespace(
|
|
21
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
22
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
23
|
+
download_dir="/tmp/",
|
|
24
|
+
**configuration,
|
|
25
|
+
)
|
|
26
|
+
return MicrosoftTTS(args)
|
tests/test_download.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Tests for download functionality."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from unittest.mock import patch
|
|
7
|
+
|
|
8
|
+
from wyoming_microsoft_tts.download import get_voices
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_get_voices_download_failure_logs_error(caplog):
|
|
12
|
+
"""Test that a failed download logs an error and continues with fallback."""
|
|
13
|
+
with (
|
|
14
|
+
tempfile.TemporaryDirectory() as temp_dir,
|
|
15
|
+
patch("wyoming_microsoft_tts.download.urlopen") as mock_urlopen,
|
|
16
|
+
):
|
|
17
|
+
mock_urlopen.side_effect = Exception("Network error")
|
|
18
|
+
|
|
19
|
+
# Capture logs at error level
|
|
20
|
+
with caplog.at_level(logging.ERROR):
|
|
21
|
+
# Call get_voices with update_voices=True to trigger download
|
|
22
|
+
voices = get_voices(
|
|
23
|
+
download_dir=temp_dir,
|
|
24
|
+
update_voices=True,
|
|
25
|
+
region="westus",
|
|
26
|
+
key="fake_key",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Verify that we got an error log
|
|
30
|
+
assert len(caplog.records) > 0
|
|
31
|
+
error_logs = [
|
|
32
|
+
record for record in caplog.records if record.levelname == "ERROR"
|
|
33
|
+
]
|
|
34
|
+
assert len(error_logs) >= 1
|
|
35
|
+
|
|
36
|
+
# Check that the error message is about failed update
|
|
37
|
+
error_message = error_logs[0].message
|
|
38
|
+
assert "Failed to update voices list" in error_message
|
|
39
|
+
|
|
40
|
+
# Verify that voices are still returned (from embedded file)
|
|
41
|
+
assert isinstance(voices, dict)
|
|
42
|
+
assert len(voices) > 0 # Should have voices from embedded file
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_get_voices_download_failure_uses_fallback():
|
|
46
|
+
"""Test that a failed download falls back to embedded voices."""
|
|
47
|
+
with (
|
|
48
|
+
tempfile.TemporaryDirectory() as temp_dir,
|
|
49
|
+
patch("wyoming_microsoft_tts.download.urlopen") as mock_urlopen,
|
|
50
|
+
):
|
|
51
|
+
mock_urlopen.side_effect = Exception("Network error")
|
|
52
|
+
|
|
53
|
+
# Call get_voices with update_voices=True to trigger download
|
|
54
|
+
voices = get_voices(
|
|
55
|
+
download_dir=temp_dir, update_voices=True, region="westus", key="fake_key"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Verify that voices are still returned from embedded file
|
|
59
|
+
assert isinstance(voices, dict)
|
|
60
|
+
assert len(voices) > 0
|
|
61
|
+
|
|
62
|
+
# Verify that no downloaded file was created in temp directory
|
|
63
|
+
download_path = Path(temp_dir) / "voices.json"
|
|
64
|
+
assert not download_path.exists()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_get_voices_without_update_uses_embedded():
|
|
68
|
+
"""Test that get_voices works without update flag."""
|
|
69
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
70
|
+
# Call get_voices with update_voices=False (default)
|
|
71
|
+
voices = get_voices(download_dir=temp_dir)
|
|
72
|
+
|
|
73
|
+
# Should return voices from embedded file
|
|
74
|
+
assert isinstance(voices, dict)
|
|
75
|
+
assert len(voices) > 0
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Tests for the MicrosoftTTS class."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_initialize(microsoft_tts, configuration):
|
|
5
|
+
"""Test initialization."""
|
|
6
|
+
assert microsoft_tts.args.voice == configuration["voice"]
|
|
7
|
+
assert microsoft_tts.speech_config is not None
|
|
8
|
+
assert microsoft_tts.output_dir is not None
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_synthesize(microsoft_tts):
|
|
12
|
+
"""Test synthesize."""
|
|
13
|
+
text = "Hello, world!"
|
|
14
|
+
voice = "en-US-JennyNeural"
|
|
15
|
+
|
|
16
|
+
result = microsoft_tts.synthesize(text, voice)
|
|
17
|
+
assert result.endswith(".wav")
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Tests for voice parsing functionality."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from io import StringIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from wyoming_microsoft_tts.download import transform_voices_files
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_voice_parsing_with_script_codes():
|
|
11
|
+
"""Test that voices with script codes in locales are parsed correctly."""
|
|
12
|
+
# Sample Microsoft API response with problematic locales
|
|
13
|
+
sample_response = [
|
|
14
|
+
{
|
|
15
|
+
"ShortName": "iu-Cans-CA-SiqiniqNeural",
|
|
16
|
+
"Locale": "iu-Cans-CA",
|
|
17
|
+
"LocalName": "Siqiniq",
|
|
18
|
+
"LocaleName": "Inuktitut (Canadian Aboriginal Syllabics, Canada)",
|
|
19
|
+
"VoiceType": "Neural",
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"ShortName": "iu-Latn-CA-TaqqiqNeural",
|
|
23
|
+
"Locale": "iu-Latn-CA",
|
|
24
|
+
"LocalName": "Taqqiq",
|
|
25
|
+
"LocaleName": "Inuktitut (Latin, Canada)",
|
|
26
|
+
"VoiceType": "Neural",
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"ShortName": "sr-Latn-RS-NicholasNeural",
|
|
30
|
+
"Locale": "sr-Latn-RS",
|
|
31
|
+
"LocalName": "Nicholas",
|
|
32
|
+
"LocaleName": "Serbian (Latin, Serbia)",
|
|
33
|
+
"VoiceType": "Neural",
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"ShortName": "en-US-JennyNeural",
|
|
37
|
+
"Locale": "en-US",
|
|
38
|
+
"LocalName": "Jenny",
|
|
39
|
+
"LocaleName": "English (United States)",
|
|
40
|
+
"VoiceType": "Neural",
|
|
41
|
+
},
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# Create a StringIO object to simulate the API response
|
|
45
|
+
response_io = StringIO(json.dumps(sample_response))
|
|
46
|
+
|
|
47
|
+
# Transform the voices
|
|
48
|
+
voices = transform_voices_files(response_io)
|
|
49
|
+
|
|
50
|
+
# Verify that all voices were processed successfully
|
|
51
|
+
assert len(voices) == 4, f"Expected 4 voices, got {len(voices)}"
|
|
52
|
+
|
|
53
|
+
# Check that the problematic voices are included
|
|
54
|
+
assert "iu-Cans-CA-SiqiniqNeural" in voices
|
|
55
|
+
assert "iu-Latn-CA-TaqqiqNeural" in voices
|
|
56
|
+
assert "sr-Latn-RS-NicholasNeural" in voices
|
|
57
|
+
assert "en-US-JennyNeural" in voices
|
|
58
|
+
|
|
59
|
+
# Check that the voice data is properly structured
|
|
60
|
+
for _voice_name, voice_data in voices.items():
|
|
61
|
+
assert "key" in voice_data
|
|
62
|
+
assert "name" in voice_data
|
|
63
|
+
assert "language" in voice_data
|
|
64
|
+
assert "quality" in voice_data
|
|
65
|
+
assert "region" in voice_data["language"]
|
|
66
|
+
assert "country_english" in voice_data["language"]
|
|
67
|
+
|
|
68
|
+
# Verify that region and country_english are not None
|
|
69
|
+
assert voice_data["language"]["region"] is not None
|
|
70
|
+
assert voice_data["language"]["country_english"] is not None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_voice_parsing_with_secondary_locales():
|
|
74
|
+
"""Test that voices with secondary locales are parsed correctly."""
|
|
75
|
+
sample_response = [
|
|
76
|
+
{
|
|
77
|
+
"ShortName": "en-US-JennyMultilingualNeural",
|
|
78
|
+
"Locale": "en-US",
|
|
79
|
+
"LocalName": "Jenny",
|
|
80
|
+
"LocaleName": "English (United States)",
|
|
81
|
+
"VoiceType": "Neural",
|
|
82
|
+
"SecondaryLocaleList": ["de-DE", "es-ES"],
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
response_io = StringIO(json.dumps(sample_response))
|
|
87
|
+
voices = transform_voices_files(response_io)
|
|
88
|
+
|
|
89
|
+
# Should have 3 voices: original + 2 secondary locales
|
|
90
|
+
assert len(voices) == 3
|
|
91
|
+
assert "en-US-JennyMultilingualNeural" in voices
|
|
92
|
+
assert "de-DE-JennyMultilingualNeural" in voices
|
|
93
|
+
assert "es-ES-JennyMultilingualNeural" in voices
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def test_voice_parsing_with_standard_locales():
|
|
97
|
+
"""Test that standard locale format (lang-COUNTRY) still works correctly."""
|
|
98
|
+
sample_response = [
|
|
99
|
+
{
|
|
100
|
+
"ShortName": "en-US-JennyNeural",
|
|
101
|
+
"Locale": "en-US",
|
|
102
|
+
"LocalName": "Jenny",
|
|
103
|
+
"LocaleName": "English (United States)",
|
|
104
|
+
"VoiceType": "Neural",
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"ShortName": "fr-FR-DeniseNeural",
|
|
108
|
+
"Locale": "fr-FR",
|
|
109
|
+
"LocalName": "Denise",
|
|
110
|
+
"LocaleName": "French (France)",
|
|
111
|
+
"VoiceType": "Neural",
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"ShortName": "de-DE-KatjaNeural",
|
|
115
|
+
"Locale": "de-DE",
|
|
116
|
+
"LocalName": "Katja",
|
|
117
|
+
"LocaleName": "German (Germany)",
|
|
118
|
+
"VoiceType": "Neural",
|
|
119
|
+
},
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
response_io = StringIO(json.dumps(sample_response))
|
|
123
|
+
voices = transform_voices_files(response_io)
|
|
124
|
+
|
|
125
|
+
# Should have all 3 voices
|
|
126
|
+
assert len(voices) == 3
|
|
127
|
+
|
|
128
|
+
# Check country mappings are correct for standard locales
|
|
129
|
+
assert voices["en-US-JennyNeural"]["language"]["region"] == "US"
|
|
130
|
+
assert voices["en-US-JennyNeural"]["language"]["country_english"] == "United States"
|
|
131
|
+
|
|
132
|
+
assert voices["fr-FR-DeniseNeural"]["language"]["region"] == "FR"
|
|
133
|
+
assert voices["fr-FR-DeniseNeural"]["language"]["country_english"] == "France"
|
|
134
|
+
|
|
135
|
+
assert voices["de-DE-KatjaNeural"]["language"]["region"] == "DE"
|
|
136
|
+
assert voices["de-DE-KatjaNeural"]["language"]["country_english"] == "Germany"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_voice_parsing_with_invalid_locales():
|
|
140
|
+
"""Test that voices with completely invalid locales use fallback values."""
|
|
141
|
+
sample_response = [
|
|
142
|
+
{
|
|
143
|
+
"ShortName": "xx-INVALID-TestNeural",
|
|
144
|
+
"Locale": "xx-INVALID",
|
|
145
|
+
"LocalName": "Test",
|
|
146
|
+
"LocaleName": "Test Language",
|
|
147
|
+
"VoiceType": "Neural",
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
"ShortName": "yy-ZZ-FAKE-TestNeural",
|
|
151
|
+
"Locale": "yy-ZZ-FAKE",
|
|
152
|
+
"LocalName": "Test2",
|
|
153
|
+
"LocaleName": "Test Language 2",
|
|
154
|
+
"VoiceType": "Neural",
|
|
155
|
+
},
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
response_io = StringIO(json.dumps(sample_response))
|
|
159
|
+
voices = transform_voices_files(response_io)
|
|
160
|
+
|
|
161
|
+
# Should have both voices with fallback values
|
|
162
|
+
assert len(voices) == 2
|
|
163
|
+
|
|
164
|
+
# Check fallback values are used
|
|
165
|
+
assert voices["xx-INVALID-TestNeural"]["language"]["region"] == "INVALID"
|
|
166
|
+
assert voices["xx-INVALID-TestNeural"]["language"]["country_english"] == "Unknown"
|
|
167
|
+
|
|
168
|
+
assert voices["yy-ZZ-FAKE-TestNeural"]["language"]["region"] == "FAKE"
|
|
169
|
+
assert voices["yy-ZZ-FAKE-TestNeural"]["language"]["country_english"] == "Unknown"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Wyoming server for Microsoft TTS."""
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import argparse # noqa: D100
|
|
2
|
+
import asyncio
|
|
3
|
+
import contextlib
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import signal
|
|
7
|
+
from functools import partial
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from wyoming.info import Attribution, Info, TtsProgram, TtsVoice
|
|
11
|
+
from wyoming.server import AsyncServer
|
|
12
|
+
|
|
13
|
+
from wyoming_microsoft_tts.download import get_voices
|
|
14
|
+
from wyoming_microsoft_tts.handler import MicrosoftEventHandler
|
|
15
|
+
from wyoming_microsoft_tts.version import __version__
|
|
16
|
+
|
|
17
|
+
_LOGGER = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
stop_event = asyncio.Event()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def handle_stop_signal(*args):
|
|
23
|
+
"""Handle shutdown signal and set the stop event."""
|
|
24
|
+
_LOGGER.info("Received stop signal. Shutting down...")
|
|
25
|
+
stop_event.set()
|
|
26
|
+
exit(0)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_arguments():
|
|
30
|
+
"""Parse command-line arguments."""
|
|
31
|
+
parser = argparse.ArgumentParser()
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--service-region",
|
|
34
|
+
default=os.getenv("AZURE_SERVICE_REGION"),
|
|
35
|
+
help="Microsoft Azure region (e.g., westus2)",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--subscription-key",
|
|
39
|
+
default=os.getenv("AZURE_SUBSCRIPTION_KEY"),
|
|
40
|
+
help="Microsoft Azure subscription key",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--voice",
|
|
44
|
+
default="en-GB-SoniaNeural",
|
|
45
|
+
help="Default Microsoft voice to use (e.g., en-GB-SoniaNeural)",
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--download-dir",
|
|
49
|
+
default="/tmp/",
|
|
50
|
+
type=str,
|
|
51
|
+
help="Directory to download voices.json into (default: /tmp/)",
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--uri", default="tcp://0.0.0.0:10200", help="unix:// or tcp://"
|
|
55
|
+
)
|
|
56
|
+
#
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--speaker", type=str, help="Name or id of speaker for default voice"
|
|
59
|
+
)
|
|
60
|
+
#
|
|
61
|
+
parser.add_argument(
|
|
62
|
+
"--auto-punctuation", default=".?!", help="Automatically add punctuation"
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--no-streaming",
|
|
66
|
+
action="store_true",
|
|
67
|
+
help="Disable audio streaming on sentence boundaries",
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument("--samples-per-chunk", type=int, default=1024)
|
|
70
|
+
#
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--update-voices",
|
|
73
|
+
action="store_true",
|
|
74
|
+
help="Download latest voices.json during startup",
|
|
75
|
+
)
|
|
76
|
+
#
|
|
77
|
+
parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
|
|
78
|
+
return parser.parse_args()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def validate_args(args):
|
|
82
|
+
"""Validate command-line arguments."""
|
|
83
|
+
if not args.service_region or not args.subscription_key:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
"Both --service-region and --subscription-key must be provided either as command-line arguments or environment variables."
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
async def main() -> None:
|
|
90
|
+
"""Start Wyoming Microsoft TTS server."""
|
|
91
|
+
args = parse_arguments()
|
|
92
|
+
validate_args(args)
|
|
93
|
+
|
|
94
|
+
# setup logging
|
|
95
|
+
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
|
96
|
+
_LOGGER.debug("Arguments parsed successfully.")
|
|
97
|
+
|
|
98
|
+
# Load voice info
|
|
99
|
+
try:
|
|
100
|
+
_LOGGER.info("Starting voices loading process.")
|
|
101
|
+
voices_info = get_voices(
|
|
102
|
+
args.download_dir,
|
|
103
|
+
update_voices=args.update_voices,
|
|
104
|
+
region=args.service_region,
|
|
105
|
+
key=args.subscription_key,
|
|
106
|
+
)
|
|
107
|
+
_LOGGER.info("Voices loaded successfully.")
|
|
108
|
+
except Exception as e:
|
|
109
|
+
_LOGGER.error(f"Failed to load voices: {e}")
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
# Resolve aliases for backwards compatibility with old voice names
|
|
113
|
+
aliases_info: dict[str, Any] = {}
|
|
114
|
+
for voice_info in voices_info.values():
|
|
115
|
+
for voice_alias in voice_info.get("aliases", []):
|
|
116
|
+
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
|
|
117
|
+
|
|
118
|
+
# Make sure default voice is in the list
|
|
119
|
+
if args.voice not in voices_info:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"Voice {args.voice} not found in voices.json, please look up the correct voice name here"
|
|
122
|
+
+ "\nhttps://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
voices_info.update(aliases_info)
|
|
126
|
+
voices = [
|
|
127
|
+
TtsVoice(
|
|
128
|
+
name=voice_name,
|
|
129
|
+
description=get_description(voice_info),
|
|
130
|
+
attribution=Attribution(
|
|
131
|
+
name="Microsoft",
|
|
132
|
+
url="https://github.com/hugobloem/wyoming-microsoft-tts",
|
|
133
|
+
),
|
|
134
|
+
installed=True,
|
|
135
|
+
version=__version__,
|
|
136
|
+
languages=[
|
|
137
|
+
voice_info.get("language", {}).get(
|
|
138
|
+
"code",
|
|
139
|
+
voice_info.get("espeak", {}).get("voice", voice_name.split("_")[0]),
|
|
140
|
+
)
|
|
141
|
+
],
|
|
142
|
+
#
|
|
143
|
+
# Don't send speakers for now because it overflows StreamReader buffers
|
|
144
|
+
# speakers=[
|
|
145
|
+
# TtsVoiceSpeaker(name=speaker_name)
|
|
146
|
+
# for speaker_name in voice_info["speaker_id_map"]
|
|
147
|
+
# ]
|
|
148
|
+
# if voice_info.get("speaker_id_map")
|
|
149
|
+
# else None,
|
|
150
|
+
)
|
|
151
|
+
for voice_name, voice_info in voices_info.items()
|
|
152
|
+
if not voice_info.get("_is_alias", False)
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
wyoming_info = Info(
|
|
156
|
+
tts=[
|
|
157
|
+
TtsProgram(
|
|
158
|
+
name="microsoft",
|
|
159
|
+
description="A fast, local, neural text to speech engine",
|
|
160
|
+
attribution=Attribution(
|
|
161
|
+
name="Microsoft",
|
|
162
|
+
url="https://github.com/hugobloem/wyoming-microsoft-tts",
|
|
163
|
+
),
|
|
164
|
+
installed=True,
|
|
165
|
+
version=__version__,
|
|
166
|
+
voices=sorted(voices, key=lambda v: v.name),
|
|
167
|
+
supports_synthesize_streaming=not args.no_streaming,
|
|
168
|
+
)
|
|
169
|
+
],
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Start server
|
|
173
|
+
server = AsyncServer.from_uri(args.uri)
|
|
174
|
+
|
|
175
|
+
_LOGGER.info("Ready")
|
|
176
|
+
try:
|
|
177
|
+
await server.run(
|
|
178
|
+
partial(
|
|
179
|
+
MicrosoftEventHandler,
|
|
180
|
+
wyoming_info,
|
|
181
|
+
args,
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
except Exception as e:
|
|
185
|
+
_LOGGER.error(f"An error occurred while running the server: {e}")
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# -----------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def get_description(voice_info: dict[str, Any]):
|
|
192
|
+
"""Get a human readable description for a voice."""
|
|
193
|
+
name = voice_info["name"]
|
|
194
|
+
name = " ".join(name.split("_"))
|
|
195
|
+
quality = voice_info["quality"]
|
|
196
|
+
|
|
197
|
+
return f"{name} ({quality})"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# -----------------------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
if __name__ == "__main__":
|
|
203
|
+
# Set up signal handling for graceful shutdown
|
|
204
|
+
signal.signal(signal.SIGTERM, handle_stop_signal)
|
|
205
|
+
signal.signal(signal.SIGINT, handle_stop_signal)
|
|
206
|
+
|
|
207
|
+
with contextlib.suppress(KeyboardInterrupt):
|
|
208
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Utility for downloading Microsoft voices."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import quote, urlsplit, urlunsplit
|
|
8
|
+
from urllib.request import Request, urlopen
|
|
9
|
+
|
|
10
|
+
from pycountry import countries
|
|
11
|
+
|
|
12
|
+
URL_FORMAT = "https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list"
|
|
13
|
+
URL_HEADER = "Ocp-Apim-Subscription-Key"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
_DIR = Path(__file__).parent
|
|
17
|
+
_LOGGER = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_SKIP_FILES = {"MODEL_CARD"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class VoiceNotFoundError(Exception):
|
|
23
|
+
"""Raised when a voice is not found."""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _quote_url(url: str) -> str:
|
|
29
|
+
"""Quote file part of URL in case it contains UTF-8 characters."""
|
|
30
|
+
parts = list(urlsplit(url))
|
|
31
|
+
parts[2] = quote(parts[2])
|
|
32
|
+
return urlunsplit(parts)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_country_from_locale(locale: str):
|
|
36
|
+
"""Extract country information from a locale string.
|
|
37
|
+
|
|
38
|
+
Handles both standard (lang-COUNTRY) and extended (lang-script-COUNTRY) locale formats.
|
|
39
|
+
"""
|
|
40
|
+
parts = locale.split("-")
|
|
41
|
+
|
|
42
|
+
# For extended locales like "iu-Cans-CA", the country code is the last part
|
|
43
|
+
if len(parts) >= 3:
|
|
44
|
+
country_code = parts[-1]
|
|
45
|
+
elif len(parts) == 2:
|
|
46
|
+
country_code = parts[1]
|
|
47
|
+
else:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
return countries.get(alpha_2=country_code)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def transform_voices_files(response):
|
|
54
|
+
"""Transform the voices.json file from the Microsoft API to the format used by Wyoming."""
|
|
55
|
+
json_response = json.load(response)
|
|
56
|
+
voices = {}
|
|
57
|
+
for entry in json_response:
|
|
58
|
+
if not isinstance(entry, dict):
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
country = _get_country_from_locale(entry["Locale"])
|
|
63
|
+
# Use fallback values if country lookup fails
|
|
64
|
+
if country is None:
|
|
65
|
+
region = entry["Locale"].split("-")[-1] # Use the last part as region
|
|
66
|
+
country_name = "Unknown"
|
|
67
|
+
_LOGGER.warning(
|
|
68
|
+
"Could not find country for locale %s, using fallback values",
|
|
69
|
+
entry["Locale"],
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
region = country.alpha_2
|
|
73
|
+
country_name = country.name
|
|
74
|
+
|
|
75
|
+
voices[entry["ShortName"]] = {
|
|
76
|
+
"key": entry["ShortName"],
|
|
77
|
+
"name": entry["LocalName"],
|
|
78
|
+
"language": {
|
|
79
|
+
"code": entry["Locale"],
|
|
80
|
+
"family": entry["Locale"].split("-")[0],
|
|
81
|
+
"region": region,
|
|
82
|
+
"name_native": entry["LocaleName"],
|
|
83
|
+
"name_english": entry["LocaleName"],
|
|
84
|
+
"country_english": country_name,
|
|
85
|
+
},
|
|
86
|
+
"quality": entry["VoiceType"],
|
|
87
|
+
"num_speakers": 1,
|
|
88
|
+
"speaker_id_map": {},
|
|
89
|
+
"aliases": [],
|
|
90
|
+
}
|
|
91
|
+
if "SecondaryLocaleList" in entry:
|
|
92
|
+
for secondary_locale in entry["SecondaryLocaleList"]:
|
|
93
|
+
secondary_country = _get_country_from_locale(secondary_locale)
|
|
94
|
+
|
|
95
|
+
# Use fallback values if country lookup fails
|
|
96
|
+
if secondary_country is None:
|
|
97
|
+
secondary_region = secondary_locale.split("-")[-1]
|
|
98
|
+
secondary_country_name = "Unknown"
|
|
99
|
+
_LOGGER.warning(
|
|
100
|
+
"Could not find country for secondary locale %s, using fallback values",
|
|
101
|
+
secondary_locale,
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
secondary_region = secondary_country.alpha_2
|
|
105
|
+
secondary_country_name = secondary_country.name
|
|
106
|
+
|
|
107
|
+
voices[
|
|
108
|
+
entry["ShortName"].replace(entry["Locale"], secondary_locale)
|
|
109
|
+
] = {
|
|
110
|
+
"key": entry["ShortName"],
|
|
111
|
+
"name": entry["LocalName"],
|
|
112
|
+
"language": {
|
|
113
|
+
"code": secondary_locale,
|
|
114
|
+
"family": secondary_locale.split("-")[0],
|
|
115
|
+
"region": secondary_region,
|
|
116
|
+
"name_native": secondary_locale,
|
|
117
|
+
"name_english": secondary_locale,
|
|
118
|
+
"country_english": secondary_country_name,
|
|
119
|
+
},
|
|
120
|
+
"quality": entry["VoiceType"],
|
|
121
|
+
"num_speakers": 1,
|
|
122
|
+
"speaker_id_map": {},
|
|
123
|
+
"aliases": [],
|
|
124
|
+
}
|
|
125
|
+
except Exception as e:
|
|
126
|
+
_LOGGER.exception(
|
|
127
|
+
"Failed to parse voice %s", entry.get("ShortName", "Unknown")
|
|
128
|
+
)
|
|
129
|
+
_LOGGER.debug("%s: %s", entry.get("ShortName", "Unknown"), e)
|
|
130
|
+
return voices
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_voices(
|
|
134
|
+
download_dir: str | Path,
|
|
135
|
+
update_voices: bool = False,
|
|
136
|
+
region: str = "westus",
|
|
137
|
+
key: str = "",
|
|
138
|
+
) -> dict[str, Any]:
|
|
139
|
+
"""Load available voices from downloaded or embedded JSON file."""
|
|
140
|
+
download_dir = Path(download_dir)
|
|
141
|
+
download_dir.mkdir(parents=True, exist_ok=True)
|
|
142
|
+
voices_download = download_dir / "voices.json"
|
|
143
|
+
|
|
144
|
+
if update_voices:
|
|
145
|
+
# Download latest voices.json
|
|
146
|
+
try:
|
|
147
|
+
voices_url = URL_FORMAT.format(region=region)
|
|
148
|
+
voices_hdr = {URL_HEADER: key}
|
|
149
|
+
_LOGGER.debug("Downloading %s to %s", voices_url, voices_download)
|
|
150
|
+
req = Request(_quote_url(voices_url), headers=voices_hdr)
|
|
151
|
+
with urlopen(req) as response, open(voices_download, "w") as download_file:
|
|
152
|
+
json.dump(transform_voices_files(response), download_file, indent=4)
|
|
153
|
+
except Exception:
|
|
154
|
+
_LOGGER.exception("Failed to update voices list")
|
|
155
|
+
|
|
156
|
+
# Prefer downloaded file to embedded
|
|
157
|
+
if voices_download.exists():
|
|
158
|
+
try:
|
|
159
|
+
_LOGGER.debug("Loading downloaded file: %s", voices_download)
|
|
160
|
+
with open(voices_download, encoding="utf-8") as voices_file:
|
|
161
|
+
return json.load(voices_file)
|
|
162
|
+
except Exception:
|
|
163
|
+
_LOGGER.exception("Failed to load %s", voices_download)
|
|
164
|
+
|
|
165
|
+
# Fall back to embedded
|
|
166
|
+
voices_embedded = _DIR / "voices.json"
|
|
167
|
+
_LOGGER.debug("Loading embedded file: %s", voices_embedded)
|
|
168
|
+
with open(voices_embedded, encoding="utf-8") as voices_file:
|
|
169
|
+
return transform_voices_files(voices_file)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def find_voice(name: str, download_dir: str | Path) -> dict[str, Any]:
|
|
173
|
+
"""Look for the files for a voice.
|
|
174
|
+
|
|
175
|
+
Returns: Dict of voice info
|
|
176
|
+
"""
|
|
177
|
+
voices = get_voices(download_dir)
|
|
178
|
+
if name in voices:
|
|
179
|
+
# Already installed
|
|
180
|
+
return voices[name]
|
|
181
|
+
|
|
182
|
+
raise VoiceNotFoundError(name)
|