wyoming-microsoft-tts 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/__init__.py +1 -0
- tests/conftest.py +26 -0
- tests/test_download.py +75 -0
- tests/test_microsoft_tts.py +17 -0
- tests/test_voice_parsing.py +169 -0
- wyoming_microsoft_tts/__init__.py +1 -0
- wyoming_microsoft_tts/__main__.py +208 -0
- wyoming_microsoft_tts/download.py +182 -0
- wyoming_microsoft_tts/handler.py +183 -0
- wyoming_microsoft_tts/microsoft_tts.py +62 -0
- wyoming_microsoft_tts/sentence_boundary.py +63 -0
- wyoming_microsoft_tts/version.py +3 -0
- wyoming_microsoft_tts/voices.json +12419 -0
- wyoming_microsoft_tts-1.3.3.dist-info/METADATA +92 -0
- wyoming_microsoft_tts-1.3.3.dist-info/RECORD +17 -0
- wyoming_microsoft_tts-1.3.3.dist-info/WHEEL +5 -0
- wyoming_microsoft_tts-1.3.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Event handler for clients of the server."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import math
|
|
6
|
+
import os
|
|
7
|
+
import wave
|
|
8
|
+
|
|
9
|
+
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
|
10
|
+
from wyoming.error import Error
|
|
11
|
+
from wyoming.event import Event
|
|
12
|
+
from wyoming.info import Describe, Info
|
|
13
|
+
from wyoming.server import AsyncEventHandler
|
|
14
|
+
from wyoming.tts import (
|
|
15
|
+
Synthesize,
|
|
16
|
+
SynthesizeChunk,
|
|
17
|
+
SynthesizeStart,
|
|
18
|
+
SynthesizeStop,
|
|
19
|
+
SynthesizeStopped,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from .microsoft_tts import MicrosoftTTS
|
|
23
|
+
from .sentence_boundary import SentenceBoundaryDetector, remove_asterisks
|
|
24
|
+
|
|
25
|
+
_LOGGER = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MicrosoftEventHandler(AsyncEventHandler):
|
|
29
|
+
"""Event handler for clients of the server."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
wyoming_info: Info,
|
|
34
|
+
cli_args: argparse.Namespace,
|
|
35
|
+
*args,
|
|
36
|
+
**kwargs,
|
|
37
|
+
) -> None:
|
|
38
|
+
"""Initialize."""
|
|
39
|
+
super().__init__(*args, **kwargs)
|
|
40
|
+
|
|
41
|
+
self.cli_args = cli_args
|
|
42
|
+
self.wyoming_info_event = wyoming_info.event()
|
|
43
|
+
self.microsoft_tts = MicrosoftTTS(cli_args)
|
|
44
|
+
self.sbd = SentenceBoundaryDetector()
|
|
45
|
+
self.is_streaming: bool | None = None
|
|
46
|
+
self._synthesize: Synthesize | None = None
|
|
47
|
+
|
|
48
|
+
async def handle_event(self, event: Event) -> bool: # noqa: C901
|
|
49
|
+
"""Handle an event."""
|
|
50
|
+
if Describe.is_type(event.type):
|
|
51
|
+
await self.write_event(self.wyoming_info_event)
|
|
52
|
+
_LOGGER.debug("Sent info")
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
if Synthesize.is_type(event.type):
|
|
57
|
+
if self.is_streaming:
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
synthesize = Synthesize.from_event(event)
|
|
61
|
+
synthesize.text = remove_asterisks(synthesize.text)
|
|
62
|
+
await self._handle_synthesize(synthesize)
|
|
63
|
+
|
|
64
|
+
if self.cli_args.no_streaming:
|
|
65
|
+
return True
|
|
66
|
+
|
|
67
|
+
if SynthesizeStart.is_type(event.type):
|
|
68
|
+
# Start of a stream
|
|
69
|
+
stream_start = SynthesizeStart.from_event(event)
|
|
70
|
+
self.is_streaming = True
|
|
71
|
+
self.sbd = SentenceBoundaryDetector()
|
|
72
|
+
self._synthesize = Synthesize(text="", voice=stream_start.voice)
|
|
73
|
+
_LOGGER.debug("Text stream started: voice=%s", stream_start.voice)
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
if SynthesizeChunk.is_type(event.type):
|
|
77
|
+
assert self._synthesize is not None
|
|
78
|
+
stream_chunk = SynthesizeChunk.from_event(event)
|
|
79
|
+
for sentence in self.sbd.add_chunk(stream_chunk.text):
|
|
80
|
+
_LOGGER.debug("Synthesizing stream sentence: %s", sentence)
|
|
81
|
+
self._synthesize.text = sentence
|
|
82
|
+
await self._handle_synthesize(self._synthesize)
|
|
83
|
+
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
if SynthesizeStop.is_type(event.type):
|
|
87
|
+
assert self._synthesize is not None
|
|
88
|
+
self._synthesize.text = self.sbd.finish()
|
|
89
|
+
if self._synthesize.text:
|
|
90
|
+
# Final audio chunk(s)
|
|
91
|
+
await self._handle_synthesize(self._synthesize)
|
|
92
|
+
|
|
93
|
+
# End of audio
|
|
94
|
+
await self.write_event(SynthesizeStopped().event())
|
|
95
|
+
|
|
96
|
+
_LOGGER.debug("Text stream stopped")
|
|
97
|
+
return True
|
|
98
|
+
|
|
99
|
+
if not Synthesize.is_type(event.type):
|
|
100
|
+
return True
|
|
101
|
+
|
|
102
|
+
synthesize = Synthesize.from_event(event)
|
|
103
|
+
return await self._handle_synthesize(synthesize)
|
|
104
|
+
except Exception as err:
|
|
105
|
+
await self.write_event(
|
|
106
|
+
Error(text=str(err), code=err.__class__.__name__).event()
|
|
107
|
+
)
|
|
108
|
+
raise err
|
|
109
|
+
|
|
110
|
+
async def _handle_synthesize(self, synthesize: Synthesize):
|
|
111
|
+
_LOGGER.debug(synthesize)
|
|
112
|
+
raw_text = synthesize.text
|
|
113
|
+
|
|
114
|
+
# Join multiple lines
|
|
115
|
+
text = " ".join(raw_text.strip().splitlines())
|
|
116
|
+
|
|
117
|
+
if synthesize.voice is None: # Use default voice if not specified
|
|
118
|
+
voice = self.cli_args.voice
|
|
119
|
+
else:
|
|
120
|
+
voice = synthesize.voice.name
|
|
121
|
+
|
|
122
|
+
if self.cli_args.auto_punctuation and text:
|
|
123
|
+
# Add automatic punctuation (important for some voices)
|
|
124
|
+
has_punctuation = False
|
|
125
|
+
for punc_char in self.cli_args.auto_punctuation:
|
|
126
|
+
if text[-1] == punc_char:
|
|
127
|
+
has_punctuation = True
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
if not has_punctuation:
|
|
131
|
+
text = text + self.cli_args.auto_punctuation[0]
|
|
132
|
+
|
|
133
|
+
_LOGGER.debug("Synthesizing: %s", text)
|
|
134
|
+
try:
|
|
135
|
+
output_path = self.microsoft_tts.synthesize(text=text, voice=voice)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
_LOGGER.error("Failed to synthesize text: %s", e)
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
_LOGGER.debug("Synthesized text")
|
|
141
|
+
try:
|
|
142
|
+
wav_file: wave.Wave_read = wave.open(output_path, "rb")
|
|
143
|
+
with wav_file:
|
|
144
|
+
rate = wav_file.getframerate()
|
|
145
|
+
width = wav_file.getsampwidth()
|
|
146
|
+
channels = wav_file.getnchannels()
|
|
147
|
+
|
|
148
|
+
await self.write_event(
|
|
149
|
+
AudioStart(
|
|
150
|
+
rate=rate,
|
|
151
|
+
width=width,
|
|
152
|
+
channels=channels,
|
|
153
|
+
).event(),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Audio
|
|
157
|
+
audio_bytes = wav_file.readframes(wav_file.getnframes())
|
|
158
|
+
bytes_per_sample = width * channels
|
|
159
|
+
bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
|
|
160
|
+
num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
|
|
161
|
+
|
|
162
|
+
# Split into chunks
|
|
163
|
+
for i in range(num_chunks):
|
|
164
|
+
offset = i * bytes_per_chunk
|
|
165
|
+
chunk = audio_bytes[offset : offset + bytes_per_chunk]
|
|
166
|
+
await self.write_event(
|
|
167
|
+
AudioChunk(
|
|
168
|
+
audio=chunk,
|
|
169
|
+
rate=rate,
|
|
170
|
+
width=width,
|
|
171
|
+
channels=channels,
|
|
172
|
+
).event(),
|
|
173
|
+
)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
_LOGGER.error("Failed to send audio: %s", e)
|
|
176
|
+
return False
|
|
177
|
+
|
|
178
|
+
await self.write_event(AudioStop().event())
|
|
179
|
+
_LOGGER.debug("Completed request")
|
|
180
|
+
|
|
181
|
+
os.unlink(output_path)
|
|
182
|
+
|
|
183
|
+
return True
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Microsoft TTS."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import azure.cognitiveservices.speech as speechsdk
|
|
9
|
+
|
|
10
|
+
from .download import get_voices
|
|
11
|
+
|
|
12
|
+
_LOGGER = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MicrosoftTTS:
|
|
16
|
+
"""Class to handle Microsoft TTS."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, args) -> None:
|
|
19
|
+
"""Initialize."""
|
|
20
|
+
_LOGGER.debug("Initialize Microsoft TTS")
|
|
21
|
+
self.args = args
|
|
22
|
+
self.speech_config = speechsdk.SpeechConfig(
|
|
23
|
+
subscription=args.subscription_key, region=args.service_region
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
output_dir = str(tempfile.TemporaryDirectory())
|
|
27
|
+
output_dir = Path(output_dir)
|
|
28
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
self.output_dir = output_dir
|
|
30
|
+
|
|
31
|
+
self.voices = get_voices(args.download_dir)
|
|
32
|
+
|
|
33
|
+
def synthesize(self, text, voice=None):
|
|
34
|
+
"""Synthesize text to speech."""
|
|
35
|
+
_LOGGER.debug(f"Requested TTS for [{text}]")
|
|
36
|
+
if voice is None:
|
|
37
|
+
voice = self.args.voice
|
|
38
|
+
|
|
39
|
+
# Convert the requested voice to the key microsoft use.
|
|
40
|
+
self.speech_config.speech_synthesis_voice_name = self.voices[voice]["key"]
|
|
41
|
+
|
|
42
|
+
file_name = self.output_dir / f"{time.monotonic_ns()}.wav"
|
|
43
|
+
audio_config = speechsdk.audio.AudioOutputConfig(filename=str(file_name))
|
|
44
|
+
|
|
45
|
+
speech_synthesizer = speechsdk.SpeechSynthesizer(
|
|
46
|
+
speech_config=self.speech_config, audio_config=audio_config
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
|
|
50
|
+
|
|
51
|
+
if (
|
|
52
|
+
speech_synthesis_result.reason
|
|
53
|
+
== speechsdk.ResultReason.SynthesizingAudioCompleted
|
|
54
|
+
):
|
|
55
|
+
_LOGGER.debug(f"Speech synthesized for text [{text}]")
|
|
56
|
+
return str(file_name)
|
|
57
|
+
|
|
58
|
+
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
|
|
59
|
+
cancellation_details = speech_synthesis_result.cancellation_details
|
|
60
|
+
_LOGGER.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
|
|
61
|
+
if cancellation_details.reason == speechsdk.CancellationReason.Error:
|
|
62
|
+
_LOGGER.warning(f"Error details: {cancellation_details.error_details}")
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Guess the sentence boundaries in text."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
|
|
5
|
+
import regex as re
|
|
6
|
+
|
|
7
|
+
SENTENCE_END = r"[.!?…]|[。!?]|[؟]|[।॥]"
|
|
8
|
+
ABBREVIATION_RE = re.compile(r"\b\p{L}{1,3}\.$", re.UNICODE)
|
|
9
|
+
|
|
10
|
+
SENTENCE_BOUNDARY_RE = re.compile(
|
|
11
|
+
rf"(.*?(?:{SENTENCE_END}+))(?=\s+[\p{{Lu}}\p{{Lt}}\p{{Lo}}]|(?:\s+\d+\.\s+))",
|
|
12
|
+
re.DOTALL,
|
|
13
|
+
)
|
|
14
|
+
WORD_ASTERISKS = re.compile(r"\*+([^\*]+)\*+")
|
|
15
|
+
LINE_ASTERICKS = re.compile(r"(?<=^|\n)\s*\*+")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SentenceBoundaryDetector:
|
|
19
|
+
"""Detect sentence boundaries in text."""
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
"""Initialize the sentence boundary detector."""
|
|
23
|
+
self.remaining_text = ""
|
|
24
|
+
self.current_sentence = ""
|
|
25
|
+
|
|
26
|
+
def add_chunk(self, chunk: str) -> Iterable[str]:
|
|
27
|
+
"""Add a chunk of text and yield complete sentences."""
|
|
28
|
+
self.remaining_text += chunk
|
|
29
|
+
while self.remaining_text:
|
|
30
|
+
match = SENTENCE_BOUNDARY_RE.search(self.remaining_text)
|
|
31
|
+
if not match:
|
|
32
|
+
break
|
|
33
|
+
|
|
34
|
+
match_text = match.group(0)
|
|
35
|
+
|
|
36
|
+
if not self.current_sentence:
|
|
37
|
+
self.current_sentence = match_text
|
|
38
|
+
elif ABBREVIATION_RE.search(self.current_sentence[-5:]):
|
|
39
|
+
self.current_sentence += match_text
|
|
40
|
+
else:
|
|
41
|
+
yield remove_asterisks(self.current_sentence.strip())
|
|
42
|
+
self.current_sentence = match_text
|
|
43
|
+
|
|
44
|
+
if not ABBREVIATION_RE.search(self.current_sentence[-5:]):
|
|
45
|
+
yield remove_asterisks(self.current_sentence.strip())
|
|
46
|
+
self.current_sentence = ""
|
|
47
|
+
|
|
48
|
+
self.remaining_text = self.remaining_text[match.end() :]
|
|
49
|
+
|
|
50
|
+
def finish(self) -> str:
|
|
51
|
+
"""Return the remaining text as a single item."""
|
|
52
|
+
text = (self.current_sentence + self.remaining_text).strip()
|
|
53
|
+
self.remaining_text = ""
|
|
54
|
+
self.current_sentence = ""
|
|
55
|
+
|
|
56
|
+
return remove_asterisks(text)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def remove_asterisks(text: str) -> str:
|
|
60
|
+
"""Remove *asterisks* surrounding **words**."""
|
|
61
|
+
text = WORD_ASTERISKS.sub(r"\1", text)
|
|
62
|
+
text = LINE_ASTERICKS.sub("", text)
|
|
63
|
+
return text
|