wyoming-microsoft-tts 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,183 @@
1
+ """Event handler for clients of the server."""
2
+
3
+ import argparse
4
+ import logging
5
+ import math
6
+ import os
7
+ import wave
8
+
9
+ from wyoming.audio import AudioChunk, AudioStart, AudioStop
10
+ from wyoming.error import Error
11
+ from wyoming.event import Event
12
+ from wyoming.info import Describe, Info
13
+ from wyoming.server import AsyncEventHandler
14
+ from wyoming.tts import (
15
+ Synthesize,
16
+ SynthesizeChunk,
17
+ SynthesizeStart,
18
+ SynthesizeStop,
19
+ SynthesizeStopped,
20
+ )
21
+
22
+ from .microsoft_tts import MicrosoftTTS
23
+ from .sentence_boundary import SentenceBoundaryDetector, remove_asterisks
24
+
25
+ _LOGGER = logging.getLogger(__name__)
26
+
27
+
28
+ class MicrosoftEventHandler(AsyncEventHandler):
29
+ """Event handler for clients of the server."""
30
+
31
+ def __init__(
32
+ self,
33
+ wyoming_info: Info,
34
+ cli_args: argparse.Namespace,
35
+ *args,
36
+ **kwargs,
37
+ ) -> None:
38
+ """Initialize."""
39
+ super().__init__(*args, **kwargs)
40
+
41
+ self.cli_args = cli_args
42
+ self.wyoming_info_event = wyoming_info.event()
43
+ self.microsoft_tts = MicrosoftTTS(cli_args)
44
+ self.sbd = SentenceBoundaryDetector()
45
+ self.is_streaming: bool | None = None
46
+ self._synthesize: Synthesize | None = None
47
+
48
+ async def handle_event(self, event: Event) -> bool: # noqa: C901
49
+ """Handle an event."""
50
+ if Describe.is_type(event.type):
51
+ await self.write_event(self.wyoming_info_event)
52
+ _LOGGER.debug("Sent info")
53
+ return True
54
+
55
+ try:
56
+ if Synthesize.is_type(event.type):
57
+ if self.is_streaming:
58
+ return True
59
+
60
+ synthesize = Synthesize.from_event(event)
61
+ synthesize.text = remove_asterisks(synthesize.text)
62
+ await self._handle_synthesize(synthesize)
63
+
64
+ if self.cli_args.no_streaming:
65
+ return True
66
+
67
+ if SynthesizeStart.is_type(event.type):
68
+ # Start of a stream
69
+ stream_start = SynthesizeStart.from_event(event)
70
+ self.is_streaming = True
71
+ self.sbd = SentenceBoundaryDetector()
72
+ self._synthesize = Synthesize(text="", voice=stream_start.voice)
73
+ _LOGGER.debug("Text stream started: voice=%s", stream_start.voice)
74
+ return True
75
+
76
+ if SynthesizeChunk.is_type(event.type):
77
+ assert self._synthesize is not None
78
+ stream_chunk = SynthesizeChunk.from_event(event)
79
+ for sentence in self.sbd.add_chunk(stream_chunk.text):
80
+ _LOGGER.debug("Synthesizing stream sentence: %s", sentence)
81
+ self._synthesize.text = sentence
82
+ await self._handle_synthesize(self._synthesize)
83
+
84
+ return True
85
+
86
+ if SynthesizeStop.is_type(event.type):
87
+ assert self._synthesize is not None
88
+ self._synthesize.text = self.sbd.finish()
89
+ if self._synthesize.text:
90
+ # Final audio chunk(s)
91
+ await self._handle_synthesize(self._synthesize)
92
+
93
+ # End of audio
94
+ await self.write_event(SynthesizeStopped().event())
95
+
96
+ _LOGGER.debug("Text stream stopped")
97
+ return True
98
+
99
+ if not Synthesize.is_type(event.type):
100
+ return True
101
+
102
+ synthesize = Synthesize.from_event(event)
103
+ return await self._handle_synthesize(synthesize)
104
+ except Exception as err:
105
+ await self.write_event(
106
+ Error(text=str(err), code=err.__class__.__name__).event()
107
+ )
108
+ raise err
109
+
110
+ async def _handle_synthesize(self, synthesize: Synthesize):
111
+ _LOGGER.debug(synthesize)
112
+ raw_text = synthesize.text
113
+
114
+ # Join multiple lines
115
+ text = " ".join(raw_text.strip().splitlines())
116
+
117
+ if synthesize.voice is None: # Use default voice if not specified
118
+ voice = self.cli_args.voice
119
+ else:
120
+ voice = synthesize.voice.name
121
+
122
+ if self.cli_args.auto_punctuation and text:
123
+ # Add automatic punctuation (important for some voices)
124
+ has_punctuation = False
125
+ for punc_char in self.cli_args.auto_punctuation:
126
+ if text[-1] == punc_char:
127
+ has_punctuation = True
128
+ break
129
+
130
+ if not has_punctuation:
131
+ text = text + self.cli_args.auto_punctuation[0]
132
+
133
+ _LOGGER.debug("Synthesizing: %s", text)
134
+ try:
135
+ output_path = self.microsoft_tts.synthesize(text=text, voice=voice)
136
+ except Exception as e:
137
+ _LOGGER.error("Failed to synthesize text: %s", e)
138
+ return False
139
+
140
+ _LOGGER.debug("Synthesized text")
141
+ try:
142
+ wav_file: wave.Wave_read = wave.open(output_path, "rb")
143
+ with wav_file:
144
+ rate = wav_file.getframerate()
145
+ width = wav_file.getsampwidth()
146
+ channels = wav_file.getnchannels()
147
+
148
+ await self.write_event(
149
+ AudioStart(
150
+ rate=rate,
151
+ width=width,
152
+ channels=channels,
153
+ ).event(),
154
+ )
155
+
156
+ # Audio
157
+ audio_bytes = wav_file.readframes(wav_file.getnframes())
158
+ bytes_per_sample = width * channels
159
+ bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
160
+ num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
161
+
162
+ # Split into chunks
163
+ for i in range(num_chunks):
164
+ offset = i * bytes_per_chunk
165
+ chunk = audio_bytes[offset : offset + bytes_per_chunk]
166
+ await self.write_event(
167
+ AudioChunk(
168
+ audio=chunk,
169
+ rate=rate,
170
+ width=width,
171
+ channels=channels,
172
+ ).event(),
173
+ )
174
+ except Exception as e:
175
+ _LOGGER.error("Failed to send audio: %s", e)
176
+ return False
177
+
178
+ await self.write_event(AudioStop().event())
179
+ _LOGGER.debug("Completed request")
180
+
181
+ os.unlink(output_path)
182
+
183
+ return True
@@ -0,0 +1,62 @@
1
+ """Microsoft TTS."""
2
+
3
+ import logging
4
+ import tempfile
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import azure.cognitiveservices.speech as speechsdk
9
+
10
+ from .download import get_voices
11
+
12
+ _LOGGER = logging.getLogger(__name__)
13
+
14
+
15
+ class MicrosoftTTS:
16
+ """Class to handle Microsoft TTS."""
17
+
18
+ def __init__(self, args) -> None:
19
+ """Initialize."""
20
+ _LOGGER.debug("Initialize Microsoft TTS")
21
+ self.args = args
22
+ self.speech_config = speechsdk.SpeechConfig(
23
+ subscription=args.subscription_key, region=args.service_region
24
+ )
25
+
26
+ output_dir = str(tempfile.TemporaryDirectory())
27
+ output_dir = Path(output_dir)
28
+ output_dir.mkdir(parents=True, exist_ok=True)
29
+ self.output_dir = output_dir
30
+
31
+ self.voices = get_voices(args.download_dir)
32
+
33
+ def synthesize(self, text, voice=None):
34
+ """Synthesize text to speech."""
35
+ _LOGGER.debug(f"Requested TTS for [{text}]")
36
+ if voice is None:
37
+ voice = self.args.voice
38
+
39
+ # Convert the requested voice to the key microsoft use.
40
+ self.speech_config.speech_synthesis_voice_name = self.voices[voice]["key"]
41
+
42
+ file_name = self.output_dir / f"{time.monotonic_ns()}.wav"
43
+ audio_config = speechsdk.audio.AudioOutputConfig(filename=str(file_name))
44
+
45
+ speech_synthesizer = speechsdk.SpeechSynthesizer(
46
+ speech_config=self.speech_config, audio_config=audio_config
47
+ )
48
+
49
+ speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
50
+
51
+ if (
52
+ speech_synthesis_result.reason
53
+ == speechsdk.ResultReason.SynthesizingAudioCompleted
54
+ ):
55
+ _LOGGER.debug(f"Speech synthesized for text [{text}]")
56
+ return str(file_name)
57
+
58
+ elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
59
+ cancellation_details = speech_synthesis_result.cancellation_details
60
+ _LOGGER.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
61
+ if cancellation_details.reason == speechsdk.CancellationReason.Error:
62
+ _LOGGER.warning(f"Error details: {cancellation_details.error_details}")
@@ -0,0 +1,63 @@
1
+ """Guess the sentence boundaries in text."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ import regex as re
6
+
7
+ SENTENCE_END = r"[.!?…]|[。!?]|[؟]|[।॥]"
8
+ ABBREVIATION_RE = re.compile(r"\b\p{L}{1,3}\.$", re.UNICODE)
9
+
10
+ SENTENCE_BOUNDARY_RE = re.compile(
11
+ rf"(.*?(?:{SENTENCE_END}+))(?=\s+[\p{{Lu}}\p{{Lt}}\p{{Lo}}]|(?:\s+\d+\.\s+))",
12
+ re.DOTALL,
13
+ )
14
+ WORD_ASTERISKS = re.compile(r"\*+([^\*]+)\*+")
15
+ LINE_ASTERICKS = re.compile(r"(?<=^|\n)\s*\*+")
16
+
17
+
18
+ class SentenceBoundaryDetector:
19
+ """Detect sentence boundaries in text."""
20
+
21
+ def __init__(self) -> None:
22
+ """Initialize the sentence boundary detector."""
23
+ self.remaining_text = ""
24
+ self.current_sentence = ""
25
+
26
+ def add_chunk(self, chunk: str) -> Iterable[str]:
27
+ """Add a chunk of text and yield complete sentences."""
28
+ self.remaining_text += chunk
29
+ while self.remaining_text:
30
+ match = SENTENCE_BOUNDARY_RE.search(self.remaining_text)
31
+ if not match:
32
+ break
33
+
34
+ match_text = match.group(0)
35
+
36
+ if not self.current_sentence:
37
+ self.current_sentence = match_text
38
+ elif ABBREVIATION_RE.search(self.current_sentence[-5:]):
39
+ self.current_sentence += match_text
40
+ else:
41
+ yield remove_asterisks(self.current_sentence.strip())
42
+ self.current_sentence = match_text
43
+
44
+ if not ABBREVIATION_RE.search(self.current_sentence[-5:]):
45
+ yield remove_asterisks(self.current_sentence.strip())
46
+ self.current_sentence = ""
47
+
48
+ self.remaining_text = self.remaining_text[match.end() :]
49
+
50
+ def finish(self) -> str:
51
+ """Return the remaining text as a single item."""
52
+ text = (self.current_sentence + self.remaining_text).strip()
53
+ self.remaining_text = ""
54
+ self.current_sentence = ""
55
+
56
+ return remove_asterisks(text)
57
+
58
+
59
+ def remove_asterisks(text: str) -> str:
60
+ """Remove *asterisks* surrounding **words**."""
61
+ text = WORD_ASTERISKS.sub(r"\1", text)
62
+ text = LINE_ASTERICKS.sub("", text)
63
+ return text
@@ -0,0 +1,3 @@
1
+ """Version information."""
2
+
3
+ __version__ = "1.3.3"