wyoming-microsoft-stt 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tests/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Tests."""
tests/conftest.py ADDED
@@ -0,0 +1,15 @@
1
+ """Fixtures for tests."""
2
+
3
+ from wyoming_microsoft_stt import SpeechConfig
4
+ import pytest
5
+ import os
6
+
7
+
8
+ @pytest.fixture
9
+ def microsoft_stt_args():
10
+ """Return MicrosoftSTT instance."""
11
+ args = SpeechConfig(
12
+ subscription_key=os.environ.get("SPEECH_KEY"),
13
+ service_region=os.environ.get("SPEECH_REGION"),
14
+ )
15
+ return args
@@ -0,0 +1,19 @@
1
+ """Tests for the MicrosoftTTS class."""
2
+
3
+ from wyoming_microsoft_stt.microsoft_stt import MicrosoftSTT
4
+
5
+
6
+ def test_initialize(microsoft_stt_args):
7
+ """Test initialization."""
8
+ microsoft_stt = MicrosoftSTT(microsoft_stt_args)
9
+ assert microsoft_stt.speech_config is not None
10
+
11
+
12
+ def test_set_profanity(microsoft_stt_args):
13
+ """Test set_profanity."""
14
+ microsoft_stt = MicrosoftSTT(microsoft_stt_args)
15
+ assert microsoft_stt.speech_config is not None
16
+
17
+ profanity = "masked"
18
+ microsoft_stt.set_profanity(profanity)
19
+ # There is currently no way to check the set profanity level
@@ -0,0 +1,109 @@
1
+ """Tests for the Microsoft STT service."""
2
+
3
+ import asyncio
4
+ import re
5
+ import sys
6
+ import os
7
+ import wave
8
+ from asyncio.subprocess import PIPE
9
+ from pathlib import Path
10
+
11
+
12
+ import pytest
13
+ from wyoming.asr import Transcript
14
+ from wyoming.audio import AudioStart, AudioStop, wav_to_chunks
15
+ from wyoming.event import async_read_event, async_write_event
16
+ from wyoming.info import Describe, Info
17
+
18
+ import logging
19
+
20
+ _LOGGER = logging.getLogger(__name__)
21
+
22
+ _DIR = Path(__file__).parent
23
+ _PROGRAM_DIR = _DIR.parent
24
+ _LOCAL_DIR = _PROGRAM_DIR / "local"
25
+ _SAMPLES_PER_CHUNK = 1024
26
+
27
+ # Need to give time for the model to download
28
+ _START_TIMEOUT = 60
29
+ _TRANSCRIBE_TIMEOUT = 60
30
+
31
+
32
+ @pytest.mark.asyncio
33
+ async def test_multilanguage() -> None:
34
+ """Test the transcription."""
35
+ proc = await asyncio.create_subprocess_exec(
36
+ sys.executable,
37
+ "-m",
38
+ "wyoming_microsoft_stt",
39
+ "--uri",
40
+ "stdio://",
41
+ "--language",
42
+ "en-GB",
43
+ "nl-NL",
44
+ "--service-region",
45
+ os.environ.get("SPEECH_REGION"),
46
+ "--subscription-key",
47
+ os.environ.get("SPEECH_KEY"),
48
+ "--debug",
49
+ stdin=PIPE,
50
+ stdout=PIPE,
51
+ )
52
+ assert proc.stdin is not None
53
+ assert proc.stdout is not None
54
+
55
+ # Check info
56
+ await async_write_event(Describe().event(), proc.stdin)
57
+ while True:
58
+ event = await asyncio.wait_for(
59
+ async_read_event(proc.stdout), timeout=_START_TIMEOUT
60
+ )
61
+ assert event is not None
62
+
63
+ if not Info.is_type(event.type):
64
+ continue
65
+
66
+ info = Info.from_event(event)
67
+ assert len(info.asr) == 1, "Expected one asr service"
68
+ asr = info.asr[0]
69
+ assert len(asr.models) > 0, "Expected at least one model"
70
+ break
71
+
72
+ # Test known WAV
73
+ with wave.open(str(_DIR / "zet_het_licht_aan.wav"), "rb") as example_wav:
74
+ await async_write_event(
75
+ AudioStart(
76
+ rate=example_wav.getframerate(),
77
+ width=example_wav.getsampwidth(),
78
+ channels=example_wav.getnchannels(),
79
+ ).event(),
80
+ proc.stdin,
81
+ )
82
+ for chunk in wav_to_chunks(example_wav, _SAMPLES_PER_CHUNK):
83
+ await async_write_event(chunk.event(), proc.stdin)
84
+ _LOGGER.info("Sent bytes of audio data to the server")
85
+
86
+ await async_write_event(AudioStop().event(), proc.stdin)
87
+ _LOGGER.info("Sent audio stop event to the server")
88
+
89
+ while True:
90
+ event = await asyncio.wait_for(
91
+ async_read_event(proc.stdout), timeout=_TRANSCRIBE_TIMEOUT
92
+ )
93
+ assert event is not None
94
+
95
+ if not Transcript.is_type(event.type):
96
+ continue
97
+
98
+ transcript = Transcript.from_event(event)
99
+ _LOGGER.info(f"Received transcript: {transcript.text}")
100
+ text = transcript.text.lower().strip()
101
+ text = re.sub(r"[^a-z ]", "", text)
102
+ assert text == "zet het licht aan"
103
+ break
104
+
105
+ # Need to close stdin for graceful termination
106
+ proc.stdin.close()
107
+ _, stderr = await proc.communicate()
108
+
109
+ assert proc.returncode == 0, stderr.decode()
@@ -0,0 +1,114 @@
1
+ """Tests for the Microsoft STT service."""
2
+
3
+ import asyncio
4
+ import re
5
+ import sys
6
+ import os
7
+ import wave
8
+ from asyncio.subprocess import PIPE
9
+ from pathlib import Path
10
+
11
+
12
+ import pytest
13
+ from wyoming.asr import Transcript
14
+ from wyoming.audio import AudioStart, AudioStop, wav_to_chunks
15
+ from wyoming.event import async_read_event, async_write_event
16
+ from wyoming.info import Describe, Info
17
+
18
+ import logging
19
+
20
+ _LOGGER = logging.getLogger(__name__)
21
+
22
+ _DIR = Path(__file__).parent
23
+ _PROGRAM_DIR = _DIR.parent
24
+ _LOCAL_DIR = _PROGRAM_DIR / "local"
25
+ _SAMPLES_PER_CHUNK = 1024
26
+
27
+ # Need to give time for the model to download
28
+ _START_TIMEOUT = 60
29
+ _TRANSCRIBE_TIMEOUT = 60
30
+
31
+
32
+ @pytest.mark.asyncio
33
+ async def test_transcribe() -> None:
34
+ """Test the transcription."""
35
+ proc = await asyncio.create_subprocess_exec(
36
+ sys.executable,
37
+ "-m",
38
+ "wyoming_microsoft_stt",
39
+ "--uri",
40
+ "stdio://",
41
+ "--language",
42
+ "en-GB",
43
+ "--service-region",
44
+ os.environ.get("SPEECH_REGION"),
45
+ "--subscription-key",
46
+ os.environ.get("SPEECH_KEY"),
47
+ "--debug",
48
+ stdin=PIPE,
49
+ stdout=PIPE,
50
+ )
51
+ assert proc.stdin is not None
52
+ assert proc.stdout is not None
53
+
54
+ # Check info
55
+ await async_write_event(Describe().event(), proc.stdin)
56
+ while True:
57
+ event = await asyncio.wait_for(
58
+ async_read_event(proc.stdout), timeout=_START_TIMEOUT
59
+ )
60
+ assert event is not None
61
+
62
+ if not Info.is_type(event.type):
63
+ continue
64
+
65
+ info = Info.from_event(event)
66
+ assert len(info.asr) == 1, "Expected one asr service"
67
+ asr = info.asr[0]
68
+ assert len(asr.models) > 0, "Expected at least one model"
69
+ break
70
+
71
+ # Test known WAV
72
+ with wave.open(str(_DIR / "long_text.wav"), "rb") as example_wav:
73
+ await async_write_event(
74
+ AudioStart(
75
+ rate=example_wav.getframerate(),
76
+ width=example_wav.getsampwidth(),
77
+ channels=example_wav.getnchannels(),
78
+ ).event(),
79
+ proc.stdin,
80
+ )
81
+ for chunk in wav_to_chunks(example_wav, _SAMPLES_PER_CHUNK):
82
+ await async_write_event(chunk.event(), proc.stdin)
83
+ _LOGGER.info("Sent bytes of audio data to the server")
84
+
85
+ await async_write_event(AudioStop().event(), proc.stdin)
86
+ _LOGGER.info("Sent audio stop event to the server")
87
+
88
+ while True:
89
+ event = await asyncio.wait_for(
90
+ async_read_event(proc.stdout), timeout=_TRANSCRIBE_TIMEOUT
91
+ )
92
+ assert event is not None
93
+
94
+ if not Transcript.is_type(event.type):
95
+ continue
96
+
97
+ transcript = Transcript.from_event(event)
98
+ text = transcript.text.lower().strip()
99
+ text = re.sub(r"[^a-z ]", "", text)
100
+ _LOGGER.info(f"Received transcript: {text}")
101
+
102
+ original_text = "The Netherlands, informally Holland, is a country in Northwestern Europe with overseas territories in the Caribbean. It is the largest of the four constituent countries of the Kingdom of the Netherlands. The Netherlands consists of 12 provinces. It borders Germany to the east and Belgium to the south, with the North Sea coastline to the north and west. It shares maritime borders with the United Kingdom, Germany, and Belgium."
103
+ # Remove punctuation and convert to lowercase
104
+ original_text = original_text.lower()
105
+ original_text = re.sub(r"[^a-z ]", "", original_text)
106
+
107
+ assert text == original_text
108
+ break
109
+
110
+ # Need to close stdin for graceful termination
111
+ proc.stdin.close()
112
+ _, stderr = await proc.communicate()
113
+
114
+ assert proc.returncode == 0, stderr.decode()
@@ -0,0 +1,159 @@
1
+ """Wyoming server for Microsoft STT."""
2
+
3
+ from typing import Literal
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class SpeechConfig(BaseModel):
8
+ """Speech configuration."""
9
+
10
+ subscription_key: str
11
+ service_region: str
12
+ profanity: Literal["off", "masked", "removed"] = "masked"
13
+ language: list[
14
+ Literal[
15
+ "af-ZA",
16
+ "am-ET",
17
+ "ar-AE",
18
+ "ar-BH",
19
+ "ar-DZ",
20
+ "ar-EG",
21
+ "ar-IL",
22
+ "ar-IQ",
23
+ "ar-JO",
24
+ "ar-KW",
25
+ "ar-LB",
26
+ "ar-LY",
27
+ "ar-MA",
28
+ "ar-OM",
29
+ "ar-PS",
30
+ "ar-QA",
31
+ "ar-SA",
32
+ "ar-SY",
33
+ "ar-TN",
34
+ "ar-YE",
35
+ "az-AZ",
36
+ "bg-BG",
37
+ "bn-IN",
38
+ "bs-BA",
39
+ "ca-ES",
40
+ "cs-CZ",
41
+ "cy-GB",
42
+ "da-DK",
43
+ "de-AT",
44
+ "de-CH",
45
+ "de-DE",
46
+ "el-GR",
47
+ "en-AU",
48
+ "en-CA",
49
+ "en-GB",
50
+ "en-GH",
51
+ "en-HK",
52
+ "en-IE",
53
+ "en-IN",
54
+ "en-KE",
55
+ "en-NG",
56
+ "en-NZ",
57
+ "en-PH",
58
+ "en-SG",
59
+ "en-TZ",
60
+ "en-US",
61
+ "en-ZA",
62
+ "es-AR",
63
+ "es-BO",
64
+ "es-CL",
65
+ "es-CO",
66
+ "es-CR",
67
+ "es-CU",
68
+ "es-DO",
69
+ "es-EC",
70
+ "es-ES",
71
+ "es-GQ",
72
+ "es-GT",
73
+ "es-HN",
74
+ "es-MX",
75
+ "es-NI",
76
+ "es-PA",
77
+ "es-PE",
78
+ "es-PR",
79
+ "es-PY",
80
+ "es-SV",
81
+ "es-US",
82
+ "es-UY",
83
+ "es-VE",
84
+ "et-EE",
85
+ "eu-ES",
86
+ "fa-IR",
87
+ "fi-FI",
88
+ "fil-PH",
89
+ "fr-BE",
90
+ "fr-CA",
91
+ "fr-CH",
92
+ "fr-FR",
93
+ "ga-IE",
94
+ "gl-ES",
95
+ "gu-IN",
96
+ "he-IL",
97
+ "hi-IN",
98
+ "hr-HR",
99
+ "hu-HU",
100
+ "hy-AM",
101
+ "id-ID",
102
+ "is-IS",
103
+ "it-CH",
104
+ "it-IT",
105
+ "ja-JP",
106
+ "jv-ID",
107
+ "ka-GE",
108
+ "kk-KZ",
109
+ "km-KH",
110
+ "kn-IN",
111
+ "ko-KR",
112
+ "lo-LA",
113
+ "lt-LT",
114
+ "lv-LV",
115
+ "mk-MK",
116
+ "ml-IN",
117
+ "mn-MN",
118
+ "mr-IN",
119
+ "ms-MY",
120
+ "mt-MT",
121
+ "my-MM",
122
+ "nb-NO",
123
+ "ne-NP",
124
+ "nl-BE",
125
+ "nl-NL",
126
+ "pa-IN",
127
+ "pl-PL",
128
+ "ps-AF",
129
+ "pt-BR",
130
+ "pt-PT",
131
+ "ro-RO",
132
+ "ru-RU",
133
+ "si-LK",
134
+ "sk-SK",
135
+ "sl-SI",
136
+ "so-SO",
137
+ "sq-AL",
138
+ "sr-RS",
139
+ "sv-SE",
140
+ "sw-KE",
141
+ "sw-TZ",
142
+ "ta-IN",
143
+ "te-IN",
144
+ "th-TH",
145
+ "tr-TR",
146
+ "uk-UA",
147
+ "ur-IN",
148
+ "uz-UZ",
149
+ "vi-VN",
150
+ "wuu-CN",
151
+ "yue-CN",
152
+ "zh-CN",
153
+ "zh-CN-SHANDONG",
154
+ "zh-CN-SICHUAN",
155
+ "zh-HK",
156
+ "zh-TW",
157
+ "zu-ZA",
158
+ ]
159
+ ] = ["en-US"]
@@ -0,0 +1,176 @@
1
+ import argparse # noqa: D100
2
+ import asyncio
3
+ import logging
4
+ from functools import partial
5
+ import contextlib
6
+ import os # Import to access environment variables
7
+ import signal
8
+ import re
9
+
10
+ from wyoming.info import AsrModel, AsrProgram, Attribution, Info
11
+ from wyoming.server import AsyncServer
12
+
13
+ from .download import get_languages
14
+ from .microsoft_stt import MicrosoftSTT
15
+ from .handler import MicrosoftEventHandler
16
+ from .version import __version__
17
+ from . import SpeechConfig
18
+
19
+ _LOGGER = logging.getLogger(__name__)
20
+
21
+ stop_event = asyncio.Event()
22
+
23
+
24
+ def handle_stop_signal(*args):
25
+ """Handle shutdown signal and set the stop event."""
26
+ _LOGGER.info("Received stop signal. Shutting down...")
27
+ stop_event.set()
28
+
29
+
30
+ def parse_arguments():
31
+ """Parse command-line arguments."""
32
+ parser = argparse.ArgumentParser()
33
+ parser.add_argument(
34
+ "--service-region",
35
+ default=os.getenv("AZURE_SERVICE_REGION"),
36
+ help="Microsoft Azure region (e.g., westus2)",
37
+ )
38
+ parser.add_argument(
39
+ "--subscription-key",
40
+ default=os.getenv("AZURE_SUBSCRIPTION_KEY"),
41
+ help="Microsoft Azure subscription key",
42
+ )
43
+ parser.add_argument(
44
+ "--uri", default="tcp://0.0.0.0:10300", help="unix:// or tcp://"
45
+ )
46
+ parser.add_argument(
47
+ "--download-dir",
48
+ default="/tmp/",
49
+ help="Directory to download languages.json into (default: /tmp/)",
50
+ )
51
+ parser.add_argument(
52
+ "--language",
53
+ nargs="+",
54
+ default=["en-GB"],
55
+ help="List of languages to set for transcription (e.g., en-US fr-FR es-ES)",
56
+ )
57
+ parser.add_argument(
58
+ "--update-languages",
59
+ action="store_true",
60
+ help="Download latest languages.json during startup",
61
+ )
62
+ parser.add_argument(
63
+ "--profanity",
64
+ default="masked",
65
+ choices=["masked", "removed", "off"],
66
+ help="Profanity setting for speech recognition",
67
+ )
68
+ parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
69
+ return parser.parse_args()
70
+
71
+
72
+ def validate_args(args):
73
+ """Validate command-line arguments."""
74
+ if not args.service_region or not args.subscription_key:
75
+ raise ValueError(
76
+ "Both --service-region and --subscription-key must be provided either as command-line arguments or environment variables."
77
+ )
78
+ # Reinstate key validation with more flexibility to accommodate complex keys
79
+ if not re.match(r"^[A-Za-z0-9\-_]{40,}$", args.subscription_key):
80
+ _LOGGER.warning(
81
+ "The subscription key does not match the expected format but will attempt to initialize."
82
+ )
83
+
84
+
85
+ async def main() -> None:
86
+ """Start Wyoming Microsoft STT server."""
87
+ args = parse_arguments()
88
+ validate_args(args)
89
+
90
+ speech_config = SpeechConfig(
91
+ subscription_key=args.subscription_key,
92
+ service_region=args.service_region,
93
+ profanity=args.profanity,
94
+ language=args.language,
95
+ )
96
+
97
+ # Set up logging
98
+ logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
99
+ _LOGGER.debug("Arguments parsed successfully.")
100
+
101
+ # Load languages
102
+ try:
103
+ _LOGGER.info("Starting language loading process.")
104
+ languages = get_languages(
105
+ args.download_dir,
106
+ update_languages=args.update_languages,
107
+ region=args.service_region,
108
+ key=args.subscription_key,
109
+ )
110
+ _LOGGER.info("Languages loaded successfully.")
111
+ except Exception as e:
112
+ _LOGGER.error(f"Failed to load languages: {e}")
113
+ return
114
+
115
+ wyoming_info = Info(
116
+ asr=[
117
+ AsrProgram(
118
+ name="Microsoft",
119
+ description="Microsoft speech transcription",
120
+ attribution=Attribution(
121
+ name="Hugo Bloem",
122
+ url="https://github.com/hugobloem/wyoming-microsoft-stt/",
123
+ ),
124
+ version=__version__,
125
+ installed=True,
126
+ models=[
127
+ AsrModel(
128
+ name="Microsoft STT",
129
+ description="Microsoft speech transcription",
130
+ attribution=Attribution(
131
+ name="Hugo Bloem",
132
+ url="https://github.com/hugobloem/wyoming-microsoft-stt/",
133
+ ),
134
+ version=__version__,
135
+ installed=True,
136
+ languages=languages,
137
+ )
138
+ ],
139
+ )
140
+ ],
141
+ )
142
+
143
+ # Load Microsoft STT model
144
+ try:
145
+ _LOGGER.debug("Loading Microsoft STT")
146
+ stt_model = MicrosoftSTT(speech_config)
147
+ _LOGGER.info("Microsoft STT model loaded successfully.")
148
+ except Exception as e:
149
+ _LOGGER.error(f"Failed to load Microsoft STT model: {e}")
150
+ return
151
+
152
+ # Initialize server and run
153
+ server = AsyncServer.from_uri(args.uri)
154
+ _LOGGER.info("Ready")
155
+ model_lock = asyncio.Lock()
156
+ try:
157
+ await server.run(
158
+ partial(
159
+ MicrosoftEventHandler,
160
+ wyoming_info,
161
+ args,
162
+ stt_model,
163
+ model_lock,
164
+ )
165
+ )
166
+ except Exception as e:
167
+ _LOGGER.error(f"An error occurred while running the server: {e}")
168
+
169
+
170
+ if __name__ == "__main__":
171
+ # Set up signal handling for graceful shutdown
172
+ signal.signal(signal.SIGTERM, handle_stop_signal)
173
+ signal.signal(signal.SIGINT, handle_stop_signal)
174
+
175
+ with contextlib.suppress(KeyboardInterrupt):
176
+ asyncio.run(main())
@@ -0,0 +1,92 @@
1
+ """Utility for downloading Microsoft STT languages."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Any
6
+ from urllib.parse import quote, urlsplit, urlunsplit
7
+ from urllib.request import urlopen, Request
8
+ import json
9
+ import time
10
+ from urllib.error import URLError
11
+
12
+ URL_FORMAT = "https://{region}.cognitiveservices.azure.com/speechtotext/v3.1/transcriptions/locales"
13
+ URL_HEADER = "Ocp-Apim-Subscription-Key"
14
+
15
+ _DIR = Path(__file__).parent
16
+ _LOGGER = logging.getLogger(__name__)
17
+
18
+
19
+ def _quote_url(url: str) -> str:
20
+ """Quote file part of URL in case it contains UTF-8 characters."""
21
+ parts = list(urlsplit(url))
22
+ parts[2] = quote(parts[2])
23
+ return urlunsplit(parts)
24
+
25
+
26
+ def transform_languages_files(response):
27
+ """Transform the languages.json file from the Microsoft API to the format used by Piper."""
28
+ languages = json.load(response)
29
+ return languages
30
+
31
+
32
+ def get_languages(
33
+ download_dir: str | Path,
34
+ update_languages: bool = False,
35
+ region: str = "westus",
36
+ key: str = "",
37
+ ) -> dict[str, Any]:
38
+ """Load available languages from downloaded or embedded JSON file."""
39
+ download_dir = Path(download_dir)
40
+ if not download_dir.exists():
41
+ download_dir.mkdir(parents=True)
42
+ languages_download = download_dir.joinpath("languages.json")
43
+
44
+ if update_languages:
45
+ # Download latest languages.json with retry mechanism
46
+ MAX_RETRIES = 3
47
+ RETRY_DELAY = 5 # seconds
48
+ for attempt in range(MAX_RETRIES):
49
+ try:
50
+ languages_url = URL_FORMAT.format(region=region)
51
+ languages_hdr = {URL_HEADER: key}
52
+ _LOGGER.debug("Downloading %s to %s", languages_url, languages_download)
53
+ req = Request(_quote_url(languages_url), headers=languages_hdr)
54
+ with urlopen(req) as response, open(
55
+ languages_download, "w"
56
+ ) as download_file:
57
+ json.dump(
58
+ transform_languages_files(response), download_file, indent=4
59
+ )
60
+ _LOGGER.info("Languages downloaded successfully.")
61
+ break
62
+ except URLError as e:
63
+ _LOGGER.warning(
64
+ "Failed to download languages.json (attempt %d/%d): %s",
65
+ attempt + 1,
66
+ MAX_RETRIES,
67
+ e,
68
+ )
69
+ time.sleep(RETRY_DELAY)
70
+ except Exception as e:
71
+ _LOGGER.exception("Failed to download languages.json: %s", e)
72
+ _LOGGER.error("Failed to update languages list")
73
+ return {}
74
+
75
+ # Prefer downloaded file to embedded
76
+ if languages_download.exists():
77
+ try:
78
+ _LOGGER.debug("Loading %s", languages_download)
79
+ with open(languages_download, encoding="utf-8") as languages_file:
80
+ return json.load(languages_file)
81
+ except Exception:
82
+ _LOGGER.exception("Failed to load %s", languages_download)
83
+
84
+ # Fall back to embedded
85
+ languages_embedded = _DIR / "languages.json"
86
+ try:
87
+ _LOGGER.debug("Loading %s", languages_embedded)
88
+ with open(languages_embedded, encoding="utf-8") as languages_file:
89
+ return json.load(languages_file)
90
+ except Exception:
91
+ _LOGGER.exception("Failed to load embedded languages.json")
92
+ return {}
@@ -0,0 +1,105 @@
1
+ """Event handler for clients of the server."""
2
+
3
+ import argparse
4
+ import asyncio
5
+ import logging
6
+ import time
7
+
8
+ from wyoming.asr import Transcribe, Transcript
9
+ from wyoming.audio import AudioChunk, AudioStart, AudioStop
10
+ from wyoming.event import Event
11
+ from wyoming.info import Describe, Info
12
+ from wyoming.server import AsyncEventHandler
13
+
14
+ from .microsoft_stt import MicrosoftSTT
15
+
16
+ _LOGGER = logging.getLogger(__name__)
17
+
18
+
19
+ class MicrosoftEventHandler(AsyncEventHandler):
20
+ """Event handler for clients."""
21
+
22
+ def __init__(
23
+ self,
24
+ wyoming_info: Info,
25
+ cli_args: argparse.Namespace,
26
+ model: MicrosoftSTT,
27
+ model_lock: asyncio.Lock,
28
+ *args,
29
+ **kwargs,
30
+ ) -> None:
31
+ """Initialize."""
32
+ super().__init__(*args, **kwargs)
33
+
34
+ self.cli_args = cli_args
35
+ self.wyoming_info_event = wyoming_info.event()
36
+ self.model = model
37
+ self.model_lock = model_lock
38
+
39
+ if len(self.cli_args.language) > 1:
40
+ _LOGGER.warning(
41
+ f"Multiple languages specified, auto-detection will be used for these languages only: {self.cli_args.language}"
42
+ )
43
+
44
+ self._language = self.cli_args.language[0]
45
+
46
+ async def handle_event(self, event: Event) -> bool:
47
+ """Handle an event."""
48
+ if Describe.is_type(event.type):
49
+ await self.write_event(self.wyoming_info_event)
50
+ _LOGGER.debug("Sent info")
51
+ return True
52
+
53
+ if Transcribe.is_type(event.type):
54
+ transcribe = Transcribe.from_event(event)
55
+ if transcribe.language:
56
+ self._language = transcribe.language
57
+ _LOGGER.debug("Language set to %s", transcribe.language)
58
+ return True
59
+
60
+ if AudioStart.is_type(event.type):
61
+ start = AudioStart.from_event(event)
62
+ _LOGGER.debug(
63
+ f"Receiving audio: {start.width * 8}bit {start.rate}Hz {start.channels}ch"
64
+ )
65
+
66
+ async with self.model_lock:
67
+ self.model.start_transcribe(
68
+ bits_per_sample=start.width * 8,
69
+ samples_per_second=start.rate,
70
+ channels=start.channels,
71
+ language=self._language,
72
+ )
73
+
74
+ if AudioChunk.is_type(event.type):
75
+ chunk = AudioChunk.from_event(event)
76
+ async with self.model_lock:
77
+ self.model.push_audio_chunk(chunk.audio)
78
+
79
+ return True
80
+
81
+ if AudioStop.is_type(event.type):
82
+ _LOGGER.debug("Audio stopped")
83
+
84
+ async with self.model_lock:
85
+ try:
86
+ start_time = time.time()
87
+ _LOGGER.debug("Starting transcription")
88
+ text = self.model.transcribe()
89
+ _LOGGER.info(
90
+ f"Transcription completed in {time.time() - start_time:.2f} seconds"
91
+ )
92
+ except Exception as e:
93
+ _LOGGER.error(f"Failed to transcribe audio: {e}")
94
+ return True
95
+
96
+ _LOGGER.info(text)
97
+
98
+ await self.write_event(Transcript(text=text).event())
99
+ _LOGGER.debug("Completed request")
100
+
101
+ # Reset
102
+ self._language = self.cli_args.language
103
+ return False
104
+
105
+ return True
@@ -0,0 +1,145 @@
1
+ [
2
+ "af-ZA",
3
+ "am-ET",
4
+ "ar-AE",
5
+ "ar-BH",
6
+ "ar-DZ",
7
+ "ar-EG",
8
+ "ar-IL",
9
+ "ar-IQ",
10
+ "ar-JO",
11
+ "ar-KW",
12
+ "ar-LB",
13
+ "ar-LY",
14
+ "ar-MA",
15
+ "ar-OM",
16
+ "ar-PS",
17
+ "ar-QA",
18
+ "ar-SA",
19
+ "ar-SY",
20
+ "ar-TN",
21
+ "ar-YE",
22
+ "az-AZ",
23
+ "bg-BG",
24
+ "bn-IN",
25
+ "bs-BA",
26
+ "ca-ES",
27
+ "cs-CZ",
28
+ "cy-GB",
29
+ "da-DK",
30
+ "de-AT",
31
+ "de-CH",
32
+ "de-DE",
33
+ "el-GR",
34
+ "en-AU",
35
+ "en-CA",
36
+ "en-GB",
37
+ "en-GH",
38
+ "en-HK",
39
+ "en-IE",
40
+ "en-IN",
41
+ "en-KE",
42
+ "en-NG",
43
+ "en-NZ",
44
+ "en-PH",
45
+ "en-SG",
46
+ "en-TZ",
47
+ "en-US",
48
+ "en-ZA",
49
+ "es-AR",
50
+ "es-BO",
51
+ "es-CL",
52
+ "es-CO",
53
+ "es-CR",
54
+ "es-CU",
55
+ "es-DO",
56
+ "es-EC",
57
+ "es-ES",
58
+ "es-GQ",
59
+ "es-GT",
60
+ "es-HN",
61
+ "es-MX",
62
+ "es-NI",
63
+ "es-PA",
64
+ "es-PE",
65
+ "es-PR",
66
+ "es-PY",
67
+ "es-SV",
68
+ "es-US",
69
+ "es-UY",
70
+ "es-VE",
71
+ "et-EE",
72
+ "eu-ES",
73
+ "fa-IR",
74
+ "fi-FI",
75
+ "fil-PH",
76
+ "fr-BE",
77
+ "fr-CA",
78
+ "fr-CH",
79
+ "fr-FR",
80
+ "ga-IE",
81
+ "gl-ES",
82
+ "gu-IN",
83
+ "he-IL",
84
+ "hi-IN",
85
+ "hr-HR",
86
+ "hu-HU",
87
+ "hy-AM",
88
+ "id-ID",
89
+ "is-IS",
90
+ "it-CH",
91
+ "it-IT",
92
+ "ja-JP",
93
+ "jv-ID",
94
+ "ka-GE",
95
+ "kk-KZ",
96
+ "km-KH",
97
+ "kn-IN",
98
+ "ko-KR",
99
+ "lo-LA",
100
+ "lt-LT",
101
+ "lv-LV",
102
+ "mk-MK",
103
+ "ml-IN",
104
+ "mn-MN",
105
+ "mr-IN",
106
+ "ms-MY",
107
+ "mt-MT",
108
+ "my-MM",
109
+ "nb-NO",
110
+ "ne-NP",
111
+ "nl-BE",
112
+ "nl-NL",
113
+ "pa-IN",
114
+ "pl-PL",
115
+ "ps-AF",
116
+ "pt-BR",
117
+ "pt-PT",
118
+ "ro-RO",
119
+ "ru-RU",
120
+ "si-LK",
121
+ "sk-SK",
122
+ "sl-SI",
123
+ "so-SO",
124
+ "sq-AL",
125
+ "sr-RS",
126
+ "sv-SE",
127
+ "sw-KE",
128
+ "sw-TZ",
129
+ "ta-IN",
130
+ "te-IN",
131
+ "th-TH",
132
+ "tr-TR",
133
+ "uk-UA",
134
+ "ur-IN",
135
+ "uz-UZ",
136
+ "vi-VN",
137
+ "wuu-CN",
138
+ "yue-CN",
139
+ "zh-CN",
140
+ "zh-CN-SHANDONG",
141
+ "zh-CN-SICHUAN",
142
+ "zh-HK",
143
+ "zh-TW",
144
+ "zu-ZA"
145
+ ]
@@ -0,0 +1,148 @@
1
+ """Microsoft STT module for Wyoming."""
2
+
3
+ import time
4
+ import azure.cognitiveservices.speech as speechsdk # noqa: D100
5
+ import logging
6
+ from . import SpeechConfig
7
+
8
+ _LOGGER = logging.getLogger(__name__)
9
+
10
+
11
+ class MicrosoftSTT:
12
+ """Class to handle Microsoft STT."""
13
+
14
+ def __init__(self, speechconfig: SpeechConfig) -> None:
15
+ """Initialize."""
16
+ self.args = speechconfig
17
+
18
+ self._stream: speechsdk.audio.PushAudioInputStream | None = None
19
+ self._speech_recognizer: speechsdk.SpeechRecognizer | None = None
20
+ self._results: list[speechsdk.SpeechRecognitionResult] = []
21
+
22
+ try:
23
+ # Initialize the speech configuration with the provided subscription key and region
24
+ self.speech_config = speechsdk.SpeechConfig(
25
+ subscription=self.args.subscription_key, region=self.args.service_region
26
+ )
27
+ _LOGGER.info("Microsoft SpeechConfig initialized successfully.")
28
+ except Exception as e:
29
+ _LOGGER.error(f"Failed to initialize Microsoft SpeechConfig: {e}")
30
+ raise
31
+
32
+ self.set_profanity(self.args.profanity)
33
+
34
+ def start_transcribe(
35
+ self,
36
+ samples_per_second: int = 16000,
37
+ bits_per_sample: int = 16,
38
+ channels: int = 1,
39
+ language=None,
40
+ ) -> None:
41
+ """Begin a transcription."""
42
+ _LOGGER.debug(f"Starting transcription with language: {language}")
43
+
44
+ # Configure audio input for speech recognition
45
+ _LOGGER.debug("Configuring audio input stream...")
46
+ self._stream = speechsdk.audio.PushAudioInputStream(
47
+ stream_format=speechsdk.audio.AudioStreamFormat(
48
+ samples_per_second=samples_per_second,
49
+ bits_per_sample=bits_per_sample,
50
+ channels=channels,
51
+ )
52
+ )
53
+ audio_config = speechsdk.audio.AudioConfig(stream=self._stream)
54
+ # Create a speech recognizer with the configured speech and audio settings
55
+ self._speech_recognizer = speechsdk.SpeechRecognizer(
56
+ speech_config=self.speech_config,
57
+ audio_config=audio_config,
58
+ **self.get_language(language),
59
+ )
60
+
61
+ self.recognition_done = False
62
+
63
+ def session_stopped_cb(evt):
64
+ """Signal to stop continuous recognition upon receiving an event `evt`."""
65
+ _LOGGER.debug(f"SESSION STOPPED: {evt}")
66
+ self.recognition_done = True
67
+
68
+ self._speech_recognizer.recognizing.connect(
69
+ lambda evt: _LOGGER.debug(f"RECOGNIZING: {evt}")
70
+ )
71
+ self._speech_recognizer.recognized.connect(
72
+ lambda evt: _LOGGER.debug(f"RECOGNIZED: {evt}")
73
+ )
74
+ self._speech_recognizer.session_started.connect(
75
+ lambda evt: _LOGGER.debug(f"SESSION STARTED: {evt}")
76
+ )
77
+ self._speech_recognizer.session_stopped.connect(session_stopped_cb)
78
+ self._speech_recognizer.canceled.connect(
79
+ lambda evt: _LOGGER.debug(f"CANCELED {evt}")
80
+ )
81
+
82
+ _LOGGER.debug("Starting continuous recognition...")
83
+
84
+ def recognized(event: speechsdk.SpeechRecognitionEventArgs):
85
+ _LOGGER.debug(f"{event.result}")
86
+ self._results = event.result
87
+
88
+ self._speech_recognizer.start_continuous_recognition()
89
+ self._speech_recognizer.recognized.connect(recognized)
90
+
91
+ def push_audio_chunk(self, chunk: bytes) -> None:
92
+ """Push an audio chunk to the recognizer."""
93
+ self._stream.write(chunk)
94
+
95
+ def stop_audio_chunk(self) -> None:
96
+ """Stop the transcription."""
97
+ _LOGGER.debug("Stopping transcription...")
98
+ self._stream.close()
99
+
100
+ def transcribe(self):
101
+ """Get the results of a transcription."""
102
+ try:
103
+ self.stop_audio_chunk()
104
+
105
+ # Wait for the recognition to finish
106
+ while not self.recognition_done:
107
+ time.sleep(0.01)
108
+
109
+ self._speech_recognizer.stop_continuous_recognition()
110
+
111
+ return self._results.text
112
+
113
+ except Exception as e:
114
+ _LOGGER.error(f"Failed to transcribe audio: {e}")
115
+ return ""
116
+
117
+ def get_language(self, language: str) -> dict:
118
+ """Get the language code."""
119
+ if len(self.args.language) > 1:
120
+ auto_detect_source_language_config = (
121
+ speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
122
+ languages=self.args.language
123
+ )
124
+ )
125
+ return {
126
+ "auto_detect_source_language_config": auto_detect_source_language_config
127
+ }
128
+
129
+ if language:
130
+ _LOGGER.debug(f"Language set to {language}")
131
+ return {"language": language}
132
+
133
+ return {"language": self.args.language[0]}
134
+
135
+ def set_profanity(self, profanity: str):
136
+ """Set the profanity filter level."""
137
+ if profanity == "off":
138
+ profanity_level = speechsdk.ProfanityOption.Raw
139
+ elif profanity == "masked":
140
+ profanity_level = speechsdk.ProfanityOption.Masked
141
+ elif profanity == "removed":
142
+ profanity_level = speechsdk.ProfanityOption.Removed
143
+ else:
144
+ _LOGGER.error(f"Invalid profanity level: {profanity}")
145
+ return
146
+
147
+ self.speech_config.set_profanity(profanity_level)
148
+ _LOGGER.debug(f"Profanity filter set to {profanity}")
@@ -0,0 +1,3 @@
1
+ """Version information."""
2
+
3
+ __version__ = "1.3.3"
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: wyoming-microsoft-stt
3
+ Version: 1.3.3
4
+ Summary: Add your description here
5
+ Home-page: https://github.com/hugobloem/wyoming-microsoft-stt
6
+ Author: Hugo Bloem
7
+ Author-email:
8
+ Requires-Python: >=3.13
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: azure-cognitiveservices-speech>=1.45.0
11
+ Requires-Dist: pydantic>=2.11.7
12
+ Requires-Dist: wyoming>=1.7.2
13
+ Dynamic: author
14
+ Dynamic: home-page
15
+
16
+ # Wyoming Microsoft STT
17
+ Wyoming protocol server for Microsoft Azure speech-to-text.
18
+
19
+ This Python package provides a Wyoming integration for Microsoft Azure speech-to-text and can be directly used with [Home Assistant](https://www.home-assistant.io/) voice and [Rhasspy](https://github.com/rhasspy/rhasspy3).
20
+
21
+ ## Azure Speech Service
22
+ This program uses [Microsoft Azure Speech Service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/). You can sign up to a free Azure account which comes with free tier of 5 audio hours per month, this should be enough for running a voice assistant as each command is relatively short. Once this amount is exceeded Azure could charge you for each second used (Current pricing is $0.36 per audio hour). I am not responsible for any incurred charges and recommend you set up a spending limit to reduce your exposure. However, for normal usage the free tier could suffice and the resource should not switch to a paid service automatically.
23
+
24
+ If you have not set up a speech resource, you can follow the instructions below. (you only need to do this once and works both for [Speech-to-Text](https://github.com/hugobloem/wyoming-microsoft-stt) and [Text-to-Speech](https://github.com/hugobloem/wyoming-microsoft-tts))
25
+
26
+ 1. Sign in or create an account on [portal.azure.com](https://portal.azure.com).
27
+ 2. Create a subscription by searching for `subscription` in the search bar. [Consult Microsoft Learn for more information](https://learn.microsoft.com/en-gb/azure/cost-management-billing/manage/create-subscription#create-a-subscription-in-the-azure-portal).
28
+ 3. Create a speech resource by searching for `speech service`.
29
+ 4. Select the subscription you created, pick or create a resource group, select a region, pick an identifiable name, and select the pricing tier (you probably want Free F0)
30
+ 5. Once created, copy one of the keys from the speech service page. You will need this to run this program.
31
+
32
+ ## Usage
33
+ Depending on the installation method parameters are parsed differently. However, the same options are used for each of the installation methods and can be found in the table below. Your service region and subscription key can be found on the speech service resource page (step 5 the Azure Speech service instructions).
34
+
35
+ For the bare-metal Python install the program is run as follows:
36
+ ```python
37
+ python -m wyoming-microsoft-stt --<key> <value>
38
+ ```
39
+
40
+ | Key | Optional | Description |
41
+ |---|---|---|
42
+ | `service-region` | No | Azure service region e.g., `uksouth` |
43
+ | `subscription-key` | No | Azure subscription key |
44
+ | `language` | Yes | Default language to set for transcription, default: `en-GB`. For auto-detection provide multiple languages. |
45
+ | `uri` | No | Uri where the server will be broadcasted e.g., `tcp://0.0.0.0:10300` |
46
+ | `download-dir` | Yes | Directory to download models into (default: ) |
47
+ | `update-languages` | Yes | Download latest languages.json during startup |
48
+ | `debug` | Yes | Log debug messages |
49
+
50
+ ## Multi-language support
51
+ This add-on can also auto-detect the spoken language from a list of pre-defined languages (max. 10). To do this in Home Assistant provide the languages separated by semi-colons like so:
52
+ <img width="689" alt="Screenshot 2025-05-04 at 11 59 55" src="https://github.com/user-attachments/assets/b3c54fe5-ebf3-404a-a8e8-b0d27efaf76d" />
53
+
54
+ > [!NOTE]
55
+ > Setting multiple languages will override the options set by Home Assistant's Voice configuration! It will prompt you to select a language but the option is ignored when speech is processed.
56
+
57
+
58
+ ## Installation
59
+ Depending on your use case there are different installation options.
60
+
61
+ - **Using pip**
62
+ Clone the repository and install the package using pip. Please note the platform requirements as noted [here](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/quickstarts/setup-platform?tabs=linux%2Cubuntu%2Cdotnetcli%2Cdotnet%2Cjre%2Cmaven%2Cnodejs%2Cmac%2Cpypi&pivots=programming-language-python#platform-requirements).
63
+ ```sh
64
+ pip install .
65
+ ```
66
+
67
+ - **Home Assistant Add-On**
68
+ Add the following repository as an add-on repository to your Home Assistant, or click the button below.
69
+ [https://github.com/hugobloem/homeassistant-addons](https://github.com/hugobloem/homeassistant-addons)
70
+
71
+ [![Open your Home Assistant instance and show the add add-on repository dialog with a specific repository URL pre-filled.](https://my.home-assistant.io/badges/supervisor_add_addon_repository.svg)](https://my.home-assistant.io/redirect/supervisor_add_addon_repository/?repository_url=https%3A%2F%2Fgithub.com%2Fhugobloem%2Fhomeassistant-addons)
72
+
73
+ - **Docker container**
74
+ To run as a Docker container use the following command:
75
+ ```bash
76
+ docker run ghcr.io/hugobloem/wyoming-microsoft-stt-noha:latest --<key> <value>
77
+ ```
78
+ For the relevant keys please look at [the table below](#usage)
79
+
80
+ - **docker compose**
81
+
82
+ Below is a sample for a docker compose file. The azure region + subscription key can be set in environment variables. Everything else needs to be passed via command line arguments.
83
+
84
+ ```yaml
85
+ wyoming-proxy-azure-stt:
86
+ image: ghcr.io/hugobloem/wyoming-microsoft-stt-noha
87
+ container_name: wyoming-azure-stt
88
+ ports:
89
+ - "10300:10300"
90
+ environment:
91
+ AZURE_SERVICE_REGION: swedencentral
92
+ AZURE_SUBSCRIPTION_KEY: XXX
93
+ command: --language=en-GB,nl-NL --uri=tcp://0.0.0.0:10300
94
+ ```
@@ -0,0 +1,16 @@
1
+ tests/__init__.py,sha256=ZEU8_ARBRGIqaAOTcPRsjXxcfHSojBm-5-krYBN-68g,13
2
+ tests/conftest.py,sha256=iSju8g6PiODcg9eY4_lZaVcubXMlfnx1CpyGVsmTAgY,340
3
+ tests/test_microsoft_stt.py,sha256=CO6xyLHFcE1kqJ_XuGprRu0i28zTN2oHaKtomK_aJIU,594
4
+ tests/test_multilanguage.py,sha256=wRtyWuhWBmA5Yf01JNbmgxqoVKmDcE6XHz9QcfSR700,3100
5
+ tests/test_transcribe.py,sha256=BqT-KLZR6UM9RqNI43RZFz6WLkXFcjk2a2WLszhDgso,3674
6
+ wyoming_microsoft_stt/__init__.py,sha256=92Kms90cxU6Zs5XjNZiYvIGLA3Yd-BizN-G8f7n506I,3365
7
+ wyoming_microsoft_stt/__main__.py,sha256=gA0wM3OCLdEq7ewMRrlWs4TVpXvFlITMaX_4lP3ed7E,5577
8
+ wyoming_microsoft_stt/download.py,sha256=mTWZl9kVaE7KGeO2d0SS7wztDN6tGo9AudXWhN8uKhg,3379
9
+ wyoming_microsoft_stt/handler.py,sha256=dWm9etFANRU33IDU-p85enHmWtMtTtW522I3AJbTAxg,3330
10
+ wyoming_microsoft_stt/languages.json,sha256=eDZuPJLzDjdZlh0I4081OTkLPt6nZdUY8ra45c7oMCc,1881
11
+ wyoming_microsoft_stt/microsoft_stt.py,sha256=2Rb9uoOxGFpQfwXux5p3jwop_3poF-75U_84q5QRmt0,5357
12
+ wyoming_microsoft_stt/version.py,sha256=iG_JqR_Z5wfSTlMqH9H1vmrmlcgyFJdbRBRllY0yDgU,50
13
+ wyoming_microsoft_stt-1.3.3.dist-info/METADATA,sha256=Hh7h4BDS_hLd4t3Y8LOEpAMpRzGph7TwfPEliYQA6mA,5792
14
+ wyoming_microsoft_stt-1.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ wyoming_microsoft_stt-1.3.3.dist-info/top_level.txt,sha256=aURyYXybYBAybkVTs2RFT5ctnPz5pRa_B7Gn9_SKhL8,28
16
+ wyoming_microsoft_stt-1.3.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ tests
2
+ wyoming_microsoft_stt