wyoming-microsoft-stt 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/__init__.py +1 -0
- tests/conftest.py +15 -0
- tests/test_microsoft_stt.py +19 -0
- tests/test_multilanguage.py +109 -0
- tests/test_transcribe.py +114 -0
- wyoming_microsoft_stt/__init__.py +159 -0
- wyoming_microsoft_stt/__main__.py +176 -0
- wyoming_microsoft_stt/download.py +92 -0
- wyoming_microsoft_stt/handler.py +105 -0
- wyoming_microsoft_stt/languages.json +145 -0
- wyoming_microsoft_stt/microsoft_stt.py +148 -0
- wyoming_microsoft_stt/version.py +3 -0
- wyoming_microsoft_stt-1.3.3.dist-info/METADATA +94 -0
- wyoming_microsoft_stt-1.3.3.dist-info/RECORD +16 -0
- wyoming_microsoft_stt-1.3.3.dist-info/WHEEL +5 -0
- wyoming_microsoft_stt-1.3.3.dist-info/top_level.txt +2 -0
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests."""
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Fixtures for tests."""
|
|
2
|
+
|
|
3
|
+
from wyoming_microsoft_stt import SpeechConfig
|
|
4
|
+
import pytest
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.fixture
|
|
9
|
+
def microsoft_stt_args():
|
|
10
|
+
"""Return MicrosoftSTT instance."""
|
|
11
|
+
args = SpeechConfig(
|
|
12
|
+
subscription_key=os.environ.get("SPEECH_KEY"),
|
|
13
|
+
service_region=os.environ.get("SPEECH_REGION"),
|
|
14
|
+
)
|
|
15
|
+
return args
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Tests for the MicrosoftTTS class."""
|
|
2
|
+
|
|
3
|
+
from wyoming_microsoft_stt.microsoft_stt import MicrosoftSTT
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_initialize(microsoft_stt_args):
|
|
7
|
+
"""Test initialization."""
|
|
8
|
+
microsoft_stt = MicrosoftSTT(microsoft_stt_args)
|
|
9
|
+
assert microsoft_stt.speech_config is not None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_set_profanity(microsoft_stt_args):
|
|
13
|
+
"""Test set_profanity."""
|
|
14
|
+
microsoft_stt = MicrosoftSTT(microsoft_stt_args)
|
|
15
|
+
assert microsoft_stt.speech_config is not None
|
|
16
|
+
|
|
17
|
+
profanity = "masked"
|
|
18
|
+
microsoft_stt.set_profanity(profanity)
|
|
19
|
+
# There is currently no way to check the set profanity level
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Tests for the Microsoft STT service."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
import os
|
|
7
|
+
import wave
|
|
8
|
+
from asyncio.subprocess import PIPE
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
from wyoming.asr import Transcript
|
|
14
|
+
from wyoming.audio import AudioStart, AudioStop, wav_to_chunks
|
|
15
|
+
from wyoming.event import async_read_event, async_write_event
|
|
16
|
+
from wyoming.info import Describe, Info
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
_LOGGER = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_DIR = Path(__file__).parent
|
|
23
|
+
_PROGRAM_DIR = _DIR.parent
|
|
24
|
+
_LOCAL_DIR = _PROGRAM_DIR / "local"
|
|
25
|
+
_SAMPLES_PER_CHUNK = 1024
|
|
26
|
+
|
|
27
|
+
# Need to give time for the model to download
|
|
28
|
+
_START_TIMEOUT = 60
|
|
29
|
+
_TRANSCRIBE_TIMEOUT = 60
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.mark.asyncio
|
|
33
|
+
async def test_multilanguage() -> None:
|
|
34
|
+
"""Test the transcription."""
|
|
35
|
+
proc = await asyncio.create_subprocess_exec(
|
|
36
|
+
sys.executable,
|
|
37
|
+
"-m",
|
|
38
|
+
"wyoming_microsoft_stt",
|
|
39
|
+
"--uri",
|
|
40
|
+
"stdio://",
|
|
41
|
+
"--language",
|
|
42
|
+
"en-GB",
|
|
43
|
+
"nl-NL",
|
|
44
|
+
"--service-region",
|
|
45
|
+
os.environ.get("SPEECH_REGION"),
|
|
46
|
+
"--subscription-key",
|
|
47
|
+
os.environ.get("SPEECH_KEY"),
|
|
48
|
+
"--debug",
|
|
49
|
+
stdin=PIPE,
|
|
50
|
+
stdout=PIPE,
|
|
51
|
+
)
|
|
52
|
+
assert proc.stdin is not None
|
|
53
|
+
assert proc.stdout is not None
|
|
54
|
+
|
|
55
|
+
# Check info
|
|
56
|
+
await async_write_event(Describe().event(), proc.stdin)
|
|
57
|
+
while True:
|
|
58
|
+
event = await asyncio.wait_for(
|
|
59
|
+
async_read_event(proc.stdout), timeout=_START_TIMEOUT
|
|
60
|
+
)
|
|
61
|
+
assert event is not None
|
|
62
|
+
|
|
63
|
+
if not Info.is_type(event.type):
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
info = Info.from_event(event)
|
|
67
|
+
assert len(info.asr) == 1, "Expected one asr service"
|
|
68
|
+
asr = info.asr[0]
|
|
69
|
+
assert len(asr.models) > 0, "Expected at least one model"
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
# Test known WAV
|
|
73
|
+
with wave.open(str(_DIR / "zet_het_licht_aan.wav"), "rb") as example_wav:
|
|
74
|
+
await async_write_event(
|
|
75
|
+
AudioStart(
|
|
76
|
+
rate=example_wav.getframerate(),
|
|
77
|
+
width=example_wav.getsampwidth(),
|
|
78
|
+
channels=example_wav.getnchannels(),
|
|
79
|
+
).event(),
|
|
80
|
+
proc.stdin,
|
|
81
|
+
)
|
|
82
|
+
for chunk in wav_to_chunks(example_wav, _SAMPLES_PER_CHUNK):
|
|
83
|
+
await async_write_event(chunk.event(), proc.stdin)
|
|
84
|
+
_LOGGER.info("Sent bytes of audio data to the server")
|
|
85
|
+
|
|
86
|
+
await async_write_event(AudioStop().event(), proc.stdin)
|
|
87
|
+
_LOGGER.info("Sent audio stop event to the server")
|
|
88
|
+
|
|
89
|
+
while True:
|
|
90
|
+
event = await asyncio.wait_for(
|
|
91
|
+
async_read_event(proc.stdout), timeout=_TRANSCRIBE_TIMEOUT
|
|
92
|
+
)
|
|
93
|
+
assert event is not None
|
|
94
|
+
|
|
95
|
+
if not Transcript.is_type(event.type):
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
transcript = Transcript.from_event(event)
|
|
99
|
+
_LOGGER.info(f"Received transcript: {transcript.text}")
|
|
100
|
+
text = transcript.text.lower().strip()
|
|
101
|
+
text = re.sub(r"[^a-z ]", "", text)
|
|
102
|
+
assert text == "zet het licht aan"
|
|
103
|
+
break
|
|
104
|
+
|
|
105
|
+
# Need to close stdin for graceful termination
|
|
106
|
+
proc.stdin.close()
|
|
107
|
+
_, stderr = await proc.communicate()
|
|
108
|
+
|
|
109
|
+
assert proc.returncode == 0, stderr.decode()
|
tests/test_transcribe.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Tests for the Microsoft STT service."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
import os
|
|
7
|
+
import wave
|
|
8
|
+
from asyncio.subprocess import PIPE
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
from wyoming.asr import Transcript
|
|
14
|
+
from wyoming.audio import AudioStart, AudioStop, wav_to_chunks
|
|
15
|
+
from wyoming.event import async_read_event, async_write_event
|
|
16
|
+
from wyoming.info import Describe, Info
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
_LOGGER = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_DIR = Path(__file__).parent
|
|
23
|
+
_PROGRAM_DIR = _DIR.parent
|
|
24
|
+
_LOCAL_DIR = _PROGRAM_DIR / "local"
|
|
25
|
+
_SAMPLES_PER_CHUNK = 1024
|
|
26
|
+
|
|
27
|
+
# Need to give time for the model to download
|
|
28
|
+
_START_TIMEOUT = 60
|
|
29
|
+
_TRANSCRIBE_TIMEOUT = 60
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.mark.asyncio
|
|
33
|
+
async def test_transcribe() -> None:
|
|
34
|
+
"""Test the transcription."""
|
|
35
|
+
proc = await asyncio.create_subprocess_exec(
|
|
36
|
+
sys.executable,
|
|
37
|
+
"-m",
|
|
38
|
+
"wyoming_microsoft_stt",
|
|
39
|
+
"--uri",
|
|
40
|
+
"stdio://",
|
|
41
|
+
"--language",
|
|
42
|
+
"en-GB",
|
|
43
|
+
"--service-region",
|
|
44
|
+
os.environ.get("SPEECH_REGION"),
|
|
45
|
+
"--subscription-key",
|
|
46
|
+
os.environ.get("SPEECH_KEY"),
|
|
47
|
+
"--debug",
|
|
48
|
+
stdin=PIPE,
|
|
49
|
+
stdout=PIPE,
|
|
50
|
+
)
|
|
51
|
+
assert proc.stdin is not None
|
|
52
|
+
assert proc.stdout is not None
|
|
53
|
+
|
|
54
|
+
# Check info
|
|
55
|
+
await async_write_event(Describe().event(), proc.stdin)
|
|
56
|
+
while True:
|
|
57
|
+
event = await asyncio.wait_for(
|
|
58
|
+
async_read_event(proc.stdout), timeout=_START_TIMEOUT
|
|
59
|
+
)
|
|
60
|
+
assert event is not None
|
|
61
|
+
|
|
62
|
+
if not Info.is_type(event.type):
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
info = Info.from_event(event)
|
|
66
|
+
assert len(info.asr) == 1, "Expected one asr service"
|
|
67
|
+
asr = info.asr[0]
|
|
68
|
+
assert len(asr.models) > 0, "Expected at least one model"
|
|
69
|
+
break
|
|
70
|
+
|
|
71
|
+
# Test known WAV
|
|
72
|
+
with wave.open(str(_DIR / "long_text.wav"), "rb") as example_wav:
|
|
73
|
+
await async_write_event(
|
|
74
|
+
AudioStart(
|
|
75
|
+
rate=example_wav.getframerate(),
|
|
76
|
+
width=example_wav.getsampwidth(),
|
|
77
|
+
channels=example_wav.getnchannels(),
|
|
78
|
+
).event(),
|
|
79
|
+
proc.stdin,
|
|
80
|
+
)
|
|
81
|
+
for chunk in wav_to_chunks(example_wav, _SAMPLES_PER_CHUNK):
|
|
82
|
+
await async_write_event(chunk.event(), proc.stdin)
|
|
83
|
+
_LOGGER.info("Sent bytes of audio data to the server")
|
|
84
|
+
|
|
85
|
+
await async_write_event(AudioStop().event(), proc.stdin)
|
|
86
|
+
_LOGGER.info("Sent audio stop event to the server")
|
|
87
|
+
|
|
88
|
+
while True:
|
|
89
|
+
event = await asyncio.wait_for(
|
|
90
|
+
async_read_event(proc.stdout), timeout=_TRANSCRIBE_TIMEOUT
|
|
91
|
+
)
|
|
92
|
+
assert event is not None
|
|
93
|
+
|
|
94
|
+
if not Transcript.is_type(event.type):
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
transcript = Transcript.from_event(event)
|
|
98
|
+
text = transcript.text.lower().strip()
|
|
99
|
+
text = re.sub(r"[^a-z ]", "", text)
|
|
100
|
+
_LOGGER.info(f"Received transcript: {text}")
|
|
101
|
+
|
|
102
|
+
original_text = "The Netherlands, informally Holland, is a country in Northwestern Europe with overseas territories in the Caribbean. It is the largest of the four constituent countries of the Kingdom of the Netherlands. The Netherlands consists of 12 provinces. It borders Germany to the east and Belgium to the south, with the North Sea coastline to the north and west. It shares maritime borders with the United Kingdom, Germany, and Belgium."
|
|
103
|
+
# Remove punctuation and convert to lowercase
|
|
104
|
+
original_text = original_text.lower()
|
|
105
|
+
original_text = re.sub(r"[^a-z ]", "", original_text)
|
|
106
|
+
|
|
107
|
+
assert text == original_text
|
|
108
|
+
break
|
|
109
|
+
|
|
110
|
+
# Need to close stdin for graceful termination
|
|
111
|
+
proc.stdin.close()
|
|
112
|
+
_, stderr = await proc.communicate()
|
|
113
|
+
|
|
114
|
+
assert proc.returncode == 0, stderr.decode()
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Wyoming server for Microsoft STT."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SpeechConfig(BaseModel):
|
|
8
|
+
"""Speech configuration."""
|
|
9
|
+
|
|
10
|
+
subscription_key: str
|
|
11
|
+
service_region: str
|
|
12
|
+
profanity: Literal["off", "masked", "removed"] = "masked"
|
|
13
|
+
language: list[
|
|
14
|
+
Literal[
|
|
15
|
+
"af-ZA",
|
|
16
|
+
"am-ET",
|
|
17
|
+
"ar-AE",
|
|
18
|
+
"ar-BH",
|
|
19
|
+
"ar-DZ",
|
|
20
|
+
"ar-EG",
|
|
21
|
+
"ar-IL",
|
|
22
|
+
"ar-IQ",
|
|
23
|
+
"ar-JO",
|
|
24
|
+
"ar-KW",
|
|
25
|
+
"ar-LB",
|
|
26
|
+
"ar-LY",
|
|
27
|
+
"ar-MA",
|
|
28
|
+
"ar-OM",
|
|
29
|
+
"ar-PS",
|
|
30
|
+
"ar-QA",
|
|
31
|
+
"ar-SA",
|
|
32
|
+
"ar-SY",
|
|
33
|
+
"ar-TN",
|
|
34
|
+
"ar-YE",
|
|
35
|
+
"az-AZ",
|
|
36
|
+
"bg-BG",
|
|
37
|
+
"bn-IN",
|
|
38
|
+
"bs-BA",
|
|
39
|
+
"ca-ES",
|
|
40
|
+
"cs-CZ",
|
|
41
|
+
"cy-GB",
|
|
42
|
+
"da-DK",
|
|
43
|
+
"de-AT",
|
|
44
|
+
"de-CH",
|
|
45
|
+
"de-DE",
|
|
46
|
+
"el-GR",
|
|
47
|
+
"en-AU",
|
|
48
|
+
"en-CA",
|
|
49
|
+
"en-GB",
|
|
50
|
+
"en-GH",
|
|
51
|
+
"en-HK",
|
|
52
|
+
"en-IE",
|
|
53
|
+
"en-IN",
|
|
54
|
+
"en-KE",
|
|
55
|
+
"en-NG",
|
|
56
|
+
"en-NZ",
|
|
57
|
+
"en-PH",
|
|
58
|
+
"en-SG",
|
|
59
|
+
"en-TZ",
|
|
60
|
+
"en-US",
|
|
61
|
+
"en-ZA",
|
|
62
|
+
"es-AR",
|
|
63
|
+
"es-BO",
|
|
64
|
+
"es-CL",
|
|
65
|
+
"es-CO",
|
|
66
|
+
"es-CR",
|
|
67
|
+
"es-CU",
|
|
68
|
+
"es-DO",
|
|
69
|
+
"es-EC",
|
|
70
|
+
"es-ES",
|
|
71
|
+
"es-GQ",
|
|
72
|
+
"es-GT",
|
|
73
|
+
"es-HN",
|
|
74
|
+
"es-MX",
|
|
75
|
+
"es-NI",
|
|
76
|
+
"es-PA",
|
|
77
|
+
"es-PE",
|
|
78
|
+
"es-PR",
|
|
79
|
+
"es-PY",
|
|
80
|
+
"es-SV",
|
|
81
|
+
"es-US",
|
|
82
|
+
"es-UY",
|
|
83
|
+
"es-VE",
|
|
84
|
+
"et-EE",
|
|
85
|
+
"eu-ES",
|
|
86
|
+
"fa-IR",
|
|
87
|
+
"fi-FI",
|
|
88
|
+
"fil-PH",
|
|
89
|
+
"fr-BE",
|
|
90
|
+
"fr-CA",
|
|
91
|
+
"fr-CH",
|
|
92
|
+
"fr-FR",
|
|
93
|
+
"ga-IE",
|
|
94
|
+
"gl-ES",
|
|
95
|
+
"gu-IN",
|
|
96
|
+
"he-IL",
|
|
97
|
+
"hi-IN",
|
|
98
|
+
"hr-HR",
|
|
99
|
+
"hu-HU",
|
|
100
|
+
"hy-AM",
|
|
101
|
+
"id-ID",
|
|
102
|
+
"is-IS",
|
|
103
|
+
"it-CH",
|
|
104
|
+
"it-IT",
|
|
105
|
+
"ja-JP",
|
|
106
|
+
"jv-ID",
|
|
107
|
+
"ka-GE",
|
|
108
|
+
"kk-KZ",
|
|
109
|
+
"km-KH",
|
|
110
|
+
"kn-IN",
|
|
111
|
+
"ko-KR",
|
|
112
|
+
"lo-LA",
|
|
113
|
+
"lt-LT",
|
|
114
|
+
"lv-LV",
|
|
115
|
+
"mk-MK",
|
|
116
|
+
"ml-IN",
|
|
117
|
+
"mn-MN",
|
|
118
|
+
"mr-IN",
|
|
119
|
+
"ms-MY",
|
|
120
|
+
"mt-MT",
|
|
121
|
+
"my-MM",
|
|
122
|
+
"nb-NO",
|
|
123
|
+
"ne-NP",
|
|
124
|
+
"nl-BE",
|
|
125
|
+
"nl-NL",
|
|
126
|
+
"pa-IN",
|
|
127
|
+
"pl-PL",
|
|
128
|
+
"ps-AF",
|
|
129
|
+
"pt-BR",
|
|
130
|
+
"pt-PT",
|
|
131
|
+
"ro-RO",
|
|
132
|
+
"ru-RU",
|
|
133
|
+
"si-LK",
|
|
134
|
+
"sk-SK",
|
|
135
|
+
"sl-SI",
|
|
136
|
+
"so-SO",
|
|
137
|
+
"sq-AL",
|
|
138
|
+
"sr-RS",
|
|
139
|
+
"sv-SE",
|
|
140
|
+
"sw-KE",
|
|
141
|
+
"sw-TZ",
|
|
142
|
+
"ta-IN",
|
|
143
|
+
"te-IN",
|
|
144
|
+
"th-TH",
|
|
145
|
+
"tr-TR",
|
|
146
|
+
"uk-UA",
|
|
147
|
+
"ur-IN",
|
|
148
|
+
"uz-UZ",
|
|
149
|
+
"vi-VN",
|
|
150
|
+
"wuu-CN",
|
|
151
|
+
"yue-CN",
|
|
152
|
+
"zh-CN",
|
|
153
|
+
"zh-CN-SHANDONG",
|
|
154
|
+
"zh-CN-SICHUAN",
|
|
155
|
+
"zh-HK",
|
|
156
|
+
"zh-TW",
|
|
157
|
+
"zu-ZA",
|
|
158
|
+
]
|
|
159
|
+
] = ["en-US"]
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import argparse # noqa: D100
|
|
2
|
+
import asyncio
|
|
3
|
+
import logging
|
|
4
|
+
from functools import partial
|
|
5
|
+
import contextlib
|
|
6
|
+
import os # Import to access environment variables
|
|
7
|
+
import signal
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
from wyoming.info import AsrModel, AsrProgram, Attribution, Info
|
|
11
|
+
from wyoming.server import AsyncServer
|
|
12
|
+
|
|
13
|
+
from .download import get_languages
|
|
14
|
+
from .microsoft_stt import MicrosoftSTT
|
|
15
|
+
from .handler import MicrosoftEventHandler
|
|
16
|
+
from .version import __version__
|
|
17
|
+
from . import SpeechConfig
|
|
18
|
+
|
|
19
|
+
_LOGGER = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
stop_event = asyncio.Event()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def handle_stop_signal(*args):
|
|
25
|
+
"""Handle shutdown signal and set the stop event."""
|
|
26
|
+
_LOGGER.info("Received stop signal. Shutting down...")
|
|
27
|
+
stop_event.set()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_arguments():
|
|
31
|
+
"""Parse command-line arguments."""
|
|
32
|
+
parser = argparse.ArgumentParser()
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--service-region",
|
|
35
|
+
default=os.getenv("AZURE_SERVICE_REGION"),
|
|
36
|
+
help="Microsoft Azure region (e.g., westus2)",
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--subscription-key",
|
|
40
|
+
default=os.getenv("AZURE_SUBSCRIPTION_KEY"),
|
|
41
|
+
help="Microsoft Azure subscription key",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--uri", default="tcp://0.0.0.0:10300", help="unix:// or tcp://"
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--download-dir",
|
|
48
|
+
default="/tmp/",
|
|
49
|
+
help="Directory to download languages.json into (default: /tmp/)",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--language",
|
|
53
|
+
nargs="+",
|
|
54
|
+
default=["en-GB"],
|
|
55
|
+
help="List of languages to set for transcription (e.g., en-US fr-FR es-ES)",
|
|
56
|
+
)
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--update-languages",
|
|
59
|
+
action="store_true",
|
|
60
|
+
help="Download latest languages.json during startup",
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--profanity",
|
|
64
|
+
default="masked",
|
|
65
|
+
choices=["masked", "removed", "off"],
|
|
66
|
+
help="Profanity setting for speech recognition",
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
|
|
69
|
+
return parser.parse_args()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def validate_args(args):
|
|
73
|
+
"""Validate command-line arguments."""
|
|
74
|
+
if not args.service_region or not args.subscription_key:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
"Both --service-region and --subscription-key must be provided either as command-line arguments or environment variables."
|
|
77
|
+
)
|
|
78
|
+
# Reinstate key validation with more flexibility to accommodate complex keys
|
|
79
|
+
if not re.match(r"^[A-Za-z0-9\-_]{40,}$", args.subscription_key):
|
|
80
|
+
_LOGGER.warning(
|
|
81
|
+
"The subscription key does not match the expected format but will attempt to initialize."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
async def main() -> None:
|
|
86
|
+
"""Start Wyoming Microsoft STT server."""
|
|
87
|
+
args = parse_arguments()
|
|
88
|
+
validate_args(args)
|
|
89
|
+
|
|
90
|
+
speech_config = SpeechConfig(
|
|
91
|
+
subscription_key=args.subscription_key,
|
|
92
|
+
service_region=args.service_region,
|
|
93
|
+
profanity=args.profanity,
|
|
94
|
+
language=args.language,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Set up logging
|
|
98
|
+
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
|
99
|
+
_LOGGER.debug("Arguments parsed successfully.")
|
|
100
|
+
|
|
101
|
+
# Load languages
|
|
102
|
+
try:
|
|
103
|
+
_LOGGER.info("Starting language loading process.")
|
|
104
|
+
languages = get_languages(
|
|
105
|
+
args.download_dir,
|
|
106
|
+
update_languages=args.update_languages,
|
|
107
|
+
region=args.service_region,
|
|
108
|
+
key=args.subscription_key,
|
|
109
|
+
)
|
|
110
|
+
_LOGGER.info("Languages loaded successfully.")
|
|
111
|
+
except Exception as e:
|
|
112
|
+
_LOGGER.error(f"Failed to load languages: {e}")
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
wyoming_info = Info(
|
|
116
|
+
asr=[
|
|
117
|
+
AsrProgram(
|
|
118
|
+
name="Microsoft",
|
|
119
|
+
description="Microsoft speech transcription",
|
|
120
|
+
attribution=Attribution(
|
|
121
|
+
name="Hugo Bloem",
|
|
122
|
+
url="https://github.com/hugobloem/wyoming-microsoft-stt/",
|
|
123
|
+
),
|
|
124
|
+
version=__version__,
|
|
125
|
+
installed=True,
|
|
126
|
+
models=[
|
|
127
|
+
AsrModel(
|
|
128
|
+
name="Microsoft STT",
|
|
129
|
+
description="Microsoft speech transcription",
|
|
130
|
+
attribution=Attribution(
|
|
131
|
+
name="Hugo Bloem",
|
|
132
|
+
url="https://github.com/hugobloem/wyoming-microsoft-stt/",
|
|
133
|
+
),
|
|
134
|
+
version=__version__,
|
|
135
|
+
installed=True,
|
|
136
|
+
languages=languages,
|
|
137
|
+
)
|
|
138
|
+
],
|
|
139
|
+
)
|
|
140
|
+
],
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Load Microsoft STT model
|
|
144
|
+
try:
|
|
145
|
+
_LOGGER.debug("Loading Microsoft STT")
|
|
146
|
+
stt_model = MicrosoftSTT(speech_config)
|
|
147
|
+
_LOGGER.info("Microsoft STT model loaded successfully.")
|
|
148
|
+
except Exception as e:
|
|
149
|
+
_LOGGER.error(f"Failed to load Microsoft STT model: {e}")
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
# Initialize server and run
|
|
153
|
+
server = AsyncServer.from_uri(args.uri)
|
|
154
|
+
_LOGGER.info("Ready")
|
|
155
|
+
model_lock = asyncio.Lock()
|
|
156
|
+
try:
|
|
157
|
+
await server.run(
|
|
158
|
+
partial(
|
|
159
|
+
MicrosoftEventHandler,
|
|
160
|
+
wyoming_info,
|
|
161
|
+
args,
|
|
162
|
+
stt_model,
|
|
163
|
+
model_lock,
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
_LOGGER.error(f"An error occurred while running the server: {e}")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
if __name__ == "__main__":
|
|
171
|
+
# Set up signal handling for graceful shutdown
|
|
172
|
+
signal.signal(signal.SIGTERM, handle_stop_signal)
|
|
173
|
+
signal.signal(signal.SIGINT, handle_stop_signal)
|
|
174
|
+
|
|
175
|
+
with contextlib.suppress(KeyboardInterrupt):
|
|
176
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Utility for downloading Microsoft STT languages."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import quote, urlsplit, urlunsplit
|
|
7
|
+
from urllib.request import urlopen, Request
|
|
8
|
+
import json
|
|
9
|
+
import time
|
|
10
|
+
from urllib.error import URLError
|
|
11
|
+
|
|
12
|
+
URL_FORMAT = "https://{region}.cognitiveservices.azure.com/speechtotext/v3.1/transcriptions/locales"
|
|
13
|
+
URL_HEADER = "Ocp-Apim-Subscription-Key"
|
|
14
|
+
|
|
15
|
+
_DIR = Path(__file__).parent
|
|
16
|
+
_LOGGER = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _quote_url(url: str) -> str:
|
|
20
|
+
"""Quote file part of URL in case it contains UTF-8 characters."""
|
|
21
|
+
parts = list(urlsplit(url))
|
|
22
|
+
parts[2] = quote(parts[2])
|
|
23
|
+
return urlunsplit(parts)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def transform_languages_files(response):
|
|
27
|
+
"""Transform the languages.json file from the Microsoft API to the format used by Piper."""
|
|
28
|
+
languages = json.load(response)
|
|
29
|
+
return languages
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_languages(
|
|
33
|
+
download_dir: str | Path,
|
|
34
|
+
update_languages: bool = False,
|
|
35
|
+
region: str = "westus",
|
|
36
|
+
key: str = "",
|
|
37
|
+
) -> dict[str, Any]:
|
|
38
|
+
"""Load available languages from downloaded or embedded JSON file."""
|
|
39
|
+
download_dir = Path(download_dir)
|
|
40
|
+
if not download_dir.exists():
|
|
41
|
+
download_dir.mkdir(parents=True)
|
|
42
|
+
languages_download = download_dir.joinpath("languages.json")
|
|
43
|
+
|
|
44
|
+
if update_languages:
|
|
45
|
+
# Download latest languages.json with retry mechanism
|
|
46
|
+
MAX_RETRIES = 3
|
|
47
|
+
RETRY_DELAY = 5 # seconds
|
|
48
|
+
for attempt in range(MAX_RETRIES):
|
|
49
|
+
try:
|
|
50
|
+
languages_url = URL_FORMAT.format(region=region)
|
|
51
|
+
languages_hdr = {URL_HEADER: key}
|
|
52
|
+
_LOGGER.debug("Downloading %s to %s", languages_url, languages_download)
|
|
53
|
+
req = Request(_quote_url(languages_url), headers=languages_hdr)
|
|
54
|
+
with urlopen(req) as response, open(
|
|
55
|
+
languages_download, "w"
|
|
56
|
+
) as download_file:
|
|
57
|
+
json.dump(
|
|
58
|
+
transform_languages_files(response), download_file, indent=4
|
|
59
|
+
)
|
|
60
|
+
_LOGGER.info("Languages downloaded successfully.")
|
|
61
|
+
break
|
|
62
|
+
except URLError as e:
|
|
63
|
+
_LOGGER.warning(
|
|
64
|
+
"Failed to download languages.json (attempt %d/%d): %s",
|
|
65
|
+
attempt + 1,
|
|
66
|
+
MAX_RETRIES,
|
|
67
|
+
e,
|
|
68
|
+
)
|
|
69
|
+
time.sleep(RETRY_DELAY)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
_LOGGER.exception("Failed to download languages.json: %s", e)
|
|
72
|
+
_LOGGER.error("Failed to update languages list")
|
|
73
|
+
return {}
|
|
74
|
+
|
|
75
|
+
# Prefer downloaded file to embedded
|
|
76
|
+
if languages_download.exists():
|
|
77
|
+
try:
|
|
78
|
+
_LOGGER.debug("Loading %s", languages_download)
|
|
79
|
+
with open(languages_download, encoding="utf-8") as languages_file:
|
|
80
|
+
return json.load(languages_file)
|
|
81
|
+
except Exception:
|
|
82
|
+
_LOGGER.exception("Failed to load %s", languages_download)
|
|
83
|
+
|
|
84
|
+
# Fall back to embedded
|
|
85
|
+
languages_embedded = _DIR / "languages.json"
|
|
86
|
+
try:
|
|
87
|
+
_LOGGER.debug("Loading %s", languages_embedded)
|
|
88
|
+
with open(languages_embedded, encoding="utf-8") as languages_file:
|
|
89
|
+
return json.load(languages_file)
|
|
90
|
+
except Exception:
|
|
91
|
+
_LOGGER.exception("Failed to load embedded languages.json")
|
|
92
|
+
return {}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Event handler for clients of the server."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from wyoming.asr import Transcribe, Transcript
|
|
9
|
+
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
|
10
|
+
from wyoming.event import Event
|
|
11
|
+
from wyoming.info import Describe, Info
|
|
12
|
+
from wyoming.server import AsyncEventHandler
|
|
13
|
+
|
|
14
|
+
from .microsoft_stt import MicrosoftSTT
|
|
15
|
+
|
|
16
|
+
_LOGGER = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MicrosoftEventHandler(AsyncEventHandler):
|
|
20
|
+
"""Event handler for clients."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
wyoming_info: Info,
|
|
25
|
+
cli_args: argparse.Namespace,
|
|
26
|
+
model: MicrosoftSTT,
|
|
27
|
+
model_lock: asyncio.Lock,
|
|
28
|
+
*args,
|
|
29
|
+
**kwargs,
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Initialize."""
|
|
32
|
+
super().__init__(*args, **kwargs)
|
|
33
|
+
|
|
34
|
+
self.cli_args = cli_args
|
|
35
|
+
self.wyoming_info_event = wyoming_info.event()
|
|
36
|
+
self.model = model
|
|
37
|
+
self.model_lock = model_lock
|
|
38
|
+
|
|
39
|
+
if len(self.cli_args.language) > 1:
|
|
40
|
+
_LOGGER.warning(
|
|
41
|
+
f"Multiple languages specified, auto-detection will be used for these languages only: {self.cli_args.language}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
self._language = self.cli_args.language[0]
|
|
45
|
+
|
|
46
|
+
async def handle_event(self, event: Event) -> bool:
|
|
47
|
+
"""Handle an event."""
|
|
48
|
+
if Describe.is_type(event.type):
|
|
49
|
+
await self.write_event(self.wyoming_info_event)
|
|
50
|
+
_LOGGER.debug("Sent info")
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
if Transcribe.is_type(event.type):
|
|
54
|
+
transcribe = Transcribe.from_event(event)
|
|
55
|
+
if transcribe.language:
|
|
56
|
+
self._language = transcribe.language
|
|
57
|
+
_LOGGER.debug("Language set to %s", transcribe.language)
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
if AudioStart.is_type(event.type):
|
|
61
|
+
start = AudioStart.from_event(event)
|
|
62
|
+
_LOGGER.debug(
|
|
63
|
+
f"Receiving audio: {start.width * 8}bit {start.rate}Hz {start.channels}ch"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
async with self.model_lock:
|
|
67
|
+
self.model.start_transcribe(
|
|
68
|
+
bits_per_sample=start.width * 8,
|
|
69
|
+
samples_per_second=start.rate,
|
|
70
|
+
channels=start.channels,
|
|
71
|
+
language=self._language,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if AudioChunk.is_type(event.type):
|
|
75
|
+
chunk = AudioChunk.from_event(event)
|
|
76
|
+
async with self.model_lock:
|
|
77
|
+
self.model.push_audio_chunk(chunk.audio)
|
|
78
|
+
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
if AudioStop.is_type(event.type):
|
|
82
|
+
_LOGGER.debug("Audio stopped")
|
|
83
|
+
|
|
84
|
+
async with self.model_lock:
|
|
85
|
+
try:
|
|
86
|
+
start_time = time.time()
|
|
87
|
+
_LOGGER.debug("Starting transcription")
|
|
88
|
+
text = self.model.transcribe()
|
|
89
|
+
_LOGGER.info(
|
|
90
|
+
f"Transcription completed in {time.time() - start_time:.2f} seconds"
|
|
91
|
+
)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
_LOGGER.error(f"Failed to transcribe audio: {e}")
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
_LOGGER.info(text)
|
|
97
|
+
|
|
98
|
+
await self.write_event(Transcript(text=text).event())
|
|
99
|
+
_LOGGER.debug("Completed request")
|
|
100
|
+
|
|
101
|
+
# Reset
|
|
102
|
+
self._language = self.cli_args.language
|
|
103
|
+
return False
|
|
104
|
+
|
|
105
|
+
return True
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
[
|
|
2
|
+
"af-ZA",
|
|
3
|
+
"am-ET",
|
|
4
|
+
"ar-AE",
|
|
5
|
+
"ar-BH",
|
|
6
|
+
"ar-DZ",
|
|
7
|
+
"ar-EG",
|
|
8
|
+
"ar-IL",
|
|
9
|
+
"ar-IQ",
|
|
10
|
+
"ar-JO",
|
|
11
|
+
"ar-KW",
|
|
12
|
+
"ar-LB",
|
|
13
|
+
"ar-LY",
|
|
14
|
+
"ar-MA",
|
|
15
|
+
"ar-OM",
|
|
16
|
+
"ar-PS",
|
|
17
|
+
"ar-QA",
|
|
18
|
+
"ar-SA",
|
|
19
|
+
"ar-SY",
|
|
20
|
+
"ar-TN",
|
|
21
|
+
"ar-YE",
|
|
22
|
+
"az-AZ",
|
|
23
|
+
"bg-BG",
|
|
24
|
+
"bn-IN",
|
|
25
|
+
"bs-BA",
|
|
26
|
+
"ca-ES",
|
|
27
|
+
"cs-CZ",
|
|
28
|
+
"cy-GB",
|
|
29
|
+
"da-DK",
|
|
30
|
+
"de-AT",
|
|
31
|
+
"de-CH",
|
|
32
|
+
"de-DE",
|
|
33
|
+
"el-GR",
|
|
34
|
+
"en-AU",
|
|
35
|
+
"en-CA",
|
|
36
|
+
"en-GB",
|
|
37
|
+
"en-GH",
|
|
38
|
+
"en-HK",
|
|
39
|
+
"en-IE",
|
|
40
|
+
"en-IN",
|
|
41
|
+
"en-KE",
|
|
42
|
+
"en-NG",
|
|
43
|
+
"en-NZ",
|
|
44
|
+
"en-PH",
|
|
45
|
+
"en-SG",
|
|
46
|
+
"en-TZ",
|
|
47
|
+
"en-US",
|
|
48
|
+
"en-ZA",
|
|
49
|
+
"es-AR",
|
|
50
|
+
"es-BO",
|
|
51
|
+
"es-CL",
|
|
52
|
+
"es-CO",
|
|
53
|
+
"es-CR",
|
|
54
|
+
"es-CU",
|
|
55
|
+
"es-DO",
|
|
56
|
+
"es-EC",
|
|
57
|
+
"es-ES",
|
|
58
|
+
"es-GQ",
|
|
59
|
+
"es-GT",
|
|
60
|
+
"es-HN",
|
|
61
|
+
"es-MX",
|
|
62
|
+
"es-NI",
|
|
63
|
+
"es-PA",
|
|
64
|
+
"es-PE",
|
|
65
|
+
"es-PR",
|
|
66
|
+
"es-PY",
|
|
67
|
+
"es-SV",
|
|
68
|
+
"es-US",
|
|
69
|
+
"es-UY",
|
|
70
|
+
"es-VE",
|
|
71
|
+
"et-EE",
|
|
72
|
+
"eu-ES",
|
|
73
|
+
"fa-IR",
|
|
74
|
+
"fi-FI",
|
|
75
|
+
"fil-PH",
|
|
76
|
+
"fr-BE",
|
|
77
|
+
"fr-CA",
|
|
78
|
+
"fr-CH",
|
|
79
|
+
"fr-FR",
|
|
80
|
+
"ga-IE",
|
|
81
|
+
"gl-ES",
|
|
82
|
+
"gu-IN",
|
|
83
|
+
"he-IL",
|
|
84
|
+
"hi-IN",
|
|
85
|
+
"hr-HR",
|
|
86
|
+
"hu-HU",
|
|
87
|
+
"hy-AM",
|
|
88
|
+
"id-ID",
|
|
89
|
+
"is-IS",
|
|
90
|
+
"it-CH",
|
|
91
|
+
"it-IT",
|
|
92
|
+
"ja-JP",
|
|
93
|
+
"jv-ID",
|
|
94
|
+
"ka-GE",
|
|
95
|
+
"kk-KZ",
|
|
96
|
+
"km-KH",
|
|
97
|
+
"kn-IN",
|
|
98
|
+
"ko-KR",
|
|
99
|
+
"lo-LA",
|
|
100
|
+
"lt-LT",
|
|
101
|
+
"lv-LV",
|
|
102
|
+
"mk-MK",
|
|
103
|
+
"ml-IN",
|
|
104
|
+
"mn-MN",
|
|
105
|
+
"mr-IN",
|
|
106
|
+
"ms-MY",
|
|
107
|
+
"mt-MT",
|
|
108
|
+
"my-MM",
|
|
109
|
+
"nb-NO",
|
|
110
|
+
"ne-NP",
|
|
111
|
+
"nl-BE",
|
|
112
|
+
"nl-NL",
|
|
113
|
+
"pa-IN",
|
|
114
|
+
"pl-PL",
|
|
115
|
+
"ps-AF",
|
|
116
|
+
"pt-BR",
|
|
117
|
+
"pt-PT",
|
|
118
|
+
"ro-RO",
|
|
119
|
+
"ru-RU",
|
|
120
|
+
"si-LK",
|
|
121
|
+
"sk-SK",
|
|
122
|
+
"sl-SI",
|
|
123
|
+
"so-SO",
|
|
124
|
+
"sq-AL",
|
|
125
|
+
"sr-RS",
|
|
126
|
+
"sv-SE",
|
|
127
|
+
"sw-KE",
|
|
128
|
+
"sw-TZ",
|
|
129
|
+
"ta-IN",
|
|
130
|
+
"te-IN",
|
|
131
|
+
"th-TH",
|
|
132
|
+
"tr-TR",
|
|
133
|
+
"uk-UA",
|
|
134
|
+
"ur-IN",
|
|
135
|
+
"uz-UZ",
|
|
136
|
+
"vi-VN",
|
|
137
|
+
"wuu-CN",
|
|
138
|
+
"yue-CN",
|
|
139
|
+
"zh-CN",
|
|
140
|
+
"zh-CN-SHANDONG",
|
|
141
|
+
"zh-CN-SICHUAN",
|
|
142
|
+
"zh-HK",
|
|
143
|
+
"zh-TW",
|
|
144
|
+
"zu-ZA"
|
|
145
|
+
]
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Microsoft STT module for Wyoming."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import azure.cognitiveservices.speech as speechsdk # noqa: D100
|
|
5
|
+
import logging
|
|
6
|
+
from . import SpeechConfig
|
|
7
|
+
|
|
8
|
+
_LOGGER = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MicrosoftSTT:
|
|
12
|
+
"""Class to handle Microsoft STT."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, speechconfig: SpeechConfig) -> None:
|
|
15
|
+
"""Initialize."""
|
|
16
|
+
self.args = speechconfig
|
|
17
|
+
|
|
18
|
+
self._stream: speechsdk.audio.PushAudioInputStream | None = None
|
|
19
|
+
self._speech_recognizer: speechsdk.SpeechRecognizer | None = None
|
|
20
|
+
self._results: list[speechsdk.SpeechRecognitionResult] = []
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
# Initialize the speech configuration with the provided subscription key and region
|
|
24
|
+
self.speech_config = speechsdk.SpeechConfig(
|
|
25
|
+
subscription=self.args.subscription_key, region=self.args.service_region
|
|
26
|
+
)
|
|
27
|
+
_LOGGER.info("Microsoft SpeechConfig initialized successfully.")
|
|
28
|
+
except Exception as e:
|
|
29
|
+
_LOGGER.error(f"Failed to initialize Microsoft SpeechConfig: {e}")
|
|
30
|
+
raise
|
|
31
|
+
|
|
32
|
+
self.set_profanity(self.args.profanity)
|
|
33
|
+
|
|
34
|
+
def start_transcribe(
|
|
35
|
+
self,
|
|
36
|
+
samples_per_second: int = 16000,
|
|
37
|
+
bits_per_sample: int = 16,
|
|
38
|
+
channels: int = 1,
|
|
39
|
+
language=None,
|
|
40
|
+
) -> None:
|
|
41
|
+
"""Begin a transcription."""
|
|
42
|
+
_LOGGER.debug(f"Starting transcription with language: {language}")
|
|
43
|
+
|
|
44
|
+
# Configure audio input for speech recognition
|
|
45
|
+
_LOGGER.debug("Configuring audio input stream...")
|
|
46
|
+
self._stream = speechsdk.audio.PushAudioInputStream(
|
|
47
|
+
stream_format=speechsdk.audio.AudioStreamFormat(
|
|
48
|
+
samples_per_second=samples_per_second,
|
|
49
|
+
bits_per_sample=bits_per_sample,
|
|
50
|
+
channels=channels,
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
audio_config = speechsdk.audio.AudioConfig(stream=self._stream)
|
|
54
|
+
# Create a speech recognizer with the configured speech and audio settings
|
|
55
|
+
self._speech_recognizer = speechsdk.SpeechRecognizer(
|
|
56
|
+
speech_config=self.speech_config,
|
|
57
|
+
audio_config=audio_config,
|
|
58
|
+
**self.get_language(language),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
self.recognition_done = False
|
|
62
|
+
|
|
63
|
+
def session_stopped_cb(evt):
|
|
64
|
+
"""Signal to stop continuous recognition upon receiving an event `evt`."""
|
|
65
|
+
_LOGGER.debug(f"SESSION STOPPED: {evt}")
|
|
66
|
+
self.recognition_done = True
|
|
67
|
+
|
|
68
|
+
self._speech_recognizer.recognizing.connect(
|
|
69
|
+
lambda evt: _LOGGER.debug(f"RECOGNIZING: {evt}")
|
|
70
|
+
)
|
|
71
|
+
self._speech_recognizer.recognized.connect(
|
|
72
|
+
lambda evt: _LOGGER.debug(f"RECOGNIZED: {evt}")
|
|
73
|
+
)
|
|
74
|
+
self._speech_recognizer.session_started.connect(
|
|
75
|
+
lambda evt: _LOGGER.debug(f"SESSION STARTED: {evt}")
|
|
76
|
+
)
|
|
77
|
+
self._speech_recognizer.session_stopped.connect(session_stopped_cb)
|
|
78
|
+
self._speech_recognizer.canceled.connect(
|
|
79
|
+
lambda evt: _LOGGER.debug(f"CANCELED {evt}")
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
_LOGGER.debug("Starting continuous recognition...")
|
|
83
|
+
|
|
84
|
+
def recognized(event: speechsdk.SpeechRecognitionEventArgs):
|
|
85
|
+
_LOGGER.debug(f"{event.result}")
|
|
86
|
+
self._results = event.result
|
|
87
|
+
|
|
88
|
+
self._speech_recognizer.start_continuous_recognition()
|
|
89
|
+
self._speech_recognizer.recognized.connect(recognized)
|
|
90
|
+
|
|
91
|
+
def push_audio_chunk(self, chunk: bytes) -> None:
|
|
92
|
+
"""Push an audio chunk to the recognizer."""
|
|
93
|
+
self._stream.write(chunk)
|
|
94
|
+
|
|
95
|
+
def stop_audio_chunk(self) -> None:
|
|
96
|
+
"""Stop the transcription."""
|
|
97
|
+
_LOGGER.debug("Stopping transcription...")
|
|
98
|
+
self._stream.close()
|
|
99
|
+
|
|
100
|
+
def transcribe(self):
|
|
101
|
+
"""Get the results of a transcription."""
|
|
102
|
+
try:
|
|
103
|
+
self.stop_audio_chunk()
|
|
104
|
+
|
|
105
|
+
# Wait for the recognition to finish
|
|
106
|
+
while not self.recognition_done:
|
|
107
|
+
time.sleep(0.01)
|
|
108
|
+
|
|
109
|
+
self._speech_recognizer.stop_continuous_recognition()
|
|
110
|
+
|
|
111
|
+
return self._results.text
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
_LOGGER.error(f"Failed to transcribe audio: {e}")
|
|
115
|
+
return ""
|
|
116
|
+
|
|
117
|
+
def get_language(self, language: str) -> dict:
|
|
118
|
+
"""Get the language code."""
|
|
119
|
+
if len(self.args.language) > 1:
|
|
120
|
+
auto_detect_source_language_config = (
|
|
121
|
+
speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
|
|
122
|
+
languages=self.args.language
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
return {
|
|
126
|
+
"auto_detect_source_language_config": auto_detect_source_language_config
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if language:
|
|
130
|
+
_LOGGER.debug(f"Language set to {language}")
|
|
131
|
+
return {"language": language}
|
|
132
|
+
|
|
133
|
+
return {"language": self.args.language[0]}
|
|
134
|
+
|
|
135
|
+
def set_profanity(self, profanity: str):
|
|
136
|
+
"""Set the profanity filter level."""
|
|
137
|
+
if profanity == "off":
|
|
138
|
+
profanity_level = speechsdk.ProfanityOption.Raw
|
|
139
|
+
elif profanity == "masked":
|
|
140
|
+
profanity_level = speechsdk.ProfanityOption.Masked
|
|
141
|
+
elif profanity == "removed":
|
|
142
|
+
profanity_level = speechsdk.ProfanityOption.Removed
|
|
143
|
+
else:
|
|
144
|
+
_LOGGER.error(f"Invalid profanity level: {profanity}")
|
|
145
|
+
return
|
|
146
|
+
|
|
147
|
+
self.speech_config.set_profanity(profanity_level)
|
|
148
|
+
_LOGGER.debug(f"Profanity filter set to {profanity}")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wyoming-microsoft-stt
|
|
3
|
+
Version: 1.3.3
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Home-page: https://github.com/hugobloem/wyoming-microsoft-stt
|
|
6
|
+
Author: Hugo Bloem
|
|
7
|
+
Author-email:
|
|
8
|
+
Requires-Python: >=3.13
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: azure-cognitiveservices-speech>=1.45.0
|
|
11
|
+
Requires-Dist: pydantic>=2.11.7
|
|
12
|
+
Requires-Dist: wyoming>=1.7.2
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: home-page
|
|
15
|
+
|
|
16
|
+
# Wyoming Microsoft STT
|
|
17
|
+
Wyoming protocol server for Microsoft Azure speech-to-text.
|
|
18
|
+
|
|
19
|
+
This Python package provides a Wyoming integration for Microsoft Azure speech-to-text and can be directly used with [Home Assistant](https://www.home-assistant.io/) voice and [Rhasspy](https://github.com/rhasspy/rhasspy3).
|
|
20
|
+
|
|
21
|
+
## Azure Speech Service
|
|
22
|
+
This program uses [Microsoft Azure Speech Service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/). You can sign up to a free Azure account which comes with free tier of 5 audio hours per month, this should be enough for running a voice assistant as each command is relatively short. Once this amount is exceeded Azure could charge you for each second used (Current pricing is $0.36 per audio hour). I am not responsible for any incurred charges and recommend you set up a spending limit to reduce your exposure. However, for normal usage the free tier could suffice and the resource should not switch to a paid service automatically.
|
|
23
|
+
|
|
24
|
+
If you have not set up a speech resource, you can follow the instructions below. (you only need to do this once and works both for [Speech-to-Text](https://github.com/hugobloem/wyoming-microsoft-stt) and [Text-to-Speech](https://github.com/hugobloem/wyoming-microsoft-tts))
|
|
25
|
+
|
|
26
|
+
1. Sign in or create an account on [portal.azure.com](https://portal.azure.com).
|
|
27
|
+
2. Create a subscription by searching for `subscription` in the search bar. [Consult Microsoft Learn for more information](https://learn.microsoft.com/en-gb/azure/cost-management-billing/manage/create-subscription#create-a-subscription-in-the-azure-portal).
|
|
28
|
+
3. Create a speech resource by searching for `speech service`.
|
|
29
|
+
4. Select the subscription you created, pick or create a resource group, select a region, pick an identifiable name, and select the pricing tier (you probably want Free F0)
|
|
30
|
+
5. Once created, copy one of the keys from the speech service page. You will need this to run this program.
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
Depending on the installation method parameters are parsed differently. However, the same options are used for each of the installation methods and can be found in the table below. Your service region and subscription key can be found on the speech service resource page (step 5 the Azure Speech service instructions).
|
|
34
|
+
|
|
35
|
+
For the bare-metal Python install the program is run as follows:
|
|
36
|
+
```python
|
|
37
|
+
python -m wyoming-microsoft-stt --<key> <value>
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
| Key | Optional | Description |
|
|
41
|
+
|---|---|---|
|
|
42
|
+
| `service-region` | No | Azure service region e.g., `uksouth` |
|
|
43
|
+
| `subscription-key` | No | Azure subscription key |
|
|
44
|
+
| `language` | Yes | Default language to set for transcription, default: `en-GB`. For auto-detection provide multiple languages. |
|
|
45
|
+
| `uri` | No | Uri where the server will be broadcasted e.g., `tcp://0.0.0.0:10300` |
|
|
46
|
+
| `download-dir` | Yes | Directory to download models into (default: ) |
|
|
47
|
+
| `update-languages` | Yes | Download latest languages.json during startup |
|
|
48
|
+
| `debug` | Yes | Log debug messages |
|
|
49
|
+
|
|
50
|
+
## Multi-language support
|
|
51
|
+
This add-on can also auto-detect the spoken language from a list of pre-defined languages (max. 10). To do this in Home Assistant provide the languages separated by semi-colons like so:
|
|
52
|
+
<img width="689" alt="Screenshot 2025-05-04 at 11 59 55" src="https://github.com/user-attachments/assets/b3c54fe5-ebf3-404a-a8e8-b0d27efaf76d" />
|
|
53
|
+
|
|
54
|
+
> [!NOTE]
|
|
55
|
+
> Setting multiple languages will override the options set by Home Assistant's Voice configuration! It will prompt you to select a language but the option is ignored when speech is processed.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
Depending on your use case there are different installation options.
|
|
60
|
+
|
|
61
|
+
- **Using pip**
|
|
62
|
+
Clone the repository and install the package using pip. Please note the platform requirements as noted [here](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/quickstarts/setup-platform?tabs=linux%2Cubuntu%2Cdotnetcli%2Cdotnet%2Cjre%2Cmaven%2Cnodejs%2Cmac%2Cpypi&pivots=programming-language-python#platform-requirements).
|
|
63
|
+
```sh
|
|
64
|
+
pip install .
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
- **Home Assistant Add-On**
|
|
68
|
+
Add the following repository as an add-on repository to your Home Assistant, or click the button below.
|
|
69
|
+
[https://github.com/hugobloem/homeassistant-addons](https://github.com/hugobloem/homeassistant-addons)
|
|
70
|
+
|
|
71
|
+
[](https://my.home-assistant.io/redirect/supervisor_add_addon_repository/?repository_url=https%3A%2F%2Fgithub.com%2Fhugobloem%2Fhomeassistant-addons)
|
|
72
|
+
|
|
73
|
+
- **Docker container**
|
|
74
|
+
To run as a Docker container use the following command:
|
|
75
|
+
```bash
|
|
76
|
+
docker run ghcr.io/hugobloem/wyoming-microsoft-stt-noha:latest --<key> <value>
|
|
77
|
+
```
|
|
78
|
+
For the relevant keys please look at [the table below](#usage)
|
|
79
|
+
|
|
80
|
+
- **docker compose**
|
|
81
|
+
|
|
82
|
+
Below is a sample for a docker compose file. The azure region + subscription key can be set in environment variables. Everything else needs to be passed via command line arguments.
|
|
83
|
+
|
|
84
|
+
```yaml
|
|
85
|
+
wyoming-proxy-azure-stt:
|
|
86
|
+
image: ghcr.io/hugobloem/wyoming-microsoft-stt-noha
|
|
87
|
+
container_name: wyoming-azure-stt
|
|
88
|
+
ports:
|
|
89
|
+
- "10300:10300"
|
|
90
|
+
environment:
|
|
91
|
+
AZURE_SERVICE_REGION: swedencentral
|
|
92
|
+
AZURE_SUBSCRIPTION_KEY: XXX
|
|
93
|
+
command: --language=en-GB,nl-NL --uri=tcp://0.0.0.0:10300
|
|
94
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
tests/__init__.py,sha256=ZEU8_ARBRGIqaAOTcPRsjXxcfHSojBm-5-krYBN-68g,13
|
|
2
|
+
tests/conftest.py,sha256=iSju8g6PiODcg9eY4_lZaVcubXMlfnx1CpyGVsmTAgY,340
|
|
3
|
+
tests/test_microsoft_stt.py,sha256=CO6xyLHFcE1kqJ_XuGprRu0i28zTN2oHaKtomK_aJIU,594
|
|
4
|
+
tests/test_multilanguage.py,sha256=wRtyWuhWBmA5Yf01JNbmgxqoVKmDcE6XHz9QcfSR700,3100
|
|
5
|
+
tests/test_transcribe.py,sha256=BqT-KLZR6UM9RqNI43RZFz6WLkXFcjk2a2WLszhDgso,3674
|
|
6
|
+
wyoming_microsoft_stt/__init__.py,sha256=92Kms90cxU6Zs5XjNZiYvIGLA3Yd-BizN-G8f7n506I,3365
|
|
7
|
+
wyoming_microsoft_stt/__main__.py,sha256=gA0wM3OCLdEq7ewMRrlWs4TVpXvFlITMaX_4lP3ed7E,5577
|
|
8
|
+
wyoming_microsoft_stt/download.py,sha256=mTWZl9kVaE7KGeO2d0SS7wztDN6tGo9AudXWhN8uKhg,3379
|
|
9
|
+
wyoming_microsoft_stt/handler.py,sha256=dWm9etFANRU33IDU-p85enHmWtMtTtW522I3AJbTAxg,3330
|
|
10
|
+
wyoming_microsoft_stt/languages.json,sha256=eDZuPJLzDjdZlh0I4081OTkLPt6nZdUY8ra45c7oMCc,1881
|
|
11
|
+
wyoming_microsoft_stt/microsoft_stt.py,sha256=2Rb9uoOxGFpQfwXux5p3jwop_3poF-75U_84q5QRmt0,5357
|
|
12
|
+
wyoming_microsoft_stt/version.py,sha256=iG_JqR_Z5wfSTlMqH9H1vmrmlcgyFJdbRBRllY0yDgU,50
|
|
13
|
+
wyoming_microsoft_stt-1.3.3.dist-info/METADATA,sha256=Hh7h4BDS_hLd4t3Y8LOEpAMpRzGph7TwfPEliYQA6mA,5792
|
|
14
|
+
wyoming_microsoft_stt-1.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
wyoming_microsoft_stt-1.3.3.dist-info/top_level.txt,sha256=aURyYXybYBAybkVTs2RFT5ctnPz5pRa_B7Gn9_SKhL8,28
|
|
16
|
+
wyoming_microsoft_stt-1.3.3.dist-info/RECORD,,
|