livekit-plugins-azure 0.2.1__tar.gz → 0.3.0.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/PKG-INFO +1 -1
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/__init__.py +1 -6
- livekit_plugins_azure-0.3.0.dev1/livekit/plugins/azure/py.typed +0 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/stt.py +29 -61
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/tts.py +46 -72
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/version.py +3 -1
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/PKG-INFO +1 -1
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/SOURCES.txt +1 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/README.md +0 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/log.py +0 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/dependency_links.txt +0 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/requires.txt +0 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/top_level.txt +0 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/pyproject.toml +0 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/setup.cfg +0 -0
- {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/setup.py +0 -0
{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/__init__.py
RENAMED
|
@@ -14,12 +14,7 @@ from .stt import STT, SpeechStream
|
|
|
14
14
|
from .tts import TTS
|
|
15
15
|
from .version import __version__
|
|
16
16
|
|
|
17
|
-
__all__ = [
|
|
18
|
-
"STT",
|
|
19
|
-
"SpeechStream",
|
|
20
|
-
"TTS",
|
|
21
|
-
"__version__",
|
|
22
|
-
]
|
|
17
|
+
__all__ = ["STT", "SpeechStream", "TTS", "__version__"]
|
|
23
18
|
|
|
24
19
|
from livekit.agents import Plugin
|
|
25
20
|
|
|
File without changes
|
{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/stt.py
RENAMED
|
@@ -15,13 +15,10 @@ from __future__ import annotations
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import os
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
-
from typing import Optional
|
|
19
18
|
|
|
20
|
-
from livekit import
|
|
21
|
-
from livekit.agents import stt
|
|
22
|
-
from livekit.agents.utils import AudioBuffer
|
|
19
|
+
from livekit.agents import stt, utils
|
|
23
20
|
|
|
24
|
-
import azure.cognitiveservices.speech as speechsdk
|
|
21
|
+
import azure.cognitiveservices.speech as speechsdk # type: ignore
|
|
25
22
|
|
|
26
23
|
from .log import logger
|
|
27
24
|
|
|
@@ -47,7 +44,9 @@ class STT(stt.STT):
|
|
|
47
44
|
num_channels: int = 1,
|
|
48
45
|
languages: list[str] = [], # when empty, auto-detect the language
|
|
49
46
|
):
|
|
50
|
-
super().__init__(
|
|
47
|
+
super().__init__(
|
|
48
|
+
capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
|
|
49
|
+
)
|
|
51
50
|
|
|
52
51
|
speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
|
|
53
52
|
if not speech_key:
|
|
@@ -66,18 +65,11 @@ class STT(stt.STT):
|
|
|
66
65
|
)
|
|
67
66
|
|
|
68
67
|
async def recognize(
|
|
69
|
-
self,
|
|
70
|
-
*,
|
|
71
|
-
buffer: AudioBuffer,
|
|
72
|
-
language: str | None = None,
|
|
68
|
+
self, buffer: utils.AudioBuffer, *, language: str | None = None
|
|
73
69
|
) -> stt.SpeechEvent:
|
|
74
70
|
raise NotImplementedError("Azure STT does not support single frame recognition")
|
|
75
71
|
|
|
76
|
-
def stream(
|
|
77
|
-
self,
|
|
78
|
-
*,
|
|
79
|
-
language: str | None = None,
|
|
80
|
-
) -> "SpeechStream":
|
|
72
|
+
def stream(self, *, language: str | None = None) -> "SpeechStream":
|
|
81
73
|
return SpeechStream(self._config)
|
|
82
74
|
|
|
83
75
|
|
|
@@ -85,8 +77,6 @@ class SpeechStream(stt.SpeechStream):
|
|
|
85
77
|
def __init__(self, opts: STTOptions) -> None:
|
|
86
78
|
super().__init__()
|
|
87
79
|
self._opts = opts
|
|
88
|
-
self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
|
|
89
|
-
self._closed = False
|
|
90
80
|
self._speaking = False
|
|
91
81
|
|
|
92
82
|
self._stream = speechsdk.audio.PushAudioInputStream(
|
|
@@ -108,26 +98,21 @@ class SpeechStream(stt.SpeechStream):
|
|
|
108
98
|
self._done_event = asyncio.Event()
|
|
109
99
|
self._loop = asyncio.get_running_loop()
|
|
110
100
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
101
|
+
@utils.log_exceptions(logger=logger)
|
|
102
|
+
async def _main_task(self) -> None:
|
|
103
|
+
try:
|
|
104
|
+
async for input in self._input_ch:
|
|
105
|
+
self._stream.write(input.data.tobytes())
|
|
116
106
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
self._closed = True
|
|
122
|
-
self._stream.close()
|
|
123
|
-
|
|
124
|
-
await self._done_event.wait()
|
|
107
|
+
self._stream.close()
|
|
108
|
+
await self._done_event.wait()
|
|
109
|
+
finally:
|
|
125
110
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
111
|
+
def _cleanup():
|
|
112
|
+
self._recognizer.stop_continuous_recognition()
|
|
113
|
+
del self._recognizer
|
|
129
114
|
|
|
130
|
-
|
|
115
|
+
await asyncio.to_thread(_cleanup)
|
|
131
116
|
|
|
132
117
|
def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs):
|
|
133
118
|
detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
|
|
@@ -136,15 +121,12 @@ class SpeechStream(stt.SpeechStream):
|
|
|
136
121
|
return
|
|
137
122
|
|
|
138
123
|
final_data = stt.SpeechData(
|
|
139
|
-
language=detected_lg,
|
|
140
|
-
confidence=1.0,
|
|
141
|
-
text=evt.result.text,
|
|
124
|
+
language=detected_lg, confidence=1.0, text=evt.result.text
|
|
142
125
|
)
|
|
143
126
|
|
|
144
|
-
self.
|
|
127
|
+
self._threadsafe_send(
|
|
145
128
|
stt.SpeechEvent(
|
|
146
|
-
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
|
147
|
-
alternatives=[final_data],
|
|
129
|
+
type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data]
|
|
148
130
|
)
|
|
149
131
|
)
|
|
150
132
|
|
|
@@ -155,15 +137,12 @@ class SpeechStream(stt.SpeechStream):
|
|
|
155
137
|
return
|
|
156
138
|
|
|
157
139
|
interim_data = stt.SpeechData(
|
|
158
|
-
language=detected_lg,
|
|
159
|
-
confidence=0.0,
|
|
160
|
-
text=evt.result.text,
|
|
140
|
+
language=detected_lg, confidence=0.0, text=evt.result.text
|
|
161
141
|
)
|
|
162
142
|
|
|
163
|
-
self.
|
|
143
|
+
self._threadsafe_send(
|
|
164
144
|
stt.SpeechEvent(
|
|
165
|
-
type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
|
|
166
|
-
alternatives=[interim_data],
|
|
145
|
+
type=stt.SpeechEventType.INTERIM_TRANSCRIPT, alternatives=[interim_data]
|
|
167
146
|
)
|
|
168
147
|
)
|
|
169
148
|
|
|
@@ -172,31 +151,20 @@ class SpeechStream(stt.SpeechStream):
|
|
|
172
151
|
return
|
|
173
152
|
|
|
174
153
|
self._speaking = True
|
|
175
|
-
self.
|
|
154
|
+
self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH))
|
|
176
155
|
|
|
177
156
|
def _on_speech_end(self, evt: speechsdk.SpeechRecognitionEventArgs):
|
|
178
157
|
if not self._speaking:
|
|
179
158
|
return
|
|
180
159
|
|
|
181
160
|
self._speaking = False
|
|
182
|
-
self.
|
|
161
|
+
self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
|
|
183
162
|
|
|
184
163
|
def _on_session_stopped(self, evt: speechsdk.SpeechRecognitionEventArgs):
|
|
185
|
-
if not self._closed:
|
|
186
|
-
logger.error("session stopped unexpectedly")
|
|
187
|
-
|
|
188
164
|
self._loop.call_soon_threadsafe(self._done_event.set)
|
|
189
|
-
self._threadsafe_put(None)
|
|
190
|
-
|
|
191
|
-
def _threadsafe_put(self, evt: stt.SpeechEvent | None):
|
|
192
|
-
self._loop.call_soon_threadsafe(self._event_queue.put_nowait, evt)
|
|
193
|
-
|
|
194
|
-
async def __anext__(self) -> stt.SpeechEvent:
|
|
195
|
-
evt = await self._event_queue.get()
|
|
196
|
-
if evt is None:
|
|
197
|
-
raise StopAsyncIteration
|
|
198
165
|
|
|
199
|
-
|
|
166
|
+
def _threadsafe_send(self, evt: stt.SpeechEvent | None):
|
|
167
|
+
self._loop.call_soon_threadsafe(self._event_ch.send_nowait, evt)
|
|
200
168
|
|
|
201
169
|
|
|
202
170
|
def _create_speech_recognizer(
|
{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/tts.py
RENAMED
|
@@ -13,17 +13,13 @@
|
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
|
-
import contextlib
|
|
17
16
|
import os
|
|
18
17
|
from dataclasses import dataclass
|
|
19
|
-
from typing import Optional
|
|
20
18
|
|
|
21
19
|
from livekit import rtc
|
|
22
|
-
from livekit.agents import tts
|
|
20
|
+
from livekit.agents import tts, utils
|
|
23
21
|
|
|
24
|
-
import azure.cognitiveservices.speech as speechsdk
|
|
25
|
-
|
|
26
|
-
from .log import logger
|
|
22
|
+
import azure.cognitiveservices.speech as speechsdk # type: ignore
|
|
27
23
|
|
|
28
24
|
AZURE_SAMPLE_RATE: int = 16000
|
|
29
25
|
AZURE_BITS_PER_SAMPLE: int = 16
|
|
@@ -47,7 +43,9 @@ class TTS(tts.TTS):
|
|
|
47
43
|
voice: str | None = None,
|
|
48
44
|
) -> None:
|
|
49
45
|
super().__init__(
|
|
50
|
-
|
|
46
|
+
capabilities=tts.TTSCapabilities(
|
|
47
|
+
streaming=False,
|
|
48
|
+
),
|
|
51
49
|
sample_rate=AZURE_SAMPLE_RATE,
|
|
52
50
|
num_channels=AZURE_NUM_CHANNELS,
|
|
53
51
|
)
|
|
@@ -61,43 +59,38 @@ class TTS(tts.TTS):
|
|
|
61
59
|
raise ValueError("AZURE_SPEECH_REGION must be set")
|
|
62
60
|
|
|
63
61
|
self._opts = _TTSOptions(
|
|
64
|
-
speech_key=speech_key,
|
|
65
|
-
speech_region=speech_region,
|
|
66
|
-
voice=voice,
|
|
62
|
+
speech_key=speech_key, speech_region=speech_region, voice=voice
|
|
67
63
|
)
|
|
68
64
|
|
|
69
|
-
def synthesize(
|
|
70
|
-
self,
|
|
71
|
-
text: str,
|
|
72
|
-
) -> "ChunkedStream":
|
|
65
|
+
def synthesize(self, text: str) -> "ChunkedStream":
|
|
73
66
|
return ChunkedStream(text, self._opts)
|
|
74
67
|
|
|
75
68
|
|
|
76
69
|
class ChunkedStream(tts.ChunkedStream):
|
|
77
70
|
def __init__(self, text: str, opts: _TTSOptions) -> None:
|
|
78
|
-
|
|
79
|
-
self._text = text
|
|
80
|
-
self._main_task: asyncio.Task | None = None
|
|
81
|
-
self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
|
|
71
|
+
super().__init__()
|
|
72
|
+
self._text, self._opts = text, opts
|
|
82
73
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
74
|
+
@utils.log_exceptions()
|
|
75
|
+
async def _main_task(self):
|
|
76
|
+
stream_callback = _PushAudioOutputStreamCallback(
|
|
77
|
+
asyncio.get_running_loop(), self._event_ch
|
|
78
|
+
)
|
|
79
|
+
synthesizer = _create_speech_synthesizer(
|
|
80
|
+
config=self._opts,
|
|
81
|
+
stream=speechsdk.audio.PushAudioOutputStream(stream_callback),
|
|
82
|
+
)
|
|
92
83
|
|
|
93
|
-
|
|
94
|
-
|
|
84
|
+
def _synthesize() -> speechsdk.SpeechSynthesisResult:
|
|
85
|
+
return synthesizer.speak_text_async(self._text).get() # type: ignore
|
|
95
86
|
|
|
87
|
+
try:
|
|
96
88
|
result = await asyncio.to_thread(_synthesize)
|
|
97
89
|
if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
|
|
98
90
|
raise ValueError(
|
|
99
91
|
f"failed to synthesize audio: {result.reason} {result.cancellation_details}"
|
|
100
92
|
)
|
|
93
|
+
finally:
|
|
101
94
|
|
|
102
95
|
def _cleanup() -> None:
|
|
103
96
|
nonlocal synthesizer, result
|
|
@@ -106,28 +99,32 @@ class ChunkedStream(tts.ChunkedStream):
|
|
|
106
99
|
|
|
107
100
|
await asyncio.to_thread(_cleanup)
|
|
108
101
|
|
|
109
|
-
except Exception:
|
|
110
|
-
logger.exception("failed to synthesize")
|
|
111
|
-
finally:
|
|
112
|
-
self._queue.put_nowait(None)
|
|
113
|
-
|
|
114
|
-
async def __anext__(self) -> tts.SynthesizedAudio:
|
|
115
|
-
if not self._main_task:
|
|
116
|
-
self._main_task = asyncio.create_task(self._run())
|
|
117
102
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
103
|
+
class _PushAudioOutputStreamCallback(speechsdk.audio.PushAudioOutputStreamCallback):
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
loop: asyncio.AbstractEventLoop,
|
|
107
|
+
event_ch: utils.aio.ChanSender[tts.SynthesizedAudio],
|
|
108
|
+
):
|
|
109
|
+
super().__init__()
|
|
110
|
+
self._event_ch = event_ch
|
|
111
|
+
self._loop = loop
|
|
112
|
+
self._request_id = utils.shortuuid()
|
|
113
|
+
self._segment_id = utils.shortuuid()
|
|
127
114
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
115
|
+
def write(self, audio_buffer: memoryview) -> int:
|
|
116
|
+
audio = tts.SynthesizedAudio(
|
|
117
|
+
request_id=self._request_id,
|
|
118
|
+
segment_id=self._segment_id,
|
|
119
|
+
frame=rtc.AudioFrame(
|
|
120
|
+
data=audio_buffer,
|
|
121
|
+
sample_rate=AZURE_SAMPLE_RATE,
|
|
122
|
+
num_channels=AZURE_NUM_CHANNELS,
|
|
123
|
+
samples_per_channel=audio_buffer.nbytes // 2,
|
|
124
|
+
),
|
|
125
|
+
)
|
|
126
|
+
self._loop.call_soon_threadsafe(self._event_ch.send_nowait, audio)
|
|
127
|
+
return audio_buffer.nbytes
|
|
131
128
|
|
|
132
129
|
|
|
133
130
|
def _create_speech_synthesizer(
|
|
@@ -143,26 +140,3 @@ def _create_speech_synthesizer(
|
|
|
143
140
|
return speechsdk.SpeechSynthesizer(
|
|
144
141
|
speech_config=speech_config, audio_config=stream_config
|
|
145
142
|
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
class _PushAudioOutputStreamCallback(speechsdk.audio.PushAudioOutputStreamCallback):
|
|
149
|
-
def __init__(
|
|
150
|
-
self,
|
|
151
|
-
loop: asyncio.AbstractEventLoop,
|
|
152
|
-
event_queue: asyncio.Queue[tts.SynthesizedAudio | None],
|
|
153
|
-
):
|
|
154
|
-
super().__init__()
|
|
155
|
-
self._event_queue = event_queue
|
|
156
|
-
self._loop = loop
|
|
157
|
-
|
|
158
|
-
def write(self, audio_buffer: memoryview) -> int:
|
|
159
|
-
audio_frame = rtc.AudioFrame(
|
|
160
|
-
data=audio_buffer,
|
|
161
|
-
sample_rate=AZURE_SAMPLE_RATE,
|
|
162
|
-
num_channels=AZURE_NUM_CHANNELS,
|
|
163
|
-
samples_per_channel=audio_buffer.nbytes // 2,
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
audio = tts.SynthesizedAudio(text="", data=audio_frame)
|
|
167
|
-
self._loop.call_soon_threadsafe(self._event_queue.put_nowait, audio)
|
|
168
|
-
return audio_buffer.nbytes
|
{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/version.py
RENAMED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# Copyright 2024 LiveKit, Inc.
|
|
2
|
+
#
|
|
1
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
4
|
# you may not use this file except in compliance with the License.
|
|
3
5
|
# You may obtain a copy of the License at
|
|
@@ -10,4 +12,4 @@
|
|
|
10
12
|
# See the License for the specific language governing permissions and
|
|
11
13
|
# limitations under the License.
|
|
12
14
|
|
|
13
|
-
__version__ = "0.
|
|
15
|
+
__version__ = "0.3.0-dev.1"
|
|
File without changes
|
{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/log.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|