livekit-plugins-azure 0.2.1__tar.gz → 0.3.0.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/PKG-INFO +1 -1
  2. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/__init__.py +1 -6
  3. livekit_plugins_azure-0.3.0.dev1/livekit/plugins/azure/py.typed +0 -0
  4. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/stt.py +29 -61
  5. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/tts.py +46 -72
  6. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/version.py +3 -1
  7. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/PKG-INFO +1 -1
  8. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/SOURCES.txt +1 -0
  9. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/README.md +0 -0
  10. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/log.py +0 -0
  11. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/dependency_links.txt +0 -0
  12. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/requires.txt +0 -0
  13. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/top_level.txt +0 -0
  14. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/pyproject.toml +0 -0
  15. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/setup.cfg +0 -0
  16. {livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-azure
3
- Version: 0.2.1
3
+ Version: 0.3.0.dev1
4
4
  Summary: Agent Framework plugin for services from Azure
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -14,12 +14,7 @@ from .stt import STT, SpeechStream
14
14
  from .tts import TTS
15
15
  from .version import __version__
16
16
 
17
- __all__ = [
18
- "STT",
19
- "SpeechStream",
20
- "TTS",
21
- "__version__",
22
- ]
17
+ __all__ = ["STT", "SpeechStream", "TTS", "__version__"]
23
18
 
24
19
  from livekit.agents import Plugin
25
20
 
@@ -15,13 +15,10 @@ from __future__ import annotations
15
15
  import asyncio
16
16
  import os
17
17
  from dataclasses import dataclass
18
- from typing import Optional
19
18
 
20
- from livekit import rtc
21
- from livekit.agents import stt
22
- from livekit.agents.utils import AudioBuffer
19
+ from livekit.agents import stt, utils
23
20
 
24
- import azure.cognitiveservices.speech as speechsdk
21
+ import azure.cognitiveservices.speech as speechsdk # type: ignore
25
22
 
26
23
  from .log import logger
27
24
 
@@ -47,7 +44,9 @@ class STT(stt.STT):
47
44
  num_channels: int = 1,
48
45
  languages: list[str] = [], # when empty, auto-detect the language
49
46
  ):
50
- super().__init__(streaming_supported=True)
47
+ super().__init__(
48
+ capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
49
+ )
51
50
 
52
51
  speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
53
52
  if not speech_key:
@@ -66,18 +65,11 @@ class STT(stt.STT):
66
65
  )
67
66
 
68
67
  async def recognize(
69
- self,
70
- *,
71
- buffer: AudioBuffer,
72
- language: str | None = None,
68
+ self, buffer: utils.AudioBuffer, *, language: str | None = None
73
69
  ) -> stt.SpeechEvent:
74
70
  raise NotImplementedError("Azure STT does not support single frame recognition")
75
71
 
76
- def stream(
77
- self,
78
- *,
79
- language: str | None = None,
80
- ) -> "SpeechStream":
72
+ def stream(self, *, language: str | None = None) -> "SpeechStream":
81
73
  return SpeechStream(self._config)
82
74
 
83
75
 
@@ -85,8 +77,6 @@ class SpeechStream(stt.SpeechStream):
85
77
  def __init__(self, opts: STTOptions) -> None:
86
78
  super().__init__()
87
79
  self._opts = opts
88
- self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
89
- self._closed = False
90
80
  self._speaking = False
91
81
 
92
82
  self._stream = speechsdk.audio.PushAudioInputStream(
@@ -108,26 +98,21 @@ class SpeechStream(stt.SpeechStream):
108
98
  self._done_event = asyncio.Event()
109
99
  self._loop = asyncio.get_running_loop()
110
100
 
111
- def push_frame(self, frame: rtc.AudioFrame) -> None:
112
- if self._closed:
113
- raise ValueError("cannot push frame to closed stream")
114
-
115
- self._stream.write(frame.data.tobytes())
101
+ @utils.log_exceptions(logger=logger)
102
+ async def _main_task(self) -> None:
103
+ try:
104
+ async for input in self._input_ch:
105
+ self._stream.write(input.data.tobytes())
116
106
 
117
- async def aclose(self, *, wait: bool = True) -> None:
118
- if self._closed:
119
- return
120
-
121
- self._closed = True
122
- self._stream.close()
123
-
124
- await self._done_event.wait()
107
+ self._stream.close()
108
+ await self._done_event.wait()
109
+ finally:
125
110
 
126
- def _cleanup():
127
- self._recognizer.stop_continuous_recognition()
128
- del self._recognizer
111
+ def _cleanup():
112
+ self._recognizer.stop_continuous_recognition()
113
+ del self._recognizer
129
114
 
130
- await asyncio.to_thread(_cleanup)
115
+ await asyncio.to_thread(_cleanup)
131
116
 
132
117
  def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs):
133
118
  detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
@@ -136,15 +121,12 @@ class SpeechStream(stt.SpeechStream):
136
121
  return
137
122
 
138
123
  final_data = stt.SpeechData(
139
- language=detected_lg,
140
- confidence=1.0,
141
- text=evt.result.text,
124
+ language=detected_lg, confidence=1.0, text=evt.result.text
142
125
  )
143
126
 
144
- self._threadsafe_put(
127
+ self._threadsafe_send(
145
128
  stt.SpeechEvent(
146
- type=stt.SpeechEventType.FINAL_TRANSCRIPT,
147
- alternatives=[final_data],
129
+ type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data]
148
130
  )
149
131
  )
150
132
 
@@ -155,15 +137,12 @@ class SpeechStream(stt.SpeechStream):
155
137
  return
156
138
 
157
139
  interim_data = stt.SpeechData(
158
- language=detected_lg,
159
- confidence=0.0,
160
- text=evt.result.text,
140
+ language=detected_lg, confidence=0.0, text=evt.result.text
161
141
  )
162
142
 
163
- self._threadsafe_put(
143
+ self._threadsafe_send(
164
144
  stt.SpeechEvent(
165
- type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
166
- alternatives=[interim_data],
145
+ type=stt.SpeechEventType.INTERIM_TRANSCRIPT, alternatives=[interim_data]
167
146
  )
168
147
  )
169
148
 
@@ -172,31 +151,20 @@ class SpeechStream(stt.SpeechStream):
172
151
  return
173
152
 
174
153
  self._speaking = True
175
- self._threadsafe_put(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH))
154
+ self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH))
176
155
 
177
156
  def _on_speech_end(self, evt: speechsdk.SpeechRecognitionEventArgs):
178
157
  if not self._speaking:
179
158
  return
180
159
 
181
160
  self._speaking = False
182
- self._threadsafe_put(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
161
+ self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
183
162
 
184
163
  def _on_session_stopped(self, evt: speechsdk.SpeechRecognitionEventArgs):
185
- if not self._closed:
186
- logger.error("session stopped unexpectedly")
187
-
188
164
  self._loop.call_soon_threadsafe(self._done_event.set)
189
- self._threadsafe_put(None)
190
-
191
- def _threadsafe_put(self, evt: stt.SpeechEvent | None):
192
- self._loop.call_soon_threadsafe(self._event_queue.put_nowait, evt)
193
-
194
- async def __anext__(self) -> stt.SpeechEvent:
195
- evt = await self._event_queue.get()
196
- if evt is None:
197
- raise StopAsyncIteration
198
165
 
199
- return evt
166
+ def _threadsafe_send(self, evt: stt.SpeechEvent | None):
167
+ self._loop.call_soon_threadsafe(self._event_ch.send_nowait, evt)
200
168
 
201
169
 
202
170
  def _create_speech_recognizer(
@@ -13,17 +13,13 @@
13
13
  from __future__ import annotations
14
14
 
15
15
  import asyncio
16
- import contextlib
17
16
  import os
18
17
  from dataclasses import dataclass
19
- from typing import Optional
20
18
 
21
19
  from livekit import rtc
22
- from livekit.agents import tts
20
+ from livekit.agents import tts, utils
23
21
 
24
- import azure.cognitiveservices.speech as speechsdk
25
-
26
- from .log import logger
22
+ import azure.cognitiveservices.speech as speechsdk # type: ignore
27
23
 
28
24
  AZURE_SAMPLE_RATE: int = 16000
29
25
  AZURE_BITS_PER_SAMPLE: int = 16
@@ -47,7 +43,9 @@ class TTS(tts.TTS):
47
43
  voice: str | None = None,
48
44
  ) -> None:
49
45
  super().__init__(
50
- streaming_supported=False,
46
+ capabilities=tts.TTSCapabilities(
47
+ streaming=False,
48
+ ),
51
49
  sample_rate=AZURE_SAMPLE_RATE,
52
50
  num_channels=AZURE_NUM_CHANNELS,
53
51
  )
@@ -61,43 +59,38 @@ class TTS(tts.TTS):
61
59
  raise ValueError("AZURE_SPEECH_REGION must be set")
62
60
 
63
61
  self._opts = _TTSOptions(
64
- speech_key=speech_key,
65
- speech_region=speech_region,
66
- voice=voice,
62
+ speech_key=speech_key, speech_region=speech_region, voice=voice
67
63
  )
68
64
 
69
- def synthesize(
70
- self,
71
- text: str,
72
- ) -> "ChunkedStream":
65
+ def synthesize(self, text: str) -> "ChunkedStream":
73
66
  return ChunkedStream(text, self._opts)
74
67
 
75
68
 
76
69
  class ChunkedStream(tts.ChunkedStream):
77
70
  def __init__(self, text: str, opts: _TTSOptions) -> None:
78
- self._opts = opts
79
- self._text = text
80
- self._main_task: asyncio.Task | None = None
81
- self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
71
+ super().__init__()
72
+ self._text, self._opts = text, opts
82
73
 
83
- async def _run(self):
84
- try:
85
- stream_callback = _PushAudioOutputStreamCallback(
86
- asyncio.get_running_loop(), self._queue
87
- )
88
- push_stream = speechsdk.audio.PushAudioOutputStream(stream_callback)
89
- synthesizer = _create_speech_synthesizer(
90
- config=self._opts, stream=push_stream
91
- )
74
+ @utils.log_exceptions()
75
+ async def _main_task(self):
76
+ stream_callback = _PushAudioOutputStreamCallback(
77
+ asyncio.get_running_loop(), self._event_ch
78
+ )
79
+ synthesizer = _create_speech_synthesizer(
80
+ config=self._opts,
81
+ stream=speechsdk.audio.PushAudioOutputStream(stream_callback),
82
+ )
92
83
 
93
- def _synthesize() -> speechsdk.SpeechSynthesisResult:
94
- return synthesizer.speak_text_async(self._text).get() # type: ignore
84
+ def _synthesize() -> speechsdk.SpeechSynthesisResult:
85
+ return synthesizer.speak_text_async(self._text).get() # type: ignore
95
86
 
87
+ try:
96
88
  result = await asyncio.to_thread(_synthesize)
97
89
  if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
98
90
  raise ValueError(
99
91
  f"failed to synthesize audio: {result.reason} {result.cancellation_details}"
100
92
  )
93
+ finally:
101
94
 
102
95
  def _cleanup() -> None:
103
96
  nonlocal synthesizer, result
@@ -106,28 +99,32 @@ class ChunkedStream(tts.ChunkedStream):
106
99
 
107
100
  await asyncio.to_thread(_cleanup)
108
101
 
109
- except Exception:
110
- logger.exception("failed to synthesize")
111
- finally:
112
- self._queue.put_nowait(None)
113
-
114
- async def __anext__(self) -> tts.SynthesizedAudio:
115
- if not self._main_task:
116
- self._main_task = asyncio.create_task(self._run())
117
102
 
118
- frame = await self._queue.get()
119
- if frame is None:
120
- raise StopAsyncIteration
121
-
122
- return frame
123
-
124
- async def aclose(self) -> None:
125
- if not self._main_task:
126
- return
103
+ class _PushAudioOutputStreamCallback(speechsdk.audio.PushAudioOutputStreamCallback):
104
+ def __init__(
105
+ self,
106
+ loop: asyncio.AbstractEventLoop,
107
+ event_ch: utils.aio.ChanSender[tts.SynthesizedAudio],
108
+ ):
109
+ super().__init__()
110
+ self._event_ch = event_ch
111
+ self._loop = loop
112
+ self._request_id = utils.shortuuid()
113
+ self._segment_id = utils.shortuuid()
127
114
 
128
- self._main_task.cancel()
129
- with contextlib.suppress(asyncio.CancelledError):
130
- await self._main_task
115
+ def write(self, audio_buffer: memoryview) -> int:
116
+ audio = tts.SynthesizedAudio(
117
+ request_id=self._request_id,
118
+ segment_id=self._segment_id,
119
+ frame=rtc.AudioFrame(
120
+ data=audio_buffer,
121
+ sample_rate=AZURE_SAMPLE_RATE,
122
+ num_channels=AZURE_NUM_CHANNELS,
123
+ samples_per_channel=audio_buffer.nbytes // 2,
124
+ ),
125
+ )
126
+ self._loop.call_soon_threadsafe(self._event_ch.send_nowait, audio)
127
+ return audio_buffer.nbytes
131
128
 
132
129
 
133
130
  def _create_speech_synthesizer(
@@ -143,26 +140,3 @@ def _create_speech_synthesizer(
143
140
  return speechsdk.SpeechSynthesizer(
144
141
  speech_config=speech_config, audio_config=stream_config
145
142
  )
146
-
147
-
148
- class _PushAudioOutputStreamCallback(speechsdk.audio.PushAudioOutputStreamCallback):
149
- def __init__(
150
- self,
151
- loop: asyncio.AbstractEventLoop,
152
- event_queue: asyncio.Queue[tts.SynthesizedAudio | None],
153
- ):
154
- super().__init__()
155
- self._event_queue = event_queue
156
- self._loop = loop
157
-
158
- def write(self, audio_buffer: memoryview) -> int:
159
- audio_frame = rtc.AudioFrame(
160
- data=audio_buffer,
161
- sample_rate=AZURE_SAMPLE_RATE,
162
- num_channels=AZURE_NUM_CHANNELS,
163
- samples_per_channel=audio_buffer.nbytes // 2,
164
- )
165
-
166
- audio = tts.SynthesizedAudio(text="", data=audio_frame)
167
- self._loop.call_soon_threadsafe(self._event_queue.put_nowait, audio)
168
- return audio_buffer.nbytes
@@ -1,3 +1,5 @@
1
+ # Copyright 2024 LiveKit, Inc.
2
+ #
1
3
  # Licensed under the Apache License, Version 2.0 (the "License");
2
4
  # you may not use this file except in compliance with the License.
3
5
  # You may obtain a copy of the License at
@@ -10,4 +12,4 @@
10
12
  # See the License for the specific language governing permissions and
11
13
  # limitations under the License.
12
14
 
13
- __version__ = "0.2.1"
15
+ __version__ = "0.3.0-dev.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-azure
3
- Version: 0.2.1
3
+ Version: 0.3.0.dev1
4
4
  Summary: Agent Framework plugin for services from Azure
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -3,6 +3,7 @@ pyproject.toml
3
3
  setup.py
4
4
  livekit/plugins/azure/__init__.py
5
5
  livekit/plugins/azure/log.py
6
+ livekit/plugins/azure/py.typed
6
7
  livekit/plugins/azure/stt.py
7
8
  livekit/plugins/azure/tts.py
8
9
  livekit/plugins/azure/version.py