PyPI - livekit-plugins-azure - Versions diffs - 0.2.1__tar.gz → 0.3.0.dev1__tar.gz - Mend

livekit-plugins-azure 0.2.1tar.gz → 0.3.0.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-azure
-Version: 0.2.1
+Version: 0.3.0.dev1
 Summary: Agent Framework plugin for services from Azure
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0

{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/__init__.py RENAMED Viewed

@@ -14,12 +14,7 @@ from .stt import STT, SpeechStream
 from .tts import TTS
 from .version import __version__
-__all__ = [
-    "STT",
-    "SpeechStream",
-    "TTS",
-    "__version__",
-]
+__all__ = ["STT", "SpeechStream", "TTS", "__version__"]
 from livekit.agents import Plugin

livekit_plugins_azure-0.3.0.dev1/livekit/plugins/azure/py.typed ADDED Viewed

File without changes

{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/stt.py RENAMED Viewed

@@ -15,13 +15,10 @@ from __future__ import annotations
 import asyncio
 import os
 from dataclasses import dataclass
-from typing import Optional
-from livekit import rtc
-from livekit.agents import stt
-from livekit.agents.utils import AudioBuffer
+from livekit.agents import stt, utils
-import azure.cognitiveservices.speech as speechsdk
+import azure.cognitiveservices.speech as speechsdk  # type: ignore
 from .log import logger
@@ -47,7 +44,9 @@ class STT(stt.STT):
         num_channels: int = 1,
         languages: list[str] = [],  # when empty, auto-detect the language
     ):
-        super().__init__(streaming_supported=True)
+        super().__init__(
+            capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
+        )
         speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
         if not speech_key:
@@ -66,18 +65,11 @@ class STT(stt.STT):
         )
     async def recognize(
-        self,
-        *,
-        buffer: AudioBuffer,
-        language: str | None = None,
+        self, buffer: utils.AudioBuffer, *, language: str | None = None
     ) -> stt.SpeechEvent:
         raise NotImplementedError("Azure STT does not support single frame recognition")
-    def stream(
-        self,
-        *,
-        language: str | None = None,
-    ) -> "SpeechStream":
+    def stream(self, *, language: str | None = None) -> "SpeechStream":
         return SpeechStream(self._config)
@@ -85,8 +77,6 @@ class SpeechStream(stt.SpeechStream):
     def __init__(self, opts: STTOptions) -> None:
         super().__init__()
         self._opts = opts
-        self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
-        self._closed = False
         self._speaking = False
         self._stream = speechsdk.audio.PushAudioInputStream(
@@ -108,26 +98,21 @@ class SpeechStream(stt.SpeechStream):
         self._done_event = asyncio.Event()
         self._loop = asyncio.get_running_loop()
-    def push_frame(self, frame: rtc.AudioFrame) -> None:
-        if self._closed:
-            raise ValueError("cannot push frame to closed stream")
-        self._stream.write(frame.data.tobytes())
+    @utils.log_exceptions(logger=logger)
+    async def _main_task(self) -> None:
+        try:
+            async for input in self._input_ch:
+                self._stream.write(input.data.tobytes())
-    async def aclose(self, *, wait: bool = True) -> None:
-        if self._closed:
-            return
-        self._closed = True
-        self._stream.close()
-        await self._done_event.wait()
+            self._stream.close()
+            await self._done_event.wait()
+        finally:
-        def _cleanup():
-            self._recognizer.stop_continuous_recognition()
-            del self._recognizer
+            def _cleanup():
+                self._recognizer.stop_continuous_recognition()
+                del self._recognizer
-        await asyncio.to_thread(_cleanup)
+            await asyncio.to_thread(_cleanup)
     def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs):
         detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
@@ -136,15 +121,12 @@ class SpeechStream(stt.SpeechStream):
             return
         final_data = stt.SpeechData(
-            language=detected_lg,
-            confidence=1.0,
-            text=evt.result.text,
+            language=detected_lg, confidence=1.0, text=evt.result.text
         )
-        self._threadsafe_put(
+        self._threadsafe_send(
             stt.SpeechEvent(
-                type=stt.SpeechEventType.FINAL_TRANSCRIPT,
-                alternatives=[final_data],
+                type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=[final_data]
             )
         )
@@ -155,15 +137,12 @@ class SpeechStream(stt.SpeechStream):
             return
         interim_data = stt.SpeechData(
-            language=detected_lg,
-            confidence=0.0,
-            text=evt.result.text,
+            language=detected_lg, confidence=0.0, text=evt.result.text
         )
-        self._threadsafe_put(
+        self._threadsafe_send(
             stt.SpeechEvent(
-                type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
-                alternatives=[interim_data],
+                type=stt.SpeechEventType.INTERIM_TRANSCRIPT, alternatives=[interim_data]
             )
         )
@@ -172,31 +151,20 @@ class SpeechStream(stt.SpeechStream):
             return
         self._speaking = True
-        self._threadsafe_put(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH))
+        self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH))
     def _on_speech_end(self, evt: speechsdk.SpeechRecognitionEventArgs):
         if not self._speaking:
             return
         self._speaking = False
-        self._threadsafe_put(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
+        self._threadsafe_send(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
     def _on_session_stopped(self, evt: speechsdk.SpeechRecognitionEventArgs):
-        if not self._closed:
-            logger.error("session stopped unexpectedly")
         self._loop.call_soon_threadsafe(self._done_event.set)
-        self._threadsafe_put(None)
-    def _threadsafe_put(self, evt: stt.SpeechEvent | None):
-        self._loop.call_soon_threadsafe(self._event_queue.put_nowait, evt)
-    async def __anext__(self) -> stt.SpeechEvent:
-        evt = await self._event_queue.get()
-        if evt is None:
-            raise StopAsyncIteration
-        return evt
+    def _threadsafe_send(self, evt: stt.SpeechEvent | None):
+        self._loop.call_soon_threadsafe(self._event_ch.send_nowait, evt)
 def _create_speech_recognizer(

{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/tts.py RENAMED Viewed

@@ -13,17 +13,13 @@
 from __future__ import annotations
 import asyncio
-import contextlib
 import os
 from dataclasses import dataclass
-from typing import Optional
 from livekit import rtc
-from livekit.agents import tts
+from livekit.agents import tts, utils
-import azure.cognitiveservices.speech as speechsdk
-from .log import logger
+import azure.cognitiveservices.speech as speechsdk  # type: ignore
 AZURE_SAMPLE_RATE: int = 16000
 AZURE_BITS_PER_SAMPLE: int = 16
@@ -47,7 +43,9 @@ class TTS(tts.TTS):
         voice: str | None = None,
     ) -> None:
         super().__init__(
-            streaming_supported=False,
+            capabilities=tts.TTSCapabilities(
+                streaming=False,
+            ),
             sample_rate=AZURE_SAMPLE_RATE,
             num_channels=AZURE_NUM_CHANNELS,
         )
@@ -61,43 +59,38 @@ class TTS(tts.TTS):
             raise ValueError("AZURE_SPEECH_REGION must be set")
         self._opts = _TTSOptions(
-            speech_key=speech_key,
-            speech_region=speech_region,
-            voice=voice,
+            speech_key=speech_key, speech_region=speech_region, voice=voice
         )
-    def synthesize(
-        self,
-        text: str,
-    ) -> "ChunkedStream":
+    def synthesize(self, text: str) -> "ChunkedStream":
         return ChunkedStream(text, self._opts)
 class ChunkedStream(tts.ChunkedStream):
     def __init__(self, text: str, opts: _TTSOptions) -> None:
-        self._opts = opts
-        self._text = text
-        self._main_task: asyncio.Task | None = None
-        self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
+        super().__init__()
+        self._text, self._opts = text, opts
-    async def _run(self):
-        try:
-            stream_callback = _PushAudioOutputStreamCallback(
-                asyncio.get_running_loop(), self._queue
-            )
-            push_stream = speechsdk.audio.PushAudioOutputStream(stream_callback)
-            synthesizer = _create_speech_synthesizer(
-                config=self._opts, stream=push_stream
-            )
+    @utils.log_exceptions()
+    async def _main_task(self):
+        stream_callback = _PushAudioOutputStreamCallback(
+            asyncio.get_running_loop(), self._event_ch
+        )
+        synthesizer = _create_speech_synthesizer(
+            config=self._opts,
+            stream=speechsdk.audio.PushAudioOutputStream(stream_callback),
+        )
-            def _synthesize() -> speechsdk.SpeechSynthesisResult:
-                return synthesizer.speak_text_async(self._text).get()  # type: ignore
+        def _synthesize() -> speechsdk.SpeechSynthesisResult:
+            return synthesizer.speak_text_async(self._text).get()  # type: ignore
+        try:
             result = await asyncio.to_thread(_synthesize)
             if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
                 raise ValueError(
                     f"failed to synthesize audio: {result.reason} {result.cancellation_details}"
                 )
+        finally:
             def _cleanup() -> None:
                 nonlocal synthesizer, result
@@ -106,28 +99,32 @@ class ChunkedStream(tts.ChunkedStream):
             await asyncio.to_thread(_cleanup)
-        except Exception:
-            logger.exception("failed to synthesize")
-        finally:
-            self._queue.put_nowait(None)
-    async def __anext__(self) -> tts.SynthesizedAudio:
-        if not self._main_task:
-            self._main_task = asyncio.create_task(self._run())
-        frame = await self._queue.get()
-        if frame is None:
-            raise StopAsyncIteration
-        return frame
-    async def aclose(self) -> None:
-        if not self._main_task:
-            return
+class _PushAudioOutputStreamCallback(speechsdk.audio.PushAudioOutputStreamCallback):
+    def __init__(
+        self,
+        loop: asyncio.AbstractEventLoop,
+        event_ch: utils.aio.ChanSender[tts.SynthesizedAudio],
+    ):
+        super().__init__()
+        self._event_ch = event_ch
+        self._loop = loop
+        self._request_id = utils.shortuuid()
+        self._segment_id = utils.shortuuid()
-        self._main_task.cancel()
-        with contextlib.suppress(asyncio.CancelledError):
-            await self._main_task
+    def write(self, audio_buffer: memoryview) -> int:
+        audio = tts.SynthesizedAudio(
+            request_id=self._request_id,
+            segment_id=self._segment_id,
+            frame=rtc.AudioFrame(
+                data=audio_buffer,
+                sample_rate=AZURE_SAMPLE_RATE,
+                num_channels=AZURE_NUM_CHANNELS,
+                samples_per_channel=audio_buffer.nbytes // 2,
+            ),
+        )
+        self._loop.call_soon_threadsafe(self._event_ch.send_nowait, audio)
+        return audio_buffer.nbytes
 def _create_speech_synthesizer(
@@ -143,26 +140,3 @@ def _create_speech_synthesizer(
     return speechsdk.SpeechSynthesizer(
         speech_config=speech_config, audio_config=stream_config
     )
-class _PushAudioOutputStreamCallback(speechsdk.audio.PushAudioOutputStreamCallback):
-    def __init__(
-        self,
-        loop: asyncio.AbstractEventLoop,
-        event_queue: asyncio.Queue[tts.SynthesizedAudio | None],
-    ):
-        super().__init__()
-        self._event_queue = event_queue
-        self._loop = loop
-    def write(self, audio_buffer: memoryview) -> int:
-        audio_frame = rtc.AudioFrame(
-            data=audio_buffer,
-            sample_rate=AZURE_SAMPLE_RATE,
-            num_channels=AZURE_NUM_CHANNELS,
-            samples_per_channel=audio_buffer.nbytes // 2,
-        )
-        audio = tts.SynthesizedAudio(text="", data=audio_frame)
-        self._loop.call_soon_threadsafe(self._event_queue.put_nowait, audio)
-        return audio_buffer.nbytes

{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit/plugins/azure/version.py RENAMED Viewed

@@ -1,3 +1,5 @@
+# Copyright 2024 LiveKit, Inc.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.2.1"
+__version__ = "0.3.0-dev.1"

{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-azure
-Version: 0.2.1
+Version: 0.3.0.dev1
 Summary: Agent Framework plugin for services from Azure
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0

{livekit_plugins_azure-0.2.1 → livekit_plugins_azure-0.3.0.dev1}/livekit_plugins_azure.egg-info/SOURCES.txt RENAMED Viewed

@@ -3,6 +3,7 @@ pyproject.toml
 setup.py
 livekit/plugins/azure/__init__.py
 livekit/plugins/azure/log.py
+livekit/plugins/azure/py.typed
 livekit/plugins/azure/stt.py
 livekit/plugins/azure/tts.py
 livekit/plugins/azure/version.py