PyPI - livekit-plugins-azure - Versions diffs - 0.2.0__tar.gz - Mend

livekit-plugins-azure 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

livekit_plugins_azure-0.2.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,38 @@
+Metadata-Version: 2.1
+Name: livekit-plugins-azure
+Version: 0.2.0
+Summary: Agent Framework plugin for services from Azure
+Home-page: https://github.com/livekit/agents
+License: Apache-2.0
+Project-URL: Documentation, https://docs.livekit.io
+Project-URL: Website, https://livekit.io/
+Project-URL: Source, https://github.com/livekit/agents
+Keywords: webrtc,realtime,audio,video,livekit
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Topic :: Multimedia :: Sound/Audio
+Classifier: Topic :: Multimedia :: Video
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3 :: Only
+Requires-Python: >=3.9.0
+Description-Content-Type: text/markdown
+Requires-Dist: livekit>=0.9.0
+Requires-Dist: livekit-agents>=0.3.0
+Requires-Dist: azure-cognitiveservices-speech>=1.35.0
+# LiveKit Plugins Azure
+Agent Framework plugin for services from Azure Cognitive Services. Currently supports STT and TTS.
+## Installation
+```bash
+pip install livekit-plugins-azure
+```
+## Pre-requisites
+You'll need to specify an Azure Speech Key and a Deployment Region. They can be set as environment variables: `AZURE_SPEECH_KEY` and `AZURE_SPEECH_REGION`, respectively.

livekit_plugins_azure-0.2.0/README.md ADDED Viewed

@@ -0,0 +1,13 @@
+# LiveKit Plugins Azure
+Agent Framework plugin for services from Azure Cognitive Services. Currently supports STT and TTS.
+## Installation
+```bash
+pip install livekit-plugins-azure
+```
+## Pre-requisites
+You'll need to specify an Azure Speech Key and a Deployment Region. They can be set as environment variables: `AZURE_SPEECH_KEY` and `AZURE_SPEECH_REGION`, respectively.

livekit_plugins_azure-0.2.0/livekit/plugins/azure/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .stt import STT, SpeechStream
+from .tts import TTS
+from .version import __version__
+__all__ = [
+    "STT",
+    "SpeechStream",
+    "TTS",
+    "__version__",
+]
+from livekit.agents import Plugin
+class AzurePlugin(Plugin):
+    def __init__(self):
+        super().__init__(__name__, __version__, __package__)
+    def download_files(self):
+        pass
+Plugin.register_plugin(AzurePlugin())

livekit_plugins_azure-0.2.0/livekit/plugins/azure/log.py ADDED Viewed

@@ -0,0 +1,3 @@
+import logging
+logger = logging.getLogger("livekit.plugins.azure")

livekit_plugins_azure-0.2.0/livekit/plugins/azure/stt.py ADDED Viewed

@@ -0,0 +1,224 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import asyncio
+import os
+from dataclasses import dataclass
+from typing import Optional
+from livekit import rtc
+from livekit.agents import stt
+from livekit.agents.utils import AudioBuffer
+import azure.cognitiveservices.speech as speechsdk
+from .log import logger
+@dataclass
+class STTOptions:
+    speech_key: str
+    speech_region: str
+    sample_rate: int
+    num_channels: int
+    languages: list[
+        str
+    ]  # see https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt
+class STT(stt.STT):
+    def __init__(
+        self,
+        *,
+        speech_key: str | None = None,
+        speech_region: str | None = None,
+        sample_rate: int = 48000,
+        num_channels: int = 1,
+        languages: list[str] = [],  # when empty, auto-detect the language
+    ):
+        super().__init__(streaming_supported=True)
+        speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
+        if not speech_key:
+            raise ValueError("AZURE_SPEECH_KEY must be set")
+        speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")
+        if not speech_region:
+            raise ValueError("AZURE_SPEECH_REGION must be set")
+        self._config = STTOptions(
+            speech_key=speech_key,
+            speech_region=speech_region,
+            languages=languages,
+            sample_rate=sample_rate,
+            num_channels=num_channels,
+        )
+    async def recognize(
+        self,
+        *,
+        buffer: AudioBuffer,
+        language: str | None = None,
+    ) -> stt.SpeechEvent:
+        raise NotImplementedError("Azure STT does not support single frame recognition")
+    def stream(
+        self,
+        *,
+        language: str | None = None,
+    ) -> "SpeechStream":
+        return SpeechStream(self._config)
+class SpeechStream(stt.SpeechStream):
+    def __init__(self, opts: STTOptions) -> None:
+        super().__init__()
+        self._opts = opts
+        self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
+        self._closed = False
+        self._speaking = False
+        self._stream = speechsdk.audio.PushAudioInputStream(
+            stream_format=speechsdk.audio.AudioStreamFormat(
+                samples_per_second=self._opts.sample_rate,
+                bits_per_sample=16,
+                channels=self._opts.num_channels,
+            )
+        )
+        self._recognizer = _create_speech_recognizer(
+            config=self._opts, stream=self._stream
+        )
+        self._recognizer.recognizing.connect(self._on_recognizing)
+        self._recognizer.recognized.connect(self._on_recognized)
+        self._recognizer.speech_start_detected.connect(self._on_speech_start)
+        self._recognizer.speech_end_detected.connect(self._on_speech_end)
+        self._recognizer.session_stopped.connect(self._on_session_stopped)
+        self._recognizer.start_continuous_recognition()
+        self._done_event = asyncio.Event()
+        self._loop = asyncio.get_running_loop()
+    def push_frame(self, frame: rtc.AudioFrame) -> None:
+        if self._closed:
+            raise ValueError("cannot push frame to closed stream")
+        self._stream.write(frame.data.tobytes())
+    async def aclose(self, *, wait: bool = True) -> None:
+        if self._closed:
+            return
+        self._closed = True
+        self._stream.close()
+        await self._done_event.wait()
+        def _cleanup():
+            self._recognizer.stop_continuous_recognition()
+            del self._recognizer
+        await asyncio.to_thread(_cleanup)
+    def _on_recognized(self, evt: speechsdk.SpeechRecognitionEventArgs):
+        detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
+        text = evt.result.text.strip()
+        if not text:
+            return
+        final_data = stt.SpeechData(
+            language=detected_lg,
+            confidence=1.0,
+            text=evt.result.text,
+        )
+        self._threadsafe_put(
+            stt.SpeechEvent(
+                type=stt.SpeechEventType.FINAL_TRANSCRIPT,
+                alternatives=[final_data],
+            )
+        )
+    def _on_recognizing(self, evt: speechsdk.SpeechRecognitionEventArgs):
+        detected_lg = speechsdk.AutoDetectSourceLanguageResult(evt.result).language
+        text = evt.result.text.strip()
+        if not text:
+            return
+        interim_data = stt.SpeechData(
+            language=detected_lg,
+            confidence=0.0,
+            text=evt.result.text,
+        )
+        self._threadsafe_put(
+            stt.SpeechEvent(
+                type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
+                alternatives=[interim_data],
+            )
+        )
+    def _on_speech_start(self, evt: speechsdk.SpeechRecognitionEventArgs):
+        if self._speaking:
+            return
+        self._speaking = True
+        self._threadsafe_put(stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH))
+    def _on_speech_end(self, evt: speechsdk.SpeechRecognitionEventArgs):
+        if not self._speaking:
+            return
+        self._speaking = False
+        self._threadsafe_put(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
+    def _on_session_stopped(self, evt: speechsdk.SpeechRecognitionEventArgs):
+        if not self._closed:
+            logger.error("session stopped unexpectedly")
+        self._loop.call_soon_threadsafe(self._done_event.set)
+        self._threadsafe_put(None)
+    def _threadsafe_put(self, evt: stt.SpeechEvent | None):
+        self._loop.call_soon_threadsafe(self._event_queue.put_nowait, evt)
+    async def __anext__(self) -> stt.SpeechEvent:
+        evt = await self._event_queue.get()
+        if evt is None:
+            raise StopAsyncIteration
+        return evt
+def _create_speech_recognizer(
+    *, config: STTOptions, stream: speechsdk.audio.AudioInputStream
+) -> speechsdk.SpeechRecognizer:
+    speech_config = speechsdk.SpeechConfig(
+        subscription=config.speech_key, region=config.speech_region
+    )
+    auto_detect_source_language_config = None
+    if config.languages:
+        auto_detect_source_language_config = (
+            speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
+                languages=config.languages
+            )
+        )
+    audio_config = speechsdk.audio.AudioConfig(stream=stream)
+    speech_recognizer = speechsdk.SpeechRecognizer(
+        speech_config=speech_config,
+        audio_config=audio_config,
+        auto_detect_source_language_config=auto_detect_source_language_config,  # type: ignore
+    )
+    return speech_recognizer

livekit_plugins_azure-0.2.0/livekit/plugins/azure/tts.py ADDED Viewed

@@ -0,0 +1,168 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import asyncio
+import contextlib
+import os
+from dataclasses import dataclass
+from typing import Optional
+from livekit import rtc
+from livekit.agents import tts
+import azure.cognitiveservices.speech as speechsdk
+from .log import logger
+AZURE_SAMPLE_RATE: int = 16000
+AZURE_BITS_PER_SAMPLE: int = 16
+AZURE_NUM_CHANNELS: int = 1
+@dataclass
+class _TTSOptions:
+    speech_key: str | None = None
+    speech_region: str | None = None
+    # see https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts
+    voice: str | None = None
+class TTS(tts.TTS):
+    def __init__(
+        self,
+        *,
+        speech_key: str | None = None,
+        speech_region: str | None = None,
+        voice: str | None = None,
+    ) -> None:
+        super().__init__(
+            streaming_supported=False,
+            sample_rate=AZURE_SAMPLE_RATE,
+            num_channels=AZURE_NUM_CHANNELS,
+        )
+        speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
+        if not speech_key:
+            raise ValueError("AZURE_SPEECH_KEY must be set")
+        speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")
+        if not speech_region:
+            raise ValueError("AZURE_SPEECH_REGION must be set")
+        self._opts = _TTSOptions(
+            speech_key=speech_key,
+            speech_region=speech_region,
+            voice=voice,
+        )
+    def synthesize(
+        self,
+        text: str,
+    ) -> "ChunkedStream":
+        return ChunkedStream(text, self._opts)
+class ChunkedStream(tts.ChunkedStream):
+    def __init__(self, text: str, opts: _TTSOptions) -> None:
+        self._opts = opts
+        self._text = text
+        self._main_task: asyncio.Task | None = None
+        self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
+    async def _run(self):
+        try:
+            stream_callback = _PushAudioOutputStreamCallback(
+                asyncio.get_running_loop(), self._queue
+            )
+            push_stream = speechsdk.audio.PushAudioOutputStream(stream_callback)
+            synthesizer = _create_speech_synthesizer(
+                config=self._opts, stream=push_stream
+            )
+            def _synthesize() -> speechsdk.SpeechSynthesisResult:
+                return synthesizer.speak_text_async(self._text).get()  # type: ignore
+            result = await asyncio.to_thread(_synthesize)
+            if result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
+                raise ValueError(
+                    f"failed to synthesize audio: {result.reason} {result.cancellation_details}"
+                )
+            def _cleanup() -> None:
+                nonlocal synthesizer, result
+                del synthesizer
+                del result
+            await asyncio.to_thread(_cleanup)
+        except Exception:
+            logger.exception("failed to synthesize")
+        finally:
+            self._queue.put_nowait(None)
+    async def __anext__(self) -> tts.SynthesizedAudio:
+        if not self._main_task:
+            self._main_task = asyncio.create_task(self._run())
+        frame = await self._queue.get()
+        if frame is None:
+            raise StopAsyncIteration
+        return frame
+    async def aclose(self) -> None:
+        if not self._main_task:
+            return
+        self._main_task.cancel()
+        with contextlib.suppress(asyncio.CancelledError):
+            await self._main_task
+def _create_speech_synthesizer(
+    *, config: _TTSOptions, stream: speechsdk.audio.AudioOutputStream
+) -> speechsdk.SpeechSynthesizer:
+    speech_config = speechsdk.SpeechConfig(
+        subscription=config.speech_key, region=config.speech_region
+    )
+    stream_config = speechsdk.audio.AudioOutputConfig(stream=stream)
+    if config.voice is not None:
+        speech_config.speech_synthesis_voice_name = config.voice
+    return speechsdk.SpeechSynthesizer(
+        speech_config=speech_config, audio_config=stream_config
+    )
+class _PushAudioOutputStreamCallback(speechsdk.audio.PushAudioOutputStreamCallback):
+    def __init__(
+        self,
+        loop: asyncio.AbstractEventLoop,
+        event_queue: asyncio.Queue[tts.SynthesizedAudio | None],
+    ):
+        super().__init__()
+        self._event_queue = event_queue
+        self._loop = loop
+    def write(self, audio_buffer: memoryview) -> int:
+        audio_frame = rtc.AudioFrame(
+            data=audio_buffer,
+            sample_rate=AZURE_SAMPLE_RATE,
+            num_channels=AZURE_NUM_CHANNELS,
+            samples_per_channel=audio_buffer.nbytes // 2,
+        )
+        audio = tts.SynthesizedAudio(text="", data=audio_frame)
+        self._loop.call_soon_threadsafe(self._event_queue.put_nowait, audio)
+        return audio_buffer.nbytes

livekit_plugins_azure-0.2.0/livekit/plugins/azure/version.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.2.0"

livekit_plugins_azure-0.2.0/livekit_plugins_azure.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,38 @@
+Metadata-Version: 2.1
+Name: livekit-plugins-azure
+Version: 0.2.0
+Summary: Agent Framework plugin for services from Azure
+Home-page: https://github.com/livekit/agents
+License: Apache-2.0
+Project-URL: Documentation, https://docs.livekit.io
+Project-URL: Website, https://livekit.io/
+Project-URL: Source, https://github.com/livekit/agents
+Keywords: webrtc,realtime,audio,video,livekit
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Topic :: Multimedia :: Sound/Audio
+Classifier: Topic :: Multimedia :: Video
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3 :: Only
+Requires-Python: >=3.9.0
+Description-Content-Type: text/markdown
+Requires-Dist: livekit>=0.9.0
+Requires-Dist: livekit-agents>=0.3.0
+Requires-Dist: azure-cognitiveservices-speech>=1.35.0
+# LiveKit Plugins Azure
+Agent Framework plugin for services from Azure Cognitive Services. Currently supports STT and TTS.
+## Installation
+```bash
+pip install livekit-plugins-azure
+```
+## Pre-requisites
+You'll need to specify an Azure Speech Key and a Deployment Region. They can be set as environment variables: `AZURE_SPEECH_KEY` and `AZURE_SPEECH_REGION`, respectively.

livekit_plugins_azure-0.2.0/livekit_plugins_azure.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+README.md
+pyproject.toml
+setup.py
+livekit/plugins/azure/__init__.py
+livekit/plugins/azure/log.py
+livekit/plugins/azure/stt.py
+livekit/plugins/azure/tts.py
+livekit/plugins/azure/version.py
+livekit_plugins_azure.egg-info/PKG-INFO
+livekit_plugins_azure.egg-info/SOURCES.txt
+livekit_plugins_azure.egg-info/dependency_links.txt
+livekit_plugins_azure.egg-info/requires.txt
+livekit_plugins_azure.egg-info/top_level.txt

livekit_plugins_azure-0.2.0/livekit_plugins_azure.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

livekit_plugins_azure-0.2.0/livekit_plugins_azure.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,3 @@
+livekit>=0.9.0
+livekit-agents>=0.3.0
+azure-cognitiveservices-speech>=1.35.0

livekit_plugins_azure-0.2.0/livekit_plugins_azure.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ livekit

livekit_plugins_azure-0.2.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"

livekit_plugins_azure-0.2.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

livekit_plugins_azure-0.2.0/setup.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import pathlib
+import setuptools
+import setuptools.command.build_py
+here = pathlib.Path(__file__).parent.resolve()
+about = {}
+with open(os.path.join(here, "livekit", "plugins", "azure", "version.py"), "r") as f:
+    exec(f.read(), about)
+setuptools.setup(
+    name="livekit-plugins-azure",
+    version=about["__version__"],
+    description="Agent Framework plugin for services from Azure",
+    long_description=(here / "README.md").read_text(encoding="utf-8"),
+    long_description_content_type="text/markdown",
+    url="https://github.com/livekit/agents",
+    cmdclass={},
+    classifiers=[
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: Apache Software License",
+        "Topic :: Multimedia :: Sound/Audio",
+        "Topic :: Multimedia :: Video",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3 :: Only",
+    ],
+    keywords=["webrtc", "realtime", "audio", "video", "livekit"],
+    license="Apache-2.0",
+    packages=setuptools.find_namespace_packages(include=["livekit.*"]),
+    python_requires=">=3.9.0",
+    install_requires=[
+        "livekit >= 0.9.0",
+        "livekit-agents >= 0.3.0",
+        "azure-cognitiveservices-speech >= 1.35.0",
+    ],
+    package_data={},
+    project_urls={
+        "Documentation": "https://docs.livekit.io",
+        "Website": "https://livekit.io/",
+        "Source": "https://github.com/livekit/agents",
+    },
+)