PyPI - livekit-plugins-aws - Versions diffs - 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

livekit-plugins-aws 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of livekit-plugins-aws might be problematic. Click here for more details.

Files changed (12) hide show

livekit/plugins/aws/llm.py +160 -239
livekit/plugins/aws/models.py +1 -1
livekit/plugins/aws/stt.py +114 -98
livekit/plugins/aws/tts.py +72 -79
livekit/plugins/aws/utils.py +144 -0
livekit/plugins/aws/version.py +1 -1
{livekit_plugins_aws-0.1.0.dist-info → livekit_plugins_aws-1.0.0.dist-info}/METADATA +14 -24
livekit_plugins_aws-1.0.0.dist-info/RECORD +12 -0
{livekit_plugins_aws-0.1.0.dist-info → livekit_plugins_aws-1.0.0.dist-info}/WHEEL +1 -2
livekit/plugins/aws/_utils.py +0 -216
livekit_plugins_aws-0.1.0.dist-info/RECORD +0 -13
livekit_plugins_aws-0.1.0.dist-info/top_level.txt +0 -1

livekit/plugins/aws/stt.py CHANGED Viewed

@@ -14,70 +14,72 @@ from __future__ import annotations
 import asyncio
 from dataclasses import dataclass
-from typing import Optional
+import aioboto3
+from amazon_transcribe.auth import StaticCredentialResolver
 from amazon_transcribe.client import TranscribeStreamingClient
 from amazon_transcribe.model import Result, TranscriptEvent
 from livekit import rtc
-from livekit.agents import (
-    DEFAULT_API_CONNECT_OPTIONS,
-    APIConnectOptions,
-    stt,
-    utils,
-)
-from ._utils import _get_aws_credentials
+from livekit.agents import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions, stt, utils
+from livekit.agents.types import NOT_GIVEN, NotGivenOr
+from livekit.agents.utils import is_given
 from .log import logger
+from .utils import DEFAULT_REGION, get_aws_async_session
+REFRESH_INTERVAL = 1800
 @dataclass
 class STTOptions:
-    speech_region: str
     sample_rate: int
     language: str
     encoding: str
-    vocabulary_name: Optional[str]
-    session_id: Optional[str]
-    vocab_filter_method: Optional[str]
-    vocab_filter_name: Optional[str]
-    show_speaker_label: Optional[bool]
-    enable_channel_identification: Optional[bool]
-    number_of_channels: Optional[int]
-    enable_partial_results_stabilization: Optional[bool]
-    partial_results_stability: Optional[str]
-    language_model_name: Optional[str]
+    vocabulary_name: NotGivenOr[str]
+    session_id: NotGivenOr[str]
+    vocab_filter_method: NotGivenOr[str]
+    vocab_filter_name: NotGivenOr[str]
+    show_speaker_label: NotGivenOr[bool]
+    enable_channel_identification: NotGivenOr[bool]
+    number_of_channels: NotGivenOr[int]
+    enable_partial_results_stabilization: NotGivenOr[bool]
+    partial_results_stability: NotGivenOr[str]
+    language_model_name: NotGivenOr[str]
 class STT(stt.STT):
     def __init__(
         self,
         *,
-        speech_region: str = "us-east-1",
-        api_key: str | None = None,
-        api_secret: str | None = None,
+        region: NotGivenOr[str] = NOT_GIVEN,
+        api_key: NotGivenOr[str] = NOT_GIVEN,
+        api_secret: NotGivenOr[str] = NOT_GIVEN,
         sample_rate: int = 48000,
         language: str = "en-US",
         encoding: str = "pcm",
-        vocabulary_name: Optional[str] = None,
-        session_id: Optional[str] = None,
-        vocab_filter_method: Optional[str] = None,
-        vocab_filter_name: Optional[str] = None,
-        show_speaker_label: Optional[bool] = None,
-        enable_channel_identification: Optional[bool] = None,
-        number_of_channels: Optional[int] = None,
-        enable_partial_results_stabilization: Optional[bool] = None,
-        partial_results_stability: Optional[str] = None,
-        language_model_name: Optional[str] = None,
+        vocabulary_name: NotGivenOr[str] = NOT_GIVEN,
+        session_id: NotGivenOr[str] = NOT_GIVEN,
+        vocab_filter_method: NotGivenOr[str] = NOT_GIVEN,
+        vocab_filter_name: NotGivenOr[str] = NOT_GIVEN,
+        show_speaker_label: NotGivenOr[bool] = NOT_GIVEN,
+        enable_channel_identification: NotGivenOr[bool] = NOT_GIVEN,
+        number_of_channels: NotGivenOr[int] = NOT_GIVEN,
+        enable_partial_results_stabilization: NotGivenOr[bool] = NOT_GIVEN,
+        partial_results_stability: NotGivenOr[str] = NOT_GIVEN,
+        language_model_name: NotGivenOr[str] = NOT_GIVEN,
+        session: aioboto3.Session | None = None,
+        refresh_interval: NotGivenOr[int] = NOT_GIVEN,
     ):
-        super().__init__(
-            capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
+        super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=True))
+        self._region = region if is_given(region) else DEFAULT_REGION
+        self._session = session or get_aws_async_session(
+            api_key=api_key if is_given(api_key) else None,
+            api_secret=api_secret if is_given(api_secret) else None,
+            region=self._region,
         )
-        self._api_key, self._api_secret = _get_aws_credentials(
-            api_key, api_secret, speech_region
-        )
         self._config = STTOptions(
-            speech_region=speech_region,
             language=language,
             sample_rate=sample_rate,
             encoding=encoding,
@@ -92,26 +94,47 @@ class STT(stt.STT):
             partial_results_stability=partial_results_stability,
             language_model_name=language_model_name,
         )
+        self._pool = utils.ConnectionPool[TranscribeStreamingClient](
+            connect_cb=self._create_client,
+            max_session_duration=refresh_interval
+            if is_given(refresh_interval)
+            else REFRESH_INTERVAL,
+        )
+    async def _create_client(self) -> TranscribeStreamingClient:
+        creds = await self._session.get_credentials()
+        frozen_credentials = await creds.get_frozen_credentials()
+        return TranscribeStreamingClient(
+            region=self._region,
+            credential_resolver=StaticCredentialResolver(
+                access_key_id=frozen_credentials.access_key,
+                secret_access_key=frozen_credentials.secret_key,
+                session_token=frozen_credentials.token,
+            ),
+        )
+    async def aclose(self) -> None:
+        await self._pool.aclose()
+        await super().aclose()
     async def _recognize_impl(
         self,
         buffer: utils.AudioBuffer,
         *,
-        language: str | None,
+        language: NotGivenOr[str] = NOT_GIVEN,
         conn_options: APIConnectOptions,
     ) -> stt.SpeechEvent:
-        raise NotImplementedError(
-            "Amazon Transcribe does not support single frame recognition"
-        )
+        raise NotImplementedError("Amazon Transcribe does not support single frame recognition")
     def stream(
         self,
         *,
-        language: str | None = None,
+        language: NotGivenOr[str] = NOT_GIVEN,
         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
-    ) -> "SpeechStream":
+    ) -> SpeechStream:
         return SpeechStream(
             stt=self,
+            pool=self._pool,
             conn_options=conn_options,
             opts=self._config,
         )
@@ -122,54 +145,54 @@ class SpeechStream(stt.SpeechStream):
         self,
         stt: STT,
         opts: STTOptions,
+        pool: utils.ConnectionPool[TranscribeStreamingClient],
         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
     ) -> None:
-        super().__init__(
-            stt=stt, conn_options=conn_options, sample_rate=opts.sample_rate
-        )
+        super().__init__(stt=stt, conn_options=conn_options, sample_rate=opts.sample_rate)
         self._opts = opts
-        self._client = TranscribeStreamingClient(region=self._opts.speech_region)
+        self._pool = pool
     async def _run(self) -> None:
-        stream = await self._client.start_stream_transcription(
-            language_code=self._opts.language,
-            media_sample_rate_hz=self._opts.sample_rate,
-            media_encoding=self._opts.encoding,
-            vocabulary_name=self._opts.vocabulary_name,
-            session_id=self._opts.session_id,
-            vocab_filter_method=self._opts.vocab_filter_method,
-            vocab_filter_name=self._opts.vocab_filter_name,
-            show_speaker_label=self._opts.show_speaker_label,
-            enable_channel_identification=self._opts.enable_channel_identification,
-            number_of_channels=self._opts.number_of_channels,
-            enable_partial_results_stabilization=self._opts.enable_partial_results_stabilization,
-            partial_results_stability=self._opts.partial_results_stability,
-            language_model_name=self._opts.language_model_name,
-        )
-        @utils.log_exceptions(logger=logger)
-        async def input_generator():
-            async for frame in self._input_ch:
-                if isinstance(frame, rtc.AudioFrame):
-                    await stream.input_stream.send_audio_event(
-                        audio_chunk=frame.data.tobytes()
-                    )
-            await stream.input_stream.end_stream()
-        @utils.log_exceptions(logger=logger)
-        async def handle_transcript_events():
-            async for event in stream.output_stream:
-                if isinstance(event, TranscriptEvent):
-                    self._process_transcript_event(event)
-        tasks = [
-            asyncio.create_task(input_generator()),
-            asyncio.create_task(handle_transcript_events()),
-        ]
-        try:
-            await asyncio.gather(*tasks)
-        finally:
-            await utils.aio.gracefully_cancel(*tasks)
+        async with self._pool.connection() as client:
+            live_config = {
+                "language_code": self._opts.language,
+                "media_sample_rate_hz": self._opts.sample_rate,
+                "media_encoding": self._opts.encoding,
+                "vocabulary_name": self._opts.vocabulary_name,
+                "session_id": self._opts.session_id,
+                "vocab_filter_method": self._opts.vocab_filter_method,
+                "vocab_filter_name": self._opts.vocab_filter_name,
+                "show_speaker_label": self._opts.show_speaker_label,
+                "enable_channel_identification": self._opts.enable_channel_identification,
+                "number_of_channels": self._opts.number_of_channels,
+                "enable_partial_results_stabilization": self._opts.enable_partial_results_stabilization,  # noqa: E501
+                "partial_results_stability": self._opts.partial_results_stability,
+                "language_model_name": self._opts.language_model_name,
+            }
+            filtered_config = {k: v for k, v in live_config.items() if v and is_given(v)}
+            stream = await client.start_stream_transcription(**filtered_config)
+            @utils.log_exceptions(logger=logger)
+            async def input_generator():
+                async for frame in self._input_ch:
+                    if isinstance(frame, rtc.AudioFrame):
+                        await stream.input_stream.send_audio_event(audio_chunk=frame.data.tobytes())
+                await stream.input_stream.end_stream()
+            @utils.log_exceptions(logger=logger)
+            async def handle_transcript_events():
+                async for event in stream.output_stream:
+                    if isinstance(event, TranscriptEvent):
+                        self._process_transcript_event(event)
+            tasks = [
+                asyncio.create_task(input_generator()),
+                asyncio.create_task(handle_transcript_events()),
+            ]
+            try:
+                await asyncio.gather(*tasks)
+            finally:
+                await utils.aio.gracefully_cancel(*tasks)
     def _process_transcript_event(self, transcript_event: TranscriptEvent):
         stream = transcript_event.transcript.results
@@ -184,9 +207,7 @@ class SpeechStream(stt.SpeechStream):
                     self._event_ch.send_nowait(
                         stt.SpeechEvent(
                             type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
-                            alternatives=[
-                                _streaming_recognize_response_to_speech_data(resp)
-                            ],
+                            alternatives=[_streaming_recognize_response_to_speech_data(resp)],
                         )
                     )
@@ -194,16 +215,12 @@ class SpeechStream(stt.SpeechStream):
                     self._event_ch.send_nowait(
                         stt.SpeechEvent(
                             type=stt.SpeechEventType.FINAL_TRANSCRIPT,
-                            alternatives=[
-                                _streaming_recognize_response_to_speech_data(resp)
-                            ],
+                            alternatives=[_streaming_recognize_response_to_speech_data(resp)],
                         )
                     )
             if not resp.is_partial:
-                self._event_ch.send_nowait(
-                    stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
-                )
+                self._event_ch.send_nowait(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
 def _streaming_recognize_response_to_speech_data(resp: Result) -> stt.SpeechData:
@@ -211,7 +228,6 @@ def _streaming_recognize_response_to_speech_data(resp: Result) -> stt.SpeechData
         language="en-US",
         start_time=resp.start_time if resp.start_time else 0.0,
         end_time=resp.end_time if resp.end_time else 0.0,
-        confidence=0.0,
         text=resp.alternatives[0].transcript if resp.alternatives else "",
     )

livekit/plugins/aws/tts.py CHANGED Viewed

@@ -14,11 +14,10 @@ from __future__ import annotations
 import asyncio
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+import aioboto3
 import aiohttp
-from aiobotocore.session import AioSession, get_session
-from livekit import rtc
 from livekit.agents import (
     APIConnectionError,
     APIConnectOptions,
@@ -27,14 +26,18 @@ from livekit.agents import (
     tts,
     utils,
 )
+from livekit.agents.types import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    NOT_GIVEN,
+    NotGivenOr,
+)
+from livekit.agents.utils import is_given
-from ._utils import _get_aws_credentials
-from .models import TTS_LANGUAGE, TTS_OUTPUT_FORMAT, TTS_SPEECH_ENGINE
+from .models import TTS_LANGUAGE, TTS_SPEECH_ENGINE
+from .utils import _strip_nones, get_aws_async_session
 TTS_NUM_CHANNELS: int = 1
-DEFAULT_OUTPUT_FORMAT: TTS_OUTPUT_FORMAT = "pcm"
 DEFAULT_SPEECH_ENGINE: TTS_SPEECH_ENGINE = "generative"
-DEFAULT_SPEECH_REGION = "us-east-1"
 DEFAULT_VOICE = "Ruth"
 DEFAULT_SAMPLE_RATE = 16000
@@ -42,27 +45,25 @@ DEFAULT_SAMPLE_RATE = 16000
 @dataclass
 class _TTSOptions:
     # https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html
-    voice: str | None
-    output_format: TTS_OUTPUT_FORMAT
-    speech_engine: TTS_SPEECH_ENGINE
-    speech_region: str
+    voice: NotGivenOr[str]
+    speech_engine: NotGivenOr[TTS_SPEECH_ENGINE]
+    region: str
     sample_rate: int
-    language: TTS_LANGUAGE | str | None
+    language: NotGivenOr[TTS_LANGUAGE | str]
 class TTS(tts.TTS):
     def __init__(
         self,
         *,
-        voice: str | None = DEFAULT_VOICE,
-        language: TTS_LANGUAGE | str | None = None,
-        output_format: TTS_OUTPUT_FORMAT = DEFAULT_OUTPUT_FORMAT,
-        speech_engine: TTS_SPEECH_ENGINE = DEFAULT_SPEECH_ENGINE,
+        voice: NotGivenOr[str] = NOT_GIVEN,
+        language: NotGivenOr[TTS_LANGUAGE | str] = NOT_GIVEN,
+        speech_engine: NotGivenOr[TTS_SPEECH_ENGINE] = NOT_GIVEN,
         sample_rate: int = DEFAULT_SAMPLE_RATE,
-        speech_region: str = DEFAULT_SPEECH_REGION,
-        api_key: str | None = None,
-        api_secret: str | None = None,
-        session: AioSession | None = None,
+        region: NotGivenOr[str] = NOT_GIVEN,
+        api_key: NotGivenOr[str] = NOT_GIVEN,
+        api_secret: NotGivenOr[str] = NOT_GIVEN,
+        session: aioboto3.Session | None = None,
     ) -> None:
         """
         Create a new instance of AWS Polly TTS.
@@ -75,13 +76,13 @@ class TTS(tts.TTS):
         Args:
             Voice (TTSModels, optional): Voice ID to use for the synthesis. Defaults to "Ruth".
             language (TTS_LANGUAGE, optional): language code for the Synthesize Speech request. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).
-            output_format(TTS_OUTPUT_FORMAT, optional): The format in which the returned output will be encoded. Defaults to "pcm".
             sample_rate(int, optional): The audio frequency specified in Hz. Defaults to 16000.
             speech_engine(TTS_SPEECH_ENGINE, optional): The engine to use for the synthesis. Defaults to "generative".
-            speech_region(str, optional): The region to use for the synthesis. Defaults to "us-east-1".
+            region(str, optional): The region to use for the synthesis. Defaults to "us-east-1".
             api_key(str, optional): AWS access key id.
             api_secret(str, optional): AWS secret access key.
-        """
+            session(aioboto3.Session, optional): Optional aioboto3 session to use.
+        """  # noqa: E501
         super().__init__(
             capabilities=tts.TTSCapabilities(
                 streaming=False,
@@ -89,41 +90,31 @@ class TTS(tts.TTS):
             sample_rate=sample_rate,
             num_channels=TTS_NUM_CHANNELS,
         )
-        self._api_key, self._api_secret = _get_aws_credentials(
-            api_key, api_secret, speech_region
+        self._session = session or get_aws_async_session(
+            api_key=api_key if is_given(api_key) else None,
+            api_secret=api_secret if is_given(api_secret) else None,
+            region=region if is_given(region) else None,
         )
         self._opts = _TTSOptions(
             voice=voice,
-            output_format=output_format,
             speech_engine=speech_engine,
-            speech_region=speech_region,
+            region=region,
             language=language,
             sample_rate=sample_rate,
         )
-        self._session = session or get_session()
-    def _get_client(self):
-        return self._session.create_client(
-            "polly",
-            region_name=self._opts.speech_region,
-            aws_access_key_id=self._api_key,
-            aws_secret_access_key=self._api_secret,
-        )
     def synthesize(
         self,
         text: str,
         *,
-        conn_options: Optional[APIConnectOptions] = None,
-    ) -> "ChunkedStream":
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> ChunkedStream:
         return ChunkedStream(
             tts=self,
             text=text,
             conn_options=conn_options,
+            session=self._session,
             opts=self._opts,
-            get_client=self._get_client,
         )
@@ -133,57 +124,63 @@ class ChunkedStream(tts.ChunkedStream):
         *,
         tts: TTS,
         text: str,
-        conn_options: Optional[APIConnectOptions] = None,
+        session: aioboto3.Session,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
         opts: _TTSOptions,
-        get_client: Callable[[], Any],
     ) -> None:
         super().__init__(tts=tts, input_text=text, conn_options=conn_options)
         self._opts = opts
-        self._get_client = get_client
         self._segment_id = utils.shortuuid()
+        self._session = session
     async def _run(self):
         request_id = utils.shortuuid()
         try:
-            async with self._get_client() as client:
+            async with self._session.client("polly") as client:
                 params = {
                     "Text": self._input_text,
-                    "OutputFormat": self._opts.output_format,
-                    "Engine": self._opts.speech_engine,
-                    "VoiceId": self._opts.voice,
+                    "OutputFormat": "mp3",
+                    "Engine": self._opts.speech_engine
+                    if is_given(self._opts.speech_engine)
+                    else DEFAULT_SPEECH_ENGINE,
+                    "VoiceId": self._opts.voice if is_given(self._opts.voice) else DEFAULT_VOICE,
                     "TextType": "text",
                     "SampleRate": str(self._opts.sample_rate),
-                    "LanguageCode": self._opts.language,
+                    "LanguageCode": self._opts.language if is_given(self._opts.language) else None,
                 }
                 response = await client.synthesize_speech(**_strip_nones(params))
                 if "AudioStream" in response:
-                    decoder = utils.codecs.Mp3StreamDecoder()
-                    async with response["AudioStream"] as resp:
-                        async for data, _ in resp.content.iter_chunks():
-                            if self._opts.output_format == "mp3":
-                                frames = decoder.decode_chunk(data)
-                                for frame in frames:
-                                    self._event_ch.send_nowait(
-                                        tts.SynthesizedAudio(
-                                            request_id=request_id,
-                                            segment_id=self._segment_id,
-                                            frame=frame,
-                                        )
-                                    )
-                            else:
-                                self._event_ch.send_nowait(
-                                    tts.SynthesizedAudio(
-                                        request_id=request_id,
-                                        segment_id=self._segment_id,
-                                        frame=rtc.AudioFrame(
-                                            data=data,
-                                            sample_rate=self._opts.sample_rate,
-                                            num_channels=1,
-                                            samples_per_channel=len(data) // 2,
-                                        ),
-                                    )
-                                )
+                    decoder = utils.codecs.AudioStreamDecoder(
+                        sample_rate=self._opts.sample_rate,
+                        num_channels=1,
+                    )
+                    # Create a task to push data to the decoder
+                    async def push_data():
+                        try:
+                            async with response["AudioStream"] as resp:
+                                async for data, _ in resp.content.iter_chunks():
+                                    decoder.push(data)
+                        finally:
+                            decoder.end_input()
+                    # Start pushing data to the decoder
+                    push_task = asyncio.create_task(push_data())
+                    try:
+                        # Create emitter and process decoded frames
+                        emitter = tts.SynthesizedAudioEmitter(
+                            event_ch=self._event_ch,
+                            request_id=request_id,
+                            segment_id=self._segment_id,
+                        )
+                        async for frame in decoder:
+                            emitter.push(frame)
+                        emitter.flush()
+                        await push_task
+                    finally:
+                        await utils.aio.gracefully_cancel(push_task)
         except asyncio.TimeoutError as e:
             raise APITimeoutError() from e
@@ -196,7 +193,3 @@ class ChunkedStream(tts.ChunkedStream):
             ) from e
         except Exception as e:
             raise APIConnectionError() from e
-def _strip_nones(d: dict[str, Any]) -> dict[str, Any]:
-    return {k: v for k, v in d.items() if v is not None}

livekit-plugins-aws 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

livekit-plugins-aws 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl