PyPI - livekit-plugins-aws - Versions diffs - 1.0.0rc6__py3-none-any.whl → 1.3.9__py3-none-any.whl - Mend

livekit-plugins-aws 1.0.0rc6py3-none-any.whl → 1.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

livekit/plugins/aws/__init__.py +47 -7
livekit/plugins/aws/experimental/realtime/__init__.py +11 -0
livekit/plugins/aws/experimental/realtime/events.py +545 -0
livekit/plugins/aws/experimental/realtime/pretty_printer.py +49 -0
livekit/plugins/aws/experimental/realtime/realtime_model.py +2106 -0
livekit/plugins/aws/experimental/realtime/turn_tracker.py +171 -0
livekit/plugins/aws/experimental/realtime/types.py +38 -0
livekit/plugins/aws/llm.py +109 -71
livekit/plugins/aws/log.py +4 -0
livekit/plugins/aws/models.py +4 -3
livekit/plugins/aws/stt.py +214 -71
livekit/plugins/aws/tts.py +96 -116
livekit/plugins/aws/utils.py +29 -125
livekit/plugins/aws/version.py +1 -1
livekit_plugins_aws-1.3.9.dist-info/METADATA +385 -0
livekit_plugins_aws-1.3.9.dist-info/RECORD +18 -0
{livekit_plugins_aws-1.0.0rc6.dist-info → livekit_plugins_aws-1.3.9.dist-info}/WHEEL +1 -1
livekit_plugins_aws-1.0.0rc6.dist-info/METADATA +0 -43
livekit_plugins_aws-1.0.0rc6.dist-info/RECORD +0 -12

livekit/plugins/aws/stt.py CHANGED Viewed

@@ -13,23 +13,59 @@
 from __future__ import annotations
 import asyncio
+import concurrent.futures
+import contextlib
+import os
 from dataclasses import dataclass
-from amazon_transcribe.client import TranscribeStreamingClient
-from amazon_transcribe.model import Result, TranscriptEvent
+from typing import Any
 from livekit import rtc
-from livekit.agents import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions, stt, utils
+from livekit.agents import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    APIConnectOptions,
+    stt,
+    utils,
+)
 from livekit.agents.types import NOT_GIVEN, NotGivenOr
 from livekit.agents.utils import is_given
+from livekit.agents.voice.io import TimedString
 from .log import logger
-from .utils import get_aws_credentials
+from .utils import DEFAULT_REGION
+try:
+    from aws_sdk_transcribe_streaming.client import TranscribeStreamingClient  # type: ignore
+    from aws_sdk_transcribe_streaming.config import Config  # type: ignore
+    from aws_sdk_transcribe_streaming.models import (  # type: ignore
+        AudioEvent,
+        AudioStream,
+        AudioStreamAudioEvent,
+        BadRequestException,
+        Result,
+        StartStreamTranscriptionInput,
+        TranscriptEvent,
+        TranscriptResultStream,
+    )
+    from smithy_aws_core.identity.environment import EnvironmentCredentialsResolver
+    from smithy_core.aio.interfaces.eventstream import (
+        EventPublisher,
+        EventReceiver,
+    )
+    _AWS_SDK_AVAILABLE = True
+except ImportError:
+    _AWS_SDK_AVAILABLE = False
+@dataclass
+class Credentials:
+    access_key_id: str
+    secret_access_key: str
+    session_token: str | None = None
 @dataclass
 class STTOptions:
-    speech_region: str
     sample_rate: int
     language: str
     encoding: str
@@ -43,16 +79,15 @@ class STTOptions:
     enable_partial_results_stabilization: NotGivenOr[bool]
     partial_results_stability: NotGivenOr[str]
     language_model_name: NotGivenOr[str]
+    region: str
 class STT(stt.STT):
     def __init__(
         self,
         *,
-        speech_region: str = "us-east-1",
-        api_key: NotGivenOr[str] = NOT_GIVEN,
-        api_secret: NotGivenOr[str] = NOT_GIVEN,
-        sample_rate: int = 48000,
+        region: NotGivenOr[str] = NOT_GIVEN,
+        sample_rate: int = 24000,
         language: str = "en-US",
         encoding: str = "pcm",
         vocabulary_name: NotGivenOr[str] = NOT_GIVEN,
@@ -65,14 +100,24 @@ class STT(stt.STT):
         enable_partial_results_stabilization: NotGivenOr[bool] = NOT_GIVEN,
         partial_results_stability: NotGivenOr[str] = NOT_GIVEN,
         language_model_name: NotGivenOr[str] = NOT_GIVEN,
+        credentials: NotGivenOr[Credentials] = NOT_GIVEN,
     ):
-        super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=True))
-        self._api_key, self._api_secret, self._speech_region = get_aws_credentials(
-            api_key, api_secret, speech_region
+        super().__init__(
+            capabilities=stt.STTCapabilities(
+                streaming=True, interim_results=True, aligned_transcript="word"
+            )
         )
+        if not _AWS_SDK_AVAILABLE:
+            raise ImportError(
+                "The 'aws_sdk_transcribe_streaming' package is not installed. "
+                "This implementation requires Python 3.12+ and the 'aws_sdk_transcribe_streaming' dependency."
+            )
+        if not is_given(region):
+            region = os.getenv("AWS_REGION") or DEFAULT_REGION
         self._config = STTOptions(
-            speech_region=self._speech_region,
             language=language,
             sample_rate=sample_rate,
             encoding=encoding,
@@ -86,8 +131,26 @@ class STT(stt.STT):
             enable_partial_results_stabilization=enable_partial_results_stabilization,
             partial_results_stability=partial_results_stability,
             language_model_name=language_model_name,
+            region=region,
+        )
+        self._credentials = credentials if is_given(credentials) else None
+    @property
+    def model(self) -> str:
+        return (
+            self._config.language_model_name
+            if is_given(self._config.language_model_name)
+            else "unknown"
         )
+    @property
+    def provider(self) -> str:
+        return "Amazon Transcribe"
+    async def aclose(self) -> None:
+        await super().aclose()
     async def _recognize_impl(
         self,
         buffer: utils.AudioBuffer,
@@ -107,6 +170,7 @@ class STT(stt.STT):
             stt=self,
             conn_options=conn_options,
             opts=self._config,
+            credentials=self._credentials,
         )
@@ -116,66 +180,132 @@ class SpeechStream(stt.SpeechStream):
         stt: STT,
         opts: STTOptions,
         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+        credentials: Credentials | None = None,
     ) -> None:
         super().__init__(stt=stt, conn_options=conn_options, sample_rate=opts.sample_rate)
         self._opts = opts
-        self._client = TranscribeStreamingClient(region=self._opts.speech_region)
+        self._credentials = credentials
     async def _run(self) -> None:
-        live_config = {
-            "language_code": self._opts.language,
-            "media_sample_rate_hz": self._opts.sample_rate,
-            "media_encoding": self._opts.encoding,
-            "vocabulary_name": self._opts.vocabulary_name,
-            "session_id": self._opts.session_id,
-            "vocab_filter_method": self._opts.vocab_filter_method,
-            "vocab_filter_name": self._opts.vocab_filter_name,
-            "show_speaker_label": self._opts.show_speaker_label,
-            "enable_channel_identification": self._opts.enable_channel_identification,
-            "number_of_channels": self._opts.number_of_channels,
-            "enable_partial_results_stabilization": self._opts.enable_partial_results_stabilization,
-            "partial_results_stability": self._opts.partial_results_stability,
-            "language_model_name": self._opts.language_model_name,
-        }
-        filtered_config = {k: v for k, v in live_config.items() if is_given(v)}
-        stream = await self._client.start_stream_transcription(**filtered_config)
-        @utils.log_exceptions(logger=logger)
-        async def input_generator():
-            async for frame in self._input_ch:
-                if isinstance(frame, rtc.AudioFrame):
-                    await stream.input_stream.send_audio_event(audio_chunk=frame.data.tobytes())
-            await stream.input_stream.end_stream()
-        @utils.log_exceptions(logger=logger)
-        async def handle_transcript_events():
-            async for event in stream.output_stream:
-                if isinstance(event, TranscriptEvent):
-                    self._process_transcript_event(event)
-        tasks = [
-            asyncio.create_task(input_generator()),
-            asyncio.create_task(handle_transcript_events()),
-        ]
-        try:
-            await asyncio.gather(*tasks)
-        finally:
-            await utils.aio.gracefully_cancel(*tasks)
-    def _process_transcript_event(self, transcript_event: TranscriptEvent):
+        while True:
+            config_kwargs: dict[str, Any] = {"region": self._opts.region}
+            if self._credentials:
+                config_kwargs["aws_access_key_id"] = self._credentials.access_key_id
+                config_kwargs["aws_secret_access_key"] = self._credentials.secret_access_key
+                config_kwargs["aws_session_token"] = self._credentials.session_token
+            else:
+                config_kwargs["aws_credentials_identity_resolver"] = (
+                    EnvironmentCredentialsResolver()
+                )
+            client: TranscribeStreamingClient = TranscribeStreamingClient(
+                config=Config(**config_kwargs)
+            )
+            live_config = {
+                "language_code": self._opts.language,
+                "media_sample_rate_hertz": self._opts.sample_rate,
+                "media_encoding": self._opts.encoding,
+                "vocabulary_name": self._opts.vocabulary_name,
+                "session_id": self._opts.session_id,
+                "vocab_filter_method": self._opts.vocab_filter_method,
+                "vocab_filter_name": self._opts.vocab_filter_name,
+                "show_speaker_label": self._opts.show_speaker_label,
+                "enable_channel_identification": self._opts.enable_channel_identification,
+                "number_of_channels": self._opts.number_of_channels,
+                "enable_partial_results_stabilization": self._opts.enable_partial_results_stabilization,
+                "partial_results_stability": self._opts.partial_results_stability,
+                "language_model_name": self._opts.language_model_name,
+            }
+            filtered_config = {k: v for k, v in live_config.items() if v and is_given(v)}
+            try:
+                stream = await client.start_stream_transcription(
+                    input=StartStreamTranscriptionInput(**filtered_config)
+                )
+                # Get the output stream
+                _, output_stream = await stream.await_output()
+                async def input_generator(
+                    audio_stream: EventPublisher[AudioStream],
+                ) -> None:
+                    try:
+                        async for frame in self._input_ch:
+                            if isinstance(frame, rtc.AudioFrame):
+                                await audio_stream.send(
+                                    AudioStreamAudioEvent(
+                                        value=AudioEvent(audio_chunk=frame.data.tobytes())
+                                    )
+                                )
+                        # Send empty frame to close
+                        await audio_stream.send(
+                            AudioStreamAudioEvent(value=AudioEvent(audio_chunk=b""))
+                        )
+                    finally:
+                        with contextlib.suppress(Exception):
+                            await audio_stream.close()
+                async def handle_transcript_events(
+                    output_stream: EventReceiver[TranscriptResultStream],
+                ) -> None:
+                    try:
+                        async for event in output_stream:
+                            if isinstance(event.value, TranscriptEvent):
+                                self._process_transcript_event(event.value)
+                    except concurrent.futures.InvalidStateError:
+                        logger.warning(
+                            "AWS Transcribe stream closed unexpectedly (InvalidStateError)"
+                        )
+                        pass
+                tasks = [
+                    asyncio.create_task(input_generator(stream.input_stream)),
+                    asyncio.create_task(handle_transcript_events(output_stream)),
+                ]
+                gather_future = asyncio.gather(*tasks)
+                await asyncio.shield(gather_future)
+            except BadRequestException as e:
+                if e.message and e.message.startswith("Your request timed out"):
+                    # AWS times out after 15s of inactivity, this tends to happen
+                    # at the end of the session, when the input is gone, we'll ignore it and
+                    # just treat it as a silent retry
+                    logger.info("restarting transcribe session")
+                    continue
+                else:
+                    raise e
+            finally:
+                # Close input stream first
+                await utils.aio.gracefully_cancel(tasks[0])
+                # Wait for output stream to close cleanly
+                try:
+                    await asyncio.wait_for(tasks[1], timeout=3.0)
+                except (asyncio.TimeoutError, asyncio.CancelledError):
+                    await utils.aio.gracefully_cancel(tasks[1])
+                # Ensure gather future is retrieved to avoid "exception never retrieved"
+                with contextlib.suppress(Exception):
+                    await gather_future
+    def _process_transcript_event(self, transcript_event: TranscriptEvent) -> None:
+        if not transcript_event.transcript or not transcript_event.transcript.results:
+            return
         stream = transcript_event.transcript.results
         for resp in stream:
-            if resp.start_time and resp.start_time == 0.0:
+            if resp.start_time is not None and resp.start_time == 0.0:
                 self._event_ch.send_nowait(
                     stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
                 )
-            if resp.end_time and resp.end_time > 0.0:
+            if resp.end_time is not None and resp.end_time > 0.0:
                 if resp.is_partial:
                     self._event_ch.send_nowait(
                         stt.SpeechEvent(
                             type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
-                            alternatives=[_streaming_recognize_response_to_speech_data(resp)],
+                            alternatives=[self._streaming_recognize_response_to_speech_data(resp)],
                         )
                     )
@@ -183,21 +313,34 @@ class SpeechStream(stt.SpeechStream):
                     self._event_ch.send_nowait(
                         stt.SpeechEvent(
                             type=stt.SpeechEventType.FINAL_TRANSCRIPT,
-                            alternatives=[_streaming_recognize_response_to_speech_data(resp)],
+                            alternatives=[self._streaming_recognize_response_to_speech_data(resp)],
                         )
                     )
             if not resp.is_partial:
                 self._event_ch.send_nowait(stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH))
+    def _streaming_recognize_response_to_speech_data(self, resp: Result) -> stt.SpeechData:
+        confidence = 0.0
+        if resp.alternatives and (items := resp.alternatives[0].items):
+            confidence = items[0].confidence or 0.0
-def _streaming_recognize_response_to_speech_data(resp: Result) -> stt.SpeechData:
-    data = stt.SpeechData(
-        language="en-US",
-        start_time=resp.start_time if resp.start_time else 0.0,
-        end_time=resp.end_time if resp.end_time else 0.0,
-        confidence=0.0,
-        text=resp.alternatives[0].transcript if resp.alternatives else "",
-    )
-    return data
+        return stt.SpeechData(
+            language=resp.language_code or self._opts.language,
+            start_time=(resp.start_time or 0.0) + self.start_time_offset,
+            end_time=(resp.end_time or 0.0) + self.start_time_offset,
+            text=resp.alternatives[0].transcript if resp.alternatives else "",
+            confidence=confidence,
+            words=[
+                TimedString(
+                    text=item.content,
+                    start_time=item.start_time + self.start_time_offset,
+                    end_time=item.end_time + self.start_time_offset,
+                    start_time_offset=self.start_time_offset,
+                    confidence=item.confidence or 0.0,
+                )
+                for item in resp.alternatives[0].items
+            ]
+            if resp.alternatives and resp.alternatives[0].items
+            else None,
+        )

livekit/plugins/aws/tts.py CHANGED Viewed

@@ -12,20 +12,19 @@
 from __future__ import annotations
-import asyncio
-from dataclasses import dataclass
-from typing import Any, Callable
+from dataclasses import dataclass, replace
+from typing import cast
-import aiohttp
-from aiobotocore.session import AioSession, get_session
+import aioboto3  # type: ignore
+import botocore  # type: ignore
+import botocore.exceptions  # type: ignore
+from aiobotocore.config import AioConfig  # type: ignore
 from livekit.agents import (
     APIConnectionError,
     APIConnectOptions,
-    APIStatusError,
     APITimeoutError,
     tts,
-    utils,
 )
 from livekit.agents.types import (
     DEFAULT_API_CONNECT_OPTIONS,
@@ -34,38 +33,38 @@ from livekit.agents.types import (
 )
 from livekit.agents.utils import is_given
-from .models import TTS_LANGUAGE, TTS_SPEECH_ENGINE
-from .utils import _strip_nones, get_aws_credentials
+from .models import TTSLanguages, TTSSpeechEngine, TTSTextType
+from .utils import _strip_nones
-TTS_NUM_CHANNELS: int = 1
-DEFAULT_SPEECH_ENGINE: TTS_SPEECH_ENGINE = "generative"
-DEFAULT_SPEECH_REGION = "us-east-1"
+DEFAULT_SPEECH_ENGINE: TTSSpeechEngine = "generative"
 DEFAULT_VOICE = "Ruth"
-DEFAULT_SAMPLE_RATE = 16000
+DEFAULT_TEXT_TYPE: TTSTextType = "text"
 @dataclass
 class _TTSOptions:
     # https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html
-    voice: NotGivenOr[str]
-    speech_engine: NotGivenOr[TTS_SPEECH_ENGINE]
-    speech_region: str
+    voice: str
+    speech_engine: TTSSpeechEngine
+    region: str | None
     sample_rate: int
-    language: NotGivenOr[TTS_LANGUAGE | str]
+    language: TTSLanguages | str | None
+    text_type: TTSTextType
 class TTS(tts.TTS):
     def __init__(
         self,
         *,
-        voice: NotGivenOr[str] = NOT_GIVEN,
-        language: NotGivenOr[TTS_LANGUAGE | str] = NOT_GIVEN,
-        speech_engine: NotGivenOr[TTS_SPEECH_ENGINE] = NOT_GIVEN,
-        sample_rate: int = DEFAULT_SAMPLE_RATE,
-        speech_region: NotGivenOr[str] = DEFAULT_SPEECH_REGION,
-        api_key: NotGivenOr[str] = NOT_GIVEN,
-        api_secret: NotGivenOr[str] = NOT_GIVEN,
-        session: AioSession | None = None,
+        voice: str = "Ruth",
+        language: NotGivenOr[TTSLanguages | str] = NOT_GIVEN,
+        speech_engine: TTSSpeechEngine = "generative",
+        text_type: TTSTextType = "text",
+        sample_rate: int = 16000,
+        region: str | None = None,
+        api_key: str | None = None,
+        api_secret: str | None = None,
+        session: aioboto3.Session | None = None,
     ) -> None:
         """
         Create a new instance of AWS Polly TTS.
@@ -76,130 +75,111 @@ class TTS(tts.TTS):
         See https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html for more details on the the AWS Polly TTS.
         Args:
-            Voice (TTSModels, optional): Voice ID to use for the synthesis. Defaults to "Ruth".
-            language (TTS_LANGUAGE, optional): language code for the Synthesize Speech request. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).
+            voice (TTSModels, optional): Voice ID to use for the synthesis. Defaults to "Ruth".
+            language (TTSLanguages, optional): language code for the Synthesize Speech request. This is only necessary if using a bilingual voice, such as Aditi, which can be used for either Indian English (en-IN) or Hindi (hi-IN).
+            speech_engine(TTSSpeechEngine, optional): The engine to use for the synthesis. Defaults to "generative".
+            text_type(TTSTextType, optional): Type of text to synthesize. Use "ssml" for SSML-enhanced text. Defaults to "text".
             sample_rate(int, optional): The audio frequency specified in Hz. Defaults to 16000.
-            speech_engine(TTS_SPEECH_ENGINE, optional): The engine to use for the synthesis. Defaults to "generative".
-            speech_region(str, optional): The region to use for the synthesis. Defaults to "us-east-1".
+            region(str, optional): The region to use for the synthesis. Defaults to "us-east-1".
             api_key(str, optional): AWS access key id.
             api_secret(str, optional): AWS secret access key.
+            session(aioboto3.Session, optional): Optional aioboto3 session to use.
         """  # noqa: E501
         super().__init__(
             capabilities=tts.TTSCapabilities(
                 streaming=False,
             ),
             sample_rate=sample_rate,
-            num_channels=TTS_NUM_CHANNELS,
+            num_channels=1,
         )
-        self._api_key, self._api_secret, self._speech_region = get_aws_credentials(
-            api_key, api_secret, speech_region
+        self._session = session or aioboto3.Session(
+            aws_access_key_id=api_key if is_given(api_key) else None,
+            aws_secret_access_key=api_secret if is_given(api_secret) else None,
+            region_name=region if is_given(region) else None,
         )
         self._opts = _TTSOptions(
             voice=voice,
             speech_engine=speech_engine,
-            speech_region=self._speech_region,
-            language=language,
+            text_type=text_type,
+            region=region or None,
+            language=language or None,
             sample_rate=sample_rate,
         )
-        self._session = session or get_session()
-    def _get_client(self):
-        return self._session.create_client(
-            "polly",
-            region_name=self._opts.speech_region,
-            aws_access_key_id=self._api_key,
-            aws_secret_access_key=self._api_secret,
-        )
+    @property
+    def model(self) -> str:
+        return self._opts.speech_engine
+    @property
+    def provider(self) -> str:
+        return "Amazon Polly"
     def synthesize(
+        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
+    ) -> ChunkedStream:
+        return ChunkedStream(tts=self, text=text, conn_options=conn_options)
+    def update_options(
         self,
-        text: str,
         *,
-        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
-    ) -> ChunkedStream:
-        return ChunkedStream(
-            tts=self,
-            text=text,
-            conn_options=conn_options,
-            opts=self._opts,
-            get_client=self._get_client,
-        )
+        voice: NotGivenOr[str] = NOT_GIVEN,
+        language: NotGivenOr[str] = NOT_GIVEN,
+        speech_engine: NotGivenOr[TTSSpeechEngine] = NOT_GIVEN,
+        text_type: NotGivenOr[TTSTextType] = NOT_GIVEN,
+    ) -> None:
+        if is_given(voice):
+            self._opts.voice = voice
+        if is_given(language):
+            self._opts.language = language
+        if is_given(speech_engine):
+            self._opts.speech_engine = cast(TTSSpeechEngine, speech_engine)
+        if is_given(text_type):
+            self._opts.text_type = cast(TTSTextType, text_type)
 class ChunkedStream(tts.ChunkedStream):
     def __init__(
-        self,
-        *,
-        tts: TTS,
-        text: str,
-        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
-        opts: _TTSOptions,
-        get_client: Callable[[], Any],
+        self, *, tts: TTS, text: str, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
     ) -> None:
         super().__init__(tts=tts, input_text=text, conn_options=conn_options)
-        self._opts = opts
-        self._get_client = get_client
-        self._segment_id = utils.shortuuid()
-    async def _run(self):
-        request_id = utils.shortuuid()
+        self._tts = tts
+        self._opts = replace(tts._opts)
+    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
         try:
-            async with self._get_client() as client:
-                params = {
-                    "Text": self._input_text,
-                    "OutputFormat": "mp3",
-                    "Engine": self._opts.speech_engine
-                    if is_given(self._opts.speech_engine)
-                    else DEFAULT_SPEECH_ENGINE,
-                    "VoiceId": self._opts.voice if is_given(self._opts.voice) else DEFAULT_VOICE,
-                    "TextType": "text",
-                    "SampleRate": str(self._opts.sample_rate),
-                    "LanguageCode": self._opts.language if is_given(self._opts.language) else None,
-                }
-                response = await client.synthesize_speech(**_strip_nones(params))
+            config = AioConfig(
+                connect_timeout=self._conn_options.timeout,
+                read_timeout=10,
+                retries={"mode": "standard", "total_max_attempts": 1},
+            )
+            async with self._tts._session.client("polly", config=config) as client:  # type: ignore
+                response = await client.synthesize_speech(
+                    **_strip_nones(
+                        {
+                            "Text": self._input_text,
+                            "OutputFormat": "mp3",
+                            "Engine": self._opts.speech_engine,
+                            "VoiceId": self._opts.voice,
+                            "TextType": self._opts.text_type,
+                            "SampleRate": str(self._opts.sample_rate),
+                            "LanguageCode": self._opts.language,
+                        }
+                    )
+                )
                 if "AudioStream" in response:
-                    decoder = utils.codecs.AudioStreamDecoder(
+                    output_emitter.initialize(
+                        request_id=response["ResponseMetadata"]["RequestId"],
                         sample_rate=self._opts.sample_rate,
                         num_channels=1,
+                        mime_type="audio/mp3",
                     )
-                    # Create a task to push data to the decoder
-                    async def push_data():
-                        try:
-                            async with response["AudioStream"] as resp:
-                                async for data, _ in resp.content.iter_chunks():
-                                    decoder.push(data)
-                        finally:
-                            decoder.end_input()
-                    # Start pushing data to the decoder
-                    push_task = asyncio.create_task(push_data())
-                    try:
-                        # Create emitter and process decoded frames
-                        emitter = tts.SynthesizedAudioEmitter(
-                            event_ch=self._event_ch,
-                            request_id=request_id,
-                            segment_id=self._segment_id,
-                        )
-                        async for frame in decoder:
-                            emitter.push(frame)
-                        emitter.flush()
-                        await push_task
-                    finally:
-                        await utils.aio.gracefully_cancel(push_task)
-        except asyncio.TimeoutError as e:
-            raise APITimeoutError() from e
-        except aiohttp.ClientResponseError as e:
-            raise APIStatusError(
-                message=e.message,
-                status_code=e.status,
-                request_id=request_id,
-                body=None,
-            ) from e
+                    async with response["AudioStream"] as resp:
+                        async for data, _ in resp.content.iter_chunks():
+                            output_emitter.push(data)
+        except botocore.exceptions.ConnectTimeoutError:
+            raise APITimeoutError() from None
         except Exception as e:
             raise APIConnectionError() from e

livekit-plugins-aws 1.0.0rc6__py3-none-any.whl → 1.3.9__py3-none-any.whl

livekit-plugins-aws 1.0.0rc6py3-none-any.whl → 1.3.9py3-none-any.whl