PyPI - rasa-pro - Versions diffs - 3.11.0a4.dev2__py3-none-any.whl → 3.11.0rc1__py3-none-any.whl - Mend

rasa-pro 3.11.0a4.dev2py3-none-any.whl → 3.11.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rasa-pro might be problematic. Click here for more details.

Files changed (163) hide show

rasa/core/channels/voice_stream/tts/cartesia.py CHANGED Viewed

@@ -3,13 +3,13 @@ from typing import AsyncIterator, Dict, Optional
 import os
 import aiohttp
 import structlog
-from aiohttp import ClientConnectorError
+from aiohttp import ClientConnectorError, ClientTimeout
 from rasa.core.channels.voice_stream.tts.tts_engine import (
     TTSEngineConfig,
 )
-from rasa.core.channels.voice_stream.audio_bytes import RasaAudioBytes
+from rasa.core.channels.voice_stream.audio_bytes import HERTZ, RasaAudioBytes
 from rasa.core.channels.voice_stream.tts.tts_engine import TTSEngine, TTSError
 from rasa.shared.exceptions import ConnectionException
@@ -29,10 +29,11 @@ class CartesiaTTS(TTSEngine[CartesiaTTSConfig]):
     def __init__(self, config: Optional[CartesiaTTSConfig] = None):
         super().__init__(config)
+        timeout = ClientTimeout(total=self.config.timeout)
         # Have to create this class-shared session lazily at run time otherwise
         # the async event loop doesn't work
         if self.__class__.session is None or self.__class__.session.closed:
-            self.__class__.session = aiohttp.ClientSession()
+            self.__class__.session = aiohttp.ClientSession(timeout=timeout)
     @staticmethod
     def get_tts_endpoint() -> str:
@@ -55,13 +56,13 @@ class CartesiaTTS(TTSEngine[CartesiaTTSConfig]):
             "output_format": {
                 "container": "raw",
                 "encoding": "pcm_mulaw",
-                "sample_rate": 8000,
+                "sample_rate": HERTZ,
             },
         }
     @staticmethod
     def get_request_headers(config: CartesiaTTSConfig) -> dict[str, str]:
-        cartesia_api_key = os.environ.get(CARTESIA_API_KEY)
+        cartesia_api_key = os.environ[CARTESIA_API_KEY]
         return {
             "Cartesia-Version": str(config.version),
             "Content-Type": "application/json",
@@ -88,13 +89,15 @@ class CartesiaTTS(TTSEngine[CartesiaTTSConfig]):
                     return
                 else:
                     structlogger.error(
-                        "azure.synthesize.rest.failed",
+                        "cartesia.synthesize.rest.failed",
                         status_code=response.status,
                         msg=response.text(),
                     )
                     raise TTSError(f"TTS failed: {response.text()}")
         except ClientConnectorError as e:
             raise TTSError(e)
+        except TimeoutError as e:
+            raise TTSError(e)
     def engine_bytes_to_rasa_audio_bytes(self, chunk: bytes) -> RasaAudioBytes:
         """Convert the generated tts audio bytes into rasa audio bytes."""
@@ -105,6 +108,7 @@ class CartesiaTTS(TTSEngine[CartesiaTTSConfig]):
         return CartesiaTTSConfig(
             language="en",
             voice="248be419-c632-4f23-adf1-5324ed7dbf1d",
+            timeout=10,
             model_id="sonic-english",
             version="2024-06-10",
         )

rasa/core/channels/voice_stream/tts/tts_engine.py CHANGED Viewed

@@ -18,6 +18,7 @@ E = TypeVar("E", bound="TTSEngine")
 class TTSEngineConfig(MergeableConfig):
     language: Optional[str] = None
     voice: Optional[str] = None
+    timeout: Optional[int] = None
 class TTSEngine(Generic[T]):

rasa/core/channels/voice_stream/twilio_media_streams.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import base64
 import json
-import structlog
-from typing import Any, Awaitable, Callable, Dict, List, Optional, Text
 import uuid
+import structlog
+from typing import Any, Awaitable, Callable, Dict, Optional, Text, Tuple
 from sanic import Blueprint, HTTPResponse, Request, response
 from sanic import Websocket  # type: ignore
 from rasa.core.channels import UserMessage
 from rasa.core.channels.voice_ready.utils import CallParameters
+from rasa.core.channels.voice_stream.call_state import call_state
 from rasa.core.channels.voice_stream.tts.tts_engine import TTSEngine
 from rasa.core.channels.voice_stream.audio_bytes import RasaAudioBytes
 from rasa.core.channels.voice_stream.voice_channel import (
@@ -21,7 +23,7 @@ from rasa.core.channels.voice_stream.voice_channel import (
     VoiceOutputChannel,
 )
-structlogger = structlog.get_logger()
+logger = structlog.get_logger(__name__)
 def map_call_params(data: Dict[Text, Any]) -> CallParameters:
@@ -47,10 +49,18 @@ class TwilioMediaStreamsOutputChannel(VoiceOutputChannel):
     ) -> bytes:
         return base64.b64encode(rasa_audio_bytes)
-    def channel_bytes_to_messages(
-        self, recipient_id: str, channel_bytes: bytes
-    ) -> List[Any]:
+    def create_marker_message(self, recipient_id: str) -> Tuple[str, str]:
         message_id = uuid.uuid4().hex
+        mark_message = json.dumps(
+            {
+                "event": "mark",
+                "streamSid": recipient_id,
+                "mark": {"name": message_id},
+            }
+        )
+        return mark_message, message_id
+    def channel_bytes_to_message(self, recipient_id: str, channel_bytes: bytes) -> str:
         media_message = json.dumps(
             {
                 "event": "media",
@@ -60,15 +70,7 @@ class TwilioMediaStreamsOutputChannel(VoiceOutputChannel):
                 },
             }
         )
-        mark_message = json.dumps(
-            {
-                "event": "mark",
-                "streamSid": recipient_id,
-                "mark": {"name": message_id},
-            }
-        )
-        self.latest_message_id = message_id
-        return [media_message, mark_message]
+        return media_message
 class TwilioMediaStreamsInputChannel(VoiceInputChannel):
@@ -103,9 +105,16 @@ class TwilioMediaStreamsInputChannel(VoiceInputChannel):
         elif data["event"] == "stop":
             return EndConversationAction()
         elif data["event"] == "mark":
-            if data["mark"]["name"] == self.hangup_after:
-                structlogger.debug("twilio_streams.hangup", marker=self.hangup_after)
-                return EndConversationAction()
+            if data["mark"]["name"] == call_state.latest_bot_audio_id:
+                # Just finished streaming last audio bytes
+                call_state.is_bot_speaking = False  # type: ignore[attr-defined]
+                if call_state.should_hangup:
+                    logger.debug(
+                        "twilio_streams.hangup", marker=call_state.latest_bot_audio_id
+                    )
+                    return EndConversationAction()
+            else:
+                call_state.is_bot_speaking = True  # type: ignore[attr-defined]
         return ContinueConversationAction()
     def create_output_channel(

rasa/core/channels/voice_stream/util.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional, Type, TypeVar
 import structlog
-from rasa.core.channels.voice_stream.audio_bytes import RasaAudioBytes
+from rasa.core.channels.voice_stream.audio_bytes import HERTZ, RasaAudioBytes
 from rasa.shared.exceptions import RasaException
 structlogger = structlog.get_logger()
@@ -23,16 +23,16 @@ def read_wav_to_rasa_audio_bytes(file_name: str) -> Optional[RasaAudioBytes]:
         wave_data = audioop.lin2lin(wave_data, wave_object.getsampwidth(), 1)
         # 8 bit is unsigned
         # wave_data = audioop.bias(wave_data, 1, 128)
-    if wave_object.getframerate() != 8000:
+    if wave_object.getframerate() != HERTZ:
         wave_data, _ = audioop.ratecv(
-            wave_data, 1, 1, wave_object.getframerate(), 8000, None
+            wave_data, 1, 1, wave_object.getframerate(), HERTZ, None
         )
     wave_data = audioop.lin2ulaw(wave_data, 1)
     return RasaAudioBytes(wave_data)
 def generate_silence(length_in_seconds: float = 1.0) -> RasaAudioBytes:
-    return RasaAudioBytes(b"\00" * int(length_in_seconds * 8000))
+    return RasaAudioBytes(b"\00" * int(length_in_seconds * HERTZ))
 T = TypeVar("T", bound="MergeableConfig")

rasa/core/channels/voice_stream/voice_channel.py CHANGED Viewed

@@ -1,25 +1,47 @@
 import asyncio
-import logging
+import structlog
 import copy
 from dataclasses import asdict, dataclass
-from typing import Any, Awaitable, Callable, Dict, List, Optional
+from typing import Any, AsyncIterator, Awaitable, Callable, Dict, Optional, Tuple
+from rasa.core.channels.voice_stream.util import generate_silence
+from rasa.shared.core.constants import (
+    SILENCE_TIMEOUT_DEFAULT_VALUE,
+    SLOT_SILENCE_TIMEOUT,
+)
+from rasa.shared.utils.common import (
+    class_from_module_path,
+    mark_as_beta_feature,
+)
+from rasa.shared.utils.cli import print_error_and_exit
 from sanic.exceptions import ServerError, WebsocketClosed
 from rasa.core.channels import InputChannel, OutputChannel, UserMessage
 from rasa.core.channels.voice_ready.utils import CallParameters
+from rasa.core.channels.voice_ready.utils import validate_voice_license_scope
 from rasa.core.channels.voice_stream.asr.asr_engine import ASREngine
-from rasa.core.channels.voice_stream.asr.asr_event import ASREvent, NewTranscript
+from rasa.core.channels.voice_stream.asr.asr_event import (
+    ASREvent,
+    NewTranscript,
+    UserStartedSpeaking,
+)
 from sanic import Websocket  # type: ignore
 from rasa.core.channels.voice_stream.asr.deepgram import DeepgramASR
-from rasa.core.channels.voice_stream.audio_bytes import RasaAudioBytes
+from rasa.core.channels.voice_stream.asr.azure import AzureASR
+from rasa.core.channels.voice_stream.audio_bytes import HERTZ, RasaAudioBytes
+from rasa.core.channels.voice_stream.call_state import (
+    CallState,
+    _call_state,
+    call_state,
+)
 from rasa.core.channels.voice_stream.tts.azure import AzureTTS
 from rasa.core.channels.voice_stream.tts.tts_engine import TTSEngine, TTSError
 from rasa.core.channels.voice_stream.tts.cartesia import CartesiaTTS
 from rasa.core.channels.voice_stream.tts.tts_cache import TTSCache
-logger = logging.getLogger(__name__)
+logger = structlog.get_logger(__name__)
 @dataclass
@@ -43,25 +65,55 @@ class ContinueConversationAction(VoiceChannelAction):
 def asr_engine_from_config(asr_config: Dict) -> ASREngine:
-    name = str(asr_config["name"]).lower()
+    name = str(asr_config["name"])
     asr_config = copy.copy(asr_config)
     asr_config.pop("name")
-    if name == "deepgram":
+    if name.lower() == "deepgram":
         return DeepgramASR.from_config_dict(asr_config)
+    if name == "azure":
+        return AzureASR.from_config_dict(asr_config)
     else:
-        raise NotImplementedError
+        mark_as_beta_feature("Custom ASR Engine")
+        try:
+            asr_engine_class = class_from_module_path(name)
+            return asr_engine_class.from_config_dict(asr_config)
+        except NameError:
+            print_error_and_exit(
+                f"Failed to initialize ASR Engine with type '{name}'. "
+                f"Please make sure the method `from_config_dict`is implemented."
+            )
+        except TypeError as e:
+            print_error_and_exit(
+                f"Failed to initialize ASR Engine with type '{name}'. "
+                f"Invalid configuration provided. "
+                f"Error: {e}"
+            )
 def tts_engine_from_config(tts_config: Dict) -> TTSEngine:
-    name = str(tts_config["name"]).lower()
+    name = str(tts_config["name"])
     tts_config = copy.copy(tts_config)
     tts_config.pop("name")
-    if name == "azure":
+    if name.lower() == "azure":
         return AzureTTS.from_config_dict(tts_config)
-    elif name == "cartesia":
+    elif name.lower() == "cartesia":
         return CartesiaTTS.from_config_dict(tts_config)
     else:
-        raise NotImplementedError(f"TTS engine {name} is not implemented")
+        mark_as_beta_feature("Custom TTS Engine")
+        try:
+            tts_engine_class = class_from_module_path(name)
+            return tts_engine_class.from_config_dict(tts_config)
+        except NameError:
+            print_error_and_exit(
+                f"Failed to initialize TTS Engine with type '{name}'. "
+                f"Please make sure the method `from_config_dict`is implemented."
+            )
+        except TypeError as e:
+            print_error_and_exit(
+                f"Failed to initialize ASR Engine with type '{name}'. "
+                f"Invalid configuration provided. "
+                f"Error: {e}"
+            )
 class VoiceOutputChannel(OutputChannel):
@@ -71,70 +123,131 @@ class VoiceOutputChannel(OutputChannel):
         tts_engine: TTSEngine,
         tts_cache: TTSCache,
     ):
+        super().__init__()
         self.voice_websocket = voice_websocket
         self.tts_engine = tts_engine
         self.tts_cache = tts_cache
-        self.should_hangup = False
         self.latest_message_id: Optional[str] = None
     def rasa_audio_bytes_to_channel_bytes(
         self, rasa_audio_bytes: RasaAudioBytes
     ) -> bytes:
+        """Turn rasa's audio byte format into the format for the channel."""
+        raise NotImplementedError
+    def channel_bytes_to_message(self, recipient_id: str, channel_bytes: bytes) -> str:
+        """Wrap the bytes for the channel in the proper format."""
         raise NotImplementedError
-    def channel_bytes_to_messages(
-        self, recipient_id: str, channel_bytes: bytes
-    ) -> List[Any]:
+    def create_marker_message(self, recipient_id: str) -> Tuple[str, str]:
+        """Create a marker message for a specific channel."""
         raise NotImplementedError
+    async def send_marker_message(self, recipient_id: str) -> None:
+        """Send a message that marks positions in the audio stream."""
+        marker_message, mark_id = self.create_marker_message(recipient_id)
+        await self.voice_websocket.send(marker_message)
+        self.latest_message_id = mark_id
+    def update_silence_timeout(self) -> None:
+        """Updates the silence timeout for the session."""
+        if self.tracker_state:
+            call_state.silence_timeout = (  # type: ignore[attr-defined]
+                self.tracker_state["slots"][SLOT_SILENCE_TIMEOUT]
+            )
     async def send_text_message(
         self, recipient_id: str, text: str, **kwargs: Any
     ) -> None:
+        self.update_silence_timeout()
         cached_audio_bytes = self.tts_cache.get(text)
-        if cached_audio_bytes:
-            await self.send_audio_bytes(recipient_id, cached_audio_bytes)
-            return
         collected_audio_bytes = RasaAudioBytes(b"")
-        # Todo: make kwargs compatible with engine config
-        synth_config = self.tts_engine.config.__class__.from_dict({})
-        try:
-            audio_stream = self.tts_engine.synthesize(text, synth_config)
-        except TTSError:
-            # TODO: add message that works without tts, e.g. loading from disc
-            pass
+        seconds_marker = -1
+        if cached_audio_bytes:
+            audio_stream = self.chunk_audio(cached_audio_bytes)
+        else:
+            # Todo: make kwargs compatible with engine config
+            synth_config = self.tts_engine.config.__class__.from_dict({})
+            try:
+                audio_stream = self.tts_engine.synthesize(text, synth_config)
+            except TTSError:
+                # TODO: add message that works without tts, e.g. loading from disc
+                audio_stream = self.chunk_audio(generate_silence())
         async for audio_bytes in audio_stream:
             try:
                 await self.send_audio_bytes(recipient_id, audio_bytes)
+                full_seconds_of_audio = len(collected_audio_bytes) // HERTZ
+                if full_seconds_of_audio > seconds_marker:
+                    await self.send_marker_message(recipient_id)
+                    seconds_marker = full_seconds_of_audio
             except (WebsocketClosed, ServerError):
                 # ignore sending error, and keep collecting and caching audio bytes
-                self.should_hangup = True
+                call_state.connection_failed = True  # type: ignore[attr-defined]
             collected_audio_bytes = RasaAudioBytes(collected_audio_bytes + audio_bytes)
+        try:
+            await self.send_marker_message(recipient_id)
+        except (WebsocketClosed, ServerError):
+            # ignore sending error
+            pass
+        call_state.latest_bot_audio_id = self.latest_message_id  # type: ignore[attr-defined]
-        self.tts_cache.put(text, collected_audio_bytes)
+        if not cached_audio_bytes:
+            self.tts_cache.put(text, collected_audio_bytes)
     async def send_audio_bytes(
         self, recipient_id: str, audio_bytes: RasaAudioBytes
     ) -> None:
         channel_bytes = self.rasa_audio_bytes_to_channel_bytes(audio_bytes)
-        for message in self.channel_bytes_to_messages(recipient_id, channel_bytes):
-            await self.voice_websocket.send(message)
+        message = self.channel_bytes_to_message(recipient_id, channel_bytes)
+        await self.voice_websocket.send(message)
+    async def chunk_audio(
+        self, audio_bytes: RasaAudioBytes, chunk_size: int = 2048
+    ) -> AsyncIterator[RasaAudioBytes]:
+        """Generate chunks from cached audio bytes."""
+        offset = 0
+        while offset < len(audio_bytes):
+            chunk = audio_bytes[offset : offset + chunk_size]
+            if len(chunk):
+                yield RasaAudioBytes(chunk)
+            offset += chunk_size
+        return
     async def hangup(self, recipient_id: str, **kwargs: Any) -> None:
-        self.should_hangup = True
+        call_state.should_hangup = True  # type: ignore[attr-defined]
 class VoiceInputChannel(InputChannel):
     def __init__(self, server_url: str, asr_config: Dict, tts_config: Dict):
+        validate_voice_license_scope()
         self.server_url = server_url
         self.asr_config = asr_config
         self.tts_config = tts_config
         self.tts_cache = TTSCache(tts_config.get("cache_size", 1000))
-        # if set to a value, call will be hungup after marker is reached
-        self.hangup_after: Optional[str] = None
+    async def handle_silence_timeout(
+        self,
+        voice_websocket: Websocket,
+        on_new_message: Callable[[UserMessage], Awaitable[Any]],
+        tts_engine: TTSEngine,
+        call_parameters: CallParameters,
+    ) -> None:
+        timeout = call_state.silence_timeout or SILENCE_TIMEOUT_DEFAULT_VALUE
+        logger.info("voice_channel.silence_timeout_watch_started", timeout=timeout)
+        await asyncio.sleep(timeout)
+        logger.info("voice_channel.silence_timeout_tripped")
+        output_channel = self.create_output_channel(voice_websocket, tts_engine)
+        message = UserMessage(
+            "/silence_timeout",
+            output_channel,
+            call_parameters.stream_id,
+            input_channel=self.name(),
+            metadata=asdict(call_parameters),
+        )
+        await on_new_message(message)
     @classmethod
     def from_credentials(cls, credentials: Optional[Dict[str, Any]]) -> InputChannel:
@@ -179,6 +292,7 @@ class VoiceInputChannel(InputChannel):
         channel_websocket: Websocket,
     ) -> None:
         """Pipe input audio to ASR and consume ASR events simultaneously."""
+        _call_state.set(CallState())
         asr_engine = asr_engine_from_config(self.asr_config)
         tts_engine = tts_engine_from_config(self.tts_config)
         await asr_engine.connect()
@@ -192,7 +306,26 @@ class VoiceInputChannel(InputChannel):
         async def consume_audio_bytes() -> None:
             async for message in channel_websocket:
+                is_bot_speaking_before = call_state.is_bot_speaking
                 channel_action = self.map_input_message(message)
+                is_bot_speaking_after = call_state.is_bot_speaking
+                if not is_bot_speaking_before and is_bot_speaking_after:
+                    logger.info("voice_channel.bot_started_speaking")
+                # we just stopped speaking, starting a watcher for silence timeout
+                if is_bot_speaking_before and not is_bot_speaking_after:
+                    logger.info("voice_channel.bot_stopped_speaking")
+                    call_state.silence_timeout_watcher = (  # type: ignore[attr-defined]
+                        asyncio.create_task(
+                            self.handle_silence_timeout(
+                                channel_websocket,
+                                on_new_message,
+                                tts_engine,
+                                call_parameters,
+                            )
+                        )
+                    )
                 if isinstance(channel_action, NewAudioAction):
                     await asr_engine.send_audio_chunks(channel_action.audio_bytes)
                 elif isinstance(channel_action, EndConversationAction):
@@ -232,7 +365,10 @@ class VoiceInputChannel(InputChannel):
     ) -> None:
         """Handle a new event from the ASR system."""
         if isinstance(e, NewTranscript) and e.text:
-            logger.info(f"New transcript: {e.text}")
+            logger.info(
+                "VoiceInputChannel.handle_asr_event.new_transcript", transcript=e.text
+            )
+            call_state.is_user_speaking = False  # type: ignore[attr-defined]
             output_channel = self.create_output_channel(voice_websocket, tts_engine)
             message = UserMessage(
                 e.text,
@@ -242,6 +378,8 @@ class VoiceInputChannel(InputChannel):
                 metadata=asdict(call_parameters),
             )
             await on_new_message(message)
-            if output_channel.should_hangup:
-                self.hangup_after = output_channel.latest_message_id
+        elif isinstance(e, UserStartedSpeaking):
+            if call_state.silence_timeout_watcher:
+                call_state.silence_timeout_watcher.cancel()
+                call_state.silence_timeout_watcher = None  # type: ignore[attr-defined]
+            call_state.is_user_speaking = True  # type: ignore[attr-defined]

rasa/core/featurizers/single_state_featurizer.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import logging
+from typing import List, Optional, Dict, Text, Set, Any
 import numpy as np
 import scipy.sparse
-from typing import List, Optional, Dict, Text, Set, Any
 from rasa.core.featurizers.precomputation import MessageContainerForCoreFeaturization
 from rasa.nlu.extractors.extractor import EntityTagSpec
@@ -360,6 +361,26 @@ class SingleStateFeaturizer:
             for action in domain.action_names_or_texts
         ]
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "action_texts": self.action_texts,
+            "entity_tag_specs": self.entity_tag_specs,
+            "feature_states": self._default_feature_states,
+        }
+    @classmethod
+    def create_from_dict(
+        cls, data: Dict[str, Any]
+    ) -> Optional["SingleStateFeaturizer"]:
+        if not data:
+            return None
+        featurizer = SingleStateFeaturizer()
+        featurizer.action_texts = data["action_texts"]
+        featurizer._default_feature_states = data["feature_states"]
+        featurizer.entity_tag_specs = data["entity_tag_specs"]
+        return featurizer
 class IntentTokenizerSingleStateFeaturizer(SingleStateFeaturizer):
     """A SingleStateFeaturizer for use with policies that predict intent labels."""

rasa-pro 3.11.0a4.dev2__py3-none-any.whl → 3.11.0rc1__py3-none-any.whl

Potentially problematic release.

rasa-pro 3.11.0a4.dev2py3-none-any.whl → 3.11.0rc1py3-none-any.whl