PyPI - solana-agent - Versions diffs - 31.2.6__py3-none-any.whl → 31.3.0__py3-none-any.whl - Mend

solana-agent 31.2.6py3-none-any.whl → 31.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

solana_agent/adapters/openai_realtime_ws.py CHANGED Viewed

@@ -102,16 +102,30 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
         ]
         model = self.options.model or "gpt-realtime"
         uri = f"{self.url}?model={model}"
-        logger.info(
-            "Realtime WS connecting: uri=%s, input=%s@%sHz, output=%s@%sHz, voice=%s, vad=%s",
-            uri,
-            self.options.input_mime,
-            self.options.input_rate_hz,
-            self.options.output_mime,
-            self.options.output_rate_hz,
-            self.options.voice,
-            self.options.vad_enabled,
-        )
+        # Determine if audio output should be configured for logging
+        modalities = self.options.output_modalities or ["audio", "text"]
+        should_configure_audio_output = "audio" in modalities
+        if should_configure_audio_output:
+            logger.info(
+                "Realtime WS connecting: uri=%s, input=%s@%sHz, output=%s@%sHz, voice=%s, vad=%s",
+                uri,
+                self.options.input_mime,
+                self.options.input_rate_hz,
+                self.options.output_mime,
+                self.options.output_rate_hz,
+                self.options.voice,
+                self.options.vad_enabled,
+            )
+        else:
+            logger.info(
+                "Realtime WS connecting: uri=%s, input=%s@%sHz, text-only output, vad=%s",
+                uri,
+                self.options.input_mime,
+                self.options.input_rate_hz,
+                self.options.vad_enabled,
+            )
         self._ws = await websockets.connect(
             uri, additional_headers=headers, max_size=None
         )
@@ -165,11 +179,16 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
                     cleaned.append(t)
             return cleaned
+        # Determine if audio output should be configured
+        modalities = self.options.output_modalities or ["audio", "text"]
+        should_configure_audio_output = "audio" in modalities
+        # Build session.update per docs (nested audio object)
         session_payload: Dict[str, Any] = {
             "type": "session.update",
             "session": {
                 "type": "realtime",
-                "output_modalities": ["audio"],
+                "output_modalities": modalities,
                 "audio": {
                     "input": {
                         "format": {
@@ -178,16 +197,22 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
                         },
                         "turn_detection": td_input,
                     },
-                    "output": {
-                        "format": {
-                            "type": self.options.output_mime or "audio/pcm",
-                            "rate": int(self.options.output_rate_hz or 24000),
-                        },
-                        "voice": self.options.voice,
-                        "speed": float(
-                            getattr(self.options, "voice_speed", 1.0) or 1.0
-                        ),
-                    },
+                    **(
+                        {
+                            "output": {
+                                "format": {
+                                    "type": self.options.output_mime or "audio/pcm",
+                                    "rate": int(self.options.output_rate_hz or 24000),
+                                },
+                                "voice": self.options.voice,
+                                "speed": float(
+                                    getattr(self.options, "voice_speed", 1.0) or 1.0
+                                ),
+                            }
+                        }
+                        if should_configure_audio_output
+                        else {}
+                    ),
                 },
                 # Note: no top-level turn_detection; nested under audio.input
                 **({"prompt": prompt_block} if prompt_block else {}),
@@ -204,13 +229,45 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
                 ),
             },
         }
-        logger.info(
-            "Realtime WS: sending session.update (voice=%s, vad=%s, output=%s@%s)",
-            self.options.voice,
-            self.options.vad_enabled,
-            (self.options.output_mime or "audio/pcm"),
-            int(self.options.output_rate_hz or 24000),
-        )
+        # Optional realtime transcription configuration
+        try:
+            tr_model = getattr(self.options, "transcription_model", None)
+            if tr_model:
+                audio_obj = session_payload["session"].setdefault("audio", {})
+                # Attach input transcription config per GA schema
+                transcription_cfg: Dict[str, Any] = {"model": tr_model}
+                lang = getattr(self.options, "transcription_language", None)
+                if lang:
+                    transcription_cfg["language"] = lang
+                prompt_txt = getattr(self.options, "transcription_prompt", None)
+                if prompt_txt is not None:
+                    transcription_cfg["prompt"] = prompt_txt
+                if getattr(self.options, "transcription_include_logprobs", False):
+                    session_payload["session"].setdefault("include", []).append(
+                        "item.input_audio_transcription.logprobs"
+                    )
+                nr = getattr(self.options, "transcription_noise_reduction", None)
+                if nr is not None:
+                    audio_obj["noise_reduction"] = bool(nr)
+                # Place under audio.input.transcription per current server conventions
+                audio_obj.setdefault("input", {}).setdefault(
+                    "transcription", transcription_cfg
+                )
+        except Exception:
+            logger.exception("Failed to attach transcription config to session.update")
+        if should_configure_audio_output:
+            logger.info(
+                "Realtime WS: sending session.update (voice=%s, vad=%s, output=%s@%s)",
+                self.options.voice,
+                self.options.vad_enabled,
+                (self.options.output_mime or "audio/pcm"),
+                int(self.options.output_rate_hz or 24000),
+            )
+        else:
+            logger.info(
+                "Realtime WS: sending session.update (text-only, vad=%s)",
+                self.options.vad_enabled,
+            )
         # Log exact session.update payload and mark awaiting session.updated
         try:
             logger.info(
@@ -231,7 +288,7 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
                 logger.warning(
                     "Realtime WS: instructions missing/empty in session.update"
                 )
-            if not voice:
+            if not voice and should_configure_audio_output:
                 logger.warning("Realtime WS: voice missing in session.update")
         except Exception:
             pass
@@ -632,6 +689,20 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
                                             len(final),
                                         )
                                     self._out_text_buffers.pop(rid, None)
+                                # Always terminate the output transcript stream for this response when text-only.
+                                try:
+                                    # Only enqueue sentinel when no audio modality is configured
+                                    modalities = (
+                                        getattr(self.options, "output_modalities", None)
+                                        or []
+                                    )
+                                    if "audio" not in modalities:
+                                        self._out_tr_queue.put_nowait(None)
+                                        logger.debug(
+                                            "Enqueued transcript termination sentinel (text-only response)"
+                                        )
+                                except Exception:
+                                    pass
                             except Exception:
                                 pass
                     elif (
@@ -1033,6 +1104,47 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
                 else:
                     patch[k] = raw[k]
+        # --- Inject realtime transcription config if options were updated after initial connect ---
+        try:
+            tr_model = getattr(self.options, "transcription_model", None)
+            if tr_model and isinstance(patch, dict):
+                # Ensure audio/input containers exist without overwriting caller provided fields
+                aud = patch.setdefault("audio", {})
+                inp = aud.setdefault("input", {})
+                # Only add if not explicitly provided in this patch
+                if "transcription" not in inp:
+                    transcription_cfg: Dict[str, Any] = {"model": tr_model}
+                    lang = getattr(self.options, "transcription_language", None)
+                    if lang:
+                        transcription_cfg["language"] = lang
+                    prompt_txt = getattr(self.options, "transcription_prompt", None)
+                    if prompt_txt is not None:
+                        transcription_cfg["prompt"] = prompt_txt
+                    nr = getattr(self.options, "transcription_noise_reduction", None)
+                    if nr is not None:
+                        aud["noise_reduction"] = bool(nr)
+                    if getattr(self.options, "transcription_include_logprobs", False):
+                        patch.setdefault("include", [])
+                        if (
+                            "item.input_audio_transcription.logprobs"
+                            not in patch["include"]
+                        ):
+                            patch["include"].append(
+                                "item.input_audio_transcription.logprobs"
+                            )
+                    inp["transcription"] = transcription_cfg
+                    try:
+                        logger.debug(
+                            "Realtime WS: update_session injected transcription config model=%s",
+                            tr_model,
+                        )
+                    except Exception:
+                        pass
+        except Exception:
+            logger.exception(
+                "Realtime WS: failed injecting transcription config in update_session"
+            )
         # Ensure tools are cleaned even if provided only under audio or elsewhere
         if "tools" in patch:
             patch["tools"] = _strip_tool_strict(patch["tools"])  # idempotent
@@ -1040,9 +1152,12 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
         # Per server requirements, always include session.type and output_modalities
         try:
             patch["type"] = "realtime"
-            # Preserve caller-provided output_modalities if present, otherwise default to audio
+            # Preserve caller-provided output_modalities if present, otherwise default to configured modalities
             if "output_modalities" not in patch:
-                patch["output_modalities"] = ["audio"]
+                patch["output_modalities"] = self.options.output_modalities or [
+                    "audio",
+                    "text",
+                ]
         except Exception:
             pass
@@ -1148,6 +1263,13 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
         except Exception:
             pass
+    async def create_conversation_item(
+        self, item: Dict[str, Any]
+    ) -> None:  # pragma: no cover
+        """Create a conversation item (e.g., for text input)."""
+        payload = {"type": "conversation.item.create", "item": item}
+        await self._send_tracked(payload, label="conversation.item.create")
     async def create_response(
         self, response_patch: Optional[Dict[str, Any]] = None
     ) -> None:  # pragma: no cover
@@ -1639,6 +1761,13 @@ class OpenAITranscriptionWebSocketSession(BaseRealtimeSession):
     async def clear_input(self) -> None:  # pragma: no cover
         await self._send({"type": "input_audio_buffer.clear"})
+    async def create_conversation_item(
+        self, item: Dict[str, Any]
+    ) -> None:  # pragma: no cover
+        """Create a conversation item (e.g., for text input)."""
+        payload = {"type": "conversation.item.create", "item": item}
+        await self._send_tracked(payload, label="conversation.item.create")
     async def create_response(
         self, response_patch: Optional[Dict[str, Any]] = None
     ) -> None:  # pragma: no cover

solana_agent/client/solana_agent.py CHANGED Viewed

@@ -16,6 +16,7 @@ from solana_agent.interfaces.client.client import SolanaAgent as SolanaAgentInte
 from solana_agent.interfaces.plugins.plugins import Tool
 from solana_agent.services.knowledge_base import KnowledgeBaseService
 from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
+from solana_agent.interfaces.providers.realtime import RealtimeChunk
 class SolanaAgent(SolanaAgentInterface):
@@ -57,6 +58,7 @@ class SolanaAgent(SolanaAgentInterface):
         vad: Optional[bool] = False,
         rt_encode_input: bool = False,
         rt_encode_output: bool = False,
+        rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
         rt_voice: Literal[
             "alloy",
             "ash",
@@ -90,7 +92,9 @@ class SolanaAgent(SolanaAgentInterface):
         router: Optional[RoutingInterface] = None,
         images: Optional[List[Union[str, bytes]]] = None,
         output_model: Optional[Type[BaseModel]] = None,
-    ) -> AsyncGenerator[Union[str, bytes, BaseModel], None]:  # pragma: no cover
+    ) -> AsyncGenerator[
+        Union[str, bytes, BaseModel, RealtimeChunk], None
+    ]:  # pragma: no cover
         """Process a user message (text or audio) and optional images, returning the response stream.
         Args:
@@ -104,6 +108,7 @@ class SolanaAgent(SolanaAgentInterface):
             vad: Whether to use voice activity detection (for audio input)
             rt_encode_input: Whether to re-encode input audio for compatibility
             rt_encode_output: Whether to re-encode output audio for compatibility
+            rt_output_modalities: Modalities to return in realtime (default both if None)
             rt_voice: Voice to use for realtime audio output
             audio_voice: Voice to use for audio output
             audio_output_format: Audio output format
@@ -124,6 +129,7 @@ class SolanaAgent(SolanaAgentInterface):
             vad=vad,
             rt_encode_input=rt_encode_input,
             rt_encode_output=rt_encode_output,
+            rt_output_modalities=rt_output_modalities,
             rt_voice=rt_voice,
             audio_voice=audio_voice,
             audio_output_format=audio_output_format,

solana_agent/interfaces/client/client.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import AsyncGenerator, Dict, Any, List, Literal, Optional, Type, Uni
 from pydantic import BaseModel
 from solana_agent.interfaces.plugins.plugins import Tool
 from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
+from solana_agent.interfaces.providers.realtime import RealtimeChunk
 class SolanaAgent(ABC):
@@ -22,6 +23,7 @@ class SolanaAgent(ABC):
         vad: bool = False,
         rt_encode_input: bool = False,
         rt_encode_output: bool = False,
+        rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
         rt_voice: Literal[
             "alloy",
             "ash",
@@ -55,7 +57,7 @@ class SolanaAgent(ABC):
         router: Optional[RoutingInterface] = None,
         images: Optional[List[Union[str, bytes]]] = None,
         output_model: Optional[Type[BaseModel]] = None,
-    ) -> AsyncGenerator[Union[str, bytes, BaseModel], None]:
+    ) -> AsyncGenerator[Union[str, bytes, BaseModel, RealtimeChunk], None]:
         """Process a user message and return the response stream."""
         pass

solana_agent/interfaces/providers/__init__.py ADDED Viewed

File without changes

solana_agent/interfaces/providers/realtime.py CHANGED Viewed

@@ -1,7 +1,17 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, AsyncGenerator, Dict, Literal, Optional, Awaitable, Callable
+from typing import (
+    Any,
+    AsyncGenerator,
+    Dict,
+    Literal,
+    Optional,
+    Awaitable,
+    Callable,
+    List,
+    Union,
+)
 @dataclass
@@ -24,6 +34,7 @@ class RealtimeSessionOptions:
     output_rate_hz: int = 24000
     input_mime: str = "audio/pcm"  # 16-bit PCM
     output_mime: str = "audio/pcm"  # 16-bit PCM
+    output_modalities: List[Literal["audio", "text"]] = None  # None means auto-detect
     instructions: Optional[str] = None
     # Optional: tools payload compatible with OpenAI Realtime session.update
     tools: Optional[list[dict[str, Any]]] = None
@@ -34,6 +45,107 @@ class RealtimeSessionOptions:
     # Optional guard: if a tool takes longer than this to complete, skip sending
     # function_call_output to avoid stale/expired call_id issues. Set to None to always send.
     tool_result_max_age_s: Optional[float] = None
+    # --- Realtime transcription configuration (optional) ---
+    # When transcription_model is set, QueryService should skip the HTTP STT path and rely on
+    # realtime websocket transcription events. Other fields customize that behavior.
+    transcription_model: Optional[str] = None
+    transcription_language: Optional[str] = None  # e.g. 'en'
+    transcription_prompt: Optional[str] = None
+    transcription_noise_reduction: Optional[bool] = None
+    transcription_include_logprobs: bool = False
+@dataclass
+class RealtimeChunk:
+    """Represents a chunk of data from a realtime session with its modality type."""
+    modality: Literal["audio", "text"]
+    data: Union[str, bytes]
+    timestamp: Optional[float] = None  # Optional timestamp for ordering
+    metadata: Optional[Dict[str, Any]] = None  # Optional additional metadata
+    @property
+    def is_audio(self) -> bool:
+        """Check if this is an audio chunk."""
+        return self.modality == "audio"
+    @property
+    def is_text(self) -> bool:
+        """Check if this is a text chunk."""
+        return self.modality == "text"
+    @property
+    def text_data(self) -> Optional[str]:
+        """Get text data if this is a text chunk."""
+        return self.data if isinstance(self.data, str) else None
+    @property
+    def audio_data(self) -> Optional[bytes]:
+        """Get audio data if this is an audio chunk."""
+        return self.data if isinstance(self.data, bytes) else None
+async def separate_audio_chunks(
+    chunks: AsyncGenerator[RealtimeChunk, None],
+) -> AsyncGenerator[bytes, None]:
+    """Extract only audio chunks from a stream of RealtimeChunk objects.
+    Args:
+        chunks: Stream of RealtimeChunk objects
+    Yields:
+        Audio data bytes from audio chunks only
+    """
+    async for chunk in chunks:
+        if chunk.is_audio and chunk.audio_data:
+            yield chunk.audio_data
+async def separate_text_chunks(
+    chunks: AsyncGenerator[RealtimeChunk, None],
+) -> AsyncGenerator[str, None]:
+    """Extract only text chunks from a stream of RealtimeChunk objects.
+    Args:
+        chunks: Stream of RealtimeChunk objects
+    Yields:
+        Text data from text chunks only
+    """
+    async for chunk in chunks:
+        if chunk.is_text and chunk.text_data:
+            yield chunk.text_data
+async def demux_realtime_chunks(
+    chunks: AsyncGenerator[RealtimeChunk, None],
+) -> tuple[AsyncGenerator[bytes, None], AsyncGenerator[str, None]]:
+    """Demux a stream of RealtimeChunk objects into separate audio and text streams.
+    Note: This function consumes the input generator, so each output stream can only be consumed once.
+    Args:
+        chunks: Stream of RealtimeChunk objects
+    Returns:
+        Tuple of (audio_stream, text_stream) async generators
+    """
+    # Collect all chunks first since we can't consume the generator twice
+    collected_chunks = []
+    async for chunk in chunks:
+        collected_chunks.append(chunk)
+    async def audio_stream():
+        for chunk in collected_chunks:
+            if chunk.is_audio and chunk.audio_data:
+                yield chunk.audio_data
+    async def text_stream():
+        for chunk in collected_chunks:
+            if chunk.is_text and chunk.text_data:
+                yield chunk.text_data
+    return audio_stream(), text_stream()
 class BaseRealtimeSession(ABC):

solana_agent/interfaces/services/query.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Type, Uni
 from pydantic import BaseModel
 from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
+from solana_agent.interfaces.providers.realtime import RealtimeChunk
 class QueryService(ABC):
@@ -15,6 +16,7 @@ class QueryService(ABC):
         user_id: str,
         query: Union[str, bytes],
         output_format: Literal["text", "audio"] = "text",
+        rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
         rt_voice: Literal[
             "alloy",
             "ash",
@@ -51,7 +53,7 @@ class QueryService(ABC):
         output_model: Optional[Type[BaseModel]] = None,
         capture_schema: Optional[Dict[str, Any]] = None,
         capture_name: Optional[str] = None,
-    ) -> AsyncGenerator[Union[str, bytes, BaseModel], None]:
+    ) -> AsyncGenerator[Union[str, bytes, BaseModel, RealtimeChunk], None]:
         """Process the user request and generate a response."""
         pass

solana-agent 31.2.6__py3-none-any.whl → 31.3.0__py3-none-any.whl

solana-agent 31.2.6py3-none-any.whl → 31.3.0py3-none-any.whl