PyPI - solana-agent - Versions diffs - 31.2.6__py3-none-any.whl → 31.3.0__py3-none-any.whl - Mend

solana-agent 31.2.6py3-none-any.whl → 31.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

solana_agent/services/query.py CHANGED Viewed

@@ -37,6 +37,8 @@ from solana_agent.interfaces.services.knowledge_base import (
 )
 from solana_agent.interfaces.guardrails.guardrails import InputGuardrail
+from solana_agent.interfaces.providers.realtime import RealtimeSessionOptions
 from solana_agent.services.agent import AgentService
 from solana_agent.services.routing import RoutingService
@@ -94,6 +96,7 @@ class QueryService(QueryServiceInterface):
         encode_out: bool,
         audio_input_format: str,
         audio_output_format: str,
+        rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
     ) -> Any:
         """Get a free (or new) realtime session for this user. Marks it busy via an internal lock.
@@ -148,6 +151,7 @@ class QueryService(QueryServiceInterface):
                 output_rate_hz=24000,
                 input_mime="audio/pcm",
                 output_mime="audio/pcm",
+                output_modalities=rt_output_modalities,
                 tools=initial_tools or None,
                 tool_choice="auto",
             )
@@ -187,9 +191,7 @@ class QueryService(QueryServiceInterface):
     ) -> None:
         self._sticky_sessions[user_id] = {
             "agent": agent_name,
-            "started_at": self._sticky_sessions.get(user_id, {}).get(
-                "started_at", time.time()
-            ),
+            "started_at": time.time(),
             "last_updated": time.time(),
             "required_complete": required_complete,
         }
@@ -201,6 +203,13 @@ class QueryService(QueryServiceInterface):
             self._sticky_sessions[user_id]["required_complete"] = required_complete
             self._sticky_sessions[user_id]["last_updated"] = time.time()
+    def _clear_sticky_agent(self, user_id: str) -> None:
+        if user_id in self._sticky_sessions:
+            try:
+                del self._sticky_sessions[user_id]
+            except Exception:
+                pass
     async def _build_combined_context(
         self,
         user_id: str,
@@ -514,6 +523,7 @@ class QueryService(QueryServiceInterface):
         vad: Optional[bool] = None,
         rt_encode_input: bool = False,
         rt_encode_output: bool = False,
+        rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
         rt_voice: Literal[
             "alloy",
             "ash",
@@ -526,6 +536,12 @@ class QueryService(QueryServiceInterface):
             "shimmer",
             "verse",
         ] = "marin",
+        # Realtime transcription configuration (new)
+        rt_transcription_model: Optional[str] = None,
+        rt_transcription_language: Optional[str] = None,
+        rt_transcription_prompt: Optional[str] = None,
+        rt_transcription_noise_reduction: Optional[bool] = None,
+        rt_transcription_include_logprobs: bool = False,
         audio_voice: Literal[
             "alloy",
             "ash",
@@ -554,31 +570,13 @@ class QueryService(QueryServiceInterface):
         try:
             # Realtime request: HTTP STT for user + single WS for assistant audio
             if realtime:
-                # 1) Launch HTTP STT in background when input is audio; don't block WS
+                # 1) Determine if input is audio bytes. We now ALWAYS skip HTTP STT in realtime mode.
+                #    The realtime websocket session (optionally with built-in transcription) is authoritative.
                 is_audio_bytes = isinstance(query, (bytes, bytearray))
-                user_text = ""
-                stt_task = None
-                if is_audio_bytes:
-                    async def _stt_consume():
-                        txt = ""
-                        try:
-                            logger.info(
-                                f"Realtime(HTTP STT): transcribing format: {audio_input_format}"
-                            )
-                            async for (
-                                t
-                            ) in self.agent_service.llm_provider.transcribe_audio(  # type: ignore[attr-defined]
-                                query, audio_input_format
-                            ):
-                                txt += t
-                        except Exception as e:
-                            logger.error(f"HTTP STT error: {e}")
-                        return txt
-                    stt_task = asyncio.create_task(_stt_consume())
-                else:
-                    user_text = str(query)
+                user_text = "" if is_audio_bytes else str(query)
+                # Provide a sensible default realtime transcription model when audio supplied
+                if is_audio_bytes and not rt_transcription_model:
+                    rt_transcription_model = "gpt-4o-mini-transcribe"
                 # 2) Single agent selection (no multi-agent routing in realtime path)
                 agent_name = self._get_sticky_agent(user_id)
@@ -693,6 +691,17 @@ class QueryService(QueryServiceInterface):
                 encode_out = bool(
                     rt_encode_output or (audio_output_format.lower() != "pcm")
                 )
+                # If caller explicitly requests text-only realtime, disable output encoding entirely
+                if (
+                    rt_output_modalities is not None
+                    and "audio" not in rt_output_modalities
+                ):
+                    if encode_out:
+                        logger.debug(
+                            "Realtime(QueryService): forcing encode_out False for text-only modalities=%s",
+                            rt_output_modalities,
+                        )
+                    encode_out = False
                 # Choose input transcoding when compressed input is provided (or explicitly requested)
                 is_audio_bytes = isinstance(query, (bytes, bytearray))
                 encode_in = bool(
@@ -700,7 +709,8 @@ class QueryService(QueryServiceInterface):
                     or (is_audio_bytes and audio_input_format.lower() != "pcm")
                 )
-                # Allocate or reuse a realtime session for this specific request/user
+                # Allocate or reuse a realtime session for this specific request/user.
+                # (Transcription options may be applied below; if they change after allocate we will reconfigure.)
                 rt = await self._alloc_realtime_session(
                     user_id,
                     api_key=api_key,
@@ -711,9 +721,46 @@ class QueryService(QueryServiceInterface):
                     encode_out=encode_out,
                     audio_input_format=audio_input_format,
                     audio_output_format=audio_output_format,
+                    rt_output_modalities=rt_output_modalities,
                 )
                 # Ensure lock is released no matter what
                 try:
+                    # --- Apply realtime transcription config BEFORE connecting (new) ---
+                    if rt_transcription_model and hasattr(rt, "_options"):
+                        try:
+                            setattr(
+                                rt._options,
+                                "transcription_model",
+                                rt_transcription_model,
+                            )
+                            if rt_transcription_language is not None:
+                                setattr(
+                                    rt._options,
+                                    "transcription_language",
+                                    rt_transcription_language,
+                                )
+                            if rt_transcription_prompt is not None:
+                                setattr(
+                                    rt._options,
+                                    "transcription_prompt",
+                                    rt_transcription_prompt,
+                                )
+                            if rt_transcription_noise_reduction is not None:
+                                setattr(
+                                    rt._options,
+                                    "transcription_noise_reduction",
+                                    rt_transcription_noise_reduction,
+                                )
+                            if rt_transcription_include_logprobs:
+                                setattr(
+                                    rt._options, "transcription_include_logprobs", True
+                                )
+                        except Exception:
+                            logger.debug(
+                                "Failed pre-connect transcription option assignment",
+                                exc_info=True,
+                            )
                     # Tool executor
                     async def _exec(
                         tool_name: str, args: Dict[str, Any]
@@ -755,16 +802,47 @@ class QueryService(QueryServiceInterface):
                     except Exception:
                         pass
-                    # Persist once per turn
+                    # Begin streaming turn (defer user transcript persistence until final to avoid duplicates)
                     turn_id = await self.realtime_begin_turn(user_id)
-                    if turn_id and user_text:
-                        try:
-                            await self.realtime_update_user(user_id, turn_id, user_text)
-                        except Exception:
-                            pass
+                    # We'll buffer the full user transcript (text input or realtime audio transcription) and persist exactly once.
+                    # Initialize empty; we'll build it strictly from realtime transcript segments to avoid
+                    # accidental duplication with pre-supplied user_text or prior buffers.
+                    final_user_tr: str = ""
+                    user_persisted = False
+                    # Feed audio into WS if audio bytes provided and audio modality requested; else treat as text
+                    wants_audio = (
+                        (
+                            getattr(rt, "_options", None)
+                            and getattr(rt, "_options").output_modalities
+                        )
+                        and "audio" in getattr(rt, "_options").output_modalities  # type: ignore[attr-defined]
+                    ) or (
+                        rt_output_modalities is None
+                        or (rt_output_modalities and "audio" in rt_output_modalities)
+                    )
+                    # Determine if realtime transcription should be enabled (always skip HTTP STT regardless)
+                    # realtime_transcription_enabled now implicit (options set before connect)
-                    # Feed audio into WS if audio bytes provided; else use input_text
-                    if is_audio_bytes:
+                    if is_audio_bytes and not wants_audio:
+                        # Feed audio solely for transcription (no audio output requested)
+                        bq = bytes(query)
+                        logger.info(
+                            "Realtime: appending input audio for transcription only, len=%d, fmt=%s",
+                            len(bq),
+                            audio_input_format,
+                        )
+                        await rt.append_audio(bq)
+                        vad_enabled_value = bool(vad) if vad is not None else False
+                        if not vad_enabled_value:
+                            await rt.commit_input()
+                            # Request only text response
+                            await rt.create_response({"modalities": ["text"]})
+                        else:
+                            logger.debug(
+                                "Realtime: VAD enabled (text-only output) — skipping manual response.create"
+                            )
+                    if is_audio_bytes and wants_audio:
                         bq = bytes(query)
                         logger.info(
                             "Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
@@ -782,64 +860,329 @@ class QueryService(QueryServiceInterface):
                             logger.debug(
                                 "Realtime: VAD enabled — skipping manual response.create"
                             )
-                    else:
-                        # Rely on configured session voice; attach input_text only
-                        await rt.create_response(
+                    else:  # Text-only path OR caller excluded audio modality
+                        # For text input, create conversation item first, then response
+                        await rt.create_conversation_item(
                             {
-                                "modalities": ["audio"],
-                                "input": [
+                                "type": "message",
+                                "role": "user",
+                                "content": [
                                     {"type": "input_text", "text": user_text or ""}
                                 ],
                             }
                         )
+                        # Determine effective modalities (fall back to provided override or text only)
+                        if rt_output_modalities is not None:
+                            modalities = rt_output_modalities or ["text"]
+                        else:
+                            mo = getattr(
+                                rt, "_options", RealtimeSessionOptions()
+                            ).output_modalities
+                            modalities = mo if mo else ["audio"]
+                        if "audio" not in modalities:
+                            # Ensure we do not accidentally request audio generation
+                            modalities = [m for m in modalities if m == "text"] or [
+                                "text"
+                            ]
+                        await rt.create_response(
+                            {
+                                "modalities": modalities,
+                            }
+                        )
                     # Collect audio and transcripts
-                    user_tr = ""
+                    user_tr = ""  # Accumulates realtime input transcript segments (audio path)
                     asst_tr = ""
+                    input_segments: List[str] = []
                     async def _drain_in_tr():
+                        """Accumulate realtime input transcript segments, de-duplicating cumulative repeats.
+                        Some realtime providers emit growing cumulative transcripts (e.g. "Hel", "Hello") or
+                        may occasionally resend the full final transcript. Previous logic naively concatenated
+                        every segment which could yield duplicated text ("HelloHello") if cumulative or repeated
+                        finals were received. This routine keeps a canonical buffer (user_tr) and only appends
+                        the non-overlapping suffix of each new segment.
+                        """
                         nonlocal user_tr
                         async for t in rt.iter_input_transcript():
-                            if t:
-                                user_tr += t
-                    async def _drain_out_tr():
-                        nonlocal asst_tr
-                        async for t in rt.iter_output_transcript():
-                            if t:
-                                asst_tr += t
-                    in_task = asyncio.create_task(_drain_in_tr())
-                    out_task = asyncio.create_task(_drain_out_tr())
-                    try:
-                        async for audio_chunk in rt.iter_output_audio_encoded():
-                            yield audio_chunk
-                    finally:
-                        in_task.cancel()
-                        out_task.cancel()
-                        # If no WS input transcript was captured, fall back to HTTP STT result
-                        if not user_tr:
+                            if not t:
+                                continue
+                            # Track raw segment for optional debugging
+                            input_segments.append(t)
+                            if not user_tr:
+                                user_tr = t
+                                continue
+                            if t == user_tr:
+                                # Exact duplicate of current buffer; skip
+                                continue
+                            if t.startswith(user_tr):
+                                # Cumulative growth; append only the new suffix
+                                user_tr += t[len(user_tr) :]
+                                continue
+                            # General case: find largest overlap between end of user_tr and start of t
+                            # to avoid duplicated middle content (e.g., user_tr="My name is", t="name is John")
+                            overlap = 0
+                            max_check = min(len(user_tr), len(t))
+                            for k in range(max_check, 0, -1):
+                                if user_tr.endswith(t[:k]):
+                                    overlap = k
+                                    break
+                            user_tr += t[overlap:]
+                    # Check if we need both audio and text modalities
+                    modalities = getattr(
+                        rt, "_options", RealtimeSessionOptions()
+                    ).output_modalities or ["audio"]
+                    use_combined_stream = "audio" in modalities and "text" in modalities
+                    if use_combined_stream and wants_audio:
+                        # Use combined stream for both modalities
+                        async def _drain_out_tr():
+                            nonlocal asst_tr
+                            async for t in rt.iter_output_transcript():
+                                if t:
+                                    asst_tr += t
+                        in_task = asyncio.create_task(_drain_in_tr())
+                        out_task = asyncio.create_task(_drain_out_tr())
+                        try:
+                            # Check if the service has iter_output_combined method
+                            if hasattr(rt, "iter_output_combined"):
+                                async for chunk in rt.iter_output_combined():
+                                    # Adapt output based on caller's requested output_format
+                                    if output_format == "text":
+                                        # Only yield text modalities as plain strings
+                                        if getattr(chunk, "modality", None) == "text":
+                                            yield chunk.data  # type: ignore[attr-defined]
+                                        continue
+                                    # Audio streaming path
+                                    if getattr(chunk, "modality", None) == "audio":
+                                        # Yield raw bytes if data present
+                                        yield getattr(chunk, "data", b"")
+                                    elif (
+                                        getattr(chunk, "modality", None) == "text"
+                                        and output_format == "audio"
+                                    ):
+                                        # Optionally ignore or log text while audio requested
+                                        continue
+                                    else:
+                                        # Fallback: ignore unknown modalities for now
+                                        continue
+                            else:
+                                # Fallback: yield audio chunks as RealtimeChunk objects
+                                async for audio_chunk in rt.iter_output_audio_encoded():
+                                    if output_format == "text":
+                                        # Ignore audio when text requested
+                                        continue
+                                    # output_format audio: provide raw bytes
+                                    if hasattr(audio_chunk, "modality"):
+                                        if (
+                                            getattr(audio_chunk, "modality", None)
+                                            == "audio"
+                                        ):
+                                            yield getattr(audio_chunk, "data", b"")
+                                    else:
+                                        yield audio_chunk
+                        finally:
+                            # Allow transcript drain tasks to finish to capture user/asst text before persistence
+                            try:
+                                await asyncio.wait_for(in_task, timeout=0.05)
+                            except Exception:
+                                in_task.cancel()
+                            try:
+                                await asyncio.wait_for(out_task, timeout=0.05)
+                            except Exception:
+                                out_task.cancel()
+                        # HTTP STT path removed: realtime audio input transcript (if any) is authoritative
+                        # Persist transcripts after combined streaming completes
+                        if turn_id:
                             try:
-                                if "stt_task" in locals() and stt_task is not None:
-                                    user_tr = await stt_task
+                                effective_user_tr = user_tr or ("".join(input_segments))
+                                try:
+                                    setattr(
+                                        self,
+                                        "_last_realtime_user_transcript",
+                                        effective_user_tr,
+                                    )
+                                except Exception:
+                                    pass
+                                if effective_user_tr:
+                                    final_user_tr = effective_user_tr
+                                elif (
+                                    isinstance(query, str)
+                                    and query
+                                    and not input_segments
+                                    and not user_tr
+                                ):
+                                    final_user_tr = query
+                                if asst_tr:
+                                    await self.realtime_update_assistant(
+                                        user_id, turn_id, asst_tr
+                                    )
+                            except Exception:
+                                pass
+                            if final_user_tr and not user_persisted:
+                                try:
+                                    await self.realtime_update_user(
+                                        user_id, turn_id, final_user_tr
+                                    )
+                                    user_persisted = True
+                                except Exception:
+                                    pass
+                            try:
+                                await self.realtime_finalize_turn(user_id, turn_id)
                             except Exception:
                                 pass
+                            if final_user_tr and not user_persisted:
+                                try:
+                                    await self.realtime_update_user(
+                                        user_id, turn_id, final_user_tr
+                                    )
+                                    user_persisted = True
+                                except Exception:
+                                    pass
+                    elif wants_audio:
+                        # Use separate streams (legacy behavior)
+                        async def _drain_out_tr():
+                            nonlocal asst_tr
+                            async for t in rt.iter_output_transcript():
+                                if t:
+                                    asst_tr += t
+                        in_task = asyncio.create_task(_drain_in_tr())
+                        out_task = asyncio.create_task(_drain_out_tr())
+                        try:
+                            async for audio_chunk in rt.iter_output_audio_encoded():
+                                if output_format == "text":
+                                    # Skip audio when caller wants text only
+                                    continue
+                                # output_format audio: yield raw bytes
+                                if hasattr(audio_chunk, "modality"):
+                                    if (
+                                        getattr(audio_chunk, "modality", None)
+                                        == "audio"
+                                    ):
+                                        yield getattr(audio_chunk, "data", b"")
+                                else:
+                                    yield audio_chunk
+                        finally:
+                            try:
+                                await asyncio.wait_for(in_task, timeout=0.05)
+                            except Exception:
+                                in_task.cancel()
+                            try:
+                                await asyncio.wait_for(out_task, timeout=0.05)
+                            except Exception:
+                                out_task.cancel()
+                        # HTTP STT path removed
+                        # Persist transcripts after audio-only streaming
                         if turn_id:
                             try:
-                                if user_tr:
+                                effective_user_tr = user_tr or ("".join(input_segments))
+                                try:
+                                    setattr(
+                                        self,
+                                        "_last_realtime_user_transcript",
+                                        effective_user_tr,
+                                    )
+                                except Exception:
+                                    pass
+                                # Buffer final transcript for single persistence
+                                if effective_user_tr:
+                                    final_user_tr = effective_user_tr
+                                elif (
+                                    isinstance(query, str)
+                                    and query
+                                    and not input_segments
+                                    and not user_tr
+                                ):
+                                    final_user_tr = query
+                                if asst_tr:
+                                    await self.realtime_update_assistant(
+                                        user_id, turn_id, asst_tr
+                                    )
+                            except Exception:
+                                pass
+                            if final_user_tr and not user_persisted:
+                                try:
                                     await self.realtime_update_user(
-                                        user_id, turn_id, user_tr
+                                        user_id, turn_id, final_user_tr
                                     )
+                                    user_persisted = True
+                                except Exception:
+                                    pass
+                            try:
+                                await self.realtime_finalize_turn(user_id, turn_id)
+                            except Exception:
+                                pass
+                        # If no WS input transcript was captured, fall back to HTTP STT result
+                    else:
+                        # Text-only: just stream assistant transcript if available (no audio iteration)
+                        # If original input was audio bytes but caller only wants text output (no audio modality),
+                        # we still need to drain the input transcript stream to build user_tr.
+                        in_task_audio_only = None
+                        if is_audio_bytes:
+                            in_task_audio_only = asyncio.create_task(_drain_in_tr())
+                        async def _drain_out_tr_text():
+                            nonlocal asst_tr
+                            async for t in rt.iter_output_transcript():
+                                if t:
+                                    asst_tr += t
+                                    yield t  # Yield incremental text chunks directly
+                        async for t in _drain_out_tr_text():
+                            # Provide plain text to caller
+                            yield t
+                        # Wait for input transcript (if any) before persistence
+                        if "in_task_audio_only" in locals() and in_task_audio_only:
+                            try:
+                                await asyncio.wait_for(in_task_audio_only, timeout=0.1)
+                            except Exception:
+                                in_task_audio_only.cancel()
+                        # No HTTP STT fallback
+                        if turn_id:
+                            try:
+                                effective_user_tr = user_tr or ("".join(input_segments))
+                                try:
+                                    setattr(
+                                        self,
+                                        "_last_realtime_user_transcript",
+                                        effective_user_tr,
+                                    )
+                                except Exception:
+                                    pass
+                                # For text-only modality but audio-origin (cumulative segments captured), persist user transcript
+                                if effective_user_tr:
+                                    final_user_tr = effective_user_tr
+                                elif (
+                                    isinstance(query, str)
+                                    and query
+                                    and not input_segments
+                                    and not user_tr
+                                ):
+                                    final_user_tr = query
                                 if asst_tr:
                                     await self.realtime_update_assistant(
                                         user_id, turn_id, asst_tr
                                     )
                             except Exception:
                                 pass
+                            if final_user_tr and not user_persisted:
+                                try:
+                                    await self.realtime_update_user(
+                                        user_id, turn_id, final_user_tr
+                                    )
+                                    user_persisted = True
+                                except Exception:
+                                    pass
                             try:
                                 await self.realtime_finalize_turn(user_id, turn_id)
                             except Exception:
                                 pass
+                        # Input transcript task already awaited above
                         # Clear input buffer for next turn reuse
                         try:
                             await rt.clear_input()
@@ -855,58 +1198,30 @@ class QueryService(QueryServiceInterface):
                         pass
                     return
-            # 1) Transcribe audio or accept text
+            # 1) Acquire user_text (transcribe audio or direct text) for non-realtime path
             user_text = ""
             if not isinstance(query, str):
-                logger.info(
-                    f"Received audio input, transcribing format: {audio_input_format}"
-                )
-                async for (
-                    transcript
-                ) in self.agent_service.llm_provider.transcribe_audio(
-                    query, audio_input_format
-                ):
-                    user_text += transcript
-                logger.info(f"Transcription result length: {len(user_text)}")
+                try:
+                    logger.info(
+                        f"Received audio input, transcribing format: {audio_input_format}"
+                    )
+                    async for tpart in self.agent_service.llm_provider.transcribe_audio(  # type: ignore[attr-defined]
+                        query, audio_input_format
+                    ):
+                        user_text += tpart
+                except Exception:
+                    user_text = ""
             else:
                 user_text = query
-                logger.info(f"Received text input length: {len(user_text)}")
             # 2) Input guardrails
-            original_text = user_text
             for guardrail in self.input_guardrails:
                 try:
                     user_text = await guardrail.process(user_text)
                 except Exception as e:
                     logger.debug(f"Guardrail error: {e}")
-            if user_text != original_text:
-                logger.info(
-                    f"Input guardrails modified user text. Original length: {len(original_text)}, New length: {len(user_text)}"
-                )
-            # 3) Greetings shortcut
-            if not images and user_text.strip().lower() in {
-                "hi",
-                "hello",
-                "hey",
-                "ping",
-                "test",
-            }:
-                greeting = "Hello! How can I help you today?"
-                if output_format == "audio":
-                    async for chunk in self.agent_service.llm_provider.tts(
-                        text=greeting,
-                        voice=audio_voice,
-                        response_format=audio_output_format,
-                    ):
-                        yield chunk
-                else:
-                    yield greeting
-                if self.memory_provider:
-                    await self._store_conversation(user_id, original_text, greeting)
-                return
-            # 4) Memory context (conversation history)
+            # 3) Memory context (conversation history)
             memory_context = ""
             if self.memory_provider:
                 try:
@@ -914,7 +1229,7 @@ class QueryService(QueryServiceInterface):
                 except Exception:
                     memory_context = ""
-            # 5) Knowledge base context
+            # 4) Knowledge base context
             kb_context = ""
             if self.knowledge_base:
                 try:
@@ -934,7 +1249,7 @@ class QueryService(QueryServiceInterface):
                 except Exception:
                     kb_context = ""
-            # 6) Determine agent (sticky session aware; allow explicit switch/new conversation)
+            # 5) Determine agent (sticky session aware; allow explicit switch/new conversation)
             agent_name = "default"
             prev_assistant = ""
             routing_input = user_text

solana-agent 31.2.6__py3-none-any.whl → 31.3.0__py3-none-any.whl

solana-agent 31.2.6py3-none-any.whl → 31.3.0py3-none-any.whl