PyPI - solana-agent - Versions diffs - 31.2.2__tar.gz → 31.2.4__tar.gz - Mend

solana-agent 31.2.2tar.gz → 31.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{solana_agent-31.2.2 → solana_agent-31.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: solana-agent
-Version: 31.2.2
+Version: 31.2.4
 Summary: AI Agents for Solana
 License: MIT
 Keywords: solana,solana ai,solana agent,ai,ai agent,ai agents
@@ -317,27 +317,39 @@ Due to the overhead of the router (API call) - realtime only supports a single a
 Realtime uses MongoDB for memory so Zep is not needed.
+This example will work using expo-audio on Android and iOS.
 ```python
 from solana_agent import SolanaAgent
 solana_agent = SolanaAgent(config=config)
-# Example: mobile sends MP4/AAC; server encodes output to AAC
-audio_content = await audio_file.read()  # bytes
-async for audio_chunk in solana_agent.process(
-    "user123",                    # required
-    audio_content,                # required
-    realtime=True,                # optional (default False)
-    output_format="audio",        # required
-    vad=True,                     # enable VAD (optional)
-    rt_encode_input=True,         # accept compressed input (optional)
-    rt_encode_output=True,        # encode output for client (optional)
-    rt_voice="marin"              # the voice to use for interactions (optional)
-    audio_input_format="mp4",     # client transport (optional)
-    audio_output_format="aac"     # client transport (optional)
-):
-    handle_audio(audio_chunk)
-```
+audio_content = await audio_file.read()
+async def generate():
+    async for chunk in solana_agent.process(
+        user_id=user_id,
+        message=audio_content,
+        realtime=True,
+        rt_encode_input=True,
+        rt_encode_output=True,
+        rt_voice="marin",
+        output_format="audio",
+        audio_output_format="mp3",
+        audio_input_format="mp4",
+    ):
+        yield chunk
+return StreamingResponse(
+    content=generate(),
+    media_type="audio/mp3",
+    headers={
+        "Cache-Control": "no-store",
+        "Pragma": "no-cache",
+        "Content-Disposition": "inline; filename=stream.mp3",
+        "X-Accel-Buffering": "no",
+    },
+)
 ### Image/Text Streaming

{solana_agent-31.2.2 → solana_agent-31.2.4}/README.md RENAMED Viewed

@@ -281,27 +281,39 @@ Due to the overhead of the router (API call) - realtime only supports a single a
 Realtime uses MongoDB for memory so Zep is not needed.
+This example will work using expo-audio on Android and iOS.
 ```python
 from solana_agent import SolanaAgent
 solana_agent = SolanaAgent(config=config)
-# Example: mobile sends MP4/AAC; server encodes output to AAC
-audio_content = await audio_file.read()  # bytes
-async for audio_chunk in solana_agent.process(
-    "user123",                    # required
-    audio_content,                # required
-    realtime=True,                # optional (default False)
-    output_format="audio",        # required
-    vad=True,                     # enable VAD (optional)
-    rt_encode_input=True,         # accept compressed input (optional)
-    rt_encode_output=True,        # encode output for client (optional)
-    rt_voice="marin"              # the voice to use for interactions (optional)
-    audio_input_format="mp4",     # client transport (optional)
-    audio_output_format="aac"     # client transport (optional)
-):
-    handle_audio(audio_chunk)
-```
+audio_content = await audio_file.read()
+async def generate():
+    async for chunk in solana_agent.process(
+        user_id=user_id,
+        message=audio_content,
+        realtime=True,
+        rt_encode_input=True,
+        rt_encode_output=True,
+        rt_voice="marin",
+        output_format="audio",
+        audio_output_format="mp3",
+        audio_input_format="mp4",
+    ):
+        yield chunk
+return StreamingResponse(
+    content=generate(),
+    media_type="audio/mp3",
+    headers={
+        "Cache-Control": "no-store",
+        "Pragma": "no-cache",
+        "Content-Disposition": "inline; filename=stream.mp3",
+        "X-Accel-Buffering": "no",
+    },
+)
 ### Image/Text Streaming

{solana_agent-31.2.2 → solana_agent-31.2.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "solana-agent"
-version = "31.2.2"
+version = "31.2.4"
 description = "AI Agents for Solana"
 authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
 license = "MIT"

{solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/adapters/ffmpeg_transcoder.py RENAMED Viewed

@@ -4,6 +4,8 @@ import asyncio
 import contextlib
 import logging
 from typing import List, AsyncGenerator
+import tempfile
+import os
 from solana_agent.interfaces.providers.audio import AudioTranscoder
@@ -49,11 +51,45 @@ class FFmpegTranscoder(AudioTranscoder):
             rate_hz,
             len(audio_bytes),
         )
-        # Prefer to hint format for common containers/codecs; ffmpeg can still autodetect if hint is wrong.
-        hinted_format = None
+        # iOS-recorded MP4/M4A often requires a seekable input for reliable demuxing.
+        # Decode from a temporary file instead of stdin for MP4/M4A.
         if input_mime in ("audio/mp4", "audio/m4a"):
-            hinted_format = "mp4"
-        elif input_mime in ("audio/aac",):
+            suffix = ".m4a" if input_mime == "audio/m4a" else ".mp4"
+            tmp_path = None
+            try:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tf:
+                    tmp_path = tf.name
+                    tf.write(audio_bytes)
+                args = [
+                    "-hide_banner",
+                    "-loglevel",
+                    "error",
+                    "-i",
+                    tmp_path,
+                    "-vn",  # ignore any video tracks
+                    "-acodec",
+                    "pcm_s16le",
+                    "-ac",
+                    "1",
+                    "-ar",
+                    str(rate_hz),
+                    "-f",
+                    "s16le",
+                    "pipe:1",
+                ]
+                out = await self._run_ffmpeg(args, b"")
+                logger.info(
+                    "Transcoded (MP4/M4A temp-file) to PCM16: output_len=%d", len(out)
+                )
+                return out
+            finally:
+                if tmp_path:
+                    with contextlib.suppress(Exception):
+                        os.remove(tmp_path)
+        # For other formats, prefer a format hint when helpful and decode from stdin.
+        hinted_format = None
+        if input_mime in ("audio/aac",):
             # Raw AAC is typically in ADTS stream format
             hinted_format = "adts"
         elif input_mime in ("audio/ogg", "audio/webm"):
@@ -88,13 +124,14 @@ class FFmpegTranscoder(AudioTranscoder):
     async def from_pcm16(  # pragma: no cover
         self, pcm16_bytes: bytes, output_mime: str, rate_hz: int
     ) -> bytes:
-        """Encode PCM16LE to desired format (currently AAC ADTS for mobile streaming)."""
+        """Encode PCM16LE to desired format (AAC ADTS, fragmented MP4, or MP3)."""
         logger.info(
             "Encode from PCM16: output_mime=%s, rate_hz=%d, input_len=%d",
             output_mime,
             rate_hz,
             len(pcm16_bytes),
         )
         if output_mime in ("audio/mpeg", "audio/mp3"):
             # Encode to MP3 (often better streaming compatibility on mobile)
             args = [
@@ -122,8 +159,9 @@ class FFmpegTranscoder(AudioTranscoder):
                 "Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
             )
             return out
-        if output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
-            # Encode to AAC in ADTS stream; clients can play it as AAC.
+        if output_mime in ("audio/aac",):
+            # Encode to AAC in ADTS stream; good for streaming over sockets/HTTP chunked
             args = [
                 "-hide_banner",
                 "-loglevel",
@@ -149,6 +187,38 @@ class FFmpegTranscoder(AudioTranscoder):
                 "Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
             )
             return out
+        if output_mime in ("audio/mp4", "audio/m4a"):
+            # Encode to fragmented MP4 (fMP4) with AAC for better iOS compatibility
+            # For streaming, write an initial moov and fragment over stdout.
+            args = [
+                "-hide_banner",
+                "-loglevel",
+                "error",
+                "-f",
+                "s16le",
+                "-ac",
+                "1",
+                "-ar",
+                str(rate_hz),
+                "-i",
+                "pipe:0",
+                "-c:a",
+                "aac",
+                "-b:a",
+                "96k",
+                "-movflags",
+                "+frag_keyframe+empty_moov",
+                "-f",
+                "mp4",
+                "pipe:1",
+            ]
+            out = await self._run_ffmpeg(args, pcm16_bytes)
+            logger.info(
+                "Encoded from PCM16 to %s (fMP4): output_len=%d", output_mime, len(out)
+            )
+            return out
         # Default: passthrough
         logger.info("Encode passthrough (no change), output_len=%d", len(pcm16_bytes))
         return pcm16_bytes
@@ -187,7 +257,7 @@ class FFmpegTranscoder(AudioTranscoder):
                 "mp3",
                 "pipe:1",
             ]
-        elif output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
+        elif output_mime in ("audio/aac",):
             args = [
                 "-hide_banner",
                 "-loglevel",
@@ -208,6 +278,29 @@ class FFmpegTranscoder(AudioTranscoder):
                 "adts",
                 "pipe:1",
             ]
+        elif output_mime in ("audio/mp4", "audio/m4a"):
+            args = [
+                "-hide_banner",
+                "-loglevel",
+                "error",
+                "-f",
+                "s16le",
+                "-ac",
+                "1",
+                "-ar",
+                str(rate_hz),
+                "-i",
+                "pipe:0",
+                "-c:a",
+                "aac",
+                "-b:a",
+                "96k",
+                "-movflags",
+                "+frag_keyframe+empty_moov",
+                "-f",
+                "mp4",
+                "pipe:1",
+            ]
         else:
             # Passthrough streaming: just yield input
             async for chunk in pcm_iter:

{solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/adapters/openai_realtime_ws.py RENAMED Viewed

@@ -325,7 +325,26 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
                             try:
                                 chunk = base64.b64decode(b64)
                                 self._audio_queue.put_nowait(chunk)
-                                logger.info("Audio delta bytes=%d", len(chunk))
+                                # Ownership/response tagging for diagnostics
+                                try:
+                                    owner = getattr(self, "_owner_user_id", None)
+                                except Exception:
+                                    owner = None
+                                try:
+                                    rid = getattr(self, "_active_response_id", None)
+                                except Exception:
+                                    rid = None
+                                try:
+                                    gen = int(getattr(self, "_response_generation", 0))
+                                except Exception:
+                                    gen = None
+                                logger.info(
+                                    "Audio delta bytes=%d owner=%s rid=%s gen=%s",
+                                    len(chunk),
+                                    owner,
+                                    rid,
+                                    gen,
+                                )
                                 try:
                                     # New response detected if we were previously inactive
                                     if not getattr(self, "_response_active", False):
@@ -492,8 +511,25 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
                         "response.audio.done",
                     ):
                         # End of audio stream for the response; stop audio iterator but keep WS open for transcripts
+                        try:
+                            owner = getattr(self, "_owner_user_id", None)
+                        except Exception:
+                            owner = None
+                        try:
+                            rid = (data.get("response") or {}).get("id") or getattr(
+                                self, "_active_response_id", None
+                            )
+                        except Exception:
+                            rid = None
+                        try:
+                            gen = int(getattr(self, "_response_generation", 0))
+                        except Exception:
+                            gen = None
                         logger.info(
-                            "Realtime WS: output audio done; ending audio stream"
+                            "Realtime WS: output audio done; owner=%s rid=%s gen=%s",
+                            owner,
+                            rid,
+                            gen,
                         )
                         # If we have a buffered transcript for this response, flush it now
                         try:
@@ -1001,6 +1037,15 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
         if "tools" in patch:
             patch["tools"] = _strip_tool_strict(patch["tools"])  # idempotent
+        # Per server requirements, always include session.type and output_modalities
+        try:
+            patch["type"] = "realtime"
+            # Preserve caller-provided output_modalities if present, otherwise default to audio
+            if "output_modalities" not in patch:
+                patch["output_modalities"] = ["audio"]
+        except Exception:
+            pass
         payload = {"type": "session.update", "session": patch}
         # Mark awaiting updated and store last patch
         self._last_session_patch = patch or {}

{solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/services/query.py RENAMED Viewed

@@ -67,10 +67,117 @@ class QueryService(QueryServiceInterface):
         self._sticky_sessions: Dict[str, Dict[str, Any]] = {}
         # Optional realtime service attached by factory (populated in factory)
         self.realtime = None  # type: ignore[attr-defined]
-        # Persistent realtime WS per user for push-to-talk reuse
-        self._rt_services = {}
+        # Persistent realtime WS pool per user for reuse across turns/devices
+        # { user_id: [RealtimeService, ...] }
+        self._rt_services: Dict[str, List[Any]] = {}
+        # Global lock for creating/finding per-user sessions
         self._rt_lock = asyncio.Lock()
+    async def _try_acquire_lock(self, lock: asyncio.Lock) -> bool:
+        try:
+            await asyncio.wait_for(lock.acquire(), timeout=0)
+            return True
+        except asyncio.TimeoutError:
+            return False
+        except Exception:
+            return False
+    async def _alloc_realtime_session(
+        self,
+        user_id: str,
+        *,
+        api_key: str,
+        rt_voice: str,
+        final_instructions: str,
+        initial_tools: Optional[List[Dict[str, Any]]],
+        encode_in: bool,
+        encode_out: bool,
+        audio_input_format: str,
+        audio_output_format: str,
+    ) -> Any:
+        """Get a free (or new) realtime session for this user. Marks it busy via an internal lock.
+        Returns the RealtimeService with an acquired _in_use_lock that MUST be released by caller.
+        """
+        from solana_agent.interfaces.providers.realtime import (
+            RealtimeSessionOptions,
+        )
+        from solana_agent.adapters.openai_realtime_ws import (
+            OpenAIRealtimeWebSocketSession,
+        )
+        from solana_agent.adapters.ffmpeg_transcoder import FFmpegTranscoder
+        def _mime_from(fmt: str) -> str:
+            f = (fmt or "").lower()
+            return {
+                "aac": "audio/aac",
+                "mp3": "audio/mpeg",
+                "mp4": "audio/mp4",
+                "m4a": "audio/mp4",
+                "mpeg": "audio/mpeg",
+                "mpga": "audio/mpeg",
+                "wav": "audio/wav",
+                "flac": "audio/flac",
+                "opus": "audio/opus",
+                "ogg": "audio/ogg",
+                "webm": "audio/webm",
+                "pcm": "audio/pcm",
+            }.get(f, "audio/pcm")
+        async with self._rt_lock:
+            pool = self._rt_services.get(user_id) or []
+            # Try to reuse an idle session strictly owned by this user
+            for rt in pool:
+                # Extra safety: never reuse a session from another user
+                owner = getattr(rt, "_owner_user_id", None)
+                if owner is not None and owner != user_id:
+                    continue
+                lock = getattr(rt, "_in_use_lock", None)
+                if lock is None:
+                    lock = asyncio.Lock()
+                    setattr(rt, "_in_use_lock", lock)
+                if not lock.locked():
+                    if await self._try_acquire_lock(lock):
+                        return rt
+            # None free: create a new session
+            opts = RealtimeSessionOptions(
+                model="gpt-realtime",
+                voice=rt_voice,
+                vad_enabled=False,
+                input_rate_hz=24000,
+                output_rate_hz=24000,
+                input_mime="audio/pcm",
+                output_mime="audio/pcm",
+                tools=initial_tools or None,
+                tool_choice="auto",
+            )
+            try:
+                opts.instructions = final_instructions
+                opts.voice = rt_voice
+            except Exception:
+                pass
+            conv_session = OpenAIRealtimeWebSocketSession(api_key=api_key, options=opts)
+            transcoder = FFmpegTranscoder() if (encode_in or encode_out) else None
+            from solana_agent.services.realtime import RealtimeService
+            rt = RealtimeService(
+                session=conv_session,
+                options=opts,
+                transcoder=transcoder,
+                accept_compressed_input=encode_in,
+                client_input_mime=_mime_from(audio_input_format),
+                encode_output=encode_out,
+                client_output_mime=_mime_from(audio_output_format),
+            )
+            # Tag ownership to prevent any cross-user reuse
+            setattr(rt, "_owner_user_id", user_id)
+            setattr(rt, "_in_use_lock", asyncio.Lock())
+            # Mark busy
+            await getattr(rt, "_in_use_lock").acquire()
+            pool.append(rt)
+            self._rt_services[user_id] = pool
+            return rt
     def _get_sticky_agent(self, user_id: str) -> Optional[str]:
         sess = self._sticky_sessions.get(user_id)
         return sess.get("agent") if isinstance(sess, dict) else None
@@ -554,14 +661,7 @@ class QueryService(QueryServiceInterface):
                 final_instructions = "\n\n".join([p for p in parts if p])
                 # 4) Open a single WS session for assistant audio
-                from solana_agent.adapters.openai_realtime_ws import (
-                    OpenAIRealtimeWebSocketSession,
-                )
-                from solana_agent.interfaces.providers.realtime import (
-                    RealtimeSessionOptions,
-                )
-                from solana_agent.services.realtime import RealtimeService
-                from solana_agent.adapters.ffmpeg_transcoder import FFmpegTranscoder
+                # Realtime imports handled inside allocator helper
                 api_key = None
                 try:
@@ -600,171 +700,160 @@ class QueryService(QueryServiceInterface):
                     or (is_audio_bytes and audio_input_format.lower() != "pcm")
                 )
-                async with self._rt_lock:
-                    rt = self._rt_services.get(user_id)
-                    if not rt or not isinstance(rt, RealtimeService):
-                        opts = RealtimeSessionOptions(
-                            model="gpt-realtime",
-                            voice=rt_voice,
-                            vad_enabled=False,  # no input audio
-                            input_rate_hz=24000,
-                            output_rate_hz=24000,
-                            input_mime="audio/pcm",
-                            output_mime="audio/pcm",
-                            tools=initial_tools or None,
-                            tool_choice="auto",
-                        )
-                        # Ensure initial session.update carries instructions/voice
+                # Allocate or reuse a realtime session for this specific request/user
+                rt = await self._alloc_realtime_session(
+                    user_id,
+                    api_key=api_key,
+                    rt_voice=rt_voice,
+                    final_instructions=final_instructions,
+                    initial_tools=initial_tools,
+                    encode_in=encode_in,
+                    encode_out=encode_out,
+                    audio_input_format=audio_input_format,
+                    audio_output_format=audio_output_format,
+                )
+                # Ensure lock is released no matter what
+                try:
+                    # Tool executor
+                    async def _exec(
+                        tool_name: str, args: Dict[str, Any]
+                    ) -> Dict[str, Any]:
                         try:
-                            opts.instructions = final_instructions
-                            opts.voice = rt_voice
-                        except Exception:
-                            pass
-                        conv_session = OpenAIRealtimeWebSocketSession(
-                            api_key=api_key, options=opts
-                        )
-                        transcoder = (
-                            FFmpegTranscoder() if (encode_in or encode_out) else None
-                        )
-                        rt = RealtimeService(
-                            session=conv_session,
-                            options=opts,
-                            transcoder=transcoder,
-                            accept_compressed_input=encode_in,
-                            client_input_mime=_mime_from(audio_input_format),
-                            encode_output=encode_out,
-                            client_output_mime=_mime_from(audio_output_format),
-                        )
-                        self._rt_services[user_id] = rt
+                            return await self.agent_service.execute_tool(
+                                agent_name, tool_name, args or {}
+                            )
+                        except Exception as e:
+                            return {"status": "error", "message": str(e)}
-                # Tool executor
-                async def _exec(tool_name: str, args: Dict[str, Any]) -> Dict[str, Any]:
+                    # If possible, set on underlying session
                     try:
-                        return await self.agent_service.execute_tool(
-                            agent_name, tool_name, args or {}
-                        )
-                    except Exception as e:
-                        return {"status": "error", "message": str(e)}
-                # If possible, set on underlying session
-                try:
-                    if hasattr(rt, "_session"):
-                        getattr(rt, "_session").set_tool_executor(_exec)  # type: ignore[attr-defined]
-                except Exception:
-                    pass
-                # Connect/configure
-                if not getattr(rt, "_connected", False):
-                    await rt.start()
-                await rt.configure(
-                    voice=rt_voice,
-                    vad_enabled=bool(vad) if vad is not None else False,
-                    instructions=final_instructions,
-                    tools=initial_tools or None,
-                    tool_choice="auto",
-                )
+                        if hasattr(rt, "_session"):
+                            getattr(rt, "_session").set_tool_executor(_exec)  # type: ignore[attr-defined]
+                    except Exception:
+                        pass
-                # Ensure clean input buffers for this turn
-                try:
-                    await rt.clear_input()
-                except Exception:
-                    pass
-                # Also reset any leftover output audio so new turn doesn't replay old chunks
-                try:
-                    if hasattr(rt, "reset_output_stream"):
-                        rt.reset_output_stream()
-                except Exception:
-                    pass
+                    # Connect/configure
+                    if not getattr(rt, "_connected", False):
+                        await rt.start()
+                    await rt.configure(
+                        voice=rt_voice,
+                        vad_enabled=bool(vad) if vad is not None else False,
+                        instructions=final_instructions,
+                        tools=initial_tools or None,
+                        tool_choice="auto",
+                    )
-                # Persist once per turn
-                turn_id = await self.realtime_begin_turn(user_id)
-                if turn_id and user_text:
+                    # Ensure clean input buffers for this turn
+                    try:
+                        await rt.clear_input()
+                    except Exception:
+                        pass
+                    # Also reset any leftover output audio so new turn doesn't replay old chunks
                     try:
-                        await self.realtime_update_user(user_id, turn_id, user_text)
+                        if hasattr(rt, "reset_output_stream"):
+                            rt.reset_output_stream()
                     except Exception:
                         pass
-                # Feed audio into WS if audio bytes provided; else use input_text
-                if is_audio_bytes:
-                    bq = bytes(query)
-                    logger.info(
-                        "Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
-                        len(bq),
-                        audio_input_format,
-                    )
-                    await rt.append_audio(bq)
-                    vad_enabled_value = bool(vad) if vad is not None else False
-                    if not vad_enabled_value:
-                        await rt.commit_input()
-                        # Manually trigger response when VAD is disabled
-                        await rt.create_response({})
+                    # Persist once per turn
+                    turn_id = await self.realtime_begin_turn(user_id)
+                    if turn_id and user_text:
+                        try:
+                            await self.realtime_update_user(user_id, turn_id, user_text)
+                        except Exception:
+                            pass
+                    # Feed audio into WS if audio bytes provided; else use input_text
+                    if is_audio_bytes:
+                        bq = bytes(query)
+                        logger.info(
+                            "Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
+                            len(bq),
+                            audio_input_format,
+                        )
+                        await rt.append_audio(bq)
+                        vad_enabled_value = bool(vad) if vad is not None else False
+                        if not vad_enabled_value:
+                            await rt.commit_input()
+                            # Manually trigger response when VAD is disabled
+                            await rt.create_response({})
+                        else:
+                            # With server VAD enabled, the model will auto-create a response at end of speech
+                            logger.debug(
+                                "Realtime: VAD enabled — skipping manual response.create"
+                            )
                     else:
-                        # With server VAD enabled, the model will auto-create a response at end of speech
-                        logger.debug(
-                            "Realtime: VAD enabled — skipping manual response.create"
+                        # Rely on configured session voice; attach input_text only
+                        await rt.create_response(
+                            {
+                                "modalities": ["audio"],
+                                "input": [
+                                    {"type": "input_text", "text": user_text or ""}
+                                ],
+                            }
                         )
-                else:
-                    # Rely on configured session voice; attach input_text only
-                    await rt.create_response(
-                        {
-                            "modalities": ["audio"],
-                            "input": [{"type": "input_text", "text": user_text or ""}],
-                        }
-                    )
-                # Collect audio and transcripts
-                user_tr = ""
-                asst_tr = ""
+                    # Collect audio and transcripts
+                    user_tr = ""
+                    asst_tr = ""
-                async def _drain_in_tr():
-                    nonlocal user_tr
-                    async for t in rt.iter_input_transcript():
-                        if t:
-                            user_tr += t
+                    async def _drain_in_tr():
+                        nonlocal user_tr
+                        async for t in rt.iter_input_transcript():
+                            if t:
+                                user_tr += t
-                async def _drain_out_tr():
-                    nonlocal asst_tr
-                    async for t in rt.iter_output_transcript():
-                        if t:
-                            asst_tr += t
+                    async def _drain_out_tr():
+                        nonlocal asst_tr
+                        async for t in rt.iter_output_transcript():
+                            if t:
+                                asst_tr += t
-                in_task = asyncio.create_task(_drain_in_tr())
-                out_task = asyncio.create_task(_drain_out_tr())
-                try:
-                    async for audio_chunk in rt.iter_output_audio_encoded():
-                        yield audio_chunk
-                finally:
-                    in_task.cancel()
-                    out_task.cancel()
-                    # If no WS input transcript was captured, fall back to HTTP STT result
-                    if not user_tr:
-                        try:
-                            if "stt_task" in locals() and stt_task is not None:
-                                user_tr = await stt_task
-                        except Exception:
-                            pass
-                    if turn_id:
-                        try:
-                            if user_tr:
-                                await self.realtime_update_user(
-                                    user_id, turn_id, user_tr
-                                )
-                            if asst_tr:
-                                await self.realtime_update_assistant(
-                                    user_id, turn_id, asst_tr
-                                )
-                        except Exception:
-                            pass
+                    in_task = asyncio.create_task(_drain_in_tr())
+                    out_task = asyncio.create_task(_drain_out_tr())
+                    try:
+                        async for audio_chunk in rt.iter_output_audio_encoded():
+                            yield audio_chunk
+                    finally:
+                        in_task.cancel()
+                        out_task.cancel()
+                        # If no WS input transcript was captured, fall back to HTTP STT result
+                        if not user_tr:
+                            try:
+                                if "stt_task" in locals() and stt_task is not None:
+                                    user_tr = await stt_task
+                            except Exception:
+                                pass
+                        if turn_id:
+                            try:
+                                if user_tr:
+                                    await self.realtime_update_user(
+                                        user_id, turn_id, user_tr
+                                    )
+                                if asst_tr:
+                                    await self.realtime_update_assistant(
+                                        user_id, turn_id, asst_tr
+                                    )
+                            except Exception:
+                                pass
+                            try:
+                                await self.realtime_finalize_turn(user_id, turn_id)
+                            except Exception:
+                                pass
+                        # Clear input buffer for next turn reuse
                         try:
-                            await self.realtime_finalize_turn(user_id, turn_id)
+                            await rt.clear_input()
                         except Exception:
                             pass
-                    # Clear input buffer for next turn reuse
+                finally:
+                    # Always release the session for reuse by other concurrent requests/devices
                     try:
-                        await rt.clear_input()
+                        lock = getattr(rt, "_in_use_lock", None)
+                        if lock and lock.locked():
+                            lock.release()
                     except Exception:
                         pass
-                return
+                    return
             # 1) Transcribe audio or accept text
             user_text = ""