solana-agent 31.2.2__tar.gz → 31.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {solana_agent-31.2.2 → solana_agent-31.2.4}/PKG-INFO +29 -17
  2. {solana_agent-31.2.2 → solana_agent-31.2.4}/README.md +28 -16
  3. {solana_agent-31.2.2 → solana_agent-31.2.4}/pyproject.toml +1 -1
  4. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/adapters/ffmpeg_transcoder.py +101 -8
  5. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/adapters/openai_realtime_ws.py +47 -2
  6. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/services/query.py +243 -154
  7. {solana_agent-31.2.2 → solana_agent-31.2.4}/LICENSE +0 -0
  8. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/__init__.py +0 -0
  9. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/adapters/__init__.py +0 -0
  10. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/adapters/mongodb_adapter.py +0 -0
  11. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/adapters/openai_adapter.py +0 -0
  12. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/adapters/pinecone_adapter.py +0 -0
  13. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/cli.py +0 -0
  14. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/client/__init__.py +0 -0
  15. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/client/solana_agent.py +0 -0
  16. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/domains/__init__.py +0 -0
  17. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/domains/agent.py +0 -0
  18. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/domains/routing.py +0 -0
  19. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/factories/__init__.py +0 -0
  20. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/factories/agent_factory.py +0 -0
  21. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/guardrails/pii.py +0 -0
  22. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/__init__.py +0 -0
  23. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/client/client.py +0 -0
  24. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/guardrails/guardrails.py +0 -0
  25. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/plugins/plugins.py +0 -0
  26. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/providers/audio.py +0 -0
  27. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/providers/data_storage.py +0 -0
  28. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/providers/llm.py +0 -0
  29. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/providers/memory.py +0 -0
  30. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/providers/realtime.py +0 -0
  31. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/providers/vector_storage.py +0 -0
  32. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/services/agent.py +0 -0
  33. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/services/knowledge_base.py +0 -0
  34. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/services/query.py +0 -0
  35. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/interfaces/services/routing.py +0 -0
  36. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/plugins/__init__.py +0 -0
  37. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/plugins/manager.py +0 -0
  38. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/plugins/registry.py +0 -0
  39. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/plugins/tools/__init__.py +0 -0
  40. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/plugins/tools/auto_tool.py +0 -0
  41. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/repositories/__init__.py +0 -0
  42. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/repositories/memory.py +0 -0
  43. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/services/__init__.py +0 -0
  44. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/services/agent.py +0 -0
  45. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/services/knowledge_base.py +0 -0
  46. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/services/realtime.py +0 -0
  47. {solana_agent-31.2.2 → solana_agent-31.2.4}/solana_agent/services/routing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: solana-agent
3
- Version: 31.2.2
3
+ Version: 31.2.4
4
4
  Summary: AI Agents for Solana
5
5
  License: MIT
6
6
  Keywords: solana,solana ai,solana agent,ai,ai agent,ai agents
@@ -317,27 +317,39 @@ Due to the overhead of the router (API call) - realtime only supports a single a
317
317
 
318
318
  Realtime uses MongoDB for memory so Zep is not needed.
319
319
 
320
+ This example will work using expo-audio on Android and iOS.
321
+
320
322
  ```python
321
323
  from solana_agent import SolanaAgent
322
324
 
323
325
  solana_agent = SolanaAgent(config=config)
324
326
 
325
- # Example: mobile sends MP4/AAC; server encodes output to AAC
326
- audio_content = await audio_file.read() # bytes
327
- async for audio_chunk in solana_agent.process(
328
- "user123", # required
329
- audio_content, # required
330
- realtime=True, # optional (default False)
331
- output_format="audio", # required
332
- vad=True, # enable VAD (optional)
333
- rt_encode_input=True, # accept compressed input (optional)
334
- rt_encode_output=True, # encode output for client (optional)
335
- rt_voice="marin" # the voice to use for interactions (optional)
336
- audio_input_format="mp4", # client transport (optional)
337
- audio_output_format="aac" # client transport (optional)
338
- ):
339
- handle_audio(audio_chunk)
340
- ```
327
+ audio_content = await audio_file.read()
328
+
329
+ async def generate():
330
+ async for chunk in solana_agent.process(
331
+ user_id=user_id,
332
+ message=audio_content,
333
+ realtime=True,
334
+ rt_encode_input=True,
335
+ rt_encode_output=True,
336
+ rt_voice="marin",
337
+ output_format="audio",
338
+ audio_output_format="mp3",
339
+ audio_input_format="mp4",
340
+ ):
341
+ yield chunk
342
+
343
+ return StreamingResponse(
344
+ content=generate(),
345
+ media_type="audio/mp3",
346
+ headers={
347
+ "Cache-Control": "no-store",
348
+ "Pragma": "no-cache",
349
+ "Content-Disposition": "inline; filename=stream.mp3",
350
+ "X-Accel-Buffering": "no",
351
+ },
352
+ )
341
353
 
342
354
  ### Image/Text Streaming
343
355
 
@@ -281,27 +281,39 @@ Due to the overhead of the router (API call) - realtime only supports a single a
281
281
 
282
282
  Realtime uses MongoDB for memory so Zep is not needed.
283
283
 
284
+ This example will work using expo-audio on Android and iOS.
285
+
284
286
  ```python
285
287
  from solana_agent import SolanaAgent
286
288
 
287
289
  solana_agent = SolanaAgent(config=config)
288
290
 
289
- # Example: mobile sends MP4/AAC; server encodes output to AAC
290
- audio_content = await audio_file.read() # bytes
291
- async for audio_chunk in solana_agent.process(
292
- "user123", # required
293
- audio_content, # required
294
- realtime=True, # optional (default False)
295
- output_format="audio", # required
296
- vad=True, # enable VAD (optional)
297
- rt_encode_input=True, # accept compressed input (optional)
298
- rt_encode_output=True, # encode output for client (optional)
299
- rt_voice="marin" # the voice to use for interactions (optional)
300
- audio_input_format="mp4", # client transport (optional)
301
- audio_output_format="aac" # client transport (optional)
302
- ):
303
- handle_audio(audio_chunk)
304
- ```
291
+ audio_content = await audio_file.read()
292
+
293
+ async def generate():
294
+ async for chunk in solana_agent.process(
295
+ user_id=user_id,
296
+ message=audio_content,
297
+ realtime=True,
298
+ rt_encode_input=True,
299
+ rt_encode_output=True,
300
+ rt_voice="marin",
301
+ output_format="audio",
302
+ audio_output_format="mp3",
303
+ audio_input_format="mp4",
304
+ ):
305
+ yield chunk
306
+
307
+ return StreamingResponse(
308
+ content=generate(),
309
+ media_type="audio/mp3",
310
+ headers={
311
+ "Cache-Control": "no-store",
312
+ "Pragma": "no-cache",
313
+ "Content-Disposition": "inline; filename=stream.mp3",
314
+ "X-Accel-Buffering": "no",
315
+ },
316
+ )
305
317
 
306
318
  ### Image/Text Streaming
307
319
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "solana-agent"
3
- version = "31.2.2"
3
+ version = "31.2.4"
4
4
  description = "AI Agents for Solana"
5
5
  authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
6
6
  license = "MIT"
@@ -4,6 +4,8 @@ import asyncio
4
4
  import contextlib
5
5
  import logging
6
6
  from typing import List, AsyncGenerator
7
+ import tempfile
8
+ import os
7
9
 
8
10
  from solana_agent.interfaces.providers.audio import AudioTranscoder
9
11
 
@@ -49,11 +51,45 @@ class FFmpegTranscoder(AudioTranscoder):
49
51
  rate_hz,
50
52
  len(audio_bytes),
51
53
  )
52
- # Prefer to hint format for common containers/codecs; ffmpeg can still autodetect if hint is wrong.
53
- hinted_format = None
54
+ # iOS-recorded MP4/M4A often requires a seekable input for reliable demuxing.
55
+ # Decode from a temporary file instead of stdin for MP4/M4A.
54
56
  if input_mime in ("audio/mp4", "audio/m4a"):
55
- hinted_format = "mp4"
56
- elif input_mime in ("audio/aac",):
57
+ suffix = ".m4a" if input_mime == "audio/m4a" else ".mp4"
58
+ tmp_path = None
59
+ try:
60
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tf:
61
+ tmp_path = tf.name
62
+ tf.write(audio_bytes)
63
+ args = [
64
+ "-hide_banner",
65
+ "-loglevel",
66
+ "error",
67
+ "-i",
68
+ tmp_path,
69
+ "-vn", # ignore any video tracks
70
+ "-acodec",
71
+ "pcm_s16le",
72
+ "-ac",
73
+ "1",
74
+ "-ar",
75
+ str(rate_hz),
76
+ "-f",
77
+ "s16le",
78
+ "pipe:1",
79
+ ]
80
+ out = await self._run_ffmpeg(args, b"")
81
+ logger.info(
82
+ "Transcoded (MP4/M4A temp-file) to PCM16: output_len=%d", len(out)
83
+ )
84
+ return out
85
+ finally:
86
+ if tmp_path:
87
+ with contextlib.suppress(Exception):
88
+ os.remove(tmp_path)
89
+
90
+ # For other formats, prefer a format hint when helpful and decode from stdin.
91
+ hinted_format = None
92
+ if input_mime in ("audio/aac",):
57
93
  # Raw AAC is typically in ADTS stream format
58
94
  hinted_format = "adts"
59
95
  elif input_mime in ("audio/ogg", "audio/webm"):
@@ -88,13 +124,14 @@ class FFmpegTranscoder(AudioTranscoder):
88
124
  async def from_pcm16( # pragma: no cover
89
125
  self, pcm16_bytes: bytes, output_mime: str, rate_hz: int
90
126
  ) -> bytes:
91
- """Encode PCM16LE to desired format (currently AAC ADTS for mobile streaming)."""
127
+ """Encode PCM16LE to desired format (AAC ADTS, fragmented MP4, or MP3)."""
92
128
  logger.info(
93
129
  "Encode from PCM16: output_mime=%s, rate_hz=%d, input_len=%d",
94
130
  output_mime,
95
131
  rate_hz,
96
132
  len(pcm16_bytes),
97
133
  )
134
+
98
135
  if output_mime in ("audio/mpeg", "audio/mp3"):
99
136
  # Encode to MP3 (often better streaming compatibility on mobile)
100
137
  args = [
@@ -122,8 +159,9 @@ class FFmpegTranscoder(AudioTranscoder):
122
159
  "Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
123
160
  )
124
161
  return out
125
- if output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
126
- # Encode to AAC in ADTS stream; clients can play it as AAC.
162
+
163
+ if output_mime in ("audio/aac",):
164
+ # Encode to AAC in ADTS stream; good for streaming over sockets/HTTP chunked
127
165
  args = [
128
166
  "-hide_banner",
129
167
  "-loglevel",
@@ -149,6 +187,38 @@ class FFmpegTranscoder(AudioTranscoder):
149
187
  "Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
150
188
  )
151
189
  return out
190
+
191
+ if output_mime in ("audio/mp4", "audio/m4a"):
192
+ # Encode to fragmented MP4 (fMP4) with AAC for better iOS compatibility
193
+ # For streaming, write an initial moov and fragment over stdout.
194
+ args = [
195
+ "-hide_banner",
196
+ "-loglevel",
197
+ "error",
198
+ "-f",
199
+ "s16le",
200
+ "-ac",
201
+ "1",
202
+ "-ar",
203
+ str(rate_hz),
204
+ "-i",
205
+ "pipe:0",
206
+ "-c:a",
207
+ "aac",
208
+ "-b:a",
209
+ "96k",
210
+ "-movflags",
211
+ "+frag_keyframe+empty_moov",
212
+ "-f",
213
+ "mp4",
214
+ "pipe:1",
215
+ ]
216
+ out = await self._run_ffmpeg(args, pcm16_bytes)
217
+ logger.info(
218
+ "Encoded from PCM16 to %s (fMP4): output_len=%d", output_mime, len(out)
219
+ )
220
+ return out
221
+
152
222
  # Default: passthrough
153
223
  logger.info("Encode passthrough (no change), output_len=%d", len(pcm16_bytes))
154
224
  return pcm16_bytes
@@ -187,7 +257,7 @@ class FFmpegTranscoder(AudioTranscoder):
187
257
  "mp3",
188
258
  "pipe:1",
189
259
  ]
190
- elif output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
260
+ elif output_mime in ("audio/aac",):
191
261
  args = [
192
262
  "-hide_banner",
193
263
  "-loglevel",
@@ -208,6 +278,29 @@ class FFmpegTranscoder(AudioTranscoder):
208
278
  "adts",
209
279
  "pipe:1",
210
280
  ]
281
+ elif output_mime in ("audio/mp4", "audio/m4a"):
282
+ args = [
283
+ "-hide_banner",
284
+ "-loglevel",
285
+ "error",
286
+ "-f",
287
+ "s16le",
288
+ "-ac",
289
+ "1",
290
+ "-ar",
291
+ str(rate_hz),
292
+ "-i",
293
+ "pipe:0",
294
+ "-c:a",
295
+ "aac",
296
+ "-b:a",
297
+ "96k",
298
+ "-movflags",
299
+ "+frag_keyframe+empty_moov",
300
+ "-f",
301
+ "mp4",
302
+ "pipe:1",
303
+ ]
211
304
  else:
212
305
  # Passthrough streaming: just yield input
213
306
  async for chunk in pcm_iter:
@@ -325,7 +325,26 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
325
325
  try:
326
326
  chunk = base64.b64decode(b64)
327
327
  self._audio_queue.put_nowait(chunk)
328
- logger.info("Audio delta bytes=%d", len(chunk))
328
+ # Ownership/response tagging for diagnostics
329
+ try:
330
+ owner = getattr(self, "_owner_user_id", None)
331
+ except Exception:
332
+ owner = None
333
+ try:
334
+ rid = getattr(self, "_active_response_id", None)
335
+ except Exception:
336
+ rid = None
337
+ try:
338
+ gen = int(getattr(self, "_response_generation", 0))
339
+ except Exception:
340
+ gen = None
341
+ logger.info(
342
+ "Audio delta bytes=%d owner=%s rid=%s gen=%s",
343
+ len(chunk),
344
+ owner,
345
+ rid,
346
+ gen,
347
+ )
329
348
  try:
330
349
  # New response detected if we were previously inactive
331
350
  if not getattr(self, "_response_active", False):
@@ -492,8 +511,25 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
492
511
  "response.audio.done",
493
512
  ):
494
513
  # End of audio stream for the response; stop audio iterator but keep WS open for transcripts
514
+ try:
515
+ owner = getattr(self, "_owner_user_id", None)
516
+ except Exception:
517
+ owner = None
518
+ try:
519
+ rid = (data.get("response") or {}).get("id") or getattr(
520
+ self, "_active_response_id", None
521
+ )
522
+ except Exception:
523
+ rid = None
524
+ try:
525
+ gen = int(getattr(self, "_response_generation", 0))
526
+ except Exception:
527
+ gen = None
495
528
  logger.info(
496
- "Realtime WS: output audio done; ending audio stream"
529
+ "Realtime WS: output audio done; owner=%s rid=%s gen=%s",
530
+ owner,
531
+ rid,
532
+ gen,
497
533
  )
498
534
  # If we have a buffered transcript for this response, flush it now
499
535
  try:
@@ -1001,6 +1037,15 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1001
1037
  if "tools" in patch:
1002
1038
  patch["tools"] = _strip_tool_strict(patch["tools"]) # idempotent
1003
1039
 
1040
+ # Per server requirements, always include session.type and output_modalities
1041
+ try:
1042
+ patch["type"] = "realtime"
1043
+ # Preserve caller-provided output_modalities if present, otherwise default to audio
1044
+ if "output_modalities" not in patch:
1045
+ patch["output_modalities"] = ["audio"]
1046
+ except Exception:
1047
+ pass
1048
+
1004
1049
  payload = {"type": "session.update", "session": patch}
1005
1050
  # Mark awaiting updated and store last patch
1006
1051
  self._last_session_patch = patch or {}
@@ -67,10 +67,117 @@ class QueryService(QueryServiceInterface):
67
67
  self._sticky_sessions: Dict[str, Dict[str, Any]] = {}
68
68
  # Optional realtime service attached by factory (populated in factory)
69
69
  self.realtime = None # type: ignore[attr-defined]
70
- # Persistent realtime WS per user for push-to-talk reuse
71
- self._rt_services = {}
70
+ # Persistent realtime WS pool per user for reuse across turns/devices
71
+ # { user_id: [RealtimeService, ...] }
72
+ self._rt_services: Dict[str, List[Any]] = {}
73
+ # Global lock for creating/finding per-user sessions
72
74
  self._rt_lock = asyncio.Lock()
73
75
 
76
+ async def _try_acquire_lock(self, lock: asyncio.Lock) -> bool:
77
+ try:
78
+ await asyncio.wait_for(lock.acquire(), timeout=0)
79
+ return True
80
+ except asyncio.TimeoutError:
81
+ return False
82
+ except Exception:
83
+ return False
84
+
85
+ async def _alloc_realtime_session(
86
+ self,
87
+ user_id: str,
88
+ *,
89
+ api_key: str,
90
+ rt_voice: str,
91
+ final_instructions: str,
92
+ initial_tools: Optional[List[Dict[str, Any]]],
93
+ encode_in: bool,
94
+ encode_out: bool,
95
+ audio_input_format: str,
96
+ audio_output_format: str,
97
+ ) -> Any:
98
+ """Get a free (or new) realtime session for this user. Marks it busy via an internal lock.
99
+
100
+ Returns the RealtimeService with an acquired _in_use_lock that MUST be released by caller.
101
+ """
102
+ from solana_agent.interfaces.providers.realtime import (
103
+ RealtimeSessionOptions,
104
+ )
105
+ from solana_agent.adapters.openai_realtime_ws import (
106
+ OpenAIRealtimeWebSocketSession,
107
+ )
108
+ from solana_agent.adapters.ffmpeg_transcoder import FFmpegTranscoder
109
+
110
+ def _mime_from(fmt: str) -> str:
111
+ f = (fmt or "").lower()
112
+ return {
113
+ "aac": "audio/aac",
114
+ "mp3": "audio/mpeg",
115
+ "mp4": "audio/mp4",
116
+ "m4a": "audio/mp4",
117
+ "mpeg": "audio/mpeg",
118
+ "mpga": "audio/mpeg",
119
+ "wav": "audio/wav",
120
+ "flac": "audio/flac",
121
+ "opus": "audio/opus",
122
+ "ogg": "audio/ogg",
123
+ "webm": "audio/webm",
124
+ "pcm": "audio/pcm",
125
+ }.get(f, "audio/pcm")
126
+
127
+ async with self._rt_lock:
128
+ pool = self._rt_services.get(user_id) or []
129
+ # Try to reuse an idle session strictly owned by this user
130
+ for rt in pool:
131
+ # Extra safety: never reuse a session from another user
132
+ owner = getattr(rt, "_owner_user_id", None)
133
+ if owner is not None and owner != user_id:
134
+ continue
135
+ lock = getattr(rt, "_in_use_lock", None)
136
+ if lock is None:
137
+ lock = asyncio.Lock()
138
+ setattr(rt, "_in_use_lock", lock)
139
+ if not lock.locked():
140
+ if await self._try_acquire_lock(lock):
141
+ return rt
142
+ # None free: create a new session
143
+ opts = RealtimeSessionOptions(
144
+ model="gpt-realtime",
145
+ voice=rt_voice,
146
+ vad_enabled=False,
147
+ input_rate_hz=24000,
148
+ output_rate_hz=24000,
149
+ input_mime="audio/pcm",
150
+ output_mime="audio/pcm",
151
+ tools=initial_tools or None,
152
+ tool_choice="auto",
153
+ )
154
+ try:
155
+ opts.instructions = final_instructions
156
+ opts.voice = rt_voice
157
+ except Exception:
158
+ pass
159
+ conv_session = OpenAIRealtimeWebSocketSession(api_key=api_key, options=opts)
160
+ transcoder = FFmpegTranscoder() if (encode_in or encode_out) else None
161
+ from solana_agent.services.realtime import RealtimeService
162
+
163
+ rt = RealtimeService(
164
+ session=conv_session,
165
+ options=opts,
166
+ transcoder=transcoder,
167
+ accept_compressed_input=encode_in,
168
+ client_input_mime=_mime_from(audio_input_format),
169
+ encode_output=encode_out,
170
+ client_output_mime=_mime_from(audio_output_format),
171
+ )
172
+ # Tag ownership to prevent any cross-user reuse
173
+ setattr(rt, "_owner_user_id", user_id)
174
+ setattr(rt, "_in_use_lock", asyncio.Lock())
175
+ # Mark busy
176
+ await getattr(rt, "_in_use_lock").acquire()
177
+ pool.append(rt)
178
+ self._rt_services[user_id] = pool
179
+ return rt
180
+
74
181
  def _get_sticky_agent(self, user_id: str) -> Optional[str]:
75
182
  sess = self._sticky_sessions.get(user_id)
76
183
  return sess.get("agent") if isinstance(sess, dict) else None
@@ -554,14 +661,7 @@ class QueryService(QueryServiceInterface):
554
661
  final_instructions = "\n\n".join([p for p in parts if p])
555
662
 
556
663
  # 4) Open a single WS session for assistant audio
557
- from solana_agent.adapters.openai_realtime_ws import (
558
- OpenAIRealtimeWebSocketSession,
559
- )
560
- from solana_agent.interfaces.providers.realtime import (
561
- RealtimeSessionOptions,
562
- )
563
- from solana_agent.services.realtime import RealtimeService
564
- from solana_agent.adapters.ffmpeg_transcoder import FFmpegTranscoder
664
+ # Realtime imports handled inside allocator helper
565
665
 
566
666
  api_key = None
567
667
  try:
@@ -600,171 +700,160 @@ class QueryService(QueryServiceInterface):
600
700
  or (is_audio_bytes and audio_input_format.lower() != "pcm")
601
701
  )
602
702
 
603
- async with self._rt_lock:
604
- rt = self._rt_services.get(user_id)
605
- if not rt or not isinstance(rt, RealtimeService):
606
- opts = RealtimeSessionOptions(
607
- model="gpt-realtime",
608
- voice=rt_voice,
609
- vad_enabled=False, # no input audio
610
- input_rate_hz=24000,
611
- output_rate_hz=24000,
612
- input_mime="audio/pcm",
613
- output_mime="audio/pcm",
614
- tools=initial_tools or None,
615
- tool_choice="auto",
616
- )
617
- # Ensure initial session.update carries instructions/voice
703
+ # Allocate or reuse a realtime session for this specific request/user
704
+ rt = await self._alloc_realtime_session(
705
+ user_id,
706
+ api_key=api_key,
707
+ rt_voice=rt_voice,
708
+ final_instructions=final_instructions,
709
+ initial_tools=initial_tools,
710
+ encode_in=encode_in,
711
+ encode_out=encode_out,
712
+ audio_input_format=audio_input_format,
713
+ audio_output_format=audio_output_format,
714
+ )
715
+ # Ensure lock is released no matter what
716
+ try:
717
+ # Tool executor
718
+ async def _exec(
719
+ tool_name: str, args: Dict[str, Any]
720
+ ) -> Dict[str, Any]:
618
721
  try:
619
- opts.instructions = final_instructions
620
- opts.voice = rt_voice
621
- except Exception:
622
- pass
623
- conv_session = OpenAIRealtimeWebSocketSession(
624
- api_key=api_key, options=opts
625
- )
626
- transcoder = (
627
- FFmpegTranscoder() if (encode_in or encode_out) else None
628
- )
629
- rt = RealtimeService(
630
- session=conv_session,
631
- options=opts,
632
- transcoder=transcoder,
633
- accept_compressed_input=encode_in,
634
- client_input_mime=_mime_from(audio_input_format),
635
- encode_output=encode_out,
636
- client_output_mime=_mime_from(audio_output_format),
637
- )
638
- self._rt_services[user_id] = rt
722
+ return await self.agent_service.execute_tool(
723
+ agent_name, tool_name, args or {}
724
+ )
725
+ except Exception as e:
726
+ return {"status": "error", "message": str(e)}
639
727
 
640
- # Tool executor
641
- async def _exec(tool_name: str, args: Dict[str, Any]) -> Dict[str, Any]:
728
+ # If possible, set on underlying session
642
729
  try:
643
- return await self.agent_service.execute_tool(
644
- agent_name, tool_name, args or {}
645
- )
646
- except Exception as e:
647
- return {"status": "error", "message": str(e)}
648
-
649
- # If possible, set on underlying session
650
- try:
651
- if hasattr(rt, "_session"):
652
- getattr(rt, "_session").set_tool_executor(_exec) # type: ignore[attr-defined]
653
- except Exception:
654
- pass
655
-
656
- # Connect/configure
657
- if not getattr(rt, "_connected", False):
658
- await rt.start()
659
- await rt.configure(
660
- voice=rt_voice,
661
- vad_enabled=bool(vad) if vad is not None else False,
662
- instructions=final_instructions,
663
- tools=initial_tools or None,
664
- tool_choice="auto",
665
- )
730
+ if hasattr(rt, "_session"):
731
+ getattr(rt, "_session").set_tool_executor(_exec) # type: ignore[attr-defined]
732
+ except Exception:
733
+ pass
666
734
 
667
- # Ensure clean input buffers for this turn
668
- try:
669
- await rt.clear_input()
670
- except Exception:
671
- pass
672
- # Also reset any leftover output audio so new turn doesn't replay old chunks
673
- try:
674
- if hasattr(rt, "reset_output_stream"):
675
- rt.reset_output_stream()
676
- except Exception:
677
- pass
735
+ # Connect/configure
736
+ if not getattr(rt, "_connected", False):
737
+ await rt.start()
738
+ await rt.configure(
739
+ voice=rt_voice,
740
+ vad_enabled=bool(vad) if vad is not None else False,
741
+ instructions=final_instructions,
742
+ tools=initial_tools or None,
743
+ tool_choice="auto",
744
+ )
678
745
 
679
- # Persist once per turn
680
- turn_id = await self.realtime_begin_turn(user_id)
681
- if turn_id and user_text:
746
+ # Ensure clean input buffers for this turn
747
+ try:
748
+ await rt.clear_input()
749
+ except Exception:
750
+ pass
751
+ # Also reset any leftover output audio so new turn doesn't replay old chunks
682
752
  try:
683
- await self.realtime_update_user(user_id, turn_id, user_text)
753
+ if hasattr(rt, "reset_output_stream"):
754
+ rt.reset_output_stream()
684
755
  except Exception:
685
756
  pass
686
757
 
687
- # Feed audio into WS if audio bytes provided; else use input_text
688
- if is_audio_bytes:
689
- bq = bytes(query)
690
- logger.info(
691
- "Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
692
- len(bq),
693
- audio_input_format,
694
- )
695
- await rt.append_audio(bq)
696
- vad_enabled_value = bool(vad) if vad is not None else False
697
- if not vad_enabled_value:
698
- await rt.commit_input()
699
- # Manually trigger response when VAD is disabled
700
- await rt.create_response({})
758
+ # Persist once per turn
759
+ turn_id = await self.realtime_begin_turn(user_id)
760
+ if turn_id and user_text:
761
+ try:
762
+ await self.realtime_update_user(user_id, turn_id, user_text)
763
+ except Exception:
764
+ pass
765
+
766
+ # Feed audio into WS if audio bytes provided; else use input_text
767
+ if is_audio_bytes:
768
+ bq = bytes(query)
769
+ logger.info(
770
+ "Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
771
+ len(bq),
772
+ audio_input_format,
773
+ )
774
+ await rt.append_audio(bq)
775
+ vad_enabled_value = bool(vad) if vad is not None else False
776
+ if not vad_enabled_value:
777
+ await rt.commit_input()
778
+ # Manually trigger response when VAD is disabled
779
+ await rt.create_response({})
780
+ else:
781
+ # With server VAD enabled, the model will auto-create a response at end of speech
782
+ logger.debug(
783
+ "Realtime: VAD enabled — skipping manual response.create"
784
+ )
701
785
  else:
702
- # With server VAD enabled, the model will auto-create a response at end of speech
703
- logger.debug(
704
- "Realtime: VAD enabled — skipping manual response.create"
786
+ # Rely on configured session voice; attach input_text only
787
+ await rt.create_response(
788
+ {
789
+ "modalities": ["audio"],
790
+ "input": [
791
+ {"type": "input_text", "text": user_text or ""}
792
+ ],
793
+ }
705
794
  )
706
- else:
707
- # Rely on configured session voice; attach input_text only
708
- await rt.create_response(
709
- {
710
- "modalities": ["audio"],
711
- "input": [{"type": "input_text", "text": user_text or ""}],
712
- }
713
- )
714
795
 
715
- # Collect audio and transcripts
716
- user_tr = ""
717
- asst_tr = ""
796
+ # Collect audio and transcripts
797
+ user_tr = ""
798
+ asst_tr = ""
718
799
 
719
- async def _drain_in_tr():
720
- nonlocal user_tr
721
- async for t in rt.iter_input_transcript():
722
- if t:
723
- user_tr += t
800
+ async def _drain_in_tr():
801
+ nonlocal user_tr
802
+ async for t in rt.iter_input_transcript():
803
+ if t:
804
+ user_tr += t
724
805
 
725
- async def _drain_out_tr():
726
- nonlocal asst_tr
727
- async for t in rt.iter_output_transcript():
728
- if t:
729
- asst_tr += t
806
+ async def _drain_out_tr():
807
+ nonlocal asst_tr
808
+ async for t in rt.iter_output_transcript():
809
+ if t:
810
+ asst_tr += t
730
811
 
731
- in_task = asyncio.create_task(_drain_in_tr())
732
- out_task = asyncio.create_task(_drain_out_tr())
733
- try:
734
- async for audio_chunk in rt.iter_output_audio_encoded():
735
- yield audio_chunk
736
- finally:
737
- in_task.cancel()
738
- out_task.cancel()
739
- # If no WS input transcript was captured, fall back to HTTP STT result
740
- if not user_tr:
741
- try:
742
- if "stt_task" in locals() and stt_task is not None:
743
- user_tr = await stt_task
744
- except Exception:
745
- pass
746
- if turn_id:
747
- try:
748
- if user_tr:
749
- await self.realtime_update_user(
750
- user_id, turn_id, user_tr
751
- )
752
- if asst_tr:
753
- await self.realtime_update_assistant(
754
- user_id, turn_id, asst_tr
755
- )
756
- except Exception:
757
- pass
812
+ in_task = asyncio.create_task(_drain_in_tr())
813
+ out_task = asyncio.create_task(_drain_out_tr())
814
+ try:
815
+ async for audio_chunk in rt.iter_output_audio_encoded():
816
+ yield audio_chunk
817
+ finally:
818
+ in_task.cancel()
819
+ out_task.cancel()
820
+ # If no WS input transcript was captured, fall back to HTTP STT result
821
+ if not user_tr:
822
+ try:
823
+ if "stt_task" in locals() and stt_task is not None:
824
+ user_tr = await stt_task
825
+ except Exception:
826
+ pass
827
+ if turn_id:
828
+ try:
829
+ if user_tr:
830
+ await self.realtime_update_user(
831
+ user_id, turn_id, user_tr
832
+ )
833
+ if asst_tr:
834
+ await self.realtime_update_assistant(
835
+ user_id, turn_id, asst_tr
836
+ )
837
+ except Exception:
838
+ pass
839
+ try:
840
+ await self.realtime_finalize_turn(user_id, turn_id)
841
+ except Exception:
842
+ pass
843
+ # Clear input buffer for next turn reuse
758
844
  try:
759
- await self.realtime_finalize_turn(user_id, turn_id)
845
+ await rt.clear_input()
760
846
  except Exception:
761
847
  pass
762
- # Clear input buffer for next turn reuse
848
+ finally:
849
+ # Always release the session for reuse by other concurrent requests/devices
763
850
  try:
764
- await rt.clear_input()
851
+ lock = getattr(rt, "_in_use_lock", None)
852
+ if lock and lock.locked():
853
+ lock.release()
765
854
  except Exception:
766
855
  pass
767
- return
856
+ return
768
857
 
769
858
  # 1) Transcribe audio or accept text
770
859
  user_text = ""
File without changes