solana-agent 31.2.1__tar.gz → 31.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {solana_agent-31.2.1 → solana_agent-31.2.3}/PKG-INFO +29 -17
  2. {solana_agent-31.2.1 → solana_agent-31.2.3}/README.md +28 -16
  3. {solana_agent-31.2.1 → solana_agent-31.2.3}/pyproject.toml +1 -1
  4. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/adapters/ffmpeg_transcoder.py +61 -4
  5. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/adapters/openai_realtime_ws.py +71 -9
  6. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/services/query.py +243 -148
  7. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/services/realtime.py +14 -0
  8. {solana_agent-31.2.1 → solana_agent-31.2.3}/LICENSE +0 -0
  9. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/__init__.py +0 -0
  10. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/adapters/__init__.py +0 -0
  11. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/adapters/mongodb_adapter.py +0 -0
  12. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/adapters/openai_adapter.py +0 -0
  13. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/adapters/pinecone_adapter.py +0 -0
  14. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/cli.py +0 -0
  15. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/client/__init__.py +0 -0
  16. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/client/solana_agent.py +0 -0
  17. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/domains/__init__.py +0 -0
  18. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/domains/agent.py +0 -0
  19. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/domains/routing.py +0 -0
  20. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/factories/__init__.py +0 -0
  21. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/factories/agent_factory.py +0 -0
  22. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/guardrails/pii.py +0 -0
  23. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/__init__.py +0 -0
  24. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/client/client.py +0 -0
  25. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/guardrails/guardrails.py +0 -0
  26. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/plugins/plugins.py +0 -0
  27. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/providers/audio.py +0 -0
  28. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/providers/data_storage.py +0 -0
  29. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/providers/llm.py +0 -0
  30. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/providers/memory.py +0 -0
  31. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/providers/realtime.py +0 -0
  32. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/providers/vector_storage.py +0 -0
  33. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/services/agent.py +0 -0
  34. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/services/knowledge_base.py +0 -0
  35. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/services/query.py +0 -0
  36. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/interfaces/services/routing.py +0 -0
  37. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/plugins/__init__.py +0 -0
  38. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/plugins/manager.py +0 -0
  39. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/plugins/registry.py +0 -0
  40. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/plugins/tools/__init__.py +0 -0
  41. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/plugins/tools/auto_tool.py +0 -0
  42. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/repositories/__init__.py +0 -0
  43. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/repositories/memory.py +0 -0
  44. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/services/__init__.py +0 -0
  45. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/services/agent.py +0 -0
  46. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/services/knowledge_base.py +0 -0
  47. {solana_agent-31.2.1 → solana_agent-31.2.3}/solana_agent/services/routing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: solana-agent
3
- Version: 31.2.1
3
+ Version: 31.2.3
4
4
  Summary: AI Agents for Solana
5
5
  License: MIT
6
6
  Keywords: solana,solana ai,solana agent,ai,ai agent,ai agents
@@ -317,27 +317,39 @@ Due to the overhead of the router (API call) - realtime only supports a single a
317
317
 
318
318
  Realtime uses MongoDB for memory so Zep is not needed.
319
319
 
320
+ This example will work using expo-audio on Android and iOS.
321
+
320
322
  ```python
321
323
  from solana_agent import SolanaAgent
322
324
 
323
325
  solana_agent = SolanaAgent(config=config)
324
326
 
325
- # Example: mobile sends MP4/AAC; server encodes output to AAC
326
- audio_content = await audio_file.read() # bytes
327
- async for audio_chunk in solana_agent.process(
328
- "user123", # required
329
- audio_content, # required
330
- realtime=True, # optional (default False)
331
- output_format="audio", # required
332
- vad=True, # enable VAD (optional)
333
- rt_encode_input=True, # accept compressed input (optional)
334
- rt_encode_output=True, # encode output for client (optional)
335
- rt_voice="marin" # the voice to use for interactions (optional)
336
- audio_input_format="mp4", # client transport (optional)
337
- audio_output_format="aac" # client transport (optional)
338
- ):
339
- handle_audio(audio_chunk)
340
- ```
327
+ audio_content = await audio_file.read()
328
+
329
+ async def generate():
330
+ async for chunk in solana_agent.process(
331
+ user_id=user_id,
332
+ message=audio_content,
333
+ realtime=True,
334
+ rt_encode_input=True,
335
+ rt_encode_output=True,
336
+ rt_voice="marin",
337
+ output_format="audio",
338
+ audio_output_format="m4a",
339
+ audio_input_format="mp4",
340
+ ):
341
+ yield chunk
342
+
343
+ return StreamingResponse(
344
+ content=generate(),
345
+ media_type="audio/mp4",
346
+ headers={
347
+ "Cache-Control": "no-store",
348
+ "Pragma": "no-cache",
349
+ "Content-Disposition": "inline; filename=stream.m4a",
350
+ "X-Accel-Buffering": "no",
351
+ },
352
+ )
341
353
 
342
354
  ### Image/Text Streaming
343
355
 
@@ -281,27 +281,39 @@ Due to the overhead of the router (API call) - realtime only supports a single a
281
281
 
282
282
  Realtime uses MongoDB for memory so Zep is not needed.
283
283
 
284
+ This example will work using expo-audio on Android and iOS.
285
+
284
286
  ```python
285
287
  from solana_agent import SolanaAgent
286
288
 
287
289
  solana_agent = SolanaAgent(config=config)
288
290
 
289
- # Example: mobile sends MP4/AAC; server encodes output to AAC
290
- audio_content = await audio_file.read() # bytes
291
- async for audio_chunk in solana_agent.process(
292
- "user123", # required
293
- audio_content, # required
294
- realtime=True, # optional (default False)
295
- output_format="audio", # required
296
- vad=True, # enable VAD (optional)
297
- rt_encode_input=True, # accept compressed input (optional)
298
- rt_encode_output=True, # encode output for client (optional)
299
- rt_voice="marin" # the voice to use for interactions (optional)
300
- audio_input_format="mp4", # client transport (optional)
301
- audio_output_format="aac" # client transport (optional)
302
- ):
303
- handle_audio(audio_chunk)
304
- ```
291
+ audio_content = await audio_file.read()
292
+
293
+ async def generate():
294
+ async for chunk in solana_agent.process(
295
+ user_id=user_id,
296
+ message=audio_content,
297
+ realtime=True,
298
+ rt_encode_input=True,
299
+ rt_encode_output=True,
300
+ rt_voice="marin",
301
+ output_format="audio",
302
+ audio_output_format="m4a",
303
+ audio_input_format="mp4",
304
+ ):
305
+ yield chunk
306
+
307
+ return StreamingResponse(
308
+ content=generate(),
309
+ media_type="audio/mp4",
310
+ headers={
311
+ "Cache-Control": "no-store",
312
+ "Pragma": "no-cache",
313
+ "Content-Disposition": "inline; filename=stream.m4a",
314
+ "X-Accel-Buffering": "no",
315
+ },
316
+ )
305
317
 
306
318
  ### Image/Text Streaming
307
319
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "solana-agent"
3
- version = "31.2.1"
3
+ version = "31.2.3"
4
4
  description = "AI Agents for Solana"
5
5
  authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
6
6
  license = "MIT"
@@ -88,13 +88,14 @@ class FFmpegTranscoder(AudioTranscoder):
88
88
  async def from_pcm16( # pragma: no cover
89
89
  self, pcm16_bytes: bytes, output_mime: str, rate_hz: int
90
90
  ) -> bytes:
91
- """Encode PCM16LE to desired format (currently AAC ADTS for mobile streaming)."""
91
+ """Encode PCM16LE to desired format (AAC ADTS, fragmented MP4, or MP3)."""
92
92
  logger.info(
93
93
  "Encode from PCM16: output_mime=%s, rate_hz=%d, input_len=%d",
94
94
  output_mime,
95
95
  rate_hz,
96
96
  len(pcm16_bytes),
97
97
  )
98
+
98
99
  if output_mime in ("audio/mpeg", "audio/mp3"):
99
100
  # Encode to MP3 (often better streaming compatibility on mobile)
100
101
  args = [
@@ -122,8 +123,9 @@ class FFmpegTranscoder(AudioTranscoder):
122
123
  "Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
123
124
  )
124
125
  return out
125
- if output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
126
- # Encode to AAC in ADTS stream; clients can play it as AAC.
126
+
127
+ if output_mime in ("audio/aac",):
128
+ # Encode to AAC in ADTS stream; good for streaming over sockets/HTTP chunked
127
129
  args = [
128
130
  "-hide_banner",
129
131
  "-loglevel",
@@ -149,6 +151,38 @@ class FFmpegTranscoder(AudioTranscoder):
149
151
  "Encoded from PCM16 to %s: output_len=%d", output_mime, len(out)
150
152
  )
151
153
  return out
154
+
155
+ if output_mime in ("audio/mp4", "audio/m4a"):
156
+ # Encode to fragmented MP4 (fMP4) with AAC for better iOS compatibility
157
+ # For streaming, write an initial moov and fragment over stdout.
158
+ args = [
159
+ "-hide_banner",
160
+ "-loglevel",
161
+ "error",
162
+ "-f",
163
+ "s16le",
164
+ "-ac",
165
+ "1",
166
+ "-ar",
167
+ str(rate_hz),
168
+ "-i",
169
+ "pipe:0",
170
+ "-c:a",
171
+ "aac",
172
+ "-b:a",
173
+ "96k",
174
+ "-movflags",
175
+ "+frag_keyframe+empty_moov",
176
+ "-f",
177
+ "mp4",
178
+ "pipe:1",
179
+ ]
180
+ out = await self._run_ffmpeg(args, pcm16_bytes)
181
+ logger.info(
182
+ "Encoded from PCM16 to %s (fMP4): output_len=%d", output_mime, len(out)
183
+ )
184
+ return out
185
+
152
186
  # Default: passthrough
153
187
  logger.info("Encode passthrough (no change), output_len=%d", len(pcm16_bytes))
154
188
  return pcm16_bytes
@@ -187,7 +221,7 @@ class FFmpegTranscoder(AudioTranscoder):
187
221
  "mp3",
188
222
  "pipe:1",
189
223
  ]
190
- elif output_mime in ("audio/aac", "audio/mp4", "audio/m4a"):
224
+ elif output_mime in ("audio/aac",):
191
225
  args = [
192
226
  "-hide_banner",
193
227
  "-loglevel",
@@ -208,6 +242,29 @@ class FFmpegTranscoder(AudioTranscoder):
208
242
  "adts",
209
243
  "pipe:1",
210
244
  ]
245
+ elif output_mime in ("audio/mp4", "audio/m4a"):
246
+ args = [
247
+ "-hide_banner",
248
+ "-loglevel",
249
+ "error",
250
+ "-f",
251
+ "s16le",
252
+ "-ac",
253
+ "1",
254
+ "-ar",
255
+ str(rate_hz),
256
+ "-i",
257
+ "pipe:0",
258
+ "-c:a",
259
+ "aac",
260
+ "-b:a",
261
+ "96k",
262
+ "-movflags",
263
+ "+frag_keyframe+empty_moov",
264
+ "-f",
265
+ "mp4",
266
+ "pipe:1",
267
+ ]
211
268
  else:
212
269
  # Passthrough streaming: just yield input
213
270
  async for chunk in pcm_iter:
@@ -325,7 +325,26 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
325
325
  try:
326
326
  chunk = base64.b64decode(b64)
327
327
  self._audio_queue.put_nowait(chunk)
328
- logger.info("Audio delta bytes=%d", len(chunk))
328
+ # Ownership/response tagging for diagnostics
329
+ try:
330
+ owner = getattr(self, "_owner_user_id", None)
331
+ except Exception:
332
+ owner = None
333
+ try:
334
+ rid = getattr(self, "_active_response_id", None)
335
+ except Exception:
336
+ rid = None
337
+ try:
338
+ gen = int(getattr(self, "_response_generation", 0))
339
+ except Exception:
340
+ gen = None
341
+ logger.info(
342
+ "Audio delta bytes=%d owner=%s rid=%s gen=%s",
343
+ len(chunk),
344
+ owner,
345
+ rid,
346
+ gen,
347
+ )
329
348
  try:
330
349
  # New response detected if we were previously inactive
331
350
  if not getattr(self, "_response_active", False):
@@ -492,8 +511,25 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
492
511
  "response.audio.done",
493
512
  ):
494
513
  # End of audio stream for the response; stop audio iterator but keep WS open for transcripts
514
+ try:
515
+ owner = getattr(self, "_owner_user_id", None)
516
+ except Exception:
517
+ owner = None
518
+ try:
519
+ rid = (data.get("response") or {}).get("id") or getattr(
520
+ self, "_active_response_id", None
521
+ )
522
+ except Exception:
523
+ rid = None
524
+ try:
525
+ gen = int(getattr(self, "_response_generation", 0))
526
+ except Exception:
527
+ gen = None
495
528
  logger.info(
496
- "Realtime WS: output audio done; ending audio stream"
529
+ "Realtime WS: output audio done; owner=%s rid=%s gen=%s",
530
+ owner,
531
+ rid,
532
+ gen,
497
533
  )
498
534
  # If we have a buffered transcript for this response, flush it now
499
535
  try:
@@ -961,9 +997,6 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
961
997
  if audio_patch:
962
998
  patch["audio"] = audio_patch
963
999
 
964
- # Always include session.type in updates
965
- patch["type"] = "realtime"
966
-
967
1000
  # No top-level turn_detection
968
1001
 
969
1002
  def _strip_tool_strict(tools_val):
@@ -1030,7 +1063,8 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1030
1063
  )
1031
1064
  except Exception:
1032
1065
  pass
1033
- await self._send(payload)
1066
+ # Use tracked send to attach an event_id and improve diagnostics
1067
+ await self._send_tracked(payload, label="session.update:patch")
1034
1068
 
1035
1069
  async def append_audio(self, pcm16_bytes: bytes) -> None: # pragma: no cover
1036
1070
  b64 = base64.b64encode(pcm16_bytes).decode("ascii")
@@ -1045,10 +1079,16 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1045
1079
 
1046
1080
  async def commit_input(self) -> None: # pragma: no cover
1047
1081
  try:
1048
- # Skip commits while a response is active to avoid server errors
1082
+ # If a previous response is still marked active, wait briefly, then proceed.
1083
+ # Skipping commits here can cause new turns to reference old audio and repeat answers.
1049
1084
  if bool(getattr(self, "_response_active", False)):
1050
- logger.warning("Realtime WS: skipping commit; response active")
1051
- return
1085
+ logger.warning(
1086
+ "Realtime WS: response active at commit; waiting briefly before proceeding"
1087
+ )
1088
+ for _ in range(5): # up to ~0.5s
1089
+ await asyncio.sleep(0.1)
1090
+ if not bool(getattr(self, "_response_active", False)):
1091
+ break
1052
1092
  # Avoid overlapping commits while awaiting server ack
1053
1093
  if bool(getattr(self, "_commit_inflight", False)):
1054
1094
  logger.warning("Realtime WS: skipping commit; commit in-flight")
@@ -1250,6 +1290,24 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1250
1290
  def set_tool_executor(self, executor): # pragma: no cover
1251
1291
  self._tool_executor = executor
1252
1292
 
1293
+ def reset_output_stream(self) -> None: # pragma: no cover
1294
+ """Drain any queued output audio and clear per-response text buffers.
1295
+ This avoids replaying stale audio if the client failed to consume previous chunks."""
1296
+ try:
1297
+ while True:
1298
+ try:
1299
+ _ = self._audio_queue.get_nowait()
1300
+ except asyncio.QueueEmpty:
1301
+ break
1302
+ except Exception:
1303
+ break
1304
+ try:
1305
+ self._out_text_buffers.clear()
1306
+ except Exception:
1307
+ pass
1308
+ except Exception:
1309
+ pass
1310
+
1253
1311
  # Expose whether a function/tool call is currently pending
1254
1312
  def has_pending_tool_call(self) -> bool: # pragma: no cover
1255
1313
  try:
@@ -1611,3 +1669,7 @@ class OpenAITranscriptionWebSocketSession(BaseRealtimeSession):
1611
1669
  def set_tool_executor(self, executor): # pragma: no cover
1612
1670
  # Not applicable for transcription-only
1613
1671
  return
1672
+
1673
+ def reset_output_stream(self) -> None: # pragma: no cover
1674
+ # No audio output stream to reset
1675
+ return
@@ -67,10 +67,117 @@ class QueryService(QueryServiceInterface):
67
67
  self._sticky_sessions: Dict[str, Dict[str, Any]] = {}
68
68
  # Optional realtime service attached by factory (populated in factory)
69
69
  self.realtime = None # type: ignore[attr-defined]
70
- # Persistent realtime WS per user for push-to-talk reuse
71
- self._rt_services = {}
70
+ # Persistent realtime WS pool per user for reuse across turns/devices
71
+ # { user_id: [RealtimeService, ...] }
72
+ self._rt_services: Dict[str, List[Any]] = {}
73
+ # Global lock for creating/finding per-user sessions
72
74
  self._rt_lock = asyncio.Lock()
73
75
 
76
+ async def _try_acquire_lock(self, lock: asyncio.Lock) -> bool:
77
+ try:
78
+ await asyncio.wait_for(lock.acquire(), timeout=0)
79
+ return True
80
+ except asyncio.TimeoutError:
81
+ return False
82
+ except Exception:
83
+ return False
84
+
85
+ async def _alloc_realtime_session(
86
+ self,
87
+ user_id: str,
88
+ *,
89
+ api_key: str,
90
+ rt_voice: str,
91
+ final_instructions: str,
92
+ initial_tools: Optional[List[Dict[str, Any]]],
93
+ encode_in: bool,
94
+ encode_out: bool,
95
+ audio_input_format: str,
96
+ audio_output_format: str,
97
+ ) -> Any:
98
+ """Get a free (or new) realtime session for this user. Marks it busy via an internal lock.
99
+
100
+ Returns the RealtimeService with an acquired _in_use_lock that MUST be released by caller.
101
+ """
102
+ from solana_agent.interfaces.providers.realtime import (
103
+ RealtimeSessionOptions,
104
+ )
105
+ from solana_agent.adapters.openai_realtime_ws import (
106
+ OpenAIRealtimeWebSocketSession,
107
+ )
108
+ from solana_agent.adapters.ffmpeg_transcoder import FFmpegTranscoder
109
+
110
+ def _mime_from(fmt: str) -> str:
111
+ f = (fmt or "").lower()
112
+ return {
113
+ "aac": "audio/aac",
114
+ "mp3": "audio/mpeg",
115
+ "mp4": "audio/mp4",
116
+ "m4a": "audio/mp4",
117
+ "mpeg": "audio/mpeg",
118
+ "mpga": "audio/mpeg",
119
+ "wav": "audio/wav",
120
+ "flac": "audio/flac",
121
+ "opus": "audio/opus",
122
+ "ogg": "audio/ogg",
123
+ "webm": "audio/webm",
124
+ "pcm": "audio/pcm",
125
+ }.get(f, "audio/pcm")
126
+
127
+ async with self._rt_lock:
128
+ pool = self._rt_services.get(user_id) or []
129
+ # Try to reuse an idle session strictly owned by this user
130
+ for rt in pool:
131
+ # Extra safety: never reuse a session from another user
132
+ owner = getattr(rt, "_owner_user_id", None)
133
+ if owner is not None and owner != user_id:
134
+ continue
135
+ lock = getattr(rt, "_in_use_lock", None)
136
+ if lock is None:
137
+ lock = asyncio.Lock()
138
+ setattr(rt, "_in_use_lock", lock)
139
+ if not lock.locked():
140
+ if await self._try_acquire_lock(lock):
141
+ return rt
142
+ # None free: create a new session
143
+ opts = RealtimeSessionOptions(
144
+ model="gpt-realtime",
145
+ voice=rt_voice,
146
+ vad_enabled=False,
147
+ input_rate_hz=24000,
148
+ output_rate_hz=24000,
149
+ input_mime="audio/pcm",
150
+ output_mime="audio/pcm",
151
+ tools=initial_tools or None,
152
+ tool_choice="auto",
153
+ )
154
+ try:
155
+ opts.instructions = final_instructions
156
+ opts.voice = rt_voice
157
+ except Exception:
158
+ pass
159
+ conv_session = OpenAIRealtimeWebSocketSession(api_key=api_key, options=opts)
160
+ transcoder = FFmpegTranscoder() if (encode_in or encode_out) else None
161
+ from solana_agent.services.realtime import RealtimeService
162
+
163
+ rt = RealtimeService(
164
+ session=conv_session,
165
+ options=opts,
166
+ transcoder=transcoder,
167
+ accept_compressed_input=encode_in,
168
+ client_input_mime=_mime_from(audio_input_format),
169
+ encode_output=encode_out,
170
+ client_output_mime=_mime_from(audio_output_format),
171
+ )
172
+ # Tag ownership to prevent any cross-user reuse
173
+ setattr(rt, "_owner_user_id", user_id)
174
+ setattr(rt, "_in_use_lock", asyncio.Lock())
175
+ # Mark busy
176
+ await getattr(rt, "_in_use_lock").acquire()
177
+ pool.append(rt)
178
+ self._rt_services[user_id] = pool
179
+ return rt
180
+
74
181
  def _get_sticky_agent(self, user_id: str) -> Optional[str]:
75
182
  sess = self._sticky_sessions.get(user_id)
76
183
  return sess.get("agent") if isinstance(sess, dict) else None
@@ -554,14 +661,7 @@ class QueryService(QueryServiceInterface):
554
661
  final_instructions = "\n\n".join([p for p in parts if p])
555
662
 
556
663
  # 4) Open a single WS session for assistant audio
557
- from solana_agent.adapters.openai_realtime_ws import (
558
- OpenAIRealtimeWebSocketSession,
559
- )
560
- from solana_agent.interfaces.providers.realtime import (
561
- RealtimeSessionOptions,
562
- )
563
- from solana_agent.services.realtime import RealtimeService
564
- from solana_agent.adapters.ffmpeg_transcoder import FFmpegTranscoder
664
+ # Realtime imports handled inside allocator helper
565
665
 
566
666
  api_key = None
567
667
  try:
@@ -600,165 +700,160 @@ class QueryService(QueryServiceInterface):
600
700
  or (is_audio_bytes and audio_input_format.lower() != "pcm")
601
701
  )
602
702
 
603
- async with self._rt_lock:
604
- rt = self._rt_services.get(user_id)
605
- if not rt or not isinstance(rt, RealtimeService):
606
- opts = RealtimeSessionOptions(
607
- model="gpt-realtime",
608
- voice=rt_voice,
609
- vad_enabled=False, # no input audio
610
- input_rate_hz=24000,
611
- output_rate_hz=24000,
612
- input_mime="audio/pcm",
613
- output_mime="audio/pcm",
614
- tools=initial_tools or None,
615
- tool_choice="auto",
616
- )
617
- # Ensure initial session.update carries instructions/voice
703
+ # Allocate or reuse a realtime session for this specific request/user
704
+ rt = await self._alloc_realtime_session(
705
+ user_id,
706
+ api_key=api_key,
707
+ rt_voice=rt_voice,
708
+ final_instructions=final_instructions,
709
+ initial_tools=initial_tools,
710
+ encode_in=encode_in,
711
+ encode_out=encode_out,
712
+ audio_input_format=audio_input_format,
713
+ audio_output_format=audio_output_format,
714
+ )
715
+ # Ensure lock is released no matter what
716
+ try:
717
+ # Tool executor
718
+ async def _exec(
719
+ tool_name: str, args: Dict[str, Any]
720
+ ) -> Dict[str, Any]:
618
721
  try:
619
- opts.instructions = final_instructions
620
- opts.voice = rt_voice
621
- except Exception:
622
- pass
623
- conv_session = OpenAIRealtimeWebSocketSession(
624
- api_key=api_key, options=opts
625
- )
626
- transcoder = (
627
- FFmpegTranscoder() if (encode_in or encode_out) else None
628
- )
629
- rt = RealtimeService(
630
- session=conv_session,
631
- options=opts,
632
- transcoder=transcoder,
633
- accept_compressed_input=encode_in,
634
- client_input_mime=_mime_from(audio_input_format),
635
- encode_output=encode_out,
636
- client_output_mime=_mime_from(audio_output_format),
637
- )
638
- self._rt_services[user_id] = rt
722
+ return await self.agent_service.execute_tool(
723
+ agent_name, tool_name, args or {}
724
+ )
725
+ except Exception as e:
726
+ return {"status": "error", "message": str(e)}
639
727
 
640
- # Tool executor
641
- async def _exec(tool_name: str, args: Dict[str, Any]) -> Dict[str, Any]:
728
+ # If possible, set on underlying session
642
729
  try:
643
- return await self.agent_service.execute_tool(
644
- agent_name, tool_name, args or {}
645
- )
646
- except Exception as e:
647
- return {"status": "error", "message": str(e)}
648
-
649
- # If possible, set on underlying session
650
- try:
651
- if hasattr(rt, "_session"):
652
- getattr(rt, "_session").set_tool_executor(_exec) # type: ignore[attr-defined]
653
- except Exception:
654
- pass
655
-
656
- # Connect/configure
657
- if not getattr(rt, "_connected", False):
658
- await rt.start()
659
- await rt.configure(
660
- voice=rt_voice,
661
- vad_enabled=bool(vad) if vad is not None else False,
662
- instructions=final_instructions,
663
- tools=initial_tools or None,
664
- tool_choice="auto",
665
- )
730
+ if hasattr(rt, "_session"):
731
+ getattr(rt, "_session").set_tool_executor(_exec) # type: ignore[attr-defined]
732
+ except Exception:
733
+ pass
666
734
 
667
- # Ensure clean input buffers for this turn
668
- try:
669
- await rt.clear_input()
670
- except Exception:
671
- pass
735
+ # Connect/configure
736
+ if not getattr(rt, "_connected", False):
737
+ await rt.start()
738
+ await rt.configure(
739
+ voice=rt_voice,
740
+ vad_enabled=bool(vad) if vad is not None else False,
741
+ instructions=final_instructions,
742
+ tools=initial_tools or None,
743
+ tool_choice="auto",
744
+ )
672
745
 
673
- # Persist once per turn
674
- turn_id = await self.realtime_begin_turn(user_id)
675
- if turn_id and user_text:
746
+ # Ensure clean input buffers for this turn
747
+ try:
748
+ await rt.clear_input()
749
+ except Exception:
750
+ pass
751
+ # Also reset any leftover output audio so new turn doesn't replay old chunks
676
752
  try:
677
- await self.realtime_update_user(user_id, turn_id, user_text)
753
+ if hasattr(rt, "reset_output_stream"):
754
+ rt.reset_output_stream()
678
755
  except Exception:
679
756
  pass
680
757
 
681
- # Feed audio into WS if audio bytes provided; else use input_text
682
- if is_audio_bytes:
683
- bq = bytes(query)
684
- logger.info(
685
- "Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
686
- len(bq),
687
- audio_input_format,
688
- )
689
- await rt.append_audio(bq)
690
- vad_enabled_value = bool(vad) if vad is not None else False
691
- if not vad_enabled_value:
692
- await rt.commit_input()
693
- # Manually trigger response when VAD is disabled
694
- await rt.create_response({})
758
+ # Persist once per turn
759
+ turn_id = await self.realtime_begin_turn(user_id)
760
+ if turn_id and user_text:
761
+ try:
762
+ await self.realtime_update_user(user_id, turn_id, user_text)
763
+ except Exception:
764
+ pass
765
+
766
+ # Feed audio into WS if audio bytes provided; else use input_text
767
+ if is_audio_bytes:
768
+ bq = bytes(query)
769
+ logger.info(
770
+ "Realtime: appending input audio to WS via FFmpeg, len=%d, fmt=%s",
771
+ len(bq),
772
+ audio_input_format,
773
+ )
774
+ await rt.append_audio(bq)
775
+ vad_enabled_value = bool(vad) if vad is not None else False
776
+ if not vad_enabled_value:
777
+ await rt.commit_input()
778
+ # Manually trigger response when VAD is disabled
779
+ await rt.create_response({})
780
+ else:
781
+ # With server VAD enabled, the model will auto-create a response at end of speech
782
+ logger.debug(
783
+ "Realtime: VAD enabled — skipping manual response.create"
784
+ )
695
785
  else:
696
- # With server VAD enabled, the model will auto-create a response at end of speech
697
- logger.debug(
698
- "Realtime: VAD enabled — skipping manual response.create"
786
+ # Rely on configured session voice; attach input_text only
787
+ await rt.create_response(
788
+ {
789
+ "modalities": ["audio"],
790
+ "input": [
791
+ {"type": "input_text", "text": user_text or ""}
792
+ ],
793
+ }
699
794
  )
700
- else:
701
- # Rely on configured session voice; attach input_text only
702
- await rt.create_response(
703
- {
704
- "modalities": ["audio"],
705
- "input": [{"type": "input_text", "text": user_text or ""}],
706
- }
707
- )
708
795
 
709
- # Collect audio and transcripts
710
- user_tr = ""
711
- asst_tr = ""
796
+ # Collect audio and transcripts
797
+ user_tr = ""
798
+ asst_tr = ""
712
799
 
713
- async def _drain_in_tr():
714
- nonlocal user_tr
715
- async for t in rt.iter_input_transcript():
716
- if t:
717
- user_tr += t
800
+ async def _drain_in_tr():
801
+ nonlocal user_tr
802
+ async for t in rt.iter_input_transcript():
803
+ if t:
804
+ user_tr += t
718
805
 
719
- async def _drain_out_tr():
720
- nonlocal asst_tr
721
- async for t in rt.iter_output_transcript():
722
- if t:
723
- asst_tr += t
806
+ async def _drain_out_tr():
807
+ nonlocal asst_tr
808
+ async for t in rt.iter_output_transcript():
809
+ if t:
810
+ asst_tr += t
724
811
 
725
- in_task = asyncio.create_task(_drain_in_tr())
726
- out_task = asyncio.create_task(_drain_out_tr())
727
- try:
728
- async for audio_chunk in rt.iter_output_audio_encoded():
729
- yield audio_chunk
730
- finally:
731
- in_task.cancel()
732
- out_task.cancel()
733
- # If no WS input transcript was captured, fall back to HTTP STT result
734
- if not user_tr:
735
- try:
736
- if "stt_task" in locals() and stt_task is not None:
737
- user_tr = await stt_task
738
- except Exception:
739
- pass
740
- if turn_id:
741
- try:
742
- if user_tr:
743
- await self.realtime_update_user(
744
- user_id, turn_id, user_tr
745
- )
746
- if asst_tr:
747
- await self.realtime_update_assistant(
748
- user_id, turn_id, asst_tr
749
- )
750
- except Exception:
751
- pass
812
+ in_task = asyncio.create_task(_drain_in_tr())
813
+ out_task = asyncio.create_task(_drain_out_tr())
814
+ try:
815
+ async for audio_chunk in rt.iter_output_audio_encoded():
816
+ yield audio_chunk
817
+ finally:
818
+ in_task.cancel()
819
+ out_task.cancel()
820
+ # If no WS input transcript was captured, fall back to HTTP STT result
821
+ if not user_tr:
822
+ try:
823
+ if "stt_task" in locals() and stt_task is not None:
824
+ user_tr = await stt_task
825
+ except Exception:
826
+ pass
827
+ if turn_id:
828
+ try:
829
+ if user_tr:
830
+ await self.realtime_update_user(
831
+ user_id, turn_id, user_tr
832
+ )
833
+ if asst_tr:
834
+ await self.realtime_update_assistant(
835
+ user_id, turn_id, asst_tr
836
+ )
837
+ except Exception:
838
+ pass
839
+ try:
840
+ await self.realtime_finalize_turn(user_id, turn_id)
841
+ except Exception:
842
+ pass
843
+ # Clear input buffer for next turn reuse
752
844
  try:
753
- await self.realtime_finalize_turn(user_id, turn_id)
845
+ await rt.clear_input()
754
846
  except Exception:
755
847
  pass
756
- # Clear input buffer for next turn reuse
848
+ finally:
849
+ # Always release the session for reuse by other concurrent requests/devices
757
850
  try:
758
- await rt.clear_input()
851
+ lock = getattr(rt, "_in_use_lock", None)
852
+ if lock and lock.locked():
853
+ lock.release()
759
854
  except Exception:
760
855
  pass
761
- return
856
+ return
762
857
 
763
858
  # 1) Transcribe audio or accept text
764
859
  user_text = ""
@@ -185,6 +185,13 @@ class RealtimeService:
185
185
  def iter_output_audio(self) -> AsyncGenerator[bytes, None]: # pragma: no cover
186
186
  return self._session.iter_output_audio()
187
187
 
188
+ def reset_output_stream(self) -> None: # pragma: no cover
189
+ try:
190
+ if hasattr(self._session, "reset_output_stream"):
191
+ self._session.reset_output_stream()
192
+ except Exception:
193
+ pass
194
+
188
195
  async def iter_output_audio_encoded(
189
196
  self,
190
197
  ) -> AsyncGenerator[bytes, None]: # pragma: no cover
@@ -447,6 +454,13 @@ class TwinRealtimeService:
447
454
  def iter_output_audio(self) -> AsyncGenerator[bytes, None]: # pragma: no cover
448
455
  return self._conv.iter_output_audio()
449
456
 
457
+ def reset_output_stream(self) -> None: # pragma: no cover
458
+ try:
459
+ if hasattr(self._conv, "reset_output_stream"):
460
+ self._conv.reset_output_stream()
461
+ except Exception:
462
+ pass
463
+
450
464
  async def iter_output_audio_encoded(
451
465
  self,
452
466
  ) -> AsyncGenerator[bytes, None]: # pragma: no cover
File without changes