solana-agent 31.2.6__tar.gz → 31.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {solana_agent-31.2.6 → solana_agent-31.3.0}/PKG-INFO +115 -9
  2. {solana_agent-31.2.6 → solana_agent-31.3.0}/README.md +114 -8
  3. {solana_agent-31.2.6 → solana_agent-31.3.0}/pyproject.toml +1 -1
  4. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/openai_realtime_ws.py +160 -31
  5. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/client/solana_agent.py +7 -1
  6. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/client/client.py +3 -1
  7. solana_agent-31.3.0/solana_agent/interfaces/providers/__init__.py +0 -0
  8. solana_agent-31.3.0/solana_agent/interfaces/providers/realtime.py +212 -0
  9. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/services/query.py +3 -1
  10. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/query.py +422 -107
  11. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/realtime.py +123 -17
  12. solana_agent-31.2.6/solana_agent/interfaces/providers/realtime.py +0 -100
  13. {solana_agent-31.2.6 → solana_agent-31.3.0}/LICENSE +0 -0
  14. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/__init__.py +0 -0
  15. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/__init__.py +0 -0
  16. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/ffmpeg_transcoder.py +0 -0
  17. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/mongodb_adapter.py +0 -0
  18. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/openai_adapter.py +0 -0
  19. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/pinecone_adapter.py +0 -0
  20. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/cli.py +0 -0
  21. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/client/__init__.py +0 -0
  22. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/domains/__init__.py +0 -0
  23. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/domains/agent.py +0 -0
  24. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/domains/routing.py +0 -0
  25. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/factories/__init__.py +0 -0
  26. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/factories/agent_factory.py +0 -0
  27. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/guardrails/pii.py +0 -0
  28. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/__init__.py +0 -0
  29. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/guardrails/guardrails.py +0 -0
  30. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/plugins/plugins.py +0 -0
  31. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/audio.py +0 -0
  32. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/data_storage.py +0 -0
  33. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/llm.py +0 -0
  34. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/memory.py +0 -0
  35. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/vector_storage.py +0 -0
  36. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/services/agent.py +0 -0
  37. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/services/knowledge_base.py +0 -0
  38. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/services/routing.py +0 -0
  39. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/__init__.py +0 -0
  40. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/manager.py +0 -0
  41. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/registry.py +0 -0
  42. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/tools/__init__.py +0 -0
  43. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/tools/auto_tool.py +0 -0
  44. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/repositories/__init__.py +0 -0
  45. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/repositories/memory.py +0 -0
  46. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/__init__.py +0 -0
  47. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/agent.py +0 -0
  48. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/knowledge_base.py +0 -0
  49. {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/routing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: solana-agent
3
- Version: 31.2.6
3
+ Version: 31.3.0
4
4
  Summary: AI Agents for Solana
5
5
  License: MIT
6
6
  Keywords: solana,solana ai,solana agent,ai,ai agent,ai agents
@@ -98,6 +98,7 @@ Smart workflows are as easy as combining your tools and prompts.
98
98
  * Simple agent definition using JSON
99
99
  * Designed for a multi-agent swarm
100
100
  * Fast multi-modal processing of text, audio, and images
101
+ * Dual modality realtime streaming with simultaneous audio and text output
101
102
  * Smart workflows that keep flows simple and smart
102
103
  * Interact with the Solana blockchain with many useful tools
103
104
  * MCP tool usage with first-class support for [Zapier](https://zapier.com/mcp)
@@ -132,7 +133,7 @@ Smart workflows are as easy as combining your tools and prompts.
132
133
  **OpenAI**
133
134
  * [gpt-4.1](https://platform.openai.com/docs/models/gpt-4.1) (agent & router)
134
135
  * [text-embedding-3-large](https://platform.openai.com/docs/models/text-embedding-3-large) (embedding)
135
- * [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent)
136
+ * [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent with dual modality support)
136
137
  * [tts-1](https://platform.openai.com/docs/models/tts-1) (audio TTS)
137
138
  * [gpt-4o-mini-transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe) (audio transcription)
138
139
 
@@ -281,6 +282,7 @@ async for response in solana_agent.process("user123", "What is the latest news o
281
282
  ### Audio/Text Streaming
282
283
 
283
284
  ```python
285
+ ## Realtime Usage
284
286
  from solana_agent import SolanaAgent
285
287
 
286
288
  config = {
@@ -311,28 +313,32 @@ async for response in solana_agent.process("user123", audio_content, audio_input
311
313
 
312
314
  ### Realtime Audio Streaming
313
315
 
314
- If input and/or output is encoded (compressed) like mp4/aac then you must have `ffmpeg` installed.
316
+ If input and/or output is encoded (compressed) like mp4/mp3 then you must have `ffmpeg` installed.
315
317
 
316
318
  Due to the overhead of the router (API call) - realtime only supports a single agent setup.
317
319
 
318
320
  Realtime uses MongoDB for memory so Zep is not needed.
319
321
 
322
+ By default, when `realtime=True` and you supply raw/encoded audio bytes as input, the system **always skips the HTTP transcription (STT) path** and relies solely on the realtime websocket session for input transcription. If you don't specify `rt_transcription_model`, a sensible default (`gpt-4o-mini-transcribe`) is auto-selected so you still receive input transcript events with minimal latency.
323
+
324
+ Implications:
325
+ - `llm_provider.transcribe_audio` is never invoked for realtime turns.
326
+ - Lower end-to-end latency (no duplicate network round trip for STT).
327
+ - Unified transcript sourcing from realtime events.
328
+ - If you explicitly want to disable transcription altogether, send text (not audio bytes) or ignore transcript events client-side.
329
+
320
330
  This example will work using expo-audio on Android and iOS.
321
331
 
322
332
  ```python
323
333
  from solana_agent import SolanaAgent
324
334
 
325
335
  solana_agent = SolanaAgent(config=config)
326
-
327
- audio_content = await audio_file.read()
328
-
329
- async def generate():
330
- async for chunk in solana_agent.process(
331
- user_id=user_id,
336
+ user_id="user123",
332
337
  message=audio_content,
333
338
  realtime=True,
334
339
  rt_encode_input=True,
335
340
  rt_encode_output=True,
341
+ rt_output_modalities=["audio"],
336
342
  rt_voice="marin",
337
343
  output_format="audio",
338
344
  audio_output_format="mp3",
@@ -350,6 +356,106 @@ return StreamingResponse(
350
356
  "X-Accel-Buffering": "no",
351
357
  },
352
358
  )
359
+ ```
360
+
361
+ ### Realtime Text Streaming
362
+
363
+ Due to the overhead of the router (API call) - realtime only supports a single agent setup.
364
+
365
+ Realtime uses MongoDB for memory so Zep is not needed.
366
+
367
+ When using realtime with text input, no audio transcription is needed. The same bypass rules apply—HTTP STT is never called in realtime mode.
368
+
369
+ ```python
370
+ from solana_agent import SolanaAgent
371
+
372
+ solana_agent = SolanaAgent(config=config)
373
+
374
+ async def generate():
375
+ async for chunk in solana_agent.process(
376
+ user_id="user123",
377
+ message="What is the latest news on Solana?",
378
+ realtime=True,
379
+ rt_output_modalities=["text"],
380
+ ):
381
+ yield chunk
382
+ ```
383
+
384
+ ### Dual Modality Realtime Streaming
385
+
386
+ Solana Agent supports **dual modality realtime streaming**, allowing you to stream both audio and text simultaneously from a single realtime session. This enables rich conversational experiences where users can receive both voice responses and text transcripts in real-time.
387
+
388
+ #### Features
389
+ - **Simultaneous Audio & Text**: Stream both modalities from the same conversation
390
+ - **Flexible Output**: Choose audio-only, text-only, or both modalities
391
+ - **Real-time Demuxing**: Automatically separate audio and text streams
392
+ - **Mobile Optimized**: Works seamlessly with compressed audio formats (MP4/AAC)
393
+ - **Memory Efficient**: Smart buffering and streaming for optimal performance
394
+
395
+ #### Mobile App Integration Example
396
+
397
+ ```python
398
+ from fastapi import UploadFile
399
+ from fastapi.responses import StreamingResponse
400
+ from solana_agent import SolanaAgent
401
+ from solana_agent.interfaces.providers.realtime import RealtimeChunk
402
+ import base64
403
+
404
+ solana_agent = SolanaAgent(config=config)
405
+
406
+ @app.post("/realtime/dual")
407
+ async def realtime_dual_endpoint(audio_file: UploadFile):
408
+ """
409
+ Dual modality (audio + text) realtime endpoint using Server-Sent Events (SSE).
410
+ Emits:
411
+ event: audio (base64 encoded audio frames)
412
+ event: transcript (incremental text)
413
+ Notes:
414
+ - Do NOT set output_format when using both modalities.
415
+ - If only one modality is requested, plain str (text) or raw audio bytes may be yielded instead of RealtimeChunk.
416
+ """
417
+ audio_content = await audio_file.read()
418
+
419
+ async def event_stream():
420
+ async for chunk in solana_agent.process(
421
+ user_id="mobile_user",
422
+ message=audio_content,
423
+ realtime=True,
424
+ rt_encode_input=True,
425
+ rt_encode_output=True,
426
+ rt_output_modalities=["audio", "text"],
427
+ rt_voice="marin",
428
+ audio_input_format="mp4",
429
+ audio_output_format="mp3",
430
+ # Optionally lock transcription model (otherwise default is auto-selected):
431
+ # rt_transcription_model="gpt-4o-mini-transcribe",
432
+ ):
433
+ if isinstance(chunk, RealtimeChunk):
434
+ if chunk.is_audio and chunk.audio_data:
435
+ b64 = base64.b64encode(chunk.audio_data).decode("ascii")
436
+ yield f"event: audio\ndata: {b64}\n\n"
437
+ elif chunk.is_text and chunk.text_data:
438
+ # Incremental transcript (not duplicated at finalize)
439
+ yield f"event: transcript\ndata: {chunk.text_data}\n\n"
440
+ continue
441
+ # (Defensive) fallback: if something else appears
442
+ if isinstance(chunk, bytes):
443
+ b64 = base64.b64encode(chunk).decode("ascii")
444
+ yield f"event: audio\ndata: {b64}\n\n"
445
+ elif isinstance(chunk, str):
446
+ yield f"event: transcript\ndata: {chunk}\n\n"
447
+
448
+ yield "event: done\ndata: end\n\n"
449
+
450
+ return StreamingResponse(
451
+ event_stream(),
452
+ media_type="text/event-stream",
453
+ headers={
454
+ "Cache-Control": "no-store",
455
+ "Access-Control-Allow-Origin": "*",
456
+ },
457
+ )
458
+ ```
353
459
 
354
460
  ### Image/Text Streaming
355
461
 
@@ -62,6 +62,7 @@ Smart workflows are as easy as combining your tools and prompts.
62
62
  * Simple agent definition using JSON
63
63
  * Designed for a multi-agent swarm
64
64
  * Fast multi-modal processing of text, audio, and images
65
+ * Dual modality realtime streaming with simultaneous audio and text output
65
66
  * Smart workflows that keep flows simple and smart
66
67
  * Interact with the Solana blockchain with many useful tools
67
68
  * MCP tool usage with first-class support for [Zapier](https://zapier.com/mcp)
@@ -96,7 +97,7 @@ Smart workflows are as easy as combining your tools and prompts.
96
97
  **OpenAI**
97
98
  * [gpt-4.1](https://platform.openai.com/docs/models/gpt-4.1) (agent & router)
98
99
  * [text-embedding-3-large](https://platform.openai.com/docs/models/text-embedding-3-large) (embedding)
99
- * [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent)
100
+ * [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent with dual modality support)
100
101
  * [tts-1](https://platform.openai.com/docs/models/tts-1) (audio TTS)
101
102
  * [gpt-4o-mini-transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe) (audio transcription)
102
103
 
@@ -245,6 +246,7 @@ async for response in solana_agent.process("user123", "What is the latest news o
245
246
  ### Audio/Text Streaming
246
247
 
247
248
  ```python
249
+ ## Realtime Usage
248
250
  from solana_agent import SolanaAgent
249
251
 
250
252
  config = {
@@ -275,28 +277,32 @@ async for response in solana_agent.process("user123", audio_content, audio_input
275
277
 
276
278
  ### Realtime Audio Streaming
277
279
 
278
- If input and/or output is encoded (compressed) like mp4/aac then you must have `ffmpeg` installed.
280
+ If input and/or output is encoded (compressed) like mp4/mp3 then you must have `ffmpeg` installed.
279
281
 
280
282
  Due to the overhead of the router (API call) - realtime only supports a single agent setup.
281
283
 
282
284
  Realtime uses MongoDB for memory so Zep is not needed.
283
285
 
286
+ By default, when `realtime=True` and you supply raw/encoded audio bytes as input, the system **always skips the HTTP transcription (STT) path** and relies solely on the realtime websocket session for input transcription. If you don't specify `rt_transcription_model`, a sensible default (`gpt-4o-mini-transcribe`) is auto-selected so you still receive input transcript events with minimal latency.
287
+
288
+ Implications:
289
+ - `llm_provider.transcribe_audio` is never invoked for realtime turns.
290
+ - Lower end-to-end latency (no duplicate network round trip for STT).
291
+ - Unified transcript sourcing from realtime events.
292
+ - If you explicitly want to disable transcription altogether, send text (not audio bytes) or ignore transcript events client-side.
293
+
284
294
  This example will work using expo-audio on Android and iOS.
285
295
 
286
296
  ```python
287
297
  from solana_agent import SolanaAgent
288
298
 
289
299
  solana_agent = SolanaAgent(config=config)
290
-
291
- audio_content = await audio_file.read()
292
-
293
- async def generate():
294
- async for chunk in solana_agent.process(
295
- user_id=user_id,
300
+ user_id="user123",
296
301
  message=audio_content,
297
302
  realtime=True,
298
303
  rt_encode_input=True,
299
304
  rt_encode_output=True,
305
+ rt_output_modalities=["audio"],
300
306
  rt_voice="marin",
301
307
  output_format="audio",
302
308
  audio_output_format="mp3",
@@ -314,6 +320,106 @@ return StreamingResponse(
314
320
  "X-Accel-Buffering": "no",
315
321
  },
316
322
  )
323
+ ```
324
+
325
+ ### Realtime Text Streaming
326
+
327
+ Due to the overhead of the router (API call) - realtime only supports a single agent setup.
328
+
329
+ Realtime uses MongoDB for memory so Zep is not needed.
330
+
331
+ When using realtime with text input, no audio transcription is needed. The same bypass rules apply—HTTP STT is never called in realtime mode.
332
+
333
+ ```python
334
+ from solana_agent import SolanaAgent
335
+
336
+ solana_agent = SolanaAgent(config=config)
337
+
338
+ async def generate():
339
+ async for chunk in solana_agent.process(
340
+ user_id="user123",
341
+ message="What is the latest news on Solana?",
342
+ realtime=True,
343
+ rt_output_modalities=["text"],
344
+ ):
345
+ yield chunk
346
+ ```
347
+
348
+ ### Dual Modality Realtime Streaming
349
+
350
+ Solana Agent supports **dual modality realtime streaming**, allowing you to stream both audio and text simultaneously from a single realtime session. This enables rich conversational experiences where users can receive both voice responses and text transcripts in real-time.
351
+
352
+ #### Features
353
+ - **Simultaneous Audio & Text**: Stream both modalities from the same conversation
354
+ - **Flexible Output**: Choose audio-only, text-only, or both modalities
355
+ - **Real-time Demuxing**: Automatically separate audio and text streams
356
+ - **Mobile Optimized**: Works seamlessly with compressed audio formats (MP4/AAC)
357
+ - **Memory Efficient**: Smart buffering and streaming for optimal performance
358
+
359
+ #### Mobile App Integration Example
360
+
361
+ ```python
362
+ from fastapi import UploadFile
363
+ from fastapi.responses import StreamingResponse
364
+ from solana_agent import SolanaAgent
365
+ from solana_agent.interfaces.providers.realtime import RealtimeChunk
366
+ import base64
367
+
368
+ solana_agent = SolanaAgent(config=config)
369
+
370
+ @app.post("/realtime/dual")
371
+ async def realtime_dual_endpoint(audio_file: UploadFile):
372
+ """
373
+ Dual modality (audio + text) realtime endpoint using Server-Sent Events (SSE).
374
+ Emits:
375
+ event: audio (base64 encoded audio frames)
376
+ event: transcript (incremental text)
377
+ Notes:
378
+ - Do NOT set output_format when using both modalities.
379
+ - If only one modality is requested, plain str (text) or raw audio bytes may be yielded instead of RealtimeChunk.
380
+ """
381
+ audio_content = await audio_file.read()
382
+
383
+ async def event_stream():
384
+ async for chunk in solana_agent.process(
385
+ user_id="mobile_user",
386
+ message=audio_content,
387
+ realtime=True,
388
+ rt_encode_input=True,
389
+ rt_encode_output=True,
390
+ rt_output_modalities=["audio", "text"],
391
+ rt_voice="marin",
392
+ audio_input_format="mp4",
393
+ audio_output_format="mp3",
394
+ # Optionally lock transcription model (otherwise default is auto-selected):
395
+ # rt_transcription_model="gpt-4o-mini-transcribe",
396
+ ):
397
+ if isinstance(chunk, RealtimeChunk):
398
+ if chunk.is_audio and chunk.audio_data:
399
+ b64 = base64.b64encode(chunk.audio_data).decode("ascii")
400
+ yield f"event: audio\ndata: {b64}\n\n"
401
+ elif chunk.is_text and chunk.text_data:
402
+ # Incremental transcript (not duplicated at finalize)
403
+ yield f"event: transcript\ndata: {chunk.text_data}\n\n"
404
+ continue
405
+ # (Defensive) fallback: if something else appears
406
+ if isinstance(chunk, bytes):
407
+ b64 = base64.b64encode(chunk).decode("ascii")
408
+ yield f"event: audio\ndata: {b64}\n\n"
409
+ elif isinstance(chunk, str):
410
+ yield f"event: transcript\ndata: {chunk}\n\n"
411
+
412
+ yield "event: done\ndata: end\n\n"
413
+
414
+ return StreamingResponse(
415
+ event_stream(),
416
+ media_type="text/event-stream",
417
+ headers={
418
+ "Cache-Control": "no-store",
419
+ "Access-Control-Allow-Origin": "*",
420
+ },
421
+ )
422
+ ```
317
423
 
318
424
  ### Image/Text Streaming
319
425
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "solana-agent"
3
- version = "31.2.6"
3
+ version = "31.3.0"
4
4
  description = "AI Agents for Solana"
5
5
  authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
6
6
  license = "MIT"
@@ -102,16 +102,30 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
102
102
  ]
103
103
  model = self.options.model or "gpt-realtime"
104
104
  uri = f"{self.url}?model={model}"
105
- logger.info(
106
- "Realtime WS connecting: uri=%s, input=%s@%sHz, output=%s@%sHz, voice=%s, vad=%s",
107
- uri,
108
- self.options.input_mime,
109
- self.options.input_rate_hz,
110
- self.options.output_mime,
111
- self.options.output_rate_hz,
112
- self.options.voice,
113
- self.options.vad_enabled,
114
- )
105
+
106
+ # Determine if audio output should be configured for logging
107
+ modalities = self.options.output_modalities or ["audio", "text"]
108
+ should_configure_audio_output = "audio" in modalities
109
+
110
+ if should_configure_audio_output:
111
+ logger.info(
112
+ "Realtime WS connecting: uri=%s, input=%s@%sHz, output=%s@%sHz, voice=%s, vad=%s",
113
+ uri,
114
+ self.options.input_mime,
115
+ self.options.input_rate_hz,
116
+ self.options.output_mime,
117
+ self.options.output_rate_hz,
118
+ self.options.voice,
119
+ self.options.vad_enabled,
120
+ )
121
+ else:
122
+ logger.info(
123
+ "Realtime WS connecting: uri=%s, input=%s@%sHz, text-only output, vad=%s",
124
+ uri,
125
+ self.options.input_mime,
126
+ self.options.input_rate_hz,
127
+ self.options.vad_enabled,
128
+ )
115
129
  self._ws = await websockets.connect(
116
130
  uri, additional_headers=headers, max_size=None
117
131
  )
@@ -165,11 +179,16 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
165
179
  cleaned.append(t)
166
180
  return cleaned
167
181
 
182
+ # Determine if audio output should be configured
183
+ modalities = self.options.output_modalities or ["audio", "text"]
184
+ should_configure_audio_output = "audio" in modalities
185
+
186
+ # Build session.update per docs (nested audio object)
168
187
  session_payload: Dict[str, Any] = {
169
188
  "type": "session.update",
170
189
  "session": {
171
190
  "type": "realtime",
172
- "output_modalities": ["audio"],
191
+ "output_modalities": modalities,
173
192
  "audio": {
174
193
  "input": {
175
194
  "format": {
@@ -178,16 +197,22 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
178
197
  },
179
198
  "turn_detection": td_input,
180
199
  },
181
- "output": {
182
- "format": {
183
- "type": self.options.output_mime or "audio/pcm",
184
- "rate": int(self.options.output_rate_hz or 24000),
185
- },
186
- "voice": self.options.voice,
187
- "speed": float(
188
- getattr(self.options, "voice_speed", 1.0) or 1.0
189
- ),
190
- },
200
+ **(
201
+ {
202
+ "output": {
203
+ "format": {
204
+ "type": self.options.output_mime or "audio/pcm",
205
+ "rate": int(self.options.output_rate_hz or 24000),
206
+ },
207
+ "voice": self.options.voice,
208
+ "speed": float(
209
+ getattr(self.options, "voice_speed", 1.0) or 1.0
210
+ ),
211
+ }
212
+ }
213
+ if should_configure_audio_output
214
+ else {}
215
+ ),
191
216
  },
192
217
  # Note: no top-level turn_detection; nested under audio.input
193
218
  **({"prompt": prompt_block} if prompt_block else {}),
@@ -204,13 +229,45 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
204
229
  ),
205
230
  },
206
231
  }
207
- logger.info(
208
- "Realtime WS: sending session.update (voice=%s, vad=%s, output=%s@%s)",
209
- self.options.voice,
210
- self.options.vad_enabled,
211
- (self.options.output_mime or "audio/pcm"),
212
- int(self.options.output_rate_hz or 24000),
213
- )
232
+ # Optional realtime transcription configuration
233
+ try:
234
+ tr_model = getattr(self.options, "transcription_model", None)
235
+ if tr_model:
236
+ audio_obj = session_payload["session"].setdefault("audio", {})
237
+ # Attach input transcription config per GA schema
238
+ transcription_cfg: Dict[str, Any] = {"model": tr_model}
239
+ lang = getattr(self.options, "transcription_language", None)
240
+ if lang:
241
+ transcription_cfg["language"] = lang
242
+ prompt_txt = getattr(self.options, "transcription_prompt", None)
243
+ if prompt_txt is not None:
244
+ transcription_cfg["prompt"] = prompt_txt
245
+ if getattr(self.options, "transcription_include_logprobs", False):
246
+ session_payload["session"].setdefault("include", []).append(
247
+ "item.input_audio_transcription.logprobs"
248
+ )
249
+ nr = getattr(self.options, "transcription_noise_reduction", None)
250
+ if nr is not None:
251
+ audio_obj["noise_reduction"] = bool(nr)
252
+ # Place under audio.input.transcription per current server conventions
253
+ audio_obj.setdefault("input", {}).setdefault(
254
+ "transcription", transcription_cfg
255
+ )
256
+ except Exception:
257
+ logger.exception("Failed to attach transcription config to session.update")
258
+ if should_configure_audio_output:
259
+ logger.info(
260
+ "Realtime WS: sending session.update (voice=%s, vad=%s, output=%s@%s)",
261
+ self.options.voice,
262
+ self.options.vad_enabled,
263
+ (self.options.output_mime or "audio/pcm"),
264
+ int(self.options.output_rate_hz or 24000),
265
+ )
266
+ else:
267
+ logger.info(
268
+ "Realtime WS: sending session.update (text-only, vad=%s)",
269
+ self.options.vad_enabled,
270
+ )
214
271
  # Log exact session.update payload and mark awaiting session.updated
215
272
  try:
216
273
  logger.info(
@@ -231,7 +288,7 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
231
288
  logger.warning(
232
289
  "Realtime WS: instructions missing/empty in session.update"
233
290
  )
234
- if not voice:
291
+ if not voice and should_configure_audio_output:
235
292
  logger.warning("Realtime WS: voice missing in session.update")
236
293
  except Exception:
237
294
  pass
@@ -632,6 +689,20 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
632
689
  len(final),
633
690
  )
634
691
  self._out_text_buffers.pop(rid, None)
692
+ # Always terminate the output transcript stream for this response when text-only.
693
+ try:
694
+ # Only enqueue sentinel when no audio modality is configured
695
+ modalities = (
696
+ getattr(self.options, "output_modalities", None)
697
+ or []
698
+ )
699
+ if "audio" not in modalities:
700
+ self._out_tr_queue.put_nowait(None)
701
+ logger.debug(
702
+ "Enqueued transcript termination sentinel (text-only response)"
703
+ )
704
+ except Exception:
705
+ pass
635
706
  except Exception:
636
707
  pass
637
708
  elif (
@@ -1033,6 +1104,47 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1033
1104
  else:
1034
1105
  patch[k] = raw[k]
1035
1106
 
1107
+ # --- Inject realtime transcription config if options were updated after initial connect ---
1108
+ try:
1109
+ tr_model = getattr(self.options, "transcription_model", None)
1110
+ if tr_model and isinstance(patch, dict):
1111
+ # Ensure audio/input containers exist without overwriting caller provided fields
1112
+ aud = patch.setdefault("audio", {})
1113
+ inp = aud.setdefault("input", {})
1114
+ # Only add if not explicitly provided in this patch
1115
+ if "transcription" not in inp:
1116
+ transcription_cfg: Dict[str, Any] = {"model": tr_model}
1117
+ lang = getattr(self.options, "transcription_language", None)
1118
+ if lang:
1119
+ transcription_cfg["language"] = lang
1120
+ prompt_txt = getattr(self.options, "transcription_prompt", None)
1121
+ if prompt_txt is not None:
1122
+ transcription_cfg["prompt"] = prompt_txt
1123
+ nr = getattr(self.options, "transcription_noise_reduction", None)
1124
+ if nr is not None:
1125
+ aud["noise_reduction"] = bool(nr)
1126
+ if getattr(self.options, "transcription_include_logprobs", False):
1127
+ patch.setdefault("include", [])
1128
+ if (
1129
+ "item.input_audio_transcription.logprobs"
1130
+ not in patch["include"]
1131
+ ):
1132
+ patch["include"].append(
1133
+ "item.input_audio_transcription.logprobs"
1134
+ )
1135
+ inp["transcription"] = transcription_cfg
1136
+ try:
1137
+ logger.debug(
1138
+ "Realtime WS: update_session injected transcription config model=%s",
1139
+ tr_model,
1140
+ )
1141
+ except Exception:
1142
+ pass
1143
+ except Exception:
1144
+ logger.exception(
1145
+ "Realtime WS: failed injecting transcription config in update_session"
1146
+ )
1147
+
1036
1148
  # Ensure tools are cleaned even if provided only under audio or elsewhere
1037
1149
  if "tools" in patch:
1038
1150
  patch["tools"] = _strip_tool_strict(patch["tools"]) # idempotent
@@ -1040,9 +1152,12 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1040
1152
  # Per server requirements, always include session.type and output_modalities
1041
1153
  try:
1042
1154
  patch["type"] = "realtime"
1043
- # Preserve caller-provided output_modalities if present, otherwise default to audio
1155
+ # Preserve caller-provided output_modalities if present, otherwise default to configured modalities
1044
1156
  if "output_modalities" not in patch:
1045
- patch["output_modalities"] = ["audio"]
1157
+ patch["output_modalities"] = self.options.output_modalities or [
1158
+ "audio",
1159
+ "text",
1160
+ ]
1046
1161
  except Exception:
1047
1162
  pass
1048
1163
 
@@ -1148,6 +1263,13 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1148
1263
  except Exception:
1149
1264
  pass
1150
1265
 
1266
+ async def create_conversation_item(
1267
+ self, item: Dict[str, Any]
1268
+ ) -> None: # pragma: no cover
1269
+ """Create a conversation item (e.g., for text input)."""
1270
+ payload = {"type": "conversation.item.create", "item": item}
1271
+ await self._send_tracked(payload, label="conversation.item.create")
1272
+
1151
1273
  async def create_response(
1152
1274
  self, response_patch: Optional[Dict[str, Any]] = None
1153
1275
  ) -> None: # pragma: no cover
@@ -1639,6 +1761,13 @@ class OpenAITranscriptionWebSocketSession(BaseRealtimeSession):
1639
1761
  async def clear_input(self) -> None: # pragma: no cover
1640
1762
  await self._send({"type": "input_audio_buffer.clear"})
1641
1763
 
1764
+ async def create_conversation_item(
1765
+ self, item: Dict[str, Any]
1766
+ ) -> None: # pragma: no cover
1767
+ """Create a conversation item (e.g., for text input)."""
1768
+ payload = {"type": "conversation.item.create", "item": item}
1769
+ await self._send_tracked(payload, label="conversation.item.create")
1770
+
1642
1771
  async def create_response(
1643
1772
  self, response_patch: Optional[Dict[str, Any]] = None
1644
1773
  ) -> None: # pragma: no cover
@@ -16,6 +16,7 @@ from solana_agent.interfaces.client.client import SolanaAgent as SolanaAgentInte
16
16
  from solana_agent.interfaces.plugins.plugins import Tool
17
17
  from solana_agent.services.knowledge_base import KnowledgeBaseService
18
18
  from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
19
+ from solana_agent.interfaces.providers.realtime import RealtimeChunk
19
20
 
20
21
 
21
22
  class SolanaAgent(SolanaAgentInterface):
@@ -57,6 +58,7 @@ class SolanaAgent(SolanaAgentInterface):
57
58
  vad: Optional[bool] = False,
58
59
  rt_encode_input: bool = False,
59
60
  rt_encode_output: bool = False,
61
+ rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
60
62
  rt_voice: Literal[
61
63
  "alloy",
62
64
  "ash",
@@ -90,7 +92,9 @@ class SolanaAgent(SolanaAgentInterface):
90
92
  router: Optional[RoutingInterface] = None,
91
93
  images: Optional[List[Union[str, bytes]]] = None,
92
94
  output_model: Optional[Type[BaseModel]] = None,
93
- ) -> AsyncGenerator[Union[str, bytes, BaseModel], None]: # pragma: no cover
95
+ ) -> AsyncGenerator[
96
+ Union[str, bytes, BaseModel, RealtimeChunk], None
97
+ ]: # pragma: no cover
94
98
  """Process a user message (text or audio) and optional images, returning the response stream.
95
99
 
96
100
  Args:
@@ -104,6 +108,7 @@ class SolanaAgent(SolanaAgentInterface):
104
108
  vad: Whether to use voice activity detection (for audio input)
105
109
  rt_encode_input: Whether to re-encode input audio for compatibility
106
110
  rt_encode_output: Whether to re-encode output audio for compatibility
111
+ rt_output_modalities: Modalities to return in realtime (default both if None)
107
112
  rt_voice: Voice to use for realtime audio output
108
113
  audio_voice: Voice to use for audio output
109
114
  audio_output_format: Audio output format
@@ -124,6 +129,7 @@ class SolanaAgent(SolanaAgentInterface):
124
129
  vad=vad,
125
130
  rt_encode_input=rt_encode_input,
126
131
  rt_encode_output=rt_encode_output,
132
+ rt_output_modalities=rt_output_modalities,
127
133
  rt_voice=rt_voice,
128
134
  audio_voice=audio_voice,
129
135
  audio_output_format=audio_output_format,