solana-agent 31.2.6__tar.gz → 31.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {solana_agent-31.2.6 → solana_agent-31.3.0}/PKG-INFO +115 -9
- {solana_agent-31.2.6 → solana_agent-31.3.0}/README.md +114 -8
- {solana_agent-31.2.6 → solana_agent-31.3.0}/pyproject.toml +1 -1
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/openai_realtime_ws.py +160 -31
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/client/solana_agent.py +7 -1
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/client/client.py +3 -1
- solana_agent-31.3.0/solana_agent/interfaces/providers/__init__.py +0 -0
- solana_agent-31.3.0/solana_agent/interfaces/providers/realtime.py +212 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/services/query.py +3 -1
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/query.py +422 -107
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/realtime.py +123 -17
- solana_agent-31.2.6/solana_agent/interfaces/providers/realtime.py +0 -100
- {solana_agent-31.2.6 → solana_agent-31.3.0}/LICENSE +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/ffmpeg_transcoder.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/mongodb_adapter.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/openai_adapter.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/adapters/pinecone_adapter.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/cli.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/client/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/domains/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/domains/agent.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/domains/routing.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/factories/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/factories/agent_factory.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/guardrails/pii.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/guardrails/guardrails.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/plugins/plugins.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/audio.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/data_storage.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/llm.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/memory.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/providers/vector_storage.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/services/agent.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/services/knowledge_base.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/interfaces/services/routing.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/manager.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/registry.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/tools/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/plugins/tools/auto_tool.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/repositories/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/repositories/memory.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/__init__.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/agent.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/knowledge_base.py +0 -0
- {solana_agent-31.2.6 → solana_agent-31.3.0}/solana_agent/services/routing.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: solana-agent
|
3
|
-
Version: 31.
|
3
|
+
Version: 31.3.0
|
4
4
|
Summary: AI Agents for Solana
|
5
5
|
License: MIT
|
6
6
|
Keywords: solana,solana ai,solana agent,ai,ai agent,ai agents
|
@@ -98,6 +98,7 @@ Smart workflows are as easy as combining your tools and prompts.
|
|
98
98
|
* Simple agent definition using JSON
|
99
99
|
* Designed for a multi-agent swarm
|
100
100
|
* Fast multi-modal processing of text, audio, and images
|
101
|
+
* Dual modality realtime streaming with simultaneous audio and text output
|
101
102
|
* Smart workflows that keep flows simple and smart
|
102
103
|
* Interact with the Solana blockchain with many useful tools
|
103
104
|
* MCP tool usage with first-class support for [Zapier](https://zapier.com/mcp)
|
@@ -132,7 +133,7 @@ Smart workflows are as easy as combining your tools and prompts.
|
|
132
133
|
**OpenAI**
|
133
134
|
* [gpt-4.1](https://platform.openai.com/docs/models/gpt-4.1) (agent & router)
|
134
135
|
* [text-embedding-3-large](https://platform.openai.com/docs/models/text-embedding-3-large) (embedding)
|
135
|
-
* [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent)
|
136
|
+
* [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent with dual modality support)
|
136
137
|
* [tts-1](https://platform.openai.com/docs/models/tts-1) (audio TTS)
|
137
138
|
* [gpt-4o-mini-transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe) (audio transcription)
|
138
139
|
|
@@ -281,6 +282,7 @@ async for response in solana_agent.process("user123", "What is the latest news o
|
|
281
282
|
### Audio/Text Streaming
|
282
283
|
|
283
284
|
```python
|
285
|
+
## Realtime Usage
|
284
286
|
from solana_agent import SolanaAgent
|
285
287
|
|
286
288
|
config = {
|
@@ -311,28 +313,32 @@ async for response in solana_agent.process("user123", audio_content, audio_input
|
|
311
313
|
|
312
314
|
### Realtime Audio Streaming
|
313
315
|
|
314
|
-
If input and/or output is encoded (compressed) like mp4/
|
316
|
+
If input and/or output is encoded (compressed) like mp4/mp3 then you must have `ffmpeg` installed.
|
315
317
|
|
316
318
|
Due to the overhead of the router (API call) - realtime only supports a single agent setup.
|
317
319
|
|
318
320
|
Realtime uses MongoDB for memory so Zep is not needed.
|
319
321
|
|
322
|
+
By default, when `realtime=True` and you supply raw/encoded audio bytes as input, the system **always skips the HTTP transcription (STT) path** and relies solely on the realtime websocket session for input transcription. If you don't specify `rt_transcription_model`, a sensible default (`gpt-4o-mini-transcribe`) is auto-selected so you still receive input transcript events with minimal latency.
|
323
|
+
|
324
|
+
Implications:
|
325
|
+
- `llm_provider.transcribe_audio` is never invoked for realtime turns.
|
326
|
+
- Lower end-to-end latency (no duplicate network round trip for STT).
|
327
|
+
- Unified transcript sourcing from realtime events.
|
328
|
+
- If you explicitly want to disable transcription altogether, send text (not audio bytes) or ignore transcript events client-side.
|
329
|
+
|
320
330
|
This example will work using expo-audio on Android and iOS.
|
321
331
|
|
322
332
|
```python
|
323
333
|
from solana_agent import SolanaAgent
|
324
334
|
|
325
335
|
solana_agent = SolanaAgent(config=config)
|
326
|
-
|
327
|
-
audio_content = await audio_file.read()
|
328
|
-
|
329
|
-
async def generate():
|
330
|
-
async for chunk in solana_agent.process(
|
331
|
-
user_id=user_id,
|
336
|
+
user_id="user123",
|
332
337
|
message=audio_content,
|
333
338
|
realtime=True,
|
334
339
|
rt_encode_input=True,
|
335
340
|
rt_encode_output=True,
|
341
|
+
rt_output_modalities=["audio"],
|
336
342
|
rt_voice="marin",
|
337
343
|
output_format="audio",
|
338
344
|
audio_output_format="mp3",
|
@@ -350,6 +356,106 @@ return StreamingResponse(
|
|
350
356
|
"X-Accel-Buffering": "no",
|
351
357
|
},
|
352
358
|
)
|
359
|
+
```
|
360
|
+
|
361
|
+
### Realtime Text Streaming
|
362
|
+
|
363
|
+
Due to the overhead of the router (API call) - realtime only supports a single agent setup.
|
364
|
+
|
365
|
+
Realtime uses MongoDB for memory so Zep is not needed.
|
366
|
+
|
367
|
+
When using realtime with text input, no audio transcription is needed. The same bypass rules apply—HTTP STT is never called in realtime mode.
|
368
|
+
|
369
|
+
```python
|
370
|
+
from solana_agent import SolanaAgent
|
371
|
+
|
372
|
+
solana_agent = SolanaAgent(config=config)
|
373
|
+
|
374
|
+
async def generate():
|
375
|
+
async for chunk in solana_agent.process(
|
376
|
+
user_id="user123",
|
377
|
+
message="What is the latest news on Solana?",
|
378
|
+
realtime=True,
|
379
|
+
rt_output_modalities=["text"],
|
380
|
+
):
|
381
|
+
yield chunk
|
382
|
+
```
|
383
|
+
|
384
|
+
### Dual Modality Realtime Streaming
|
385
|
+
|
386
|
+
Solana Agent supports **dual modality realtime streaming**, allowing you to stream both audio and text simultaneously from a single realtime session. This enables rich conversational experiences where users can receive both voice responses and text transcripts in real-time.
|
387
|
+
|
388
|
+
#### Features
|
389
|
+
- **Simultaneous Audio & Text**: Stream both modalities from the same conversation
|
390
|
+
- **Flexible Output**: Choose audio-only, text-only, or both modalities
|
391
|
+
- **Real-time Demuxing**: Automatically separate audio and text streams
|
392
|
+
- **Mobile Optimized**: Works seamlessly with compressed audio formats (MP4/AAC)
|
393
|
+
- **Memory Efficient**: Smart buffering and streaming for optimal performance
|
394
|
+
|
395
|
+
#### Mobile App Integration Example
|
396
|
+
|
397
|
+
```python
|
398
|
+
from fastapi import UploadFile
|
399
|
+
from fastapi.responses import StreamingResponse
|
400
|
+
from solana_agent import SolanaAgent
|
401
|
+
from solana_agent.interfaces.providers.realtime import RealtimeChunk
|
402
|
+
import base64
|
403
|
+
|
404
|
+
solana_agent = SolanaAgent(config=config)
|
405
|
+
|
406
|
+
@app.post("/realtime/dual")
|
407
|
+
async def realtime_dual_endpoint(audio_file: UploadFile):
|
408
|
+
"""
|
409
|
+
Dual modality (audio + text) realtime endpoint using Server-Sent Events (SSE).
|
410
|
+
Emits:
|
411
|
+
event: audio (base64 encoded audio frames)
|
412
|
+
event: transcript (incremental text)
|
413
|
+
Notes:
|
414
|
+
- Do NOT set output_format when using both modalities.
|
415
|
+
- If only one modality is requested, plain str (text) or raw audio bytes may be yielded instead of RealtimeChunk.
|
416
|
+
"""
|
417
|
+
audio_content = await audio_file.read()
|
418
|
+
|
419
|
+
async def event_stream():
|
420
|
+
async for chunk in solana_agent.process(
|
421
|
+
user_id="mobile_user",
|
422
|
+
message=audio_content,
|
423
|
+
realtime=True,
|
424
|
+
rt_encode_input=True,
|
425
|
+
rt_encode_output=True,
|
426
|
+
rt_output_modalities=["audio", "text"],
|
427
|
+
rt_voice="marin",
|
428
|
+
audio_input_format="mp4",
|
429
|
+
audio_output_format="mp3",
|
430
|
+
# Optionally lock transcription model (otherwise default is auto-selected):
|
431
|
+
# rt_transcription_model="gpt-4o-mini-transcribe",
|
432
|
+
):
|
433
|
+
if isinstance(chunk, RealtimeChunk):
|
434
|
+
if chunk.is_audio and chunk.audio_data:
|
435
|
+
b64 = base64.b64encode(chunk.audio_data).decode("ascii")
|
436
|
+
yield f"event: audio\ndata: {b64}\n\n"
|
437
|
+
elif chunk.is_text and chunk.text_data:
|
438
|
+
# Incremental transcript (not duplicated at finalize)
|
439
|
+
yield f"event: transcript\ndata: {chunk.text_data}\n\n"
|
440
|
+
continue
|
441
|
+
# (Defensive) fallback: if something else appears
|
442
|
+
if isinstance(chunk, bytes):
|
443
|
+
b64 = base64.b64encode(chunk).decode("ascii")
|
444
|
+
yield f"event: audio\ndata: {b64}\n\n"
|
445
|
+
elif isinstance(chunk, str):
|
446
|
+
yield f"event: transcript\ndata: {chunk}\n\n"
|
447
|
+
|
448
|
+
yield "event: done\ndata: end\n\n"
|
449
|
+
|
450
|
+
return StreamingResponse(
|
451
|
+
event_stream(),
|
452
|
+
media_type="text/event-stream",
|
453
|
+
headers={
|
454
|
+
"Cache-Control": "no-store",
|
455
|
+
"Access-Control-Allow-Origin": "*",
|
456
|
+
},
|
457
|
+
)
|
458
|
+
```
|
353
459
|
|
354
460
|
### Image/Text Streaming
|
355
461
|
|
@@ -62,6 +62,7 @@ Smart workflows are as easy as combining your tools and prompts.
|
|
62
62
|
* Simple agent definition using JSON
|
63
63
|
* Designed for a multi-agent swarm
|
64
64
|
* Fast multi-modal processing of text, audio, and images
|
65
|
+
* Dual modality realtime streaming with simultaneous audio and text output
|
65
66
|
* Smart workflows that keep flows simple and smart
|
66
67
|
* Interact with the Solana blockchain with many useful tools
|
67
68
|
* MCP tool usage with first-class support for [Zapier](https://zapier.com/mcp)
|
@@ -96,7 +97,7 @@ Smart workflows are as easy as combining your tools and prompts.
|
|
96
97
|
**OpenAI**
|
97
98
|
* [gpt-4.1](https://platform.openai.com/docs/models/gpt-4.1) (agent & router)
|
98
99
|
* [text-embedding-3-large](https://platform.openai.com/docs/models/text-embedding-3-large) (embedding)
|
99
|
-
* [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent)
|
100
|
+
* [gpt-realtime](https://platform.openai.com/docs/models/gpt-realtime) (realtime audio agent with dual modality support)
|
100
101
|
* [tts-1](https://platform.openai.com/docs/models/tts-1) (audio TTS)
|
101
102
|
* [gpt-4o-mini-transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe) (audio transcription)
|
102
103
|
|
@@ -245,6 +246,7 @@ async for response in solana_agent.process("user123", "What is the latest news o
|
|
245
246
|
### Audio/Text Streaming
|
246
247
|
|
247
248
|
```python
|
249
|
+
## Realtime Usage
|
248
250
|
from solana_agent import SolanaAgent
|
249
251
|
|
250
252
|
config = {
|
@@ -275,28 +277,32 @@ async for response in solana_agent.process("user123", audio_content, audio_input
|
|
275
277
|
|
276
278
|
### Realtime Audio Streaming
|
277
279
|
|
278
|
-
If input and/or output is encoded (compressed) like mp4/
|
280
|
+
If input and/or output is encoded (compressed) like mp4/mp3 then you must have `ffmpeg` installed.
|
279
281
|
|
280
282
|
Due to the overhead of the router (API call) - realtime only supports a single agent setup.
|
281
283
|
|
282
284
|
Realtime uses MongoDB for memory so Zep is not needed.
|
283
285
|
|
286
|
+
By default, when `realtime=True` and you supply raw/encoded audio bytes as input, the system **always skips the HTTP transcription (STT) path** and relies solely on the realtime websocket session for input transcription. If you don't specify `rt_transcription_model`, a sensible default (`gpt-4o-mini-transcribe`) is auto-selected so you still receive input transcript events with minimal latency.
|
287
|
+
|
288
|
+
Implications:
|
289
|
+
- `llm_provider.transcribe_audio` is never invoked for realtime turns.
|
290
|
+
- Lower end-to-end latency (no duplicate network round trip for STT).
|
291
|
+
- Unified transcript sourcing from realtime events.
|
292
|
+
- If you explicitly want to disable transcription altogether, send text (not audio bytes) or ignore transcript events client-side.
|
293
|
+
|
284
294
|
This example will work using expo-audio on Android and iOS.
|
285
295
|
|
286
296
|
```python
|
287
297
|
from solana_agent import SolanaAgent
|
288
298
|
|
289
299
|
solana_agent = SolanaAgent(config=config)
|
290
|
-
|
291
|
-
audio_content = await audio_file.read()
|
292
|
-
|
293
|
-
async def generate():
|
294
|
-
async for chunk in solana_agent.process(
|
295
|
-
user_id=user_id,
|
300
|
+
user_id="user123",
|
296
301
|
message=audio_content,
|
297
302
|
realtime=True,
|
298
303
|
rt_encode_input=True,
|
299
304
|
rt_encode_output=True,
|
305
|
+
rt_output_modalities=["audio"],
|
300
306
|
rt_voice="marin",
|
301
307
|
output_format="audio",
|
302
308
|
audio_output_format="mp3",
|
@@ -314,6 +320,106 @@ return StreamingResponse(
|
|
314
320
|
"X-Accel-Buffering": "no",
|
315
321
|
},
|
316
322
|
)
|
323
|
+
```
|
324
|
+
|
325
|
+
### Realtime Text Streaming
|
326
|
+
|
327
|
+
Due to the overhead of the router (API call) - realtime only supports a single agent setup.
|
328
|
+
|
329
|
+
Realtime uses MongoDB for memory so Zep is not needed.
|
330
|
+
|
331
|
+
When using realtime with text input, no audio transcription is needed. The same bypass rules apply—HTTP STT is never called in realtime mode.
|
332
|
+
|
333
|
+
```python
|
334
|
+
from solana_agent import SolanaAgent
|
335
|
+
|
336
|
+
solana_agent = SolanaAgent(config=config)
|
337
|
+
|
338
|
+
async def generate():
|
339
|
+
async for chunk in solana_agent.process(
|
340
|
+
user_id="user123",
|
341
|
+
message="What is the latest news on Solana?",
|
342
|
+
realtime=True,
|
343
|
+
rt_output_modalities=["text"],
|
344
|
+
):
|
345
|
+
yield chunk
|
346
|
+
```
|
347
|
+
|
348
|
+
### Dual Modality Realtime Streaming
|
349
|
+
|
350
|
+
Solana Agent supports **dual modality realtime streaming**, allowing you to stream both audio and text simultaneously from a single realtime session. This enables rich conversational experiences where users can receive both voice responses and text transcripts in real-time.
|
351
|
+
|
352
|
+
#### Features
|
353
|
+
- **Simultaneous Audio & Text**: Stream both modalities from the same conversation
|
354
|
+
- **Flexible Output**: Choose audio-only, text-only, or both modalities
|
355
|
+
- **Real-time Demuxing**: Automatically separate audio and text streams
|
356
|
+
- **Mobile Optimized**: Works seamlessly with compressed audio formats (MP4/AAC)
|
357
|
+
- **Memory Efficient**: Smart buffering and streaming for optimal performance
|
358
|
+
|
359
|
+
#### Mobile App Integration Example
|
360
|
+
|
361
|
+
```python
|
362
|
+
from fastapi import UploadFile
|
363
|
+
from fastapi.responses import StreamingResponse
|
364
|
+
from solana_agent import SolanaAgent
|
365
|
+
from solana_agent.interfaces.providers.realtime import RealtimeChunk
|
366
|
+
import base64
|
367
|
+
|
368
|
+
solana_agent = SolanaAgent(config=config)
|
369
|
+
|
370
|
+
@app.post("/realtime/dual")
|
371
|
+
async def realtime_dual_endpoint(audio_file: UploadFile):
|
372
|
+
"""
|
373
|
+
Dual modality (audio + text) realtime endpoint using Server-Sent Events (SSE).
|
374
|
+
Emits:
|
375
|
+
event: audio (base64 encoded audio frames)
|
376
|
+
event: transcript (incremental text)
|
377
|
+
Notes:
|
378
|
+
- Do NOT set output_format when using both modalities.
|
379
|
+
- If only one modality is requested, plain str (text) or raw audio bytes may be yielded instead of RealtimeChunk.
|
380
|
+
"""
|
381
|
+
audio_content = await audio_file.read()
|
382
|
+
|
383
|
+
async def event_stream():
|
384
|
+
async for chunk in solana_agent.process(
|
385
|
+
user_id="mobile_user",
|
386
|
+
message=audio_content,
|
387
|
+
realtime=True,
|
388
|
+
rt_encode_input=True,
|
389
|
+
rt_encode_output=True,
|
390
|
+
rt_output_modalities=["audio", "text"],
|
391
|
+
rt_voice="marin",
|
392
|
+
audio_input_format="mp4",
|
393
|
+
audio_output_format="mp3",
|
394
|
+
# Optionally lock transcription model (otherwise default is auto-selected):
|
395
|
+
# rt_transcription_model="gpt-4o-mini-transcribe",
|
396
|
+
):
|
397
|
+
if isinstance(chunk, RealtimeChunk):
|
398
|
+
if chunk.is_audio and chunk.audio_data:
|
399
|
+
b64 = base64.b64encode(chunk.audio_data).decode("ascii")
|
400
|
+
yield f"event: audio\ndata: {b64}\n\n"
|
401
|
+
elif chunk.is_text and chunk.text_data:
|
402
|
+
# Incremental transcript (not duplicated at finalize)
|
403
|
+
yield f"event: transcript\ndata: {chunk.text_data}\n\n"
|
404
|
+
continue
|
405
|
+
# (Defensive) fallback: if something else appears
|
406
|
+
if isinstance(chunk, bytes):
|
407
|
+
b64 = base64.b64encode(chunk).decode("ascii")
|
408
|
+
yield f"event: audio\ndata: {b64}\n\n"
|
409
|
+
elif isinstance(chunk, str):
|
410
|
+
yield f"event: transcript\ndata: {chunk}\n\n"
|
411
|
+
|
412
|
+
yield "event: done\ndata: end\n\n"
|
413
|
+
|
414
|
+
return StreamingResponse(
|
415
|
+
event_stream(),
|
416
|
+
media_type="text/event-stream",
|
417
|
+
headers={
|
418
|
+
"Cache-Control": "no-store",
|
419
|
+
"Access-Control-Allow-Origin": "*",
|
420
|
+
},
|
421
|
+
)
|
422
|
+
```
|
317
423
|
|
318
424
|
### Image/Text Streaming
|
319
425
|
|
@@ -102,16 +102,30 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
102
102
|
]
|
103
103
|
model = self.options.model or "gpt-realtime"
|
104
104
|
uri = f"{self.url}?model={model}"
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
105
|
+
|
106
|
+
# Determine if audio output should be configured for logging
|
107
|
+
modalities = self.options.output_modalities or ["audio", "text"]
|
108
|
+
should_configure_audio_output = "audio" in modalities
|
109
|
+
|
110
|
+
if should_configure_audio_output:
|
111
|
+
logger.info(
|
112
|
+
"Realtime WS connecting: uri=%s, input=%s@%sHz, output=%s@%sHz, voice=%s, vad=%s",
|
113
|
+
uri,
|
114
|
+
self.options.input_mime,
|
115
|
+
self.options.input_rate_hz,
|
116
|
+
self.options.output_mime,
|
117
|
+
self.options.output_rate_hz,
|
118
|
+
self.options.voice,
|
119
|
+
self.options.vad_enabled,
|
120
|
+
)
|
121
|
+
else:
|
122
|
+
logger.info(
|
123
|
+
"Realtime WS connecting: uri=%s, input=%s@%sHz, text-only output, vad=%s",
|
124
|
+
uri,
|
125
|
+
self.options.input_mime,
|
126
|
+
self.options.input_rate_hz,
|
127
|
+
self.options.vad_enabled,
|
128
|
+
)
|
115
129
|
self._ws = await websockets.connect(
|
116
130
|
uri, additional_headers=headers, max_size=None
|
117
131
|
)
|
@@ -165,11 +179,16 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
165
179
|
cleaned.append(t)
|
166
180
|
return cleaned
|
167
181
|
|
182
|
+
# Determine if audio output should be configured
|
183
|
+
modalities = self.options.output_modalities or ["audio", "text"]
|
184
|
+
should_configure_audio_output = "audio" in modalities
|
185
|
+
|
186
|
+
# Build session.update per docs (nested audio object)
|
168
187
|
session_payload: Dict[str, Any] = {
|
169
188
|
"type": "session.update",
|
170
189
|
"session": {
|
171
190
|
"type": "realtime",
|
172
|
-
"output_modalities":
|
191
|
+
"output_modalities": modalities,
|
173
192
|
"audio": {
|
174
193
|
"input": {
|
175
194
|
"format": {
|
@@ -178,16 +197,22 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
178
197
|
},
|
179
198
|
"turn_detection": td_input,
|
180
199
|
},
|
181
|
-
|
182
|
-
|
183
|
-
"
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
200
|
+
**(
|
201
|
+
{
|
202
|
+
"output": {
|
203
|
+
"format": {
|
204
|
+
"type": self.options.output_mime or "audio/pcm",
|
205
|
+
"rate": int(self.options.output_rate_hz or 24000),
|
206
|
+
},
|
207
|
+
"voice": self.options.voice,
|
208
|
+
"speed": float(
|
209
|
+
getattr(self.options, "voice_speed", 1.0) or 1.0
|
210
|
+
),
|
211
|
+
}
|
212
|
+
}
|
213
|
+
if should_configure_audio_output
|
214
|
+
else {}
|
215
|
+
),
|
191
216
|
},
|
192
217
|
# Note: no top-level turn_detection; nested under audio.input
|
193
218
|
**({"prompt": prompt_block} if prompt_block else {}),
|
@@ -204,13 +229,45 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
204
229
|
),
|
205
230
|
},
|
206
231
|
}
|
207
|
-
|
208
|
-
|
209
|
-
self.options
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
232
|
+
# Optional realtime transcription configuration
|
233
|
+
try:
|
234
|
+
tr_model = getattr(self.options, "transcription_model", None)
|
235
|
+
if tr_model:
|
236
|
+
audio_obj = session_payload["session"].setdefault("audio", {})
|
237
|
+
# Attach input transcription config per GA schema
|
238
|
+
transcription_cfg: Dict[str, Any] = {"model": tr_model}
|
239
|
+
lang = getattr(self.options, "transcription_language", None)
|
240
|
+
if lang:
|
241
|
+
transcription_cfg["language"] = lang
|
242
|
+
prompt_txt = getattr(self.options, "transcription_prompt", None)
|
243
|
+
if prompt_txt is not None:
|
244
|
+
transcription_cfg["prompt"] = prompt_txt
|
245
|
+
if getattr(self.options, "transcription_include_logprobs", False):
|
246
|
+
session_payload["session"].setdefault("include", []).append(
|
247
|
+
"item.input_audio_transcription.logprobs"
|
248
|
+
)
|
249
|
+
nr = getattr(self.options, "transcription_noise_reduction", None)
|
250
|
+
if nr is not None:
|
251
|
+
audio_obj["noise_reduction"] = bool(nr)
|
252
|
+
# Place under audio.input.transcription per current server conventions
|
253
|
+
audio_obj.setdefault("input", {}).setdefault(
|
254
|
+
"transcription", transcription_cfg
|
255
|
+
)
|
256
|
+
except Exception:
|
257
|
+
logger.exception("Failed to attach transcription config to session.update")
|
258
|
+
if should_configure_audio_output:
|
259
|
+
logger.info(
|
260
|
+
"Realtime WS: sending session.update (voice=%s, vad=%s, output=%s@%s)",
|
261
|
+
self.options.voice,
|
262
|
+
self.options.vad_enabled,
|
263
|
+
(self.options.output_mime or "audio/pcm"),
|
264
|
+
int(self.options.output_rate_hz or 24000),
|
265
|
+
)
|
266
|
+
else:
|
267
|
+
logger.info(
|
268
|
+
"Realtime WS: sending session.update (text-only, vad=%s)",
|
269
|
+
self.options.vad_enabled,
|
270
|
+
)
|
214
271
|
# Log exact session.update payload and mark awaiting session.updated
|
215
272
|
try:
|
216
273
|
logger.info(
|
@@ -231,7 +288,7 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
231
288
|
logger.warning(
|
232
289
|
"Realtime WS: instructions missing/empty in session.update"
|
233
290
|
)
|
234
|
-
if not voice:
|
291
|
+
if not voice and should_configure_audio_output:
|
235
292
|
logger.warning("Realtime WS: voice missing in session.update")
|
236
293
|
except Exception:
|
237
294
|
pass
|
@@ -632,6 +689,20 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
632
689
|
len(final),
|
633
690
|
)
|
634
691
|
self._out_text_buffers.pop(rid, None)
|
692
|
+
# Always terminate the output transcript stream for this response when text-only.
|
693
|
+
try:
|
694
|
+
# Only enqueue sentinel when no audio modality is configured
|
695
|
+
modalities = (
|
696
|
+
getattr(self.options, "output_modalities", None)
|
697
|
+
or []
|
698
|
+
)
|
699
|
+
if "audio" not in modalities:
|
700
|
+
self._out_tr_queue.put_nowait(None)
|
701
|
+
logger.debug(
|
702
|
+
"Enqueued transcript termination sentinel (text-only response)"
|
703
|
+
)
|
704
|
+
except Exception:
|
705
|
+
pass
|
635
706
|
except Exception:
|
636
707
|
pass
|
637
708
|
elif (
|
@@ -1033,6 +1104,47 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
1033
1104
|
else:
|
1034
1105
|
patch[k] = raw[k]
|
1035
1106
|
|
1107
|
+
# --- Inject realtime transcription config if options were updated after initial connect ---
|
1108
|
+
try:
|
1109
|
+
tr_model = getattr(self.options, "transcription_model", None)
|
1110
|
+
if tr_model and isinstance(patch, dict):
|
1111
|
+
# Ensure audio/input containers exist without overwriting caller provided fields
|
1112
|
+
aud = patch.setdefault("audio", {})
|
1113
|
+
inp = aud.setdefault("input", {})
|
1114
|
+
# Only add if not explicitly provided in this patch
|
1115
|
+
if "transcription" not in inp:
|
1116
|
+
transcription_cfg: Dict[str, Any] = {"model": tr_model}
|
1117
|
+
lang = getattr(self.options, "transcription_language", None)
|
1118
|
+
if lang:
|
1119
|
+
transcription_cfg["language"] = lang
|
1120
|
+
prompt_txt = getattr(self.options, "transcription_prompt", None)
|
1121
|
+
if prompt_txt is not None:
|
1122
|
+
transcription_cfg["prompt"] = prompt_txt
|
1123
|
+
nr = getattr(self.options, "transcription_noise_reduction", None)
|
1124
|
+
if nr is not None:
|
1125
|
+
aud["noise_reduction"] = bool(nr)
|
1126
|
+
if getattr(self.options, "transcription_include_logprobs", False):
|
1127
|
+
patch.setdefault("include", [])
|
1128
|
+
if (
|
1129
|
+
"item.input_audio_transcription.logprobs"
|
1130
|
+
not in patch["include"]
|
1131
|
+
):
|
1132
|
+
patch["include"].append(
|
1133
|
+
"item.input_audio_transcription.logprobs"
|
1134
|
+
)
|
1135
|
+
inp["transcription"] = transcription_cfg
|
1136
|
+
try:
|
1137
|
+
logger.debug(
|
1138
|
+
"Realtime WS: update_session injected transcription config model=%s",
|
1139
|
+
tr_model,
|
1140
|
+
)
|
1141
|
+
except Exception:
|
1142
|
+
pass
|
1143
|
+
except Exception:
|
1144
|
+
logger.exception(
|
1145
|
+
"Realtime WS: failed injecting transcription config in update_session"
|
1146
|
+
)
|
1147
|
+
|
1036
1148
|
# Ensure tools are cleaned even if provided only under audio or elsewhere
|
1037
1149
|
if "tools" in patch:
|
1038
1150
|
patch["tools"] = _strip_tool_strict(patch["tools"]) # idempotent
|
@@ -1040,9 +1152,12 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
1040
1152
|
# Per server requirements, always include session.type and output_modalities
|
1041
1153
|
try:
|
1042
1154
|
patch["type"] = "realtime"
|
1043
|
-
# Preserve caller-provided output_modalities if present, otherwise default to
|
1155
|
+
# Preserve caller-provided output_modalities if present, otherwise default to configured modalities
|
1044
1156
|
if "output_modalities" not in patch:
|
1045
|
-
patch["output_modalities"] = [
|
1157
|
+
patch["output_modalities"] = self.options.output_modalities or [
|
1158
|
+
"audio",
|
1159
|
+
"text",
|
1160
|
+
]
|
1046
1161
|
except Exception:
|
1047
1162
|
pass
|
1048
1163
|
|
@@ -1148,6 +1263,13 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
1148
1263
|
except Exception:
|
1149
1264
|
pass
|
1150
1265
|
|
1266
|
+
async def create_conversation_item(
|
1267
|
+
self, item: Dict[str, Any]
|
1268
|
+
) -> None: # pragma: no cover
|
1269
|
+
"""Create a conversation item (e.g., for text input)."""
|
1270
|
+
payload = {"type": "conversation.item.create", "item": item}
|
1271
|
+
await self._send_tracked(payload, label="conversation.item.create")
|
1272
|
+
|
1151
1273
|
async def create_response(
|
1152
1274
|
self, response_patch: Optional[Dict[str, Any]] = None
|
1153
1275
|
) -> None: # pragma: no cover
|
@@ -1639,6 +1761,13 @@ class OpenAITranscriptionWebSocketSession(BaseRealtimeSession):
|
|
1639
1761
|
async def clear_input(self) -> None: # pragma: no cover
|
1640
1762
|
await self._send({"type": "input_audio_buffer.clear"})
|
1641
1763
|
|
1764
|
+
async def create_conversation_item(
|
1765
|
+
self, item: Dict[str, Any]
|
1766
|
+
) -> None: # pragma: no cover
|
1767
|
+
"""Create a conversation item (e.g., for text input)."""
|
1768
|
+
payload = {"type": "conversation.item.create", "item": item}
|
1769
|
+
await self._send_tracked(payload, label="conversation.item.create")
|
1770
|
+
|
1642
1771
|
async def create_response(
|
1643
1772
|
self, response_patch: Optional[Dict[str, Any]] = None
|
1644
1773
|
) -> None: # pragma: no cover
|
@@ -16,6 +16,7 @@ from solana_agent.interfaces.client.client import SolanaAgent as SolanaAgentInte
|
|
16
16
|
from solana_agent.interfaces.plugins.plugins import Tool
|
17
17
|
from solana_agent.services.knowledge_base import KnowledgeBaseService
|
18
18
|
from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
|
19
|
+
from solana_agent.interfaces.providers.realtime import RealtimeChunk
|
19
20
|
|
20
21
|
|
21
22
|
class SolanaAgent(SolanaAgentInterface):
|
@@ -57,6 +58,7 @@ class SolanaAgent(SolanaAgentInterface):
|
|
57
58
|
vad: Optional[bool] = False,
|
58
59
|
rt_encode_input: bool = False,
|
59
60
|
rt_encode_output: bool = False,
|
61
|
+
rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
|
60
62
|
rt_voice: Literal[
|
61
63
|
"alloy",
|
62
64
|
"ash",
|
@@ -90,7 +92,9 @@ class SolanaAgent(SolanaAgentInterface):
|
|
90
92
|
router: Optional[RoutingInterface] = None,
|
91
93
|
images: Optional[List[Union[str, bytes]]] = None,
|
92
94
|
output_model: Optional[Type[BaseModel]] = None,
|
93
|
-
) -> AsyncGenerator[
|
95
|
+
) -> AsyncGenerator[
|
96
|
+
Union[str, bytes, BaseModel, RealtimeChunk], None
|
97
|
+
]: # pragma: no cover
|
94
98
|
"""Process a user message (text or audio) and optional images, returning the response stream.
|
95
99
|
|
96
100
|
Args:
|
@@ -104,6 +108,7 @@ class SolanaAgent(SolanaAgentInterface):
|
|
104
108
|
vad: Whether to use voice activity detection (for audio input)
|
105
109
|
rt_encode_input: Whether to re-encode input audio for compatibility
|
106
110
|
rt_encode_output: Whether to re-encode output audio for compatibility
|
111
|
+
rt_output_modalities: Modalities to return in realtime (default both if None)
|
107
112
|
rt_voice: Voice to use for realtime audio output
|
108
113
|
audio_voice: Voice to use for audio output
|
109
114
|
audio_output_format: Audio output format
|
@@ -124,6 +129,7 @@ class SolanaAgent(SolanaAgentInterface):
|
|
124
129
|
vad=vad,
|
125
130
|
rt_encode_input=rt_encode_input,
|
126
131
|
rt_encode_output=rt_encode_output,
|
132
|
+
rt_output_modalities=rt_output_modalities,
|
127
133
|
rt_voice=rt_voice,
|
128
134
|
audio_voice=audio_voice,
|
129
135
|
audio_output_format=audio_output_format,
|