solana-agent 31.2.5__py3-none-any.whl → 31.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solana_agent/adapters/openai_realtime_ws.py +160 -31
- solana_agent/client/solana_agent.py +7 -1
- solana_agent/domains/routing.py +10 -4
- solana_agent/interfaces/client/client.py +3 -1
- solana_agent/interfaces/providers/__init__.py +0 -0
- solana_agent/interfaces/providers/realtime.py +113 -1
- solana_agent/interfaces/services/query.py +3 -1
- solana_agent/services/query.py +422 -107
- solana_agent/services/realtime.py +123 -17
- solana_agent/services/routing.py +17 -22
- {solana_agent-31.2.5.dist-info → solana_agent-31.3.0.dist-info}/METADATA +116 -10
- {solana_agent-31.2.5.dist-info → solana_agent-31.3.0.dist-info}/RECORD +15 -14
- {solana_agent-31.2.5.dist-info → solana_agent-31.3.0.dist-info}/LICENSE +0 -0
- {solana_agent-31.2.5.dist-info → solana_agent-31.3.0.dist-info}/WHEEL +0 -0
- {solana_agent-31.2.5.dist-info → solana_agent-31.3.0.dist-info}/entry_points.txt +0 -0
@@ -102,16 +102,30 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
102
102
|
]
|
103
103
|
model = self.options.model or "gpt-realtime"
|
104
104
|
uri = f"{self.url}?model={model}"
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
105
|
+
|
106
|
+
# Determine if audio output should be configured for logging
|
107
|
+
modalities = self.options.output_modalities or ["audio", "text"]
|
108
|
+
should_configure_audio_output = "audio" in modalities
|
109
|
+
|
110
|
+
if should_configure_audio_output:
|
111
|
+
logger.info(
|
112
|
+
"Realtime WS connecting: uri=%s, input=%s@%sHz, output=%s@%sHz, voice=%s, vad=%s",
|
113
|
+
uri,
|
114
|
+
self.options.input_mime,
|
115
|
+
self.options.input_rate_hz,
|
116
|
+
self.options.output_mime,
|
117
|
+
self.options.output_rate_hz,
|
118
|
+
self.options.voice,
|
119
|
+
self.options.vad_enabled,
|
120
|
+
)
|
121
|
+
else:
|
122
|
+
logger.info(
|
123
|
+
"Realtime WS connecting: uri=%s, input=%s@%sHz, text-only output, vad=%s",
|
124
|
+
uri,
|
125
|
+
self.options.input_mime,
|
126
|
+
self.options.input_rate_hz,
|
127
|
+
self.options.vad_enabled,
|
128
|
+
)
|
115
129
|
self._ws = await websockets.connect(
|
116
130
|
uri, additional_headers=headers, max_size=None
|
117
131
|
)
|
@@ -165,11 +179,16 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
165
179
|
cleaned.append(t)
|
166
180
|
return cleaned
|
167
181
|
|
182
|
+
# Determine if audio output should be configured
|
183
|
+
modalities = self.options.output_modalities or ["audio", "text"]
|
184
|
+
should_configure_audio_output = "audio" in modalities
|
185
|
+
|
186
|
+
# Build session.update per docs (nested audio object)
|
168
187
|
session_payload: Dict[str, Any] = {
|
169
188
|
"type": "session.update",
|
170
189
|
"session": {
|
171
190
|
"type": "realtime",
|
172
|
-
"output_modalities":
|
191
|
+
"output_modalities": modalities,
|
173
192
|
"audio": {
|
174
193
|
"input": {
|
175
194
|
"format": {
|
@@ -178,16 +197,22 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
178
197
|
},
|
179
198
|
"turn_detection": td_input,
|
180
199
|
},
|
181
|
-
|
182
|
-
|
183
|
-
"
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
200
|
+
**(
|
201
|
+
{
|
202
|
+
"output": {
|
203
|
+
"format": {
|
204
|
+
"type": self.options.output_mime or "audio/pcm",
|
205
|
+
"rate": int(self.options.output_rate_hz or 24000),
|
206
|
+
},
|
207
|
+
"voice": self.options.voice,
|
208
|
+
"speed": float(
|
209
|
+
getattr(self.options, "voice_speed", 1.0) or 1.0
|
210
|
+
),
|
211
|
+
}
|
212
|
+
}
|
213
|
+
if should_configure_audio_output
|
214
|
+
else {}
|
215
|
+
),
|
191
216
|
},
|
192
217
|
# Note: no top-level turn_detection; nested under audio.input
|
193
218
|
**({"prompt": prompt_block} if prompt_block else {}),
|
@@ -204,13 +229,45 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
204
229
|
),
|
205
230
|
},
|
206
231
|
}
|
207
|
-
|
208
|
-
|
209
|
-
self.options
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
232
|
+
# Optional realtime transcription configuration
|
233
|
+
try:
|
234
|
+
tr_model = getattr(self.options, "transcription_model", None)
|
235
|
+
if tr_model:
|
236
|
+
audio_obj = session_payload["session"].setdefault("audio", {})
|
237
|
+
# Attach input transcription config per GA schema
|
238
|
+
transcription_cfg: Dict[str, Any] = {"model": tr_model}
|
239
|
+
lang = getattr(self.options, "transcription_language", None)
|
240
|
+
if lang:
|
241
|
+
transcription_cfg["language"] = lang
|
242
|
+
prompt_txt = getattr(self.options, "transcription_prompt", None)
|
243
|
+
if prompt_txt is not None:
|
244
|
+
transcription_cfg["prompt"] = prompt_txt
|
245
|
+
if getattr(self.options, "transcription_include_logprobs", False):
|
246
|
+
session_payload["session"].setdefault("include", []).append(
|
247
|
+
"item.input_audio_transcription.logprobs"
|
248
|
+
)
|
249
|
+
nr = getattr(self.options, "transcription_noise_reduction", None)
|
250
|
+
if nr is not None:
|
251
|
+
audio_obj["noise_reduction"] = bool(nr)
|
252
|
+
# Place under audio.input.transcription per current server conventions
|
253
|
+
audio_obj.setdefault("input", {}).setdefault(
|
254
|
+
"transcription", transcription_cfg
|
255
|
+
)
|
256
|
+
except Exception:
|
257
|
+
logger.exception("Failed to attach transcription config to session.update")
|
258
|
+
if should_configure_audio_output:
|
259
|
+
logger.info(
|
260
|
+
"Realtime WS: sending session.update (voice=%s, vad=%s, output=%s@%s)",
|
261
|
+
self.options.voice,
|
262
|
+
self.options.vad_enabled,
|
263
|
+
(self.options.output_mime or "audio/pcm"),
|
264
|
+
int(self.options.output_rate_hz or 24000),
|
265
|
+
)
|
266
|
+
else:
|
267
|
+
logger.info(
|
268
|
+
"Realtime WS: sending session.update (text-only, vad=%s)",
|
269
|
+
self.options.vad_enabled,
|
270
|
+
)
|
214
271
|
# Log exact session.update payload and mark awaiting session.updated
|
215
272
|
try:
|
216
273
|
logger.info(
|
@@ -231,7 +288,7 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
231
288
|
logger.warning(
|
232
289
|
"Realtime WS: instructions missing/empty in session.update"
|
233
290
|
)
|
234
|
-
if not voice:
|
291
|
+
if not voice and should_configure_audio_output:
|
235
292
|
logger.warning("Realtime WS: voice missing in session.update")
|
236
293
|
except Exception:
|
237
294
|
pass
|
@@ -632,6 +689,20 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
632
689
|
len(final),
|
633
690
|
)
|
634
691
|
self._out_text_buffers.pop(rid, None)
|
692
|
+
# Always terminate the output transcript stream for this response when text-only.
|
693
|
+
try:
|
694
|
+
# Only enqueue sentinel when no audio modality is configured
|
695
|
+
modalities = (
|
696
|
+
getattr(self.options, "output_modalities", None)
|
697
|
+
or []
|
698
|
+
)
|
699
|
+
if "audio" not in modalities:
|
700
|
+
self._out_tr_queue.put_nowait(None)
|
701
|
+
logger.debug(
|
702
|
+
"Enqueued transcript termination sentinel (text-only response)"
|
703
|
+
)
|
704
|
+
except Exception:
|
705
|
+
pass
|
635
706
|
except Exception:
|
636
707
|
pass
|
637
708
|
elif (
|
@@ -1033,6 +1104,47 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
1033
1104
|
else:
|
1034
1105
|
patch[k] = raw[k]
|
1035
1106
|
|
1107
|
+
# --- Inject realtime transcription config if options were updated after initial connect ---
|
1108
|
+
try:
|
1109
|
+
tr_model = getattr(self.options, "transcription_model", None)
|
1110
|
+
if tr_model and isinstance(patch, dict):
|
1111
|
+
# Ensure audio/input containers exist without overwriting caller provided fields
|
1112
|
+
aud = patch.setdefault("audio", {})
|
1113
|
+
inp = aud.setdefault("input", {})
|
1114
|
+
# Only add if not explicitly provided in this patch
|
1115
|
+
if "transcription" not in inp:
|
1116
|
+
transcription_cfg: Dict[str, Any] = {"model": tr_model}
|
1117
|
+
lang = getattr(self.options, "transcription_language", None)
|
1118
|
+
if lang:
|
1119
|
+
transcription_cfg["language"] = lang
|
1120
|
+
prompt_txt = getattr(self.options, "transcription_prompt", None)
|
1121
|
+
if prompt_txt is not None:
|
1122
|
+
transcription_cfg["prompt"] = prompt_txt
|
1123
|
+
nr = getattr(self.options, "transcription_noise_reduction", None)
|
1124
|
+
if nr is not None:
|
1125
|
+
aud["noise_reduction"] = bool(nr)
|
1126
|
+
if getattr(self.options, "transcription_include_logprobs", False):
|
1127
|
+
patch.setdefault("include", [])
|
1128
|
+
if (
|
1129
|
+
"item.input_audio_transcription.logprobs"
|
1130
|
+
not in patch["include"]
|
1131
|
+
):
|
1132
|
+
patch["include"].append(
|
1133
|
+
"item.input_audio_transcription.logprobs"
|
1134
|
+
)
|
1135
|
+
inp["transcription"] = transcription_cfg
|
1136
|
+
try:
|
1137
|
+
logger.debug(
|
1138
|
+
"Realtime WS: update_session injected transcription config model=%s",
|
1139
|
+
tr_model,
|
1140
|
+
)
|
1141
|
+
except Exception:
|
1142
|
+
pass
|
1143
|
+
except Exception:
|
1144
|
+
logger.exception(
|
1145
|
+
"Realtime WS: failed injecting transcription config in update_session"
|
1146
|
+
)
|
1147
|
+
|
1036
1148
|
# Ensure tools are cleaned even if provided only under audio or elsewhere
|
1037
1149
|
if "tools" in patch:
|
1038
1150
|
patch["tools"] = _strip_tool_strict(patch["tools"]) # idempotent
|
@@ -1040,9 +1152,12 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
1040
1152
|
# Per server requirements, always include session.type and output_modalities
|
1041
1153
|
try:
|
1042
1154
|
patch["type"] = "realtime"
|
1043
|
-
# Preserve caller-provided output_modalities if present, otherwise default to
|
1155
|
+
# Preserve caller-provided output_modalities if present, otherwise default to configured modalities
|
1044
1156
|
if "output_modalities" not in patch:
|
1045
|
-
patch["output_modalities"] = [
|
1157
|
+
patch["output_modalities"] = self.options.output_modalities or [
|
1158
|
+
"audio",
|
1159
|
+
"text",
|
1160
|
+
]
|
1046
1161
|
except Exception:
|
1047
1162
|
pass
|
1048
1163
|
|
@@ -1148,6 +1263,13 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
|
|
1148
1263
|
except Exception:
|
1149
1264
|
pass
|
1150
1265
|
|
1266
|
+
async def create_conversation_item(
|
1267
|
+
self, item: Dict[str, Any]
|
1268
|
+
) -> None: # pragma: no cover
|
1269
|
+
"""Create a conversation item (e.g., for text input)."""
|
1270
|
+
payload = {"type": "conversation.item.create", "item": item}
|
1271
|
+
await self._send_tracked(payload, label="conversation.item.create")
|
1272
|
+
|
1151
1273
|
async def create_response(
|
1152
1274
|
self, response_patch: Optional[Dict[str, Any]] = None
|
1153
1275
|
) -> None: # pragma: no cover
|
@@ -1639,6 +1761,13 @@ class OpenAITranscriptionWebSocketSession(BaseRealtimeSession):
|
|
1639
1761
|
async def clear_input(self) -> None: # pragma: no cover
|
1640
1762
|
await self._send({"type": "input_audio_buffer.clear"})
|
1641
1763
|
|
1764
|
+
async def create_conversation_item(
|
1765
|
+
self, item: Dict[str, Any]
|
1766
|
+
) -> None: # pragma: no cover
|
1767
|
+
"""Create a conversation item (e.g., for text input)."""
|
1768
|
+
payload = {"type": "conversation.item.create", "item": item}
|
1769
|
+
await self._send_tracked(payload, label="conversation.item.create")
|
1770
|
+
|
1642
1771
|
async def create_response(
|
1643
1772
|
self, response_patch: Optional[Dict[str, Any]] = None
|
1644
1773
|
) -> None: # pragma: no cover
|
@@ -16,6 +16,7 @@ from solana_agent.interfaces.client.client import SolanaAgent as SolanaAgentInte
|
|
16
16
|
from solana_agent.interfaces.plugins.plugins import Tool
|
17
17
|
from solana_agent.services.knowledge_base import KnowledgeBaseService
|
18
18
|
from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
|
19
|
+
from solana_agent.interfaces.providers.realtime import RealtimeChunk
|
19
20
|
|
20
21
|
|
21
22
|
class SolanaAgent(SolanaAgentInterface):
|
@@ -57,6 +58,7 @@ class SolanaAgent(SolanaAgentInterface):
|
|
57
58
|
vad: Optional[bool] = False,
|
58
59
|
rt_encode_input: bool = False,
|
59
60
|
rt_encode_output: bool = False,
|
61
|
+
rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
|
60
62
|
rt_voice: Literal[
|
61
63
|
"alloy",
|
62
64
|
"ash",
|
@@ -90,7 +92,9 @@ class SolanaAgent(SolanaAgentInterface):
|
|
90
92
|
router: Optional[RoutingInterface] = None,
|
91
93
|
images: Optional[List[Union[str, bytes]]] = None,
|
92
94
|
output_model: Optional[Type[BaseModel]] = None,
|
93
|
-
) -> AsyncGenerator[
|
95
|
+
) -> AsyncGenerator[
|
96
|
+
Union[str, bytes, BaseModel, RealtimeChunk], None
|
97
|
+
]: # pragma: no cover
|
94
98
|
"""Process a user message (text or audio) and optional images, returning the response stream.
|
95
99
|
|
96
100
|
Args:
|
@@ -104,6 +108,7 @@ class SolanaAgent(SolanaAgentInterface):
|
|
104
108
|
vad: Whether to use voice activity detection (for audio input)
|
105
109
|
rt_encode_input: Whether to re-encode input audio for compatibility
|
106
110
|
rt_encode_output: Whether to re-encode output audio for compatibility
|
111
|
+
rt_output_modalities: Modalities to return in realtime (default both if None)
|
107
112
|
rt_voice: Voice to use for realtime audio output
|
108
113
|
audio_voice: Voice to use for audio output
|
109
114
|
audio_output_format: Audio output format
|
@@ -124,6 +129,7 @@ class SolanaAgent(SolanaAgentInterface):
|
|
124
129
|
vad=vad,
|
125
130
|
rt_encode_input=rt_encode_input,
|
126
131
|
rt_encode_output=rt_encode_output,
|
132
|
+
rt_output_modalities=rt_output_modalities,
|
127
133
|
rt_voice=rt_voice,
|
128
134
|
audio_voice=audio_voice,
|
129
135
|
audio_output_format=audio_output_format,
|
solana_agent/domains/routing.py
CHANGED
@@ -10,9 +10,15 @@ class QueryAnalysis(BaseModel):
|
|
10
10
|
description="Name of the primary agent that should handle this query (must be one of the available agent names)",
|
11
11
|
)
|
12
12
|
secondary_agents: List[str] = Field(
|
13
|
-
|
13
|
+
default_factory=list,
|
14
14
|
description="Names of secondary agents that might be helpful (must be from the available agent names)",
|
15
15
|
)
|
16
|
-
complexity_level: int = Field(
|
17
|
-
|
18
|
-
|
16
|
+
complexity_level: int = Field(
|
17
|
+
default=1, description="Complexity level (1-5)", ge=1, le=5
|
18
|
+
)
|
19
|
+
topics: List[str] = Field(
|
20
|
+
default_factory=list, description="Key topics in the query"
|
21
|
+
)
|
22
|
+
confidence: float = Field(
|
23
|
+
default=0.5, description="Confidence in the analysis", ge=0.0, le=1.0
|
24
|
+
)
|
@@ -4,6 +4,7 @@ from typing import AsyncGenerator, Dict, Any, List, Literal, Optional, Type, Uni
|
|
4
4
|
from pydantic import BaseModel
|
5
5
|
from solana_agent.interfaces.plugins.plugins import Tool
|
6
6
|
from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
|
7
|
+
from solana_agent.interfaces.providers.realtime import RealtimeChunk
|
7
8
|
|
8
9
|
|
9
10
|
class SolanaAgent(ABC):
|
@@ -22,6 +23,7 @@ class SolanaAgent(ABC):
|
|
22
23
|
vad: bool = False,
|
23
24
|
rt_encode_input: bool = False,
|
24
25
|
rt_encode_output: bool = False,
|
26
|
+
rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
|
25
27
|
rt_voice: Literal[
|
26
28
|
"alloy",
|
27
29
|
"ash",
|
@@ -55,7 +57,7 @@ class SolanaAgent(ABC):
|
|
55
57
|
router: Optional[RoutingInterface] = None,
|
56
58
|
images: Optional[List[Union[str, bytes]]] = None,
|
57
59
|
output_model: Optional[Type[BaseModel]] = None,
|
58
|
-
) -> AsyncGenerator[Union[str, bytes, BaseModel], None]:
|
60
|
+
) -> AsyncGenerator[Union[str, bytes, BaseModel, RealtimeChunk], None]:
|
59
61
|
"""Process a user message and return the response stream."""
|
60
62
|
pass
|
61
63
|
|
File without changes
|
@@ -1,7 +1,17 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from abc import ABC, abstractmethod
|
3
3
|
from dataclasses import dataclass
|
4
|
-
from typing import
|
4
|
+
from typing import (
|
5
|
+
Any,
|
6
|
+
AsyncGenerator,
|
7
|
+
Dict,
|
8
|
+
Literal,
|
9
|
+
Optional,
|
10
|
+
Awaitable,
|
11
|
+
Callable,
|
12
|
+
List,
|
13
|
+
Union,
|
14
|
+
)
|
5
15
|
|
6
16
|
|
7
17
|
@dataclass
|
@@ -24,6 +34,7 @@ class RealtimeSessionOptions:
|
|
24
34
|
output_rate_hz: int = 24000
|
25
35
|
input_mime: str = "audio/pcm" # 16-bit PCM
|
26
36
|
output_mime: str = "audio/pcm" # 16-bit PCM
|
37
|
+
output_modalities: List[Literal["audio", "text"]] = None # None means auto-detect
|
27
38
|
instructions: Optional[str] = None
|
28
39
|
# Optional: tools payload compatible with OpenAI Realtime session.update
|
29
40
|
tools: Optional[list[dict[str, Any]]] = None
|
@@ -34,6 +45,107 @@ class RealtimeSessionOptions:
|
|
34
45
|
# Optional guard: if a tool takes longer than this to complete, skip sending
|
35
46
|
# function_call_output to avoid stale/expired call_id issues. Set to None to always send.
|
36
47
|
tool_result_max_age_s: Optional[float] = None
|
48
|
+
# --- Realtime transcription configuration (optional) ---
|
49
|
+
# When transcription_model is set, QueryService should skip the HTTP STT path and rely on
|
50
|
+
# realtime websocket transcription events. Other fields customize that behavior.
|
51
|
+
transcription_model: Optional[str] = None
|
52
|
+
transcription_language: Optional[str] = None # e.g. 'en'
|
53
|
+
transcription_prompt: Optional[str] = None
|
54
|
+
transcription_noise_reduction: Optional[bool] = None
|
55
|
+
transcription_include_logprobs: bool = False
|
56
|
+
|
57
|
+
|
58
|
+
@dataclass
|
59
|
+
class RealtimeChunk:
|
60
|
+
"""Represents a chunk of data from a realtime session with its modality type."""
|
61
|
+
|
62
|
+
modality: Literal["audio", "text"]
|
63
|
+
data: Union[str, bytes]
|
64
|
+
timestamp: Optional[float] = None # Optional timestamp for ordering
|
65
|
+
metadata: Optional[Dict[str, Any]] = None # Optional additional metadata
|
66
|
+
|
67
|
+
@property
|
68
|
+
def is_audio(self) -> bool:
|
69
|
+
"""Check if this is an audio chunk."""
|
70
|
+
return self.modality == "audio"
|
71
|
+
|
72
|
+
@property
|
73
|
+
def is_text(self) -> bool:
|
74
|
+
"""Check if this is a text chunk."""
|
75
|
+
return self.modality == "text"
|
76
|
+
|
77
|
+
@property
|
78
|
+
def text_data(self) -> Optional[str]:
|
79
|
+
"""Get text data if this is a text chunk."""
|
80
|
+
return self.data if isinstance(self.data, str) else None
|
81
|
+
|
82
|
+
@property
|
83
|
+
def audio_data(self) -> Optional[bytes]:
|
84
|
+
"""Get audio data if this is an audio chunk."""
|
85
|
+
return self.data if isinstance(self.data, bytes) else None
|
86
|
+
|
87
|
+
|
88
|
+
async def separate_audio_chunks(
|
89
|
+
chunks: AsyncGenerator[RealtimeChunk, None],
|
90
|
+
) -> AsyncGenerator[bytes, None]:
|
91
|
+
"""Extract only audio chunks from a stream of RealtimeChunk objects.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
chunks: Stream of RealtimeChunk objects
|
95
|
+
|
96
|
+
Yields:
|
97
|
+
Audio data bytes from audio chunks only
|
98
|
+
"""
|
99
|
+
async for chunk in chunks:
|
100
|
+
if chunk.is_audio and chunk.audio_data:
|
101
|
+
yield chunk.audio_data
|
102
|
+
|
103
|
+
|
104
|
+
async def separate_text_chunks(
|
105
|
+
chunks: AsyncGenerator[RealtimeChunk, None],
|
106
|
+
) -> AsyncGenerator[str, None]:
|
107
|
+
"""Extract only text chunks from a stream of RealtimeChunk objects.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
chunks: Stream of RealtimeChunk objects
|
111
|
+
|
112
|
+
Yields:
|
113
|
+
Text data from text chunks only
|
114
|
+
"""
|
115
|
+
async for chunk in chunks:
|
116
|
+
if chunk.is_text and chunk.text_data:
|
117
|
+
yield chunk.text_data
|
118
|
+
|
119
|
+
|
120
|
+
async def demux_realtime_chunks(
|
121
|
+
chunks: AsyncGenerator[RealtimeChunk, None],
|
122
|
+
) -> tuple[AsyncGenerator[bytes, None], AsyncGenerator[str, None]]:
|
123
|
+
"""Demux a stream of RealtimeChunk objects into separate audio and text streams.
|
124
|
+
|
125
|
+
Note: This function consumes the input generator, so each output stream can only be consumed once.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
chunks: Stream of RealtimeChunk objects
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
Tuple of (audio_stream, text_stream) async generators
|
132
|
+
"""
|
133
|
+
# Collect all chunks first since we can't consume the generator twice
|
134
|
+
collected_chunks = []
|
135
|
+
async for chunk in chunks:
|
136
|
+
collected_chunks.append(chunk)
|
137
|
+
|
138
|
+
async def audio_stream():
|
139
|
+
for chunk in collected_chunks:
|
140
|
+
if chunk.is_audio and chunk.audio_data:
|
141
|
+
yield chunk.audio_data
|
142
|
+
|
143
|
+
async def text_stream():
|
144
|
+
for chunk in collected_chunks:
|
145
|
+
if chunk.is_text and chunk.text_data:
|
146
|
+
yield chunk.text_data
|
147
|
+
|
148
|
+
return audio_stream(), text_stream()
|
37
149
|
|
38
150
|
|
39
151
|
class BaseRealtimeSession(ABC):
|
@@ -4,6 +4,7 @@ from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Type, Uni
|
|
4
4
|
from pydantic import BaseModel
|
5
5
|
|
6
6
|
from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
|
7
|
+
from solana_agent.interfaces.providers.realtime import RealtimeChunk
|
7
8
|
|
8
9
|
|
9
10
|
class QueryService(ABC):
|
@@ -15,6 +16,7 @@ class QueryService(ABC):
|
|
15
16
|
user_id: str,
|
16
17
|
query: Union[str, bytes],
|
17
18
|
output_format: Literal["text", "audio"] = "text",
|
19
|
+
rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
|
18
20
|
rt_voice: Literal[
|
19
21
|
"alloy",
|
20
22
|
"ash",
|
@@ -51,7 +53,7 @@ class QueryService(ABC):
|
|
51
53
|
output_model: Optional[Type[BaseModel]] = None,
|
52
54
|
capture_schema: Optional[Dict[str, Any]] = None,
|
53
55
|
capture_name: Optional[str] = None,
|
54
|
-
) -> AsyncGenerator[Union[str, bytes, BaseModel], None]:
|
56
|
+
) -> AsyncGenerator[Union[str, bytes, BaseModel, RealtimeChunk], None]:
|
55
57
|
"""Process the user request and generate a response."""
|
56
58
|
pass
|
57
59
|
|