solana-agent 31.2.6__py3-none-any.whl → 31.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -102,16 +102,30 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
102
102
  ]
103
103
  model = self.options.model or "gpt-realtime"
104
104
  uri = f"{self.url}?model={model}"
105
- logger.info(
106
- "Realtime WS connecting: uri=%s, input=%s@%sHz, output=%s@%sHz, voice=%s, vad=%s",
107
- uri,
108
- self.options.input_mime,
109
- self.options.input_rate_hz,
110
- self.options.output_mime,
111
- self.options.output_rate_hz,
112
- self.options.voice,
113
- self.options.vad_enabled,
114
- )
105
+
106
+ # Determine if audio output should be configured for logging
107
+ modalities = self.options.output_modalities or ["audio", "text"]
108
+ should_configure_audio_output = "audio" in modalities
109
+
110
+ if should_configure_audio_output:
111
+ logger.info(
112
+ "Realtime WS connecting: uri=%s, input=%s@%sHz, output=%s@%sHz, voice=%s, vad=%s",
113
+ uri,
114
+ self.options.input_mime,
115
+ self.options.input_rate_hz,
116
+ self.options.output_mime,
117
+ self.options.output_rate_hz,
118
+ self.options.voice,
119
+ self.options.vad_enabled,
120
+ )
121
+ else:
122
+ logger.info(
123
+ "Realtime WS connecting: uri=%s, input=%s@%sHz, text-only output, vad=%s",
124
+ uri,
125
+ self.options.input_mime,
126
+ self.options.input_rate_hz,
127
+ self.options.vad_enabled,
128
+ )
115
129
  self._ws = await websockets.connect(
116
130
  uri, additional_headers=headers, max_size=None
117
131
  )
@@ -165,11 +179,16 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
165
179
  cleaned.append(t)
166
180
  return cleaned
167
181
 
182
+ # Determine if audio output should be configured
183
+ modalities = self.options.output_modalities or ["audio", "text"]
184
+ should_configure_audio_output = "audio" in modalities
185
+
186
+ # Build session.update per docs (nested audio object)
168
187
  session_payload: Dict[str, Any] = {
169
188
  "type": "session.update",
170
189
  "session": {
171
190
  "type": "realtime",
172
- "output_modalities": ["audio"],
191
+ "output_modalities": modalities,
173
192
  "audio": {
174
193
  "input": {
175
194
  "format": {
@@ -178,16 +197,22 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
178
197
  },
179
198
  "turn_detection": td_input,
180
199
  },
181
- "output": {
182
- "format": {
183
- "type": self.options.output_mime or "audio/pcm",
184
- "rate": int(self.options.output_rate_hz or 24000),
185
- },
186
- "voice": self.options.voice,
187
- "speed": float(
188
- getattr(self.options, "voice_speed", 1.0) or 1.0
189
- ),
190
- },
200
+ **(
201
+ {
202
+ "output": {
203
+ "format": {
204
+ "type": self.options.output_mime or "audio/pcm",
205
+ "rate": int(self.options.output_rate_hz or 24000),
206
+ },
207
+ "voice": self.options.voice,
208
+ "speed": float(
209
+ getattr(self.options, "voice_speed", 1.0) or 1.0
210
+ ),
211
+ }
212
+ }
213
+ if should_configure_audio_output
214
+ else {}
215
+ ),
191
216
  },
192
217
  # Note: no top-level turn_detection; nested under audio.input
193
218
  **({"prompt": prompt_block} if prompt_block else {}),
@@ -204,13 +229,45 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
204
229
  ),
205
230
  },
206
231
  }
207
- logger.info(
208
- "Realtime WS: sending session.update (voice=%s, vad=%s, output=%s@%s)",
209
- self.options.voice,
210
- self.options.vad_enabled,
211
- (self.options.output_mime or "audio/pcm"),
212
- int(self.options.output_rate_hz or 24000),
213
- )
232
+ # Optional realtime transcription configuration
233
+ try:
234
+ tr_model = getattr(self.options, "transcription_model", None)
235
+ if tr_model:
236
+ audio_obj = session_payload["session"].setdefault("audio", {})
237
+ # Attach input transcription config per GA schema
238
+ transcription_cfg: Dict[str, Any] = {"model": tr_model}
239
+ lang = getattr(self.options, "transcription_language", None)
240
+ if lang:
241
+ transcription_cfg["language"] = lang
242
+ prompt_txt = getattr(self.options, "transcription_prompt", None)
243
+ if prompt_txt is not None:
244
+ transcription_cfg["prompt"] = prompt_txt
245
+ if getattr(self.options, "transcription_include_logprobs", False):
246
+ session_payload["session"].setdefault("include", []).append(
247
+ "item.input_audio_transcription.logprobs"
248
+ )
249
+ nr = getattr(self.options, "transcription_noise_reduction", None)
250
+ if nr is not None:
251
+ audio_obj["noise_reduction"] = bool(nr)
252
+ # Place under audio.input.transcription per current server conventions
253
+ audio_obj.setdefault("input", {}).setdefault(
254
+ "transcription", transcription_cfg
255
+ )
256
+ except Exception:
257
+ logger.exception("Failed to attach transcription config to session.update")
258
+ if should_configure_audio_output:
259
+ logger.info(
260
+ "Realtime WS: sending session.update (voice=%s, vad=%s, output=%s@%s)",
261
+ self.options.voice,
262
+ self.options.vad_enabled,
263
+ (self.options.output_mime or "audio/pcm"),
264
+ int(self.options.output_rate_hz or 24000),
265
+ )
266
+ else:
267
+ logger.info(
268
+ "Realtime WS: sending session.update (text-only, vad=%s)",
269
+ self.options.vad_enabled,
270
+ )
214
271
  # Log exact session.update payload and mark awaiting session.updated
215
272
  try:
216
273
  logger.info(
@@ -231,7 +288,7 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
231
288
  logger.warning(
232
289
  "Realtime WS: instructions missing/empty in session.update"
233
290
  )
234
- if not voice:
291
+ if not voice and should_configure_audio_output:
235
292
  logger.warning("Realtime WS: voice missing in session.update")
236
293
  except Exception:
237
294
  pass
@@ -632,6 +689,20 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
632
689
  len(final),
633
690
  )
634
691
  self._out_text_buffers.pop(rid, None)
692
+ # Always terminate the output transcript stream for this response when text-only.
693
+ try:
694
+ # Only enqueue sentinel when no audio modality is configured
695
+ modalities = (
696
+ getattr(self.options, "output_modalities", None)
697
+ or []
698
+ )
699
+ if "audio" not in modalities:
700
+ self._out_tr_queue.put_nowait(None)
701
+ logger.debug(
702
+ "Enqueued transcript termination sentinel (text-only response)"
703
+ )
704
+ except Exception:
705
+ pass
635
706
  except Exception:
636
707
  pass
637
708
  elif (
@@ -1033,6 +1104,47 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1033
1104
  else:
1034
1105
  patch[k] = raw[k]
1035
1106
 
1107
+ # --- Inject realtime transcription config if options were updated after initial connect ---
1108
+ try:
1109
+ tr_model = getattr(self.options, "transcription_model", None)
1110
+ if tr_model and isinstance(patch, dict):
1111
+ # Ensure audio/input containers exist without overwriting caller provided fields
1112
+ aud = patch.setdefault("audio", {})
1113
+ inp = aud.setdefault("input", {})
1114
+ # Only add if not explicitly provided in this patch
1115
+ if "transcription" not in inp:
1116
+ transcription_cfg: Dict[str, Any] = {"model": tr_model}
1117
+ lang = getattr(self.options, "transcription_language", None)
1118
+ if lang:
1119
+ transcription_cfg["language"] = lang
1120
+ prompt_txt = getattr(self.options, "transcription_prompt", None)
1121
+ if prompt_txt is not None:
1122
+ transcription_cfg["prompt"] = prompt_txt
1123
+ nr = getattr(self.options, "transcription_noise_reduction", None)
1124
+ if nr is not None:
1125
+ aud["noise_reduction"] = bool(nr)
1126
+ if getattr(self.options, "transcription_include_logprobs", False):
1127
+ patch.setdefault("include", [])
1128
+ if (
1129
+ "item.input_audio_transcription.logprobs"
1130
+ not in patch["include"]
1131
+ ):
1132
+ patch["include"].append(
1133
+ "item.input_audio_transcription.logprobs"
1134
+ )
1135
+ inp["transcription"] = transcription_cfg
1136
+ try:
1137
+ logger.debug(
1138
+ "Realtime WS: update_session injected transcription config model=%s",
1139
+ tr_model,
1140
+ )
1141
+ except Exception:
1142
+ pass
1143
+ except Exception:
1144
+ logger.exception(
1145
+ "Realtime WS: failed injecting transcription config in update_session"
1146
+ )
1147
+
1036
1148
  # Ensure tools are cleaned even if provided only under audio or elsewhere
1037
1149
  if "tools" in patch:
1038
1150
  patch["tools"] = _strip_tool_strict(patch["tools"]) # idempotent
@@ -1040,9 +1152,12 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1040
1152
  # Per server requirements, always include session.type and output_modalities
1041
1153
  try:
1042
1154
  patch["type"] = "realtime"
1043
- # Preserve caller-provided output_modalities if present, otherwise default to audio
1155
+ # Preserve caller-provided output_modalities if present, otherwise default to configured modalities
1044
1156
  if "output_modalities" not in patch:
1045
- patch["output_modalities"] = ["audio"]
1157
+ patch["output_modalities"] = self.options.output_modalities or [
1158
+ "audio",
1159
+ "text",
1160
+ ]
1046
1161
  except Exception:
1047
1162
  pass
1048
1163
 
@@ -1148,6 +1263,13 @@ class OpenAIRealtimeWebSocketSession(BaseRealtimeSession):
1148
1263
  except Exception:
1149
1264
  pass
1150
1265
 
1266
+ async def create_conversation_item(
1267
+ self, item: Dict[str, Any]
1268
+ ) -> None: # pragma: no cover
1269
+ """Create a conversation item (e.g., for text input)."""
1270
+ payload = {"type": "conversation.item.create", "item": item}
1271
+ await self._send_tracked(payload, label="conversation.item.create")
1272
+
1151
1273
  async def create_response(
1152
1274
  self, response_patch: Optional[Dict[str, Any]] = None
1153
1275
  ) -> None: # pragma: no cover
@@ -1639,6 +1761,13 @@ class OpenAITranscriptionWebSocketSession(BaseRealtimeSession):
1639
1761
  async def clear_input(self) -> None: # pragma: no cover
1640
1762
  await self._send({"type": "input_audio_buffer.clear"})
1641
1763
 
1764
+ async def create_conversation_item(
1765
+ self, item: Dict[str, Any]
1766
+ ) -> None: # pragma: no cover
1767
+ """Create a conversation item (e.g., for text input)."""
1768
+ payload = {"type": "conversation.item.create", "item": item}
1769
+ await self._send_tracked(payload, label="conversation.item.create")
1770
+
1642
1771
  async def create_response(
1643
1772
  self, response_patch: Optional[Dict[str, Any]] = None
1644
1773
  ) -> None: # pragma: no cover
@@ -16,6 +16,7 @@ from solana_agent.interfaces.client.client import SolanaAgent as SolanaAgentInte
16
16
  from solana_agent.interfaces.plugins.plugins import Tool
17
17
  from solana_agent.services.knowledge_base import KnowledgeBaseService
18
18
  from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
19
+ from solana_agent.interfaces.providers.realtime import RealtimeChunk
19
20
 
20
21
 
21
22
  class SolanaAgent(SolanaAgentInterface):
@@ -57,6 +58,7 @@ class SolanaAgent(SolanaAgentInterface):
57
58
  vad: Optional[bool] = False,
58
59
  rt_encode_input: bool = False,
59
60
  rt_encode_output: bool = False,
61
+ rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
60
62
  rt_voice: Literal[
61
63
  "alloy",
62
64
  "ash",
@@ -90,7 +92,9 @@ class SolanaAgent(SolanaAgentInterface):
90
92
  router: Optional[RoutingInterface] = None,
91
93
  images: Optional[List[Union[str, bytes]]] = None,
92
94
  output_model: Optional[Type[BaseModel]] = None,
93
- ) -> AsyncGenerator[Union[str, bytes, BaseModel], None]: # pragma: no cover
95
+ ) -> AsyncGenerator[
96
+ Union[str, bytes, BaseModel, RealtimeChunk], None
97
+ ]: # pragma: no cover
94
98
  """Process a user message (text or audio) and optional images, returning the response stream.
95
99
 
96
100
  Args:
@@ -104,6 +108,7 @@ class SolanaAgent(SolanaAgentInterface):
104
108
  vad: Whether to use voice activity detection (for audio input)
105
109
  rt_encode_input: Whether to re-encode input audio for compatibility
106
110
  rt_encode_output: Whether to re-encode output audio for compatibility
111
+ rt_output_modalities: Modalities to return in realtime (default both if None)
107
112
  rt_voice: Voice to use for realtime audio output
108
113
  audio_voice: Voice to use for audio output
109
114
  audio_output_format: Audio output format
@@ -124,6 +129,7 @@ class SolanaAgent(SolanaAgentInterface):
124
129
  vad=vad,
125
130
  rt_encode_input=rt_encode_input,
126
131
  rt_encode_output=rt_encode_output,
132
+ rt_output_modalities=rt_output_modalities,
127
133
  rt_voice=rt_voice,
128
134
  audio_voice=audio_voice,
129
135
  audio_output_format=audio_output_format,
@@ -4,6 +4,7 @@ from typing import AsyncGenerator, Dict, Any, List, Literal, Optional, Type, Uni
4
4
  from pydantic import BaseModel
5
5
  from solana_agent.interfaces.plugins.plugins import Tool
6
6
  from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
7
+ from solana_agent.interfaces.providers.realtime import RealtimeChunk
7
8
 
8
9
 
9
10
  class SolanaAgent(ABC):
@@ -22,6 +23,7 @@ class SolanaAgent(ABC):
22
23
  vad: bool = False,
23
24
  rt_encode_input: bool = False,
24
25
  rt_encode_output: bool = False,
26
+ rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
25
27
  rt_voice: Literal[
26
28
  "alloy",
27
29
  "ash",
@@ -55,7 +57,7 @@ class SolanaAgent(ABC):
55
57
  router: Optional[RoutingInterface] = None,
56
58
  images: Optional[List[Union[str, bytes]]] = None,
57
59
  output_model: Optional[Type[BaseModel]] = None,
58
- ) -> AsyncGenerator[Union[str, bytes, BaseModel], None]:
60
+ ) -> AsyncGenerator[Union[str, bytes, BaseModel, RealtimeChunk], None]:
59
61
  """Process a user message and return the response stream."""
60
62
  pass
61
63
 
File without changes
@@ -1,7 +1,17 @@
1
1
  from __future__ import annotations
2
2
  from abc import ABC, abstractmethod
3
3
  from dataclasses import dataclass
4
- from typing import Any, AsyncGenerator, Dict, Literal, Optional, Awaitable, Callable
4
+ from typing import (
5
+ Any,
6
+ AsyncGenerator,
7
+ Dict,
8
+ Literal,
9
+ Optional,
10
+ Awaitable,
11
+ Callable,
12
+ List,
13
+ Union,
14
+ )
5
15
 
6
16
 
7
17
  @dataclass
@@ -24,6 +34,7 @@ class RealtimeSessionOptions:
24
34
  output_rate_hz: int = 24000
25
35
  input_mime: str = "audio/pcm" # 16-bit PCM
26
36
  output_mime: str = "audio/pcm" # 16-bit PCM
37
+ output_modalities: List[Literal["audio", "text"]] = None # None means auto-detect
27
38
  instructions: Optional[str] = None
28
39
  # Optional: tools payload compatible with OpenAI Realtime session.update
29
40
  tools: Optional[list[dict[str, Any]]] = None
@@ -34,6 +45,107 @@ class RealtimeSessionOptions:
34
45
  # Optional guard: if a tool takes longer than this to complete, skip sending
35
46
  # function_call_output to avoid stale/expired call_id issues. Set to None to always send.
36
47
  tool_result_max_age_s: Optional[float] = None
48
+ # --- Realtime transcription configuration (optional) ---
49
+ # When transcription_model is set, QueryService should skip the HTTP STT path and rely on
50
+ # realtime websocket transcription events. Other fields customize that behavior.
51
+ transcription_model: Optional[str] = None
52
+ transcription_language: Optional[str] = None # e.g. 'en'
53
+ transcription_prompt: Optional[str] = None
54
+ transcription_noise_reduction: Optional[bool] = None
55
+ transcription_include_logprobs: bool = False
56
+
57
+
58
+ @dataclass
59
+ class RealtimeChunk:
60
+ """Represents a chunk of data from a realtime session with its modality type."""
61
+
62
+ modality: Literal["audio", "text"]
63
+ data: Union[str, bytes]
64
+ timestamp: Optional[float] = None # Optional timestamp for ordering
65
+ metadata: Optional[Dict[str, Any]] = None # Optional additional metadata
66
+
67
+ @property
68
+ def is_audio(self) -> bool:
69
+ """Check if this is an audio chunk."""
70
+ return self.modality == "audio"
71
+
72
+ @property
73
+ def is_text(self) -> bool:
74
+ """Check if this is a text chunk."""
75
+ return self.modality == "text"
76
+
77
+ @property
78
+ def text_data(self) -> Optional[str]:
79
+ """Get text data if this is a text chunk."""
80
+ return self.data if isinstance(self.data, str) else None
81
+
82
+ @property
83
+ def audio_data(self) -> Optional[bytes]:
84
+ """Get audio data if this is an audio chunk."""
85
+ return self.data if isinstance(self.data, bytes) else None
86
+
87
+
88
+ async def separate_audio_chunks(
89
+ chunks: AsyncGenerator[RealtimeChunk, None],
90
+ ) -> AsyncGenerator[bytes, None]:
91
+ """Extract only audio chunks from a stream of RealtimeChunk objects.
92
+
93
+ Args:
94
+ chunks: Stream of RealtimeChunk objects
95
+
96
+ Yields:
97
+ Audio data bytes from audio chunks only
98
+ """
99
+ async for chunk in chunks:
100
+ if chunk.is_audio and chunk.audio_data:
101
+ yield chunk.audio_data
102
+
103
+
104
+ async def separate_text_chunks(
105
+ chunks: AsyncGenerator[RealtimeChunk, None],
106
+ ) -> AsyncGenerator[str, None]:
107
+ """Extract only text chunks from a stream of RealtimeChunk objects.
108
+
109
+ Args:
110
+ chunks: Stream of RealtimeChunk objects
111
+
112
+ Yields:
113
+ Text data from text chunks only
114
+ """
115
+ async for chunk in chunks:
116
+ if chunk.is_text and chunk.text_data:
117
+ yield chunk.text_data
118
+
119
+
120
+ async def demux_realtime_chunks(
121
+ chunks: AsyncGenerator[RealtimeChunk, None],
122
+ ) -> tuple[AsyncGenerator[bytes, None], AsyncGenerator[str, None]]:
123
+ """Demux a stream of RealtimeChunk objects into separate audio and text streams.
124
+
125
+ Note: This function consumes the input generator, so each output stream can only be consumed once.
126
+
127
+ Args:
128
+ chunks: Stream of RealtimeChunk objects
129
+
130
+ Returns:
131
+ Tuple of (audio_stream, text_stream) async generators
132
+ """
133
+ # Collect all chunks first since we can't consume the generator twice
134
+ collected_chunks = []
135
+ async for chunk in chunks:
136
+ collected_chunks.append(chunk)
137
+
138
+ async def audio_stream():
139
+ for chunk in collected_chunks:
140
+ if chunk.is_audio and chunk.audio_data:
141
+ yield chunk.audio_data
142
+
143
+ async def text_stream():
144
+ for chunk in collected_chunks:
145
+ if chunk.is_text and chunk.text_data:
146
+ yield chunk.text_data
147
+
148
+ return audio_stream(), text_stream()
37
149
 
38
150
 
39
151
  class BaseRealtimeSession(ABC):
@@ -4,6 +4,7 @@ from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Type, Uni
4
4
  from pydantic import BaseModel
5
5
 
6
6
  from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
7
+ from solana_agent.interfaces.providers.realtime import RealtimeChunk
7
8
 
8
9
 
9
10
  class QueryService(ABC):
@@ -15,6 +16,7 @@ class QueryService(ABC):
15
16
  user_id: str,
16
17
  query: Union[str, bytes],
17
18
  output_format: Literal["text", "audio"] = "text",
19
+ rt_output_modalities: Optional[List[Literal["audio", "text"]]] = None,
18
20
  rt_voice: Literal[
19
21
  "alloy",
20
22
  "ash",
@@ -51,7 +53,7 @@ class QueryService(ABC):
51
53
  output_model: Optional[Type[BaseModel]] = None,
52
54
  capture_schema: Optional[Dict[str, Any]] = None,
53
55
  capture_name: Optional[str] = None,
54
- ) -> AsyncGenerator[Union[str, bytes, BaseModel], None]:
56
+ ) -> AsyncGenerator[Union[str, bytes, BaseModel, RealtimeChunk], None]:
55
57
  """Process the user request and generate a response."""
56
58
  pass
57
59