solana-agent 24.0.0__py3-none-any.whl → 24.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,14 @@ LLM provider adapters for the Solana Agent system.
3
3
 
4
4
  These adapters implement the LLMProvider interface for different LLM services.
5
5
  """
6
- from typing import AsyncGenerator, List, Literal, Type, TypeVar, Union
6
+ import asyncio
7
+ import json
8
+ from typing import Any, AsyncGenerator, Callable, Dict, Literal, Optional, Type, TypeVar
7
9
 
8
- from openai import OpenAI
10
+ import httpx
11
+ from openai import AsyncOpenAI
9
12
  from pydantic import BaseModel
13
+ import websockets
10
14
 
11
15
  from solana_agent.interfaces.providers.llm import LLMProvider
12
16
 
@@ -17,7 +21,7 @@ class OpenAIAdapter(LLMProvider):
17
21
  """OpenAI implementation of LLMProvider with web search capabilities."""
18
22
 
19
23
  def __init__(self, api_key: str):
20
- self.client = OpenAI(api_key=api_key)
24
+ self.client = AsyncOpenAI(api_key=api_key)
21
25
  self.parse_model = "gpt-4o-mini"
22
26
  self.text_model = "gpt-4o-mini"
23
27
  self.transcription_model = "gpt-4o-mini-transcribe"
@@ -44,7 +48,7 @@ class OpenAIAdapter(LLMProvider):
44
48
  Audio bytes as they become available
45
49
  """
46
50
  try:
47
- with self.client.audio.speech.with_streaming_response.create(
51
+ async with self.client.audio.speech.with_streaming_response.create(
48
52
  model=self.tts_model,
49
53
  voice=voice,
50
54
  instructions=instructions,
@@ -52,7 +56,7 @@ class OpenAIAdapter(LLMProvider):
52
56
  response_format=response_format
53
57
  ) as stream:
54
58
  # Stream the bytes in 16KB chunks
55
- for chunk in stream.iter_bytes(chunk_size=1024 * 16):
59
+ async for chunk in stream.iter_bytes(chunk_size=1024 * 16):
56
60
  yield chunk
57
61
 
58
62
  except Exception as e:
@@ -84,13 +88,13 @@ class OpenAIAdapter(LLMProvider):
84
88
  Transcript text chunks as they become available
85
89
  """
86
90
  try:
87
- with self.client.audio.transcriptions.with_streaming_response.create(
91
+ async with self.client.audio.transcriptions.with_streaming_response.create(
88
92
  model=self.transcription_model,
89
93
  file=(f"file.{input_format}", audio_bytes),
90
94
  response_format="text",
91
95
  ) as stream:
92
96
  # Stream the text in 16KB chunks
93
- for chunk in stream.iter_text(chunk_size=1024 * 16):
97
+ async for chunk in stream.iter_text(chunk_size=1024 * 16):
94
98
  yield chunk
95
99
 
96
100
  except Exception as e:
@@ -119,9 +123,9 @@ class OpenAIAdapter(LLMProvider):
119
123
  "model": self.text_model,
120
124
  }
121
125
  try:
122
- response = self.client.chat.completions.create(**request_params)
126
+ response = await self.client.chat.completions.create(**request_params)
123
127
 
124
- for chunk in response:
128
+ async for chunk in response:
125
129
  if chunk.choices:
126
130
  if chunk.choices[0].delta.content:
127
131
  text = chunk.choices[0].delta.content
@@ -148,7 +152,7 @@ class OpenAIAdapter(LLMProvider):
148
152
 
149
153
  try:
150
154
  # First try the beta parsing API
151
- completion = self.client.beta.chat.completions.parse(
155
+ completion = await self.client.beta.chat.completions.parse(
152
156
  model=self.parse_model,
153
157
  messages=messages,
154
158
  response_format=model_class,
@@ -156,3 +160,173 @@ class OpenAIAdapter(LLMProvider):
156
160
  return completion.choices[0].message.parsed
157
161
  except Exception as e:
158
162
  print(f"Error with beta.parse method: {e}")
163
+
164
+ async def create_realtime_session(
165
+ self,
166
+ model: str = "gpt-4o-mini-realtime-preview",
167
+ modalities: list = ["audio", "text"],
168
+ instructions: str = "You are a helpful assistant.",
169
+ voice: str = "alloy",
170
+ input_audio_format: str = "pcm16",
171
+ output_audio_format: str = "pcm16",
172
+ ) -> Dict[str, Any]: # pragma: no cover
173
+ """Create a realtime session token for WebSocket communication."""
174
+ try:
175
+ # Get the API key from the AsyncOpenAI client
176
+ api_key = self.client.api_key
177
+
178
+ # Create an async HTTP client
179
+ async with httpx.AsyncClient() as client:
180
+ response = await client.post(
181
+ "https://api.openai.com/v1/realtime/sessions",
182
+ json={
183
+ "model": model,
184
+ "modalities": modalities,
185
+ "instructions": instructions,
186
+ "voice": voice,
187
+ "input_audio_format": input_audio_format,
188
+ "output_audio_format": output_audio_format,
189
+ },
190
+ headers={
191
+ "Authorization": f"Bearer {api_key}",
192
+ "Content-Type": "application/json",
193
+ "OpenAI-Beta": "realtime=v1"
194
+ }
195
+ )
196
+
197
+ if response.status_code == 200:
198
+ return response.json()
199
+ else:
200
+ raise Exception(
201
+ f"Failed to create realtime session: {response.text}")
202
+ except Exception as e:
203
+ print(f"Error creating realtime session: {str(e)}")
204
+ raise
205
+
206
+ async def realtime_audio_transcription(
207
+ self,
208
+ audio_generator: AsyncGenerator[bytes, None],
209
+ transcription_config: Optional[Dict[str, Any]] = None,
210
+ on_event: Optional[Callable[[Dict[str, Any]], Any]] = None,
211
+ ) -> AsyncGenerator[str, None]: # pragma: no cover
212
+ """Stream real-time audio transcription using the Realtime API.
213
+
214
+ Args:
215
+ audio_generator: Async generator that yields audio chunks
216
+ transcription_config: Optional custom configuration for transcription
217
+ on_event: Optional callback function for handling raw events
218
+
219
+ Yields:
220
+ Transcription text as it becomes available
221
+ """
222
+ # Create default transcription config if none provided
223
+ if transcription_config is None:
224
+ transcription_config = {
225
+ "input_audio_format": "pcm16",
226
+ "input_audio_transcription": {
227
+ "model": "gpt-4o-mini-transcribe"
228
+ },
229
+ "turn_detection": {
230
+ "type": "server_vad",
231
+ "threshold": 0.5,
232
+ "prefix_padding_ms": 300,
233
+ "silence_duration_ms": 200
234
+ }
235
+ }
236
+
237
+ try:
238
+ # Get the API key from the AsyncOpenAI client
239
+ api_key = self.client.api_key
240
+
241
+ # Create transcription session
242
+ async with httpx.AsyncClient() as client:
243
+ response = await client.post(
244
+ "https://api.openai.com/v1/realtime/transcription_sessions",
245
+ json=transcription_config,
246
+ headers={
247
+ "Authorization": f"Bearer {api_key}",
248
+ "Content-Type": "application/json",
249
+ "OpenAI-Beta": "realtime=v1"
250
+ }
251
+ )
252
+
253
+ if response.status_code != 200:
254
+ raise Exception(
255
+ f"Failed to create transcription session: {response.text}")
256
+
257
+ session = response.json()
258
+ client_secret = session["client_secret"]["value"]
259
+
260
+ # Connect to WebSocket with proper headers as dictionary
261
+ url = "wss://api.openai.com/v1/realtime?model=gpt-4o-mini-transcribe"
262
+ headers = {
263
+ "Authorization": f"Bearer {client_secret}",
264
+ "OpenAI-Beta": "realtime=v1"
265
+ }
266
+
267
+ async with websockets.connect(url, additional_headers=headers) as websocket:
268
+ # Handle WebSocket communication in the background
269
+ audio_task = None
270
+
271
+ async def send_audio():
272
+ try:
273
+ async for audio_chunk in audio_generator:
274
+ # Base64 encode the audio
275
+ import base64
276
+ encoded_audio = base64.b64encode(
277
+ audio_chunk).decode('utf-8')
278
+
279
+ # Send audio chunk
280
+ await websocket.send(json.dumps({
281
+ "type": "input_audio_buffer.append",
282
+ "audio": encoded_audio
283
+ }))
284
+
285
+ # Small delay to prevent flooding
286
+ await asyncio.sleep(0.05)
287
+
288
+ # Commit the audio buffer when done
289
+ await websocket.send(json.dumps({
290
+ "type": "input_audio_buffer.commit"
291
+ }))
292
+ except Exception as e:
293
+ print(f"Error sending audio: {str(e)}")
294
+
295
+ # Start sending audio in the background
296
+ audio_task = asyncio.create_task(send_audio())
297
+
298
+ # Process transcription events
299
+ try:
300
+ while True:
301
+ message = await websocket.recv()
302
+ event = json.loads(message)
303
+
304
+ if on_event:
305
+ # Check if on_event is a coroutine function and await it if needed
306
+ if asyncio.iscoroutinefunction(on_event):
307
+ await on_event(event)
308
+ else:
309
+ on_event(event)
310
+
311
+ # Extract transcription deltas
312
+ if event["type"] == "conversation.item.input_audio_transcription.delta":
313
+ yield event["delta"]
314
+
315
+ # Also handle completed transcriptions
316
+ elif event["type"] == "conversation.item.input_audio_transcription.completed":
317
+ yield event["transcript"]
318
+ break
319
+ finally:
320
+ # Clean up audio task if it's still running
321
+ if audio_task and not audio_task.done():
322
+ audio_task.cancel()
323
+ try:
324
+ await audio_task
325
+ except asyncio.CancelledError:
326
+ pass
327
+
328
+ except Exception as e:
329
+ print(f"Error in realtime audio transcription: {str(e)}")
330
+ import traceback
331
+ print(traceback.format_exc())
332
+ yield f"I apologize, but I encountered an error transcribing the audio: {str(e)}"
@@ -55,6 +55,7 @@ class SolanaAgent(SolanaAgentInterface):
55
55
  audio_input_format: Literal[
56
56
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
57
57
  ] = "mp4",
58
+ audio_transcription_real_time: bool = True,
58
59
  router: Optional[RoutingInterface] = None,
59
60
  ) -> AsyncGenerator[Union[str, bytes], None]: # pragma: no cover
60
61
  """Process a user message and return the response stream.
@@ -68,6 +69,7 @@ class SolanaAgent(SolanaAgentInterface):
68
69
  audio_instructions: Audio voice instructions
69
70
  audio_output_format: Audio output format
70
71
  audio_input_format: Audio input format
72
+ audio_transcription_real_time: Flag for real-time audio transcription
71
73
  router: Optional routing service for processing
72
74
 
73
75
  Returns:
@@ -83,6 +85,7 @@ class SolanaAgent(SolanaAgentInterface):
83
85
  audio_input_format=audio_input_format,
84
86
  prompt=prompt,
85
87
  router=router,
88
+ audio_transcription_real_time=audio_transcription_real_time,
86
89
  ):
87
90
  yield chunk
88
91
 
@@ -24,6 +24,7 @@ class SolanaAgent(ABC):
24
24
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
25
25
  ] = "mp4",
26
26
  router: Optional[RoutingInterface] = None,
27
+ audio_transcription_real_time: bool = True,
27
28
  ) -> AsyncGenerator[Union[str, bytes], None]:
28
29
  """Process a user message and return the response stream."""
29
30
  pass
@@ -1,5 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import AsyncGenerator, List, Literal, Type, TypeVar, Union
2
+ from typing import Any, AsyncGenerator, Callable, Dict, List, Literal, Optional, Type, TypeVar, Union
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
@@ -49,3 +49,13 @@ class LLMProvider(ABC):
49
49
  ) -> AsyncGenerator[str, None]:
50
50
  """Transcribe audio from the language model."""
51
51
  pass
52
+
53
+ @abstractmethod
54
+ async def realtime_audio_transcription(
55
+ self,
56
+ audio_generator: AsyncGenerator[bytes, None],
57
+ transcription_config: Optional[Dict[str, Any]] = None,
58
+ on_event: Optional[Callable[[Dict[str, Any]], Any]] = None,
59
+ ) -> AsyncGenerator[str, None]:
60
+ """Stream real-time audio transcription from the language model."""
61
+ pass
@@ -34,6 +34,7 @@ class AgentService(ABC):
34
34
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
35
35
  ] = "mp4",
36
36
  prompt: Optional[str] = None,
37
+ audio_transcription_real_time: bool = True,
37
38
  ) -> AsyncGenerator[Union[str, bytes], None]:
38
39
  """Generate a response from an agent."""
39
40
  pass
@@ -1,6 +1,8 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from typing import Any, AsyncGenerator, Dict, Literal, Optional, Union
3
3
 
4
+ from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
5
+
4
6
 
5
7
  class QueryService(ABC):
6
8
  """Interface for processing user queries."""
@@ -20,6 +22,8 @@ class QueryService(ABC):
20
22
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
21
23
  ] = "mp4",
22
24
  prompt: Optional[str] = None,
25
+ router: Optional[RoutingInterface] = None,
26
+ audio_transcription_real_time: bool = True,
23
27
  ) -> AsyncGenerator[Union[str, bytes], None]:
24
28
  """Process the user request and generate a response."""
25
29
  pass
@@ -69,8 +69,8 @@ class MemoryRepository(MemoryProvider):
69
69
  # Store truncated messages
70
70
  doc = {
71
71
  "user_id": user_id,
72
- "user_message": self._truncate(user_msg),
73
- "assistant_message": self._truncate(assistant_msg),
72
+ "user_message": user_msg,
73
+ "assistant_message": assistant_msg,
74
74
  "timestamp": datetime.now(timezone.utc)
75
75
  }
76
76
  self.mongo.insert_one(self.collection, doc)
@@ -176,6 +176,7 @@ class AgentService(AgentServiceInterface):
176
176
  audio_input_format: Literal[
177
177
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
178
178
  ] = "mp4",
179
+ audio_transcription_real_time: bool = True,
179
180
  prompt: Optional[str] = None,
180
181
  ) -> AsyncGenerator[Union[str, bytes], None]: # pragma: no cover
181
182
  """Generate a response with support for text/audio input/output."""
@@ -191,11 +192,25 @@ class AgentService(AgentServiceInterface):
191
192
  return
192
193
 
193
194
  try:
194
- # Handle audio input if provided
195
+ # Handle audio input if provided - KEEP REAL-TIME AUDIO TRANSCRIPTION
195
196
  query_text = ""
196
197
  if not isinstance(query, str):
197
- async for transcript in self.llm_provider.transcribe_audio(query, input_format=audio_input_format):
198
- query_text += transcript
198
+ if audio_transcription_real_time and hasattr(self.llm_provider, "realtime_audio_transcription"):
199
+ # Use realtime transcription for faster processing if available
200
+ print("Using realtime audio transcription")
201
+ async for transcript in self.llm_provider.realtime_audio_transcription(
202
+ audio_generator=self._bytes_to_generator(query),
203
+ transcription_config={
204
+ "input_audio_format": audio_input_format}
205
+ ):
206
+ query_text += transcript
207
+ else:
208
+ # Fall back to standard transcription
209
+ print("Using standard audio transcription")
210
+ async for transcript in self.llm_provider.transcribe_audio(query, input_format=audio_input_format):
211
+ query_text += transcript
212
+
213
+ print(f"Transcribed query: {query_text}")
199
214
  else:
200
215
  query_text = query
201
216
 
@@ -209,118 +224,172 @@ class AgentService(AgentServiceInterface):
209
224
  if prompt:
210
225
  system_prompt += f"\n\nADDITIONAL PROMPT: {prompt}"
211
226
 
212
- # make tool calling prompt
227
+ # Add tool usage prompt if tools are available
213
228
  tool_calling_system_prompt = deepcopy(system_prompt)
214
229
  if self.tool_registry:
215
230
  tool_usage_prompt = self._get_tool_usage_prompt(agent_name)
216
231
  if tool_usage_prompt:
217
232
  tool_calling_system_prompt += f"\n\nTOOL CALLING PROMPT: {tool_usage_prompt}"
233
+ print(
234
+ f"Tools available to agent {agent_name}: {[t.get('name') for t in self.get_agent_tools(agent_name)]}")
218
235
 
219
- # Variables for tracking the response
236
+ # Variables for tracking the complete response
220
237
  complete_text_response = ""
221
-
222
- # For audio output, we'll collect everything first
223
238
  full_response_buffer = ""
224
239
 
225
- # Variables for handling JSON processing
226
- json_buffer = ""
227
- is_json = False
240
+ # Variables for robust handling of tool call markers that may be split across chunks
241
+ tool_buffer = ""
242
+ pending_chunk = "" # To hold text that might contain partial markers
243
+ is_tool_call = False
244
+ window_size = 30 # Increased window size for better detection
228
245
 
229
- # Generate and stream response
246
+ # Define start and end markers
247
+ start_marker = "[TOOL]"
248
+ end_marker = "[/TOOL]"
249
+
250
+ # Generate and stream response (ALWAYS use non-realtime for text generation)
251
+ print(
252
+ f"Generating response with {len(query_text)} characters of query text")
230
253
  async for chunk in self.llm_provider.generate_text(
231
254
  prompt=query_text,
232
255
  system_prompt=tool_calling_system_prompt,
233
256
  ):
234
- # Check if the chunk is JSON or a tool call
235
- if (chunk.strip().startswith("{") or "{\"tool_call\":" in chunk) and not is_json:
236
- is_json = True
237
- json_buffer = chunk
257
+ # If we have pending text from the previous chunk, combine it with this chunk
258
+ if pending_chunk:
259
+ combined_chunk = pending_chunk + chunk
260
+ pending_chunk = "" # Reset pending chunk
261
+ else:
262
+ combined_chunk = chunk
263
+
264
+ # STEP 1: Check for tool call start marker
265
+ if start_marker in combined_chunk and not is_tool_call:
266
+ print(
267
+ f"Found tool start marker in chunk of length {len(combined_chunk)}")
268
+ is_tool_call = True
269
+
270
+ # Extract text before the marker and the marker itself with everything after
271
+ start_pos = combined_chunk.find(start_marker)
272
+ before_marker = combined_chunk[:start_pos]
273
+ after_marker = combined_chunk[start_pos:]
274
+
275
+ # Yield text that appeared before the marker
276
+ if before_marker and output_format == "text":
277
+ yield before_marker
278
+
279
+ # Start collecting the tool call
280
+ tool_buffer = after_marker
281
+ continue # Skip to next chunk
282
+
283
+ # STEP 2: Handle ongoing tool call collection
284
+ if is_tool_call:
285
+ tool_buffer += combined_chunk
286
+
287
+ # Check if the tool call is complete
288
+ if end_marker in tool_buffer:
289
+ print(
290
+ f"Tool call complete, buffer size: {len(tool_buffer)}")
291
+
292
+ # Process the tool call
293
+ response_text = await self._handle_tool_call(
294
+ agent_name=agent_name,
295
+ tool_text=tool_buffer
296
+ )
297
+
298
+ # Clean the response to remove any markers or formatting
299
+ response_text = self._clean_tool_response(
300
+ response_text)
301
+ print(
302
+ f"Tool execution complete, result size: {len(response_text)}")
303
+
304
+ # Create new prompt with search/tool results
305
+ # Using "Search Result" instead of "TOOL RESPONSE" to avoid model repeating "TOOL"
306
+ user_prompt = f"{query_text}\n\nSearch Result: {response_text}"
307
+ tool_system_prompt = system_prompt + \
308
+ "\n DO NOT use the tool calling format again."
309
+
310
+ # Generate a new response with the tool results
311
+ print("Generating new response with tool results")
312
+ if output_format == "text":
313
+ # Stream the follow-up response for text output
314
+ async for processed_chunk in self.llm_provider.generate_text(
315
+ prompt=user_prompt,
316
+ system_prompt=tool_system_prompt,
317
+ ):
318
+ complete_text_response += processed_chunk
319
+ yield processed_chunk
320
+ else:
321
+ # For audio output, collect the full response first
322
+ tool_response = ""
323
+ async for processed_chunk in self.llm_provider.generate_text(
324
+ prompt=user_prompt,
325
+ system_prompt=tool_system_prompt,
326
+ ):
327
+ tool_response += processed_chunk
328
+
329
+ # Clean and add to our complete text record and audio buffer
330
+ tool_response = self._clean_for_audio(
331
+ tool_response)
332
+ complete_text_response += tool_response
333
+ full_response_buffer += tool_response
334
+
335
+ # Reset tool handling state
336
+ is_tool_call = False
337
+ tool_buffer = ""
338
+ pending_chunk = ""
339
+ break # Exit the original generation loop after tool processing
340
+
341
+ # Continue collecting tool call content without yielding
238
342
  continue
239
343
 
240
- # Collect JSON or handle normal text
241
- if is_json:
242
- json_buffer += chunk
243
- try:
244
- # Try to parse complete JSON
245
- data = json.loads(json_buffer)
246
-
247
- # Valid JSON found, handle it
248
- if "tool_call" in data:
249
- response_text = await self._handle_tool_call(
250
- agent_name=agent_name,
251
- json_chunk=json_buffer
252
- )
253
-
254
- # Update system prompt to prevent further tool calls
255
- tool_system_prompt = system_prompt + \
256
- "\n DO NOT make any tool calls or return JSON."
257
-
258
- # Create prompt with tool response
259
- user_prompt = f"\n USER QUERY: {query_text} \n"
260
- user_prompt += f"\n TOOL RESPONSE: {response_text} \n"
261
-
262
- # For text output, process chunks directly
263
- if output_format == "text":
264
- # Stream text response for text output
265
- async for processed_chunk in self.llm_provider.generate_text(
266
- prompt=user_prompt,
267
- system_prompt=tool_system_prompt,
268
- ):
269
- complete_text_response += processed_chunk
270
- yield processed_chunk
271
- else:
272
- # For audio output, collect the full tool response first
273
- tool_response = ""
274
- async for processed_chunk in self.llm_provider.generate_text(
275
- prompt=user_prompt,
276
- system_prompt=tool_system_prompt,
277
- ):
278
- tool_response += processed_chunk
279
-
280
- # Add to our complete text record and full audio buffer
281
- tool_response = self._clean_for_audio(
282
- tool_response)
283
- complete_text_response += tool_response
284
- full_response_buffer += tool_response
285
- else:
286
- # For non-tool JSON, still capture the text
287
- complete_text_response += json_buffer
288
-
289
- if output_format == "text":
290
- yield json_buffer
291
- else:
292
- # Add to full response buffer for audio
293
- full_response_buffer += json_buffer
294
-
295
- # Reset JSON handling
296
- is_json = False
297
- json_buffer = ""
298
-
299
- except json.JSONDecodeError:
300
- # JSON not complete yet, continue collecting
301
- pass
302
- else:
303
- # For regular text
304
- complete_text_response += chunk
305
-
306
- if output_format == "text":
307
- # For text output, yield directly
308
- yield chunk
309
- else:
310
- # For audio output, add to the full response buffer
311
- full_response_buffer += chunk
312
-
313
- # Handle any leftover JSON buffer
314
- if json_buffer:
315
- complete_text_response += json_buffer
344
+ # STEP 3: Check for possible partial start markers at the end of the chunk
345
+ # This helps detect markers split across chunks
346
+ potential_marker = False
347
+ for i in range(1, len(start_marker)):
348
+ if combined_chunk.endswith(start_marker[:i]):
349
+ # Found a partial marker at the end
350
+ # Save the partial marker
351
+ pending_chunk = combined_chunk[-i:]
352
+ # Everything except the partial marker
353
+ chunk_to_yield = combined_chunk[:-i]
354
+ potential_marker = True
355
+ print(
356
+ f"Potential partial marker detected: '{pending_chunk}'")
357
+ break
358
+
359
+ if potential_marker:
360
+ # Process the safe part of the chunk
361
+ if chunk_to_yield and output_format == "text":
362
+ yield chunk_to_yield
363
+ if chunk_to_yield:
364
+ complete_text_response += chunk_to_yield
365
+ if output_format == "audio":
366
+ full_response_buffer += chunk_to_yield
367
+ continue
368
+
369
+ # STEP 4: Normal text processing for non-tool call content
316
370
  if output_format == "text":
317
- yield json_buffer
318
- else:
319
- full_response_buffer += json_buffer
371
+ yield combined_chunk
320
372
 
321
- # For audio output, now process the complete response
373
+ complete_text_response += combined_chunk
374
+ if output_format == "audio":
375
+ full_response_buffer += combined_chunk
376
+
377
+ # Process any incomplete tool call as regular text
378
+ if is_tool_call and tool_buffer:
379
+ print(
380
+ f"Incomplete tool call detected, returning as regular text: {len(tool_buffer)} chars")
381
+ if output_format == "text":
382
+ yield tool_buffer
383
+
384
+ complete_text_response += tool_buffer
385
+ if output_format == "audio":
386
+ full_response_buffer += tool_buffer
387
+
388
+ # For audio output, generate speech from the complete buffer
322
389
  if output_format == "audio" and full_response_buffer:
323
390
  # Clean text before TTS
391
+ print(
392
+ f"Processing {len(full_response_buffer)} characters for audio output")
324
393
  full_response_buffer = self._clean_for_audio(
325
394
  full_response_buffer)
326
395
 
@@ -335,9 +404,15 @@ class AgentService(AgentServiceInterface):
335
404
 
336
405
  # Store the complete text response
337
406
  self.last_text_response = complete_text_response
407
+ print(
408
+ f"Response generation complete: {len(complete_text_response)} chars")
338
409
 
339
410
  except Exception as e:
340
411
  error_msg = f"I apologize, but I encountered an error: {str(e)}"
412
+ print(f"Error in generate_response: {str(e)}")
413
+ import traceback
414
+ print(traceback.format_exc())
415
+
341
416
  if output_format == "audio":
342
417
  async for chunk in self.llm_provider.tts(
343
418
  error_msg,
@@ -349,52 +424,73 @@ class AgentService(AgentServiceInterface):
349
424
  else:
350
425
  yield error_msg
351
426
 
352
- print(f"Error in generate_response: {str(e)}")
353
- import traceback
354
- print(traceback.format_exc())
427
+ async def _bytes_to_generator(self, data: bytes) -> AsyncGenerator[bytes, None]:
428
+ """Convert bytes to an async generator for streaming.
355
429
 
356
- async def _handle_tool_call(
357
- self,
358
- agent_name: str,
359
- json_chunk: str,
360
- ) -> str:
361
- """Handle tool calls and return formatted response."""
430
+ Args:
431
+ data: Bytes of audio data
432
+
433
+ Yields:
434
+ Chunks of audio data
435
+ """
436
+ # Define a reasonable chunk size (adjust based on your needs)
437
+ chunk_size = 4096
438
+
439
+ for i in range(0, len(data), chunk_size):
440
+ yield data[i:i + chunk_size]
441
+ # Small delay to simulate streaming
442
+ await asyncio.sleep(0.01)
443
+
444
+ async def _handle_tool_call(self, agent_name: str, tool_text: str) -> str:
445
+ """Handle marker-based tool calls."""
362
446
  try:
363
- data = json.loads(json_chunk)
364
- if "tool_call" in data:
365
- tool_data = data["tool_call"]
366
- tool_name = tool_data.get("name")
367
- parameters = tool_data.get("parameters", {})
368
-
369
- if tool_name:
370
- # Execute the tool and get the result
371
- result = await self.execute_tool(agent_name, tool_name, parameters)
372
-
373
- if result.get("status") == "success":
374
- tool_result = result.get("result", "")
375
- return tool_result
376
- else:
377
- error_message = f"I apologize, but I encountered an issue with the {tool_name} tool: {result.get('message', 'Unknown error')}"
378
- print(f"Tool error: {error_message}")
379
- return error_message
380
- else:
381
- return "Tool name was not provided in the tool call."
447
+ # Extract the content between markers
448
+ start_marker = "[TOOL]"
449
+ end_marker = "[/TOOL]"
450
+
451
+ start_idx = tool_text.find(start_marker) + len(start_marker)
452
+ end_idx = tool_text.find(end_marker)
453
+
454
+ tool_content = tool_text[start_idx:end_idx].strip()
455
+
456
+ # Parse the lines to extract name and parameters
457
+ tool_name = None
458
+ parameters = {}
459
+
460
+ for line in tool_content.split("\n"):
461
+ line = line.strip()
462
+ if not line:
463
+ continue
464
+
465
+ if line.startswith("name:"):
466
+ tool_name = line[5:].strip()
467
+ elif line.startswith("parameters:"):
468
+ params_text = line[11:].strip()
469
+ # Parse comma-separated parameters
470
+ param_pairs = params_text.split(",")
471
+ for pair in param_pairs:
472
+ if "=" in pair:
473
+ k, v = pair.split("=", 1)
474
+ parameters[k.strip()] = v.strip()
475
+
476
+ # Execute the tool
477
+ result = await self.execute_tool(agent_name, tool_name, parameters)
478
+
479
+ # Return the result as string
480
+ if result.get("status") == "success":
481
+ tool_result = str(result.get("result", ""))
482
+ return tool_result
382
483
  else:
383
- print(f"JSON received but no tool_call found: {json_chunk}")
484
+ error_msg = f"Error calling {tool_name}: {result.get('message', 'Unknown error')}"
485
+ return error_msg
384
486
 
385
- # If we get here, it wasn't properly handled as a tool
386
- return f"The following request was not processed as a valid tool call:\n{json_chunk}"
387
- except json.JSONDecodeError as e:
388
- print(f"JSON decode error in tool call: {e}")
389
- return json_chunk
390
487
  except Exception as e:
391
- print(f"Unexpected error in tool call handling: {str(e)}")
392
488
  import traceback
393
489
  print(traceback.format_exc())
394
490
  return f"Error processing tool call: {str(e)}"
395
491
 
396
492
  def _get_tool_usage_prompt(self, agent_name: str) -> str:
397
- """Generate JSON-based instructions for tool usage."""
493
+ """Generate marker-based instructions for tool usage."""
398
494
  # Get tools assigned to this agent
399
495
  tools = self.get_agent_tools(agent_name)
400
496
  if not tools:
@@ -405,29 +501,38 @@ class AgentService(AgentServiceInterface):
405
501
  tools_json = json.dumps(tools, indent=2)
406
502
 
407
503
  return f"""
408
- AVAILABLE TOOLS:
409
- {tools_json}
410
-
411
- TOOL USAGE FORMAT:
412
- {{
413
- "tool_call": {{
414
- "name": "<one_of:{', '.join(available_tool_names)}>",
415
- "parameters": {{
416
- // parameters as specified in tool definition above
417
- }}
418
- }}
419
- }}
420
-
421
- RESPONSE RULES:
422
- 1. For tool usage:
423
- - Only use tools from the AVAILABLE TOOLS list above
424
- - Follow the exact parameter format shown in the tool definition
425
-
426
- 2. Format Requirements:
427
- - Return ONLY the JSON object for tool calls
428
- - No explanation text before or after
429
- - Use exact tool names as shown in AVAILABLE TOOLS
430
- """
504
+ AVAILABLE TOOLS:
505
+ {tools_json}
506
+
507
+ ⚠️ CRITICAL INSTRUCTION: When using a tool, NEVER include explanatory text.
508
+ Only output the exact tool call format shown below with NO other text.
509
+
510
+ TOOL USAGE FORMAT:
511
+ [TOOL]
512
+ name: tool_name
513
+ parameters: key1=value1, key2=value2
514
+ [/TOOL]
515
+
516
+ EXAMPLES:
517
+
518
+ CORRECT - ONLY the tool call with NOTHING else:
519
+ [TOOL]
520
+ name: search_internet
521
+ parameters: query=latest news on Solana
522
+ [/TOOL]
523
+
524
+ ❌ INCORRECT - Never add explanatory text like this:
525
+ To get the latest news on Solana, I will search the internet.
526
+ [TOOL]
527
+ name: search_internet
528
+ parameters: query=latest news on Solana
529
+ [/TOOL]
530
+
531
+ REMEMBER:
532
+ 1. Output ONLY the exact tool call format with NO additional text
533
+ 2. After seeing your tool call, I will execute it automatically
534
+ 3. You will receive the tool results and can then respond to the user
535
+ """
431
536
 
432
537
  def _clean_for_audio(self, text: str) -> str:
433
538
  """Remove Markdown formatting, emojis, and non-pronounceable characters from text.
@@ -502,3 +607,18 @@ class AgentService(AgentServiceInterface):
502
607
  text = re.sub(r'\s+', ' ', text)
503
608
 
504
609
  return text.strip()
610
+
611
+ def _clean_tool_response(self, text: str) -> str:
612
+ """Remove any tool markers or formatting that might have leaked into the response."""
613
+ if not text:
614
+ return ""
615
+
616
+ # Remove any tool markers that might be in the response
617
+ text = text.replace("[TOOL]", "")
618
+ text = text.replace("[/TOOL]", "")
619
+
620
+ # Remove the word TOOL from start if it appears
621
+ if text.lstrip().startswith("TOOL"):
622
+ text = text.lstrip().replace("TOOL", "", 1)
623
+
624
+ return text.strip()
@@ -47,6 +47,7 @@ class QueryService(QueryServiceInterface):
47
47
  audio_input_format: Literal[
48
48
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
49
49
  ] = "mp4",
50
+ audio_transcription_real_time: bool = True,
50
51
  prompt: Optional[str] = None,
51
52
  router: Optional[RoutingServiceInterface] = None,
52
53
  ) -> AsyncGenerator[Union[str, bytes], None]: # pragma: no cover
@@ -60,6 +61,7 @@ class QueryService(QueryServiceInterface):
60
61
  audio_instructions: Audio voice instructions
61
62
  audio_output_format: Audio output format
62
63
  audio_input_format: Audio input format
64
+ audio_transcription_real_time: Flag for real-time audio transcription
63
65
  prompt: Optional prompt for the agent
64
66
  router: Optional routing service for processing
65
67
 
@@ -120,6 +122,7 @@ class QueryService(QueryServiceInterface):
120
122
  audio_output_format=audio_output_format,
121
123
  audio_instructions=audio_instructions,
122
124
  prompt=prompt,
125
+ audio_transcription_real_time=audio_transcription_real_time,
123
126
  ):
124
127
  yield audio_chunk
125
128
 
@@ -138,6 +141,7 @@ class QueryService(QueryServiceInterface):
138
141
  memory_context=memory_context,
139
142
  output_format="text",
140
143
  prompt=prompt,
144
+ audio_transcription_real_time=audio_transcription_real_time,
141
145
  ):
142
146
  yield chunk
143
147
  full_text_response += chunk
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: solana-agent
3
- Version: 24.0.0
3
+ Version: 24.1.0
4
4
  Summary: Agentic IQ
5
5
  License: MIT
6
6
  Keywords: ai,openai,ai agents,agi
@@ -14,9 +14,11 @@ Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
16
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Dist: httpx (>=0.28.1,<0.29.0)
17
18
  Requires-Dist: openai (>=1.71.0,<2.0.0)
18
19
  Requires-Dist: pydantic (>=2.11.2,<3.0.0)
19
20
  Requires-Dist: pymongo (>=4.11.3,<5.0.0)
21
+ Requires-Dist: websockets (>=15.0.1,<16.0.0)
20
22
  Requires-Dist: zep-cloud (>=2.9.0,<3.0.0)
21
23
  Project-URL: Documentation, https://docs.solana-agent.com
22
24
  Project-URL: Repository, https://github.com/truemagic-coder/solana-agent
@@ -41,6 +43,7 @@ Build your AI business in three lines of code!
41
43
 
42
44
  ## Why?
43
45
  * Three lines of code setup
46
+ * Fast Responses
44
47
  * Multi-Agent Swarm
45
48
  * Multi-Modal Streaming (Text & Audio)
46
49
  * Conversational Memory & History
@@ -56,6 +59,7 @@ Build your AI business in three lines of code!
56
59
  ## Features
57
60
 
58
61
  * Easy three lines of code setup
62
+ * Fast AI responses
59
63
  * Designed for a multi-agent swarm
60
64
  * Seamless text and audio streaming with real-time multi-modal processing
61
65
  * Configurable audio voice characteristics via prompting
@@ -371,6 +375,15 @@ async for response in solana_agent.process("user123", audio_content, output_form
371
375
  print(response, end="")
372
376
  ```
373
377
 
378
+ ### Real-Time Audio Transcription
379
+
380
+ It is possible to disable real-time audio transcription responses to save on costs.
381
+
382
+ ```python
383
+ async for response in solana_agent.process("user123", "What is the latest news on Canada?", audio_transcription_real_time=False):
384
+ print(response, end="")
385
+ ```
386
+
374
387
  ## Tools
375
388
 
376
389
  Tools can be used from plugins like Solana Agent Kit (sakit) or via inline tools. Tools available via plugins integrate automatically with Solana Agent.
@@ -1,22 +1,22 @@
1
1
  solana_agent/__init__.py,sha256=ceYeUpjIitpln8YK1r0JVJU8mzG6cRPYu-HLny3d-Tw,887
2
2
  solana_agent/adapters/__init__.py,sha256=tiEEuuy0NF3ngc_tGEcRTt71zVI58v3dYY9RvMrF2Cg,204
3
- solana_agent/adapters/llm_adapter.py,sha256=Q1oCOV3Zzk_hEtcr7OgclwEss_4M61B5do1TFN1541M,5534
3
+ solana_agent/adapters/llm_adapter.py,sha256=LLRRIhtJcPrNd2qIAHmEsFE5YyuUg53-POoiNKIradQ,12833
4
4
  solana_agent/adapters/mongodb_adapter.py,sha256=qqEFbY_v1XGyFXBmwd5HSXSSHnA9wWo-Hm1vGEyIG0k,2718
5
5
  solana_agent/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- solana_agent/client/solana_agent.py,sha256=M2AHloEFXEAM321je9xRdos5dXNQigQ0uYqnzXv7-iA,5208
6
+ solana_agent/client/solana_agent.py,sha256=iIRuwOP1jChAgiP_ewW2lEOV-PE6AtVROlt-s8mBbyg,5415
7
7
  solana_agent/domains/__init__.py,sha256=HiC94wVPRy-QDJSSRywCRrhrFfTBeHjfi5z-QfZv46U,168
8
8
  solana_agent/domains/agent.py,sha256=WTo-pEc66V6D_35cpDE-kTsw1SJM-dtylPZ7em5em7Q,2659
9
9
  solana_agent/domains/routing.py,sha256=UDlgTjUoC9xIBVYu_dnf9-KG_bBgdEXAv_UtDOrYo0w,650
10
10
  solana_agent/factories/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  solana_agent/factories/agent_factory.py,sha256=mJQb1G0-gebizZvSVHm4NAxRMB1kemm2w_BAcYlN15Y,5496
12
12
  solana_agent/interfaces/__init__.py,sha256=IQs1WIM1FeKP1-kY2FEfyhol_dB-I-VAe2rD6jrVF6k,355
13
- solana_agent/interfaces/client/client.py,sha256=CB8YuSsn-Lvinrb12huyIVaFpJqVDh8EHsHJi9SVXM4,1690
13
+ solana_agent/interfaces/client/client.py,sha256=ymZiJEVy966HKVTZR75MdcrTfct6MPielHKdvfCYF_g,1742
14
14
  solana_agent/interfaces/plugins/plugins.py,sha256=T8HPBsekmzVwfU_Rizp-vtzAeYkMlKMYD7U9d0Wjq9c,3338
15
15
  solana_agent/interfaces/providers/data_storage.py,sha256=NqGeFvAzhz9rr-liLPRNCGjooB2EIhe-EVsMmX__b0M,1658
16
- solana_agent/interfaces/providers/llm.py,sha256=AxfUCBVbyN2GaBOdAo_Oxoy7nP9-IvHQl8Xo8H-ZLNs,1552
16
+ solana_agent/interfaces/providers/llm.py,sha256=09E6NgMcIpf_nJGgdVLjlZAF2HGHtW5EmhIbaEiylt0,1972
17
17
  solana_agent/interfaces/providers/memory.py,sha256=oNOH8WZXVW8assDigIWZAWiwkxbpDiKupxA2RB6tQvQ,1010
18
- solana_agent/interfaces/services/agent.py,sha256=ETAfz_VbtOgpTDIpo9tMSJnUAM5boPJXw9R7b_WEu3o,2113
19
- solana_agent/interfaces/services/query.py,sha256=0C5yD8DYNrsJd3SA0lVb-ajm0fMqJJNPuPeHoab7-WQ,1340
18
+ solana_agent/interfaces/services/agent.py,sha256=KHGFjmxj0yE04VTeNa6Jpk-34OEMhDgAtzmPkpUBdRA,2165
19
+ solana_agent/interfaces/services/query.py,sha256=2i-Qq4Bel5P5U1O5wWUYzYoECFwiMkNj7n0K1v1edd4,1532
20
20
  solana_agent/interfaces/services/routing.py,sha256=UzJC-z-Q9puTWPFGEo2_CAhIxuxP5IRnze7S66NSrsI,397
21
21
  solana_agent/plugins/__init__.py,sha256=coZdgJKq1ExOaj6qB810i3rEhbjdVlrkN76ozt_Ojgo,193
22
22
  solana_agent/plugins/manager.py,sha256=Il49hXeqvu0b02pURNNp7mY8kp9_sqpi_vJIWBW5Hc0,5044
@@ -24,12 +24,12 @@ solana_agent/plugins/registry.py,sha256=5S0DlUQKogsg1zLiRUIGMHEmGYHtOovU-S-5W1Mw
24
24
  solana_agent/plugins/tools/__init__.py,sha256=c0z7ij42gs94_VJrcn4Y8gUlTxMhsFNY6ahIsNswdLk,231
25
25
  solana_agent/plugins/tools/auto_tool.py,sha256=DgES_cZ6xKSf_HJpFINpvJxrjVlk5oeqa7pZRBsR9SM,1575
26
26
  solana_agent/repositories/__init__.py,sha256=fP83w83CGzXLnSdq-C5wbw9EhWTYtqE2lQTgp46-X_4,163
27
- solana_agent/repositories/memory.py,sha256=GYyNcwdQZKqfCjG_6uYh7YqjwwbUwvuVwbNim4aHN3I,7329
27
+ solana_agent/repositories/memory.py,sha256=75zuqAMn4YFafiLsE8RvjFNd3p5ensXbFWv6VvlhFtE,7297
28
28
  solana_agent/services/__init__.py,sha256=ab_NXJmwYUCmCrCzuTlZ47bJZINW0Y0F5jfQ9OovidU,163
29
- solana_agent/services/agent.py,sha256=hvgZPOcGuojALlpf-zpZ20ga7j45CFCBxZyHLz6ge04,19960
30
- solana_agent/services/query.py,sha256=IFEWYfkDCbp8W0FDooAor_UZe7H1cqgrud-CzoGlu-8,11154
29
+ solana_agent/services/agent.py,sha256=d6Sv6W6Vtuhf5JHknUchjAD8XSUOkXALkIImnre93j8,25524
30
+ solana_agent/services/query.py,sha256=vWopHKES-K0KpxPCSZNyunRJrkBVGGQC13awd0Sd56M,11450
31
31
  solana_agent/services/routing.py,sha256=PMCSG5m3uLMaHMj3dxNvNfcFZaeaDi7kMr7AEBCzwDE,6499
32
- solana_agent-24.0.0.dist-info/LICENSE,sha256=BnSRc-NSFuyF2s496l_4EyrwAP6YimvxWcjPiJ0J7g4,1057
33
- solana_agent-24.0.0.dist-info/METADATA,sha256=FnvqQI4jAN_q3wAEArOQXe0YDQ0wmfxfx5XJ2lVKgH8,20270
34
- solana_agent-24.0.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
35
- solana_agent-24.0.0.dist-info/RECORD,,
32
+ solana_agent-24.1.0.dist-info/LICENSE,sha256=BnSRc-NSFuyF2s496l_4EyrwAP6YimvxWcjPiJ0J7g4,1057
33
+ solana_agent-24.1.0.dist-info/METADATA,sha256=WC9LoaQVgFHhA0bfXC_c57iYU7V-ZW1TJVDrCbmUmh0,20685
34
+ solana_agent-24.1.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
35
+ solana_agent-24.1.0.dist-info/RECORD,,