solana-agent 23.0.7__tar.gz → 24.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {solana_agent-23.0.7 → solana_agent-24.1.0}/PKG-INFO +20 -23
- {solana_agent-23.0.7 → solana_agent-24.1.0}/README.md +16 -21
- {solana_agent-23.0.7 → solana_agent-24.1.0}/pyproject.toml +4 -2
- solana_agent-24.1.0/solana_agent/adapters/llm_adapter.py +332 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/client/solana_agent.py +3 -3
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/client/client.py +1 -1
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/providers/llm.py +11 -2
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/services/agent.py +1 -1
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/services/query.py +4 -1
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/repositories/memory.py +2 -2
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/services/agent.py +274 -156
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/services/query.py +4 -4
- solana_agent-23.0.7/solana_agent/adapters/llm_adapter.py +0 -164
- {solana_agent-23.0.7 → solana_agent-24.1.0}/LICENSE +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/adapters/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/adapters/mongodb_adapter.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/client/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/domains/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/domains/agent.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/domains/routing.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/factories/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/factories/agent_factory.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/plugins/plugins.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/providers/data_storage.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/providers/memory.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/interfaces/services/routing.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/plugins/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/plugins/manager.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/plugins/registry.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/plugins/tools/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/plugins/tools/auto_tool.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/repositories/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/services/__init__.py +0 -0
- {solana_agent-23.0.7 → solana_agent-24.1.0}/solana_agent/services/routing.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: solana-agent
|
3
|
-
Version:
|
3
|
+
Version: 24.1.0
|
4
4
|
Summary: Agentic IQ
|
5
5
|
License: MIT
|
6
6
|
Keywords: ai,openai,ai agents,agi
|
@@ -14,9 +14,11 @@ Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
16
16
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
-
Requires-Dist:
|
17
|
+
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
18
|
+
Requires-Dist: openai (>=1.71.0,<2.0.0)
|
18
19
|
Requires-Dist: pydantic (>=2.11.2,<3.0.0)
|
19
20
|
Requires-Dist: pymongo (>=4.11.3,<5.0.0)
|
21
|
+
Requires-Dist: websockets (>=15.0.1,<16.0.0)
|
20
22
|
Requires-Dist: zep-cloud (>=2.9.0,<3.0.0)
|
21
23
|
Project-URL: Documentation, https://docs.solana-agent.com
|
22
24
|
Project-URL: Repository, https://github.com/truemagic-coder/solana-agent
|
@@ -41,10 +43,11 @@ Build your AI business in three lines of code!
|
|
41
43
|
|
42
44
|
## Why?
|
43
45
|
* Three lines of code setup
|
46
|
+
* Fast Responses
|
44
47
|
* Multi-Agent Swarm
|
45
48
|
* Multi-Modal Streaming (Text & Audio)
|
46
49
|
* Conversational Memory & History
|
47
|
-
*
|
50
|
+
* Internet Search
|
48
51
|
* Intelligent Routing
|
49
52
|
* Business Alignment
|
50
53
|
* Extensible Tooling
|
@@ -56,11 +59,12 @@ Build your AI business in three lines of code!
|
|
56
59
|
## Features
|
57
60
|
|
58
61
|
* Easy three lines of code setup
|
62
|
+
* Fast AI responses
|
59
63
|
* Designed for a multi-agent swarm
|
60
64
|
* Seamless text and audio streaming with real-time multi-modal processing
|
61
65
|
* Configurable audio voice characteristics via prompting
|
62
66
|
* Persistent memory that preserves context across all agent interactions
|
63
|
-
* Quick
|
67
|
+
* Quick Internet search to answer users' queries
|
64
68
|
* Streamlined message history for all agent interactions
|
65
69
|
* Intelligent query routing to agents with optimal domain expertise or your own custom routing
|
66
70
|
* Unified value system ensuring brand-aligned agent responses
|
@@ -82,7 +86,6 @@ Build your AI business in three lines of code!
|
|
82
86
|
* [gpt-4o-mini](https://platform.openai.com/docs/models/gpt-4o-mini)
|
83
87
|
* [gpt-4o-mini-tts](https://platform.openai.com/docs/models/gpt-4o-mini-tts)
|
84
88
|
* [gpt-4o-mini-transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe)
|
85
|
-
* [gpt-4o-mini-search-preview](https://platform.openai.com/docs/models/gpt-4o-mini-search-preview)
|
86
89
|
|
87
90
|
## Installation
|
88
91
|
|
@@ -353,21 +356,6 @@ API Calls:
|
|
353
356
|
|
354
357
|
* If the Zep user and session isn't created it creates them for 2 API calls (POST)
|
355
358
|
|
356
|
-
### Internet Search
|
357
|
-
|
358
|
-
This mode is great for text output where the default response from OpenAI is enough.
|
359
|
-
|
360
|
-
It is not suitable for audio as the OpenAI search results contain links and markdown.
|
361
|
-
|
362
|
-
Also it may not call tools when they should be called as it thinks the search results answer the user query.
|
363
|
-
|
364
|
-
It is much faster than calling `search_internet` from `sakit` as it saves 2 API calls.
|
365
|
-
|
366
|
-
```python
|
367
|
-
async for response in solana_agent.process("user123", "What is the latest news on Canada?", internet_search=True):
|
368
|
-
print(response, end="")
|
369
|
-
```
|
370
|
-
|
371
359
|
### Customize Speech
|
372
360
|
|
373
361
|
This is an audio to audio example using the `audio_instructions` parameter.
|
@@ -387,16 +375,25 @@ async for response in solana_agent.process("user123", audio_content, output_form
|
|
387
375
|
print(response, end="")
|
388
376
|
```
|
389
377
|
|
378
|
+
### Real-Time Audio Transcription
|
379
|
+
|
380
|
+
It is possible to disable real-time audio transcription responses to save on costs.
|
381
|
+
|
382
|
+
```python
|
383
|
+
async for response in solana_agent.process("user123", "What is the latest news on Canada?", audio_transcription_real_time=False):
|
384
|
+
print(response, end="")
|
385
|
+
```
|
386
|
+
|
390
387
|
## Tools
|
391
388
|
|
392
389
|
Tools can be used from plugins like Solana Agent Kit (sakit) or via inline tools. Tools available via plugins integrate automatically with Solana Agent.
|
393
390
|
|
394
391
|
* Agents can only call one tool per response
|
395
392
|
* Agents choose the best tool for the job
|
396
|
-
*
|
397
|
-
*
|
393
|
+
* Solana Agent doesn't use OpenAI function calling (tools) as they don't support async functions
|
394
|
+
* Solana Agent tools are async functions
|
398
395
|
|
399
|
-
### Plugin
|
396
|
+
### Internet Search (Plugin Example)
|
400
397
|
|
401
398
|
`pip install sakit`
|
402
399
|
|
@@ -17,10 +17,11 @@ Build your AI business in three lines of code!
|
|
17
17
|
|
18
18
|
## Why?
|
19
19
|
* Three lines of code setup
|
20
|
+
* Fast Responses
|
20
21
|
* Multi-Agent Swarm
|
21
22
|
* Multi-Modal Streaming (Text & Audio)
|
22
23
|
* Conversational Memory & History
|
23
|
-
*
|
24
|
+
* Internet Search
|
24
25
|
* Intelligent Routing
|
25
26
|
* Business Alignment
|
26
27
|
* Extensible Tooling
|
@@ -32,11 +33,12 @@ Build your AI business in three lines of code!
|
|
32
33
|
## Features
|
33
34
|
|
34
35
|
* Easy three lines of code setup
|
36
|
+
* Fast AI responses
|
35
37
|
* Designed for a multi-agent swarm
|
36
38
|
* Seamless text and audio streaming with real-time multi-modal processing
|
37
39
|
* Configurable audio voice characteristics via prompting
|
38
40
|
* Persistent memory that preserves context across all agent interactions
|
39
|
-
* Quick
|
41
|
+
* Quick Internet search to answer users' queries
|
40
42
|
* Streamlined message history for all agent interactions
|
41
43
|
* Intelligent query routing to agents with optimal domain expertise or your own custom routing
|
42
44
|
* Unified value system ensuring brand-aligned agent responses
|
@@ -58,7 +60,6 @@ Build your AI business in three lines of code!
|
|
58
60
|
* [gpt-4o-mini](https://platform.openai.com/docs/models/gpt-4o-mini)
|
59
61
|
* [gpt-4o-mini-tts](https://platform.openai.com/docs/models/gpt-4o-mini-tts)
|
60
62
|
* [gpt-4o-mini-transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe)
|
61
|
-
* [gpt-4o-mini-search-preview](https://platform.openai.com/docs/models/gpt-4o-mini-search-preview)
|
62
63
|
|
63
64
|
## Installation
|
64
65
|
|
@@ -329,21 +330,6 @@ API Calls:
|
|
329
330
|
|
330
331
|
* If the Zep user and session isn't created it creates them for 2 API calls (POST)
|
331
332
|
|
332
|
-
### Internet Search
|
333
|
-
|
334
|
-
This mode is great for text output where the default response from OpenAI is enough.
|
335
|
-
|
336
|
-
It is not suitable for audio as the OpenAI search results contain links and markdown.
|
337
|
-
|
338
|
-
Also it may not call tools when they should be called as it thinks the search results answer the user query.
|
339
|
-
|
340
|
-
It is much faster than calling `search_internet` from `sakit` as it saves 2 API calls.
|
341
|
-
|
342
|
-
```python
|
343
|
-
async for response in solana_agent.process("user123", "What is the latest news on Canada?", internet_search=True):
|
344
|
-
print(response, end="")
|
345
|
-
```
|
346
|
-
|
347
333
|
### Customize Speech
|
348
334
|
|
349
335
|
This is an audio to audio example using the `audio_instructions` parameter.
|
@@ -363,16 +349,25 @@ async for response in solana_agent.process("user123", audio_content, output_form
|
|
363
349
|
print(response, end="")
|
364
350
|
```
|
365
351
|
|
352
|
+
### Real-Time Audio Transcription
|
353
|
+
|
354
|
+
It is possible to disable real-time audio transcription responses to save on costs.
|
355
|
+
|
356
|
+
```python
|
357
|
+
async for response in solana_agent.process("user123", "What is the latest news on Canada?", audio_transcription_real_time=False):
|
358
|
+
print(response, end="")
|
359
|
+
```
|
360
|
+
|
366
361
|
## Tools
|
367
362
|
|
368
363
|
Tools can be used from plugins like Solana Agent Kit (sakit) or via inline tools. Tools available via plugins integrate automatically with Solana Agent.
|
369
364
|
|
370
365
|
* Agents can only call one tool per response
|
371
366
|
* Agents choose the best tool for the job
|
372
|
-
*
|
373
|
-
*
|
367
|
+
* Solana Agent doesn't use OpenAI function calling (tools) as they don't support async functions
|
368
|
+
* Solana Agent tools are async functions
|
374
369
|
|
375
|
-
### Plugin
|
370
|
+
### Internet Search (Plugin Example)
|
376
371
|
|
377
372
|
`pip install sakit`
|
378
373
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "solana-agent"
|
3
|
-
version = "
|
3
|
+
version = "24.1.0"
|
4
4
|
description = "Agentic IQ"
|
5
5
|
authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
|
6
6
|
license = "MIT"
|
@@ -23,10 +23,12 @@ python_paths = [".", "tests"]
|
|
23
23
|
|
24
24
|
[tool.poetry.dependencies]
|
25
25
|
python = ">=3.12,<4.0"
|
26
|
-
openai = "^1.
|
26
|
+
openai = "^1.71.0"
|
27
27
|
pydantic = "^2.11.2"
|
28
28
|
pymongo = "^4.11.3"
|
29
29
|
zep-cloud = "^2.9.0"
|
30
|
+
httpx = "^0.28.1"
|
31
|
+
websockets = "^15.0.1"
|
30
32
|
|
31
33
|
[tool.poetry.group.dev.dependencies]
|
32
34
|
pytest = "^8.3.5"
|
@@ -0,0 +1,332 @@
|
|
1
|
+
"""
|
2
|
+
LLM provider adapters for the Solana Agent system.
|
3
|
+
|
4
|
+
These adapters implement the LLMProvider interface for different LLM services.
|
5
|
+
"""
|
6
|
+
import asyncio
|
7
|
+
import json
|
8
|
+
from typing import Any, AsyncGenerator, Callable, Dict, Literal, Optional, Type, TypeVar
|
9
|
+
|
10
|
+
import httpx
|
11
|
+
from openai import AsyncOpenAI
|
12
|
+
from pydantic import BaseModel
|
13
|
+
import websockets
|
14
|
+
|
15
|
+
from solana_agent.interfaces.providers.llm import LLMProvider
|
16
|
+
|
17
|
+
T = TypeVar('T', bound=BaseModel)
|
18
|
+
|
19
|
+
|
20
|
+
class OpenAIAdapter(LLMProvider):
|
21
|
+
"""OpenAI implementation of LLMProvider with web search capabilities."""
|
22
|
+
|
23
|
+
def __init__(self, api_key: str):
|
24
|
+
self.client = AsyncOpenAI(api_key=api_key)
|
25
|
+
self.parse_model = "gpt-4o-mini"
|
26
|
+
self.text_model = "gpt-4o-mini"
|
27
|
+
self.transcription_model = "gpt-4o-mini-transcribe"
|
28
|
+
self.tts_model = "gpt-4o-mini-tts"
|
29
|
+
|
30
|
+
async def tts(
|
31
|
+
self,
|
32
|
+
text: str,
|
33
|
+
instructions: str = "You speak in a friendly and helpful manner.",
|
34
|
+
voice: Literal["alloy", "ash", "ballad", "coral", "echo",
|
35
|
+
"fable", "onyx", "nova", "sage", "shimmer"] = "nova",
|
36
|
+
response_format: Literal['mp3', 'opus',
|
37
|
+
'aac', 'flac', 'wav', 'pcm'] = "aac",
|
38
|
+
) -> AsyncGenerator[bytes, None]: # pragma: no cover
|
39
|
+
"""Stream text-to-speech audio from OpenAI models.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
text: Text to convert to speech
|
43
|
+
instructions: Optional instructions for speech generation
|
44
|
+
voice: Voice to use for synthesis
|
45
|
+
response_format: Audio format
|
46
|
+
|
47
|
+
Yields:
|
48
|
+
Audio bytes as they become available
|
49
|
+
"""
|
50
|
+
try:
|
51
|
+
async with self.client.audio.speech.with_streaming_response.create(
|
52
|
+
model=self.tts_model,
|
53
|
+
voice=voice,
|
54
|
+
instructions=instructions,
|
55
|
+
input=text,
|
56
|
+
response_format=response_format
|
57
|
+
) as stream:
|
58
|
+
# Stream the bytes in 16KB chunks
|
59
|
+
async for chunk in stream.iter_bytes(chunk_size=1024 * 16):
|
60
|
+
yield chunk
|
61
|
+
|
62
|
+
except Exception as e:
|
63
|
+
print(f"Error in text_to_speech: {str(e)}")
|
64
|
+
import traceback
|
65
|
+
print(traceback.format_exc())
|
66
|
+
yield b"" # Return empty bytes on error
|
67
|
+
|
68
|
+
except Exception as e:
|
69
|
+
print(f"Error in text_to_speech: {str(e)}")
|
70
|
+
import traceback
|
71
|
+
print(traceback.format_exc())
|
72
|
+
yield b"" # Return empty bytes on error
|
73
|
+
|
74
|
+
async def transcribe_audio(
|
75
|
+
self,
|
76
|
+
audio_bytes: bytes,
|
77
|
+
input_format: Literal[
|
78
|
+
"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
|
79
|
+
] = "mp4",
|
80
|
+
) -> AsyncGenerator[str, None]: # pragma: no cover
|
81
|
+
"""Stream transcription of an audio file.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
audio_bytes: Audio file bytes
|
85
|
+
input_format: Format of the input audio file
|
86
|
+
|
87
|
+
Yields:
|
88
|
+
Transcript text chunks as they become available
|
89
|
+
"""
|
90
|
+
try:
|
91
|
+
async with self.client.audio.transcriptions.with_streaming_response.create(
|
92
|
+
model=self.transcription_model,
|
93
|
+
file=(f"file.{input_format}", audio_bytes),
|
94
|
+
response_format="text",
|
95
|
+
) as stream:
|
96
|
+
# Stream the text in 16KB chunks
|
97
|
+
async for chunk in stream.iter_text(chunk_size=1024 * 16):
|
98
|
+
yield chunk
|
99
|
+
|
100
|
+
except Exception as e:
|
101
|
+
print(f"Error in transcribe_audio: {str(e)}")
|
102
|
+
import traceback
|
103
|
+
print(traceback.format_exc())
|
104
|
+
yield f"I apologize, but I encountered an error transcribing the audio: {str(e)}"
|
105
|
+
|
106
|
+
async def generate_text(
|
107
|
+
self,
|
108
|
+
prompt: str,
|
109
|
+
system_prompt: str = "",
|
110
|
+
) -> AsyncGenerator[str, None]: # pragma: no cover
|
111
|
+
"""Generate text from OpenAI models."""
|
112
|
+
messages = []
|
113
|
+
|
114
|
+
if system_prompt:
|
115
|
+
messages.append({"role": "system", "content": system_prompt})
|
116
|
+
|
117
|
+
messages.append({"role": "user", "content": prompt})
|
118
|
+
|
119
|
+
# Prepare request parameters
|
120
|
+
request_params = {
|
121
|
+
"messages": messages,
|
122
|
+
"stream": True,
|
123
|
+
"model": self.text_model,
|
124
|
+
}
|
125
|
+
try:
|
126
|
+
response = await self.client.chat.completions.create(**request_params)
|
127
|
+
|
128
|
+
async for chunk in response:
|
129
|
+
if chunk.choices:
|
130
|
+
if chunk.choices[0].delta.content:
|
131
|
+
text = chunk.choices[0].delta.content
|
132
|
+
yield text
|
133
|
+
|
134
|
+
except Exception as e:
|
135
|
+
print(f"Error in generate_text: {str(e)}")
|
136
|
+
import traceback
|
137
|
+
print(traceback.format_exc())
|
138
|
+
yield f"I apologize, but I encountered an error: {str(e)}"
|
139
|
+
|
140
|
+
async def parse_structured_output(
|
141
|
+
self,
|
142
|
+
prompt: str,
|
143
|
+
system_prompt: str,
|
144
|
+
model_class: Type[T],
|
145
|
+
) -> T: # pragma: no cover
|
146
|
+
"""Generate structured output using Pydantic model parsing."""
|
147
|
+
messages = []
|
148
|
+
if system_prompt:
|
149
|
+
messages.append({"role": "system", "content": system_prompt})
|
150
|
+
|
151
|
+
messages.append({"role": "user", "content": prompt})
|
152
|
+
|
153
|
+
try:
|
154
|
+
# First try the beta parsing API
|
155
|
+
completion = await self.client.beta.chat.completions.parse(
|
156
|
+
model=self.parse_model,
|
157
|
+
messages=messages,
|
158
|
+
response_format=model_class,
|
159
|
+
)
|
160
|
+
return completion.choices[0].message.parsed
|
161
|
+
except Exception as e:
|
162
|
+
print(f"Error with beta.parse method: {e}")
|
163
|
+
|
164
|
+
async def create_realtime_session(
|
165
|
+
self,
|
166
|
+
model: str = "gpt-4o-mini-realtime-preview",
|
167
|
+
modalities: list = ["audio", "text"],
|
168
|
+
instructions: str = "You are a helpful assistant.",
|
169
|
+
voice: str = "alloy",
|
170
|
+
input_audio_format: str = "pcm16",
|
171
|
+
output_audio_format: str = "pcm16",
|
172
|
+
) -> Dict[str, Any]: # pragma: no cover
|
173
|
+
"""Create a realtime session token for WebSocket communication."""
|
174
|
+
try:
|
175
|
+
# Get the API key from the AsyncOpenAI client
|
176
|
+
api_key = self.client.api_key
|
177
|
+
|
178
|
+
# Create an async HTTP client
|
179
|
+
async with httpx.AsyncClient() as client:
|
180
|
+
response = await client.post(
|
181
|
+
"https://api.openai.com/v1/realtime/sessions",
|
182
|
+
json={
|
183
|
+
"model": model,
|
184
|
+
"modalities": modalities,
|
185
|
+
"instructions": instructions,
|
186
|
+
"voice": voice,
|
187
|
+
"input_audio_format": input_audio_format,
|
188
|
+
"output_audio_format": output_audio_format,
|
189
|
+
},
|
190
|
+
headers={
|
191
|
+
"Authorization": f"Bearer {api_key}",
|
192
|
+
"Content-Type": "application/json",
|
193
|
+
"OpenAI-Beta": "realtime=v1"
|
194
|
+
}
|
195
|
+
)
|
196
|
+
|
197
|
+
if response.status_code == 200:
|
198
|
+
return response.json()
|
199
|
+
else:
|
200
|
+
raise Exception(
|
201
|
+
f"Failed to create realtime session: {response.text}")
|
202
|
+
except Exception as e:
|
203
|
+
print(f"Error creating realtime session: {str(e)}")
|
204
|
+
raise
|
205
|
+
|
206
|
+
async def realtime_audio_transcription(
|
207
|
+
self,
|
208
|
+
audio_generator: AsyncGenerator[bytes, None],
|
209
|
+
transcription_config: Optional[Dict[str, Any]] = None,
|
210
|
+
on_event: Optional[Callable[[Dict[str, Any]], Any]] = None,
|
211
|
+
) -> AsyncGenerator[str, None]: # pragma: no cover
|
212
|
+
"""Stream real-time audio transcription using the Realtime API.
|
213
|
+
|
214
|
+
Args:
|
215
|
+
audio_generator: Async generator that yields audio chunks
|
216
|
+
transcription_config: Optional custom configuration for transcription
|
217
|
+
on_event: Optional callback function for handling raw events
|
218
|
+
|
219
|
+
Yields:
|
220
|
+
Transcription text as it becomes available
|
221
|
+
"""
|
222
|
+
# Create default transcription config if none provided
|
223
|
+
if transcription_config is None:
|
224
|
+
transcription_config = {
|
225
|
+
"input_audio_format": "pcm16",
|
226
|
+
"input_audio_transcription": {
|
227
|
+
"model": "gpt-4o-mini-transcribe"
|
228
|
+
},
|
229
|
+
"turn_detection": {
|
230
|
+
"type": "server_vad",
|
231
|
+
"threshold": 0.5,
|
232
|
+
"prefix_padding_ms": 300,
|
233
|
+
"silence_duration_ms": 200
|
234
|
+
}
|
235
|
+
}
|
236
|
+
|
237
|
+
try:
|
238
|
+
# Get the API key from the AsyncOpenAI client
|
239
|
+
api_key = self.client.api_key
|
240
|
+
|
241
|
+
# Create transcription session
|
242
|
+
async with httpx.AsyncClient() as client:
|
243
|
+
response = await client.post(
|
244
|
+
"https://api.openai.com/v1/realtime/transcription_sessions",
|
245
|
+
json=transcription_config,
|
246
|
+
headers={
|
247
|
+
"Authorization": f"Bearer {api_key}",
|
248
|
+
"Content-Type": "application/json",
|
249
|
+
"OpenAI-Beta": "realtime=v1"
|
250
|
+
}
|
251
|
+
)
|
252
|
+
|
253
|
+
if response.status_code != 200:
|
254
|
+
raise Exception(
|
255
|
+
f"Failed to create transcription session: {response.text}")
|
256
|
+
|
257
|
+
session = response.json()
|
258
|
+
client_secret = session["client_secret"]["value"]
|
259
|
+
|
260
|
+
# Connect to WebSocket with proper headers as dictionary
|
261
|
+
url = "wss://api.openai.com/v1/realtime?model=gpt-4o-mini-transcribe"
|
262
|
+
headers = {
|
263
|
+
"Authorization": f"Bearer {client_secret}",
|
264
|
+
"OpenAI-Beta": "realtime=v1"
|
265
|
+
}
|
266
|
+
|
267
|
+
async with websockets.connect(url, additional_headers=headers) as websocket:
|
268
|
+
# Handle WebSocket communication in the background
|
269
|
+
audio_task = None
|
270
|
+
|
271
|
+
async def send_audio():
|
272
|
+
try:
|
273
|
+
async for audio_chunk in audio_generator:
|
274
|
+
# Base64 encode the audio
|
275
|
+
import base64
|
276
|
+
encoded_audio = base64.b64encode(
|
277
|
+
audio_chunk).decode('utf-8')
|
278
|
+
|
279
|
+
# Send audio chunk
|
280
|
+
await websocket.send(json.dumps({
|
281
|
+
"type": "input_audio_buffer.append",
|
282
|
+
"audio": encoded_audio
|
283
|
+
}))
|
284
|
+
|
285
|
+
# Small delay to prevent flooding
|
286
|
+
await asyncio.sleep(0.05)
|
287
|
+
|
288
|
+
# Commit the audio buffer when done
|
289
|
+
await websocket.send(json.dumps({
|
290
|
+
"type": "input_audio_buffer.commit"
|
291
|
+
}))
|
292
|
+
except Exception as e:
|
293
|
+
print(f"Error sending audio: {str(e)}")
|
294
|
+
|
295
|
+
# Start sending audio in the background
|
296
|
+
audio_task = asyncio.create_task(send_audio())
|
297
|
+
|
298
|
+
# Process transcription events
|
299
|
+
try:
|
300
|
+
while True:
|
301
|
+
message = await websocket.recv()
|
302
|
+
event = json.loads(message)
|
303
|
+
|
304
|
+
if on_event:
|
305
|
+
# Check if on_event is a coroutine function and await it if needed
|
306
|
+
if asyncio.iscoroutinefunction(on_event):
|
307
|
+
await on_event(event)
|
308
|
+
else:
|
309
|
+
on_event(event)
|
310
|
+
|
311
|
+
# Extract transcription deltas
|
312
|
+
if event["type"] == "conversation.item.input_audio_transcription.delta":
|
313
|
+
yield event["delta"]
|
314
|
+
|
315
|
+
# Also handle completed transcriptions
|
316
|
+
elif event["type"] == "conversation.item.input_audio_transcription.completed":
|
317
|
+
yield event["transcript"]
|
318
|
+
break
|
319
|
+
finally:
|
320
|
+
# Clean up audio task if it's still running
|
321
|
+
if audio_task and not audio_task.done():
|
322
|
+
audio_task.cancel()
|
323
|
+
try:
|
324
|
+
await audio_task
|
325
|
+
except asyncio.CancelledError:
|
326
|
+
pass
|
327
|
+
|
328
|
+
except Exception as e:
|
329
|
+
print(f"Error in realtime audio transcription: {str(e)}")
|
330
|
+
import traceback
|
331
|
+
print(traceback.format_exc())
|
332
|
+
yield f"I apologize, but I encountered an error transcribing the audio: {str(e)}"
|
@@ -55,8 +55,8 @@ class SolanaAgent(SolanaAgentInterface):
|
|
55
55
|
audio_input_format: Literal[
|
56
56
|
"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
|
57
57
|
] = "mp4",
|
58
|
+
audio_transcription_real_time: bool = True,
|
58
59
|
router: Optional[RoutingInterface] = None,
|
59
|
-
internet_search: bool = False,
|
60
60
|
) -> AsyncGenerator[Union[str, bytes], None]: # pragma: no cover
|
61
61
|
"""Process a user message and return the response stream.
|
62
62
|
|
@@ -69,8 +69,8 @@ class SolanaAgent(SolanaAgentInterface):
|
|
69
69
|
audio_instructions: Audio voice instructions
|
70
70
|
audio_output_format: Audio output format
|
71
71
|
audio_input_format: Audio input format
|
72
|
+
audio_transcription_real_time: Flag for real-time audio transcription
|
72
73
|
router: Optional routing service for processing
|
73
|
-
internet_search: Flag to use OpenAI Internet search
|
74
74
|
|
75
75
|
Returns:
|
76
76
|
Async generator yielding response chunks (text strings or audio bytes)
|
@@ -85,7 +85,7 @@ class SolanaAgent(SolanaAgentInterface):
|
|
85
85
|
audio_input_format=audio_input_format,
|
86
86
|
prompt=prompt,
|
87
87
|
router=router,
|
88
|
-
|
88
|
+
audio_transcription_real_time=audio_transcription_real_time,
|
89
89
|
):
|
90
90
|
yield chunk
|
91
91
|
|
@@ -24,7 +24,7 @@ class SolanaAgent(ABC):
|
|
24
24
|
"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
|
25
25
|
] = "mp4",
|
26
26
|
router: Optional[RoutingInterface] = None,
|
27
|
-
|
27
|
+
audio_transcription_real_time: bool = True,
|
28
28
|
) -> AsyncGenerator[Union[str, bytes], None]:
|
29
29
|
"""Process a user message and return the response stream."""
|
30
30
|
pass
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from typing import AsyncGenerator, List, Literal, Type, TypeVar, Union
|
2
|
+
from typing import Any, AsyncGenerator, Callable, Dict, List, Literal, Optional, Type, TypeVar, Union
|
3
3
|
|
4
4
|
from pydantic import BaseModel
|
5
5
|
|
@@ -15,7 +15,6 @@ class LLMProvider(ABC):
|
|
15
15
|
self,
|
16
16
|
prompt: str,
|
17
17
|
system_prompt: str = "",
|
18
|
-
internet_search: bool = False,
|
19
18
|
) -> AsyncGenerator[str, None]:
|
20
19
|
"""Generate text from the language model."""
|
21
20
|
pass
|
@@ -50,3 +49,13 @@ class LLMProvider(ABC):
|
|
50
49
|
) -> AsyncGenerator[str, None]:
|
51
50
|
"""Transcribe audio from the language model."""
|
52
51
|
pass
|
52
|
+
|
53
|
+
@abstractmethod
|
54
|
+
async def realtime_audio_transcription(
|
55
|
+
self,
|
56
|
+
audio_generator: AsyncGenerator[bytes, None],
|
57
|
+
transcription_config: Optional[Dict[str, Any]] = None,
|
58
|
+
on_event: Optional[Callable[[Dict[str, Any]], Any]] = None,
|
59
|
+
) -> AsyncGenerator[str, None]:
|
60
|
+
"""Stream real-time audio transcription from the language model."""
|
61
|
+
pass
|
@@ -34,7 +34,7 @@ class AgentService(ABC):
|
|
34
34
|
"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
|
35
35
|
] = "mp4",
|
36
36
|
prompt: Optional[str] = None,
|
37
|
-
|
37
|
+
audio_transcription_real_time: bool = True,
|
38
38
|
) -> AsyncGenerator[Union[str, bytes], None]:
|
39
39
|
"""Generate a response from an agent."""
|
40
40
|
pass
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
from typing import Any, AsyncGenerator, Dict, Literal, Optional, Union
|
3
3
|
|
4
|
+
from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
|
5
|
+
|
4
6
|
|
5
7
|
class QueryService(ABC):
|
6
8
|
"""Interface for processing user queries."""
|
@@ -20,7 +22,8 @@ class QueryService(ABC):
|
|
20
22
|
"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
|
21
23
|
] = "mp4",
|
22
24
|
prompt: Optional[str] = None,
|
23
|
-
|
25
|
+
router: Optional[RoutingInterface] = None,
|
26
|
+
audio_transcription_real_time: bool = True,
|
24
27
|
) -> AsyncGenerator[Union[str, bytes], None]:
|
25
28
|
"""Process the user request and generate a response."""
|
26
29
|
pass
|
@@ -69,8 +69,8 @@ class MemoryRepository(MemoryProvider):
|
|
69
69
|
# Store truncated messages
|
70
70
|
doc = {
|
71
71
|
"user_id": user_id,
|
72
|
-
"user_message":
|
73
|
-
"assistant_message":
|
72
|
+
"user_message": user_msg,
|
73
|
+
"assistant_message": assistant_msg,
|
74
74
|
"timestamp": datetime.now(timezone.utc)
|
75
75
|
}
|
76
76
|
self.mongo.insert_one(self.collection, doc)
|