solana-agent 24.1.0__tar.gz → 24.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {solana_agent-24.1.0 → solana_agent-24.1.2}/PKG-INFO +1 -12
  2. {solana_agent-24.1.0 → solana_agent-24.1.2}/README.md +0 -9
  3. {solana_agent-24.1.0 → solana_agent-24.1.2}/pyproject.toml +1 -3
  4. solana_agent-24.1.2/solana_agent/adapters/llm_adapter.py +158 -0
  5. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/client/solana_agent.py +0 -3
  6. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/client/client.py +0 -1
  7. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/providers/llm.py +0 -10
  8. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/services/agent.py +0 -1
  9. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/services/query.py +0 -1
  10. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/services/agent.py +2 -17
  11. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/services/query.py +0 -4
  12. solana_agent-24.1.0/solana_agent/adapters/llm_adapter.py +0 -332
  13. {solana_agent-24.1.0 → solana_agent-24.1.2}/LICENSE +0 -0
  14. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/__init__.py +0 -0
  15. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/adapters/__init__.py +0 -0
  16. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/adapters/mongodb_adapter.py +0 -0
  17. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/client/__init__.py +0 -0
  18. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/domains/__init__.py +0 -0
  19. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/domains/agent.py +0 -0
  20. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/domains/routing.py +0 -0
  21. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/factories/__init__.py +0 -0
  22. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/factories/agent_factory.py +0 -0
  23. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/__init__.py +0 -0
  24. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/plugins/plugins.py +0 -0
  25. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/providers/data_storage.py +0 -0
  26. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/providers/memory.py +0 -0
  27. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/interfaces/services/routing.py +0 -0
  28. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/plugins/__init__.py +0 -0
  29. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/plugins/manager.py +0 -0
  30. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/plugins/registry.py +0 -0
  31. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/plugins/tools/__init__.py +0 -0
  32. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/plugins/tools/auto_tool.py +0 -0
  33. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/repositories/__init__.py +0 -0
  34. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/repositories/memory.py +0 -0
  35. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/services/__init__.py +0 -0
  36. {solana_agent-24.1.0 → solana_agent-24.1.2}/solana_agent/services/routing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: solana-agent
3
- Version: 24.1.0
3
+ Version: 24.1.2
4
4
  Summary: Agentic IQ
5
5
  License: MIT
6
6
  Keywords: ai,openai,ai agents,agi
@@ -14,11 +14,9 @@ Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
16
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
- Requires-Dist: httpx (>=0.28.1,<0.29.0)
18
17
  Requires-Dist: openai (>=1.71.0,<2.0.0)
19
18
  Requires-Dist: pydantic (>=2.11.2,<3.0.0)
20
19
  Requires-Dist: pymongo (>=4.11.3,<5.0.0)
21
- Requires-Dist: websockets (>=15.0.1,<16.0.0)
22
20
  Requires-Dist: zep-cloud (>=2.9.0,<3.0.0)
23
21
  Project-URL: Documentation, https://docs.solana-agent.com
24
22
  Project-URL: Repository, https://github.com/truemagic-coder/solana-agent
@@ -375,15 +373,6 @@ async for response in solana_agent.process("user123", audio_content, output_form
375
373
  print(response, end="")
376
374
  ```
377
375
 
378
- ### Real-Time Audio Transcription
379
-
380
- It is possible to disable real-time audio transcription responses to save on costs.
381
-
382
- ```python
383
- async for response in solana_agent.process("user123", "What is the latest news on Canada?", audio_transcription_real_time=False):
384
- print(response, end="")
385
- ```
386
-
387
376
  ## Tools
388
377
 
389
378
  Tools can be used from plugins like Solana Agent Kit (sakit) or via inline tools. Tools available via plugins integrate automatically with Solana Agent.
@@ -349,15 +349,6 @@ async for response in solana_agent.process("user123", audio_content, output_form
349
349
  print(response, end="")
350
350
  ```
351
351
 
352
- ### Real-Time Audio Transcription
353
-
354
- It is possible to disable real-time audio transcription responses to save on costs.
355
-
356
- ```python
357
- async for response in solana_agent.process("user123", "What is the latest news on Canada?", audio_transcription_real_time=False):
358
- print(response, end="")
359
- ```
360
-
361
352
  ## Tools
362
353
 
363
354
  Tools can be used from plugins like Solana Agent Kit (sakit) or via inline tools. Tools available via plugins integrate automatically with Solana Agent.
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "solana-agent"
3
- version = "24.1.0"
3
+ version = "24.1.2"
4
4
  description = "Agentic IQ"
5
5
  authors = ["Bevan Hunt <bevan@bevanhunt.com>"]
6
6
  license = "MIT"
@@ -27,8 +27,6 @@ openai = "^1.71.0"
27
27
  pydantic = "^2.11.2"
28
28
  pymongo = "^4.11.3"
29
29
  zep-cloud = "^2.9.0"
30
- httpx = "^0.28.1"
31
- websockets = "^15.0.1"
32
30
 
33
31
  [tool.poetry.group.dev.dependencies]
34
32
  pytest = "^8.3.5"
@@ -0,0 +1,158 @@
1
+ """
2
+ LLM provider adapters for the Solana Agent system.
3
+
4
+ These adapters implement the LLMProvider interface for different LLM services.
5
+ """
6
+ from typing import AsyncGenerator, Literal, Type, TypeVar
7
+
8
+ from openai import AsyncOpenAI
9
+ from pydantic import BaseModel
10
+
11
+ from solana_agent.interfaces.providers.llm import LLMProvider
12
+
13
+ T = TypeVar('T', bound=BaseModel)
14
+
15
+
16
+ class OpenAIAdapter(LLMProvider):
17
+ """OpenAI implementation of LLMProvider with web search capabilities."""
18
+
19
+ def __init__(self, api_key: str):
20
+ self.client = AsyncOpenAI(api_key=api_key)
21
+ self.parse_model = "gpt-4o-mini"
22
+ self.text_model = "gpt-4o-mini"
23
+ self.transcription_model = "gpt-4o-mini-transcribe"
24
+ self.tts_model = "gpt-4o-mini-tts"
25
+
26
+ async def tts(
27
+ self,
28
+ text: str,
29
+ instructions: str = "You speak in a friendly and helpful manner.",
30
+ voice: Literal["alloy", "ash", "ballad", "coral", "echo",
31
+ "fable", "onyx", "nova", "sage", "shimmer"] = "nova",
32
+ response_format: Literal['mp3', 'opus',
33
+ 'aac', 'flac', 'wav', 'pcm'] = "aac",
34
+ ) -> AsyncGenerator[bytes, None]: # pragma: no cover
35
+ """Stream text-to-speech audio from OpenAI models.
36
+
37
+ Args:
38
+ text: Text to convert to speech
39
+ instructions: Optional instructions for speech generation
40
+ voice: Voice to use for synthesis
41
+ response_format: Audio format
42
+
43
+ Yields:
44
+ Audio bytes as they become available
45
+ """
46
+ try:
47
+ async with self.client.audio.speech.with_streaming_response.create(
48
+ model=self.tts_model,
49
+ voice=voice,
50
+ instructions=instructions,
51
+ input=text,
52
+ response_format=response_format
53
+ ) as stream:
54
+ # Stream the bytes in 16KB chunks
55
+ async for chunk in stream.iter_bytes(chunk_size=1024 * 16):
56
+ yield chunk
57
+
58
+ except Exception as e:
59
+ print(f"Error in text_to_speech: {str(e)}")
60
+ import traceback
61
+ print(traceback.format_exc())
62
+ yield b"" # Return empty bytes on error
63
+
64
+ except Exception as e:
65
+ print(f"Error in text_to_speech: {str(e)}")
66
+ import traceback
67
+ print(traceback.format_exc())
68
+ yield b"" # Return empty bytes on error
69
+
70
+ async def transcribe_audio(
71
+ self,
72
+ audio_bytes: bytes,
73
+ input_format: Literal[
74
+ "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
75
+ ] = "mp4",
76
+ ) -> AsyncGenerator[str, None]: # pragma: no cover
77
+ """Stream transcription of an audio file.
78
+
79
+ Args:
80
+ audio_bytes: Audio file bytes
81
+ input_format: Format of the input audio file
82
+
83
+ Yields:
84
+ Transcript text chunks as they become available
85
+ """
86
+ try:
87
+ async with self.client.audio.transcriptions.with_streaming_response.create(
88
+ model=self.transcription_model,
89
+ file=(f"file.{input_format}", audio_bytes),
90
+ response_format="text",
91
+ ) as stream:
92
+ # Stream the text in 16KB chunks
93
+ async for chunk in stream.iter_text(chunk_size=1024 * 16):
94
+ yield chunk
95
+
96
+ except Exception as e:
97
+ print(f"Error in transcribe_audio: {str(e)}")
98
+ import traceback
99
+ print(traceback.format_exc())
100
+ yield f"I apologize, but I encountered an error transcribing the audio: {str(e)}"
101
+
102
+ async def generate_text(
103
+ self,
104
+ prompt: str,
105
+ system_prompt: str = "",
106
+ ) -> AsyncGenerator[str, None]: # pragma: no cover
107
+ """Generate text from OpenAI models."""
108
+ messages = []
109
+
110
+ if system_prompt:
111
+ messages.append({"role": "system", "content": system_prompt})
112
+
113
+ messages.append({"role": "user", "content": prompt})
114
+
115
+ # Prepare request parameters
116
+ request_params = {
117
+ "messages": messages,
118
+ "stream": True,
119
+ "model": self.text_model,
120
+ }
121
+ try:
122
+ response = await self.client.chat.completions.create(**request_params)
123
+
124
+ async for chunk in response:
125
+ if chunk.choices:
126
+ if chunk.choices[0].delta.content:
127
+ text = chunk.choices[0].delta.content
128
+ yield text
129
+
130
+ except Exception as e:
131
+ print(f"Error in generate_text: {str(e)}")
132
+ import traceback
133
+ print(traceback.format_exc())
134
+ yield f"I apologize, but I encountered an error: {str(e)}"
135
+
136
+ async def parse_structured_output(
137
+ self,
138
+ prompt: str,
139
+ system_prompt: str,
140
+ model_class: Type[T],
141
+ ) -> T: # pragma: no cover
142
+ """Generate structured output using Pydantic model parsing."""
143
+ messages = []
144
+ if system_prompt:
145
+ messages.append({"role": "system", "content": system_prompt})
146
+
147
+ messages.append({"role": "user", "content": prompt})
148
+
149
+ try:
150
+ # First try the beta parsing API
151
+ completion = await self.client.beta.chat.completions.parse(
152
+ model=self.parse_model,
153
+ messages=messages,
154
+ response_format=model_class,
155
+ )
156
+ return completion.choices[0].message.parsed
157
+ except Exception as e:
158
+ print(f"Error with beta.parse method: {e}")
@@ -55,7 +55,6 @@ class SolanaAgent(SolanaAgentInterface):
55
55
  audio_input_format: Literal[
56
56
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
57
57
  ] = "mp4",
58
- audio_transcription_real_time: bool = True,
59
58
  router: Optional[RoutingInterface] = None,
60
59
  ) -> AsyncGenerator[Union[str, bytes], None]: # pragma: no cover
61
60
  """Process a user message and return the response stream.
@@ -69,7 +68,6 @@ class SolanaAgent(SolanaAgentInterface):
69
68
  audio_instructions: Audio voice instructions
70
69
  audio_output_format: Audio output format
71
70
  audio_input_format: Audio input format
72
- audio_transcription_real_time: Flag for real-time audio transcription
73
71
  router: Optional routing service for processing
74
72
 
75
73
  Returns:
@@ -85,7 +83,6 @@ class SolanaAgent(SolanaAgentInterface):
85
83
  audio_input_format=audio_input_format,
86
84
  prompt=prompt,
87
85
  router=router,
88
- audio_transcription_real_time=audio_transcription_real_time,
89
86
  ):
90
87
  yield chunk
91
88
 
@@ -24,7 +24,6 @@ class SolanaAgent(ABC):
24
24
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
25
25
  ] = "mp4",
26
26
  router: Optional[RoutingInterface] = None,
27
- audio_transcription_real_time: bool = True,
28
27
  ) -> AsyncGenerator[Union[str, bytes], None]:
29
28
  """Process a user message and return the response stream."""
30
29
  pass
@@ -49,13 +49,3 @@ class LLMProvider(ABC):
49
49
  ) -> AsyncGenerator[str, None]:
50
50
  """Transcribe audio from the language model."""
51
51
  pass
52
-
53
- @abstractmethod
54
- async def realtime_audio_transcription(
55
- self,
56
- audio_generator: AsyncGenerator[bytes, None],
57
- transcription_config: Optional[Dict[str, Any]] = None,
58
- on_event: Optional[Callable[[Dict[str, Any]], Any]] = None,
59
- ) -> AsyncGenerator[str, None]:
60
- """Stream real-time audio transcription from the language model."""
61
- pass
@@ -34,7 +34,6 @@ class AgentService(ABC):
34
34
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
35
35
  ] = "mp4",
36
36
  prompt: Optional[str] = None,
37
- audio_transcription_real_time: bool = True,
38
37
  ) -> AsyncGenerator[Union[str, bytes], None]:
39
38
  """Generate a response from an agent."""
40
39
  pass
@@ -23,7 +23,6 @@ class QueryService(ABC):
23
23
  ] = "mp4",
24
24
  prompt: Optional[str] = None,
25
25
  router: Optional[RoutingInterface] = None,
26
- audio_transcription_real_time: bool = True,
27
26
  ) -> AsyncGenerator[Union[str, bytes], None]:
28
27
  """Process the user request and generate a response."""
29
28
  pass
@@ -176,7 +176,6 @@ class AgentService(AgentServiceInterface):
176
176
  audio_input_format: Literal[
177
177
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
178
178
  ] = "mp4",
179
- audio_transcription_real_time: bool = True,
180
179
  prompt: Optional[str] = None,
181
180
  ) -> AsyncGenerator[Union[str, bytes], None]: # pragma: no cover
182
181
  """Generate a response with support for text/audio input/output."""
@@ -195,22 +194,8 @@ class AgentService(AgentServiceInterface):
195
194
  # Handle audio input if provided - KEEP REAL-TIME AUDIO TRANSCRIPTION
196
195
  query_text = ""
197
196
  if not isinstance(query, str):
198
- if audio_transcription_real_time and hasattr(self.llm_provider, "realtime_audio_transcription"):
199
- # Use realtime transcription for faster processing if available
200
- print("Using realtime audio transcription")
201
- async for transcript in self.llm_provider.realtime_audio_transcription(
202
- audio_generator=self._bytes_to_generator(query),
203
- transcription_config={
204
- "input_audio_format": audio_input_format}
205
- ):
206
- query_text += transcript
207
- else:
208
- # Fall back to standard transcription
209
- print("Using standard audio transcription")
210
- async for transcript in self.llm_provider.transcribe_audio(query, input_format=audio_input_format):
211
- query_text += transcript
212
-
213
- print(f"Transcribed query: {query_text}")
197
+ async for transcript in self.llm_provider.transcribe_audio(query, input_format=audio_input_format):
198
+ query_text += transcript
214
199
  else:
215
200
  query_text = query
216
201
 
@@ -47,7 +47,6 @@ class QueryService(QueryServiceInterface):
47
47
  audio_input_format: Literal[
48
48
  "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
49
49
  ] = "mp4",
50
- audio_transcription_real_time: bool = True,
51
50
  prompt: Optional[str] = None,
52
51
  router: Optional[RoutingServiceInterface] = None,
53
52
  ) -> AsyncGenerator[Union[str, bytes], None]: # pragma: no cover
@@ -61,7 +60,6 @@ class QueryService(QueryServiceInterface):
61
60
  audio_instructions: Audio voice instructions
62
61
  audio_output_format: Audio output format
63
62
  audio_input_format: Audio input format
64
- audio_transcription_real_time: Flag for real-time audio transcription
65
63
  prompt: Optional prompt for the agent
66
64
  router: Optional routing service for processing
67
65
 
@@ -122,7 +120,6 @@ class QueryService(QueryServiceInterface):
122
120
  audio_output_format=audio_output_format,
123
121
  audio_instructions=audio_instructions,
124
122
  prompt=prompt,
125
- audio_transcription_real_time=audio_transcription_real_time,
126
123
  ):
127
124
  yield audio_chunk
128
125
 
@@ -141,7 +138,6 @@ class QueryService(QueryServiceInterface):
141
138
  memory_context=memory_context,
142
139
  output_format="text",
143
140
  prompt=prompt,
144
- audio_transcription_real_time=audio_transcription_real_time,
145
141
  ):
146
142
  yield chunk
147
143
  full_text_response += chunk
@@ -1,332 +0,0 @@
1
- """
2
- LLM provider adapters for the Solana Agent system.
3
-
4
- These adapters implement the LLMProvider interface for different LLM services.
5
- """
6
- import asyncio
7
- import json
8
- from typing import Any, AsyncGenerator, Callable, Dict, Literal, Optional, Type, TypeVar
9
-
10
- import httpx
11
- from openai import AsyncOpenAI
12
- from pydantic import BaseModel
13
- import websockets
14
-
15
- from solana_agent.interfaces.providers.llm import LLMProvider
16
-
17
- T = TypeVar('T', bound=BaseModel)
18
-
19
-
20
- class OpenAIAdapter(LLMProvider):
21
- """OpenAI implementation of LLMProvider with web search capabilities."""
22
-
23
- def __init__(self, api_key: str):
24
- self.client = AsyncOpenAI(api_key=api_key)
25
- self.parse_model = "gpt-4o-mini"
26
- self.text_model = "gpt-4o-mini"
27
- self.transcription_model = "gpt-4o-mini-transcribe"
28
- self.tts_model = "gpt-4o-mini-tts"
29
-
30
- async def tts(
31
- self,
32
- text: str,
33
- instructions: str = "You speak in a friendly and helpful manner.",
34
- voice: Literal["alloy", "ash", "ballad", "coral", "echo",
35
- "fable", "onyx", "nova", "sage", "shimmer"] = "nova",
36
- response_format: Literal['mp3', 'opus',
37
- 'aac', 'flac', 'wav', 'pcm'] = "aac",
38
- ) -> AsyncGenerator[bytes, None]: # pragma: no cover
39
- """Stream text-to-speech audio from OpenAI models.
40
-
41
- Args:
42
- text: Text to convert to speech
43
- instructions: Optional instructions for speech generation
44
- voice: Voice to use for synthesis
45
- response_format: Audio format
46
-
47
- Yields:
48
- Audio bytes as they become available
49
- """
50
- try:
51
- async with self.client.audio.speech.with_streaming_response.create(
52
- model=self.tts_model,
53
- voice=voice,
54
- instructions=instructions,
55
- input=text,
56
- response_format=response_format
57
- ) as stream:
58
- # Stream the bytes in 16KB chunks
59
- async for chunk in stream.iter_bytes(chunk_size=1024 * 16):
60
- yield chunk
61
-
62
- except Exception as e:
63
- print(f"Error in text_to_speech: {str(e)}")
64
- import traceback
65
- print(traceback.format_exc())
66
- yield b"" # Return empty bytes on error
67
-
68
- except Exception as e:
69
- print(f"Error in text_to_speech: {str(e)}")
70
- import traceback
71
- print(traceback.format_exc())
72
- yield b"" # Return empty bytes on error
73
-
74
- async def transcribe_audio(
75
- self,
76
- audio_bytes: bytes,
77
- input_format: Literal[
78
- "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
79
- ] = "mp4",
80
- ) -> AsyncGenerator[str, None]: # pragma: no cover
81
- """Stream transcription of an audio file.
82
-
83
- Args:
84
- audio_bytes: Audio file bytes
85
- input_format: Format of the input audio file
86
-
87
- Yields:
88
- Transcript text chunks as they become available
89
- """
90
- try:
91
- async with self.client.audio.transcriptions.with_streaming_response.create(
92
- model=self.transcription_model,
93
- file=(f"file.{input_format}", audio_bytes),
94
- response_format="text",
95
- ) as stream:
96
- # Stream the text in 16KB chunks
97
- async for chunk in stream.iter_text(chunk_size=1024 * 16):
98
- yield chunk
99
-
100
- except Exception as e:
101
- print(f"Error in transcribe_audio: {str(e)}")
102
- import traceback
103
- print(traceback.format_exc())
104
- yield f"I apologize, but I encountered an error transcribing the audio: {str(e)}"
105
-
106
- async def generate_text(
107
- self,
108
- prompt: str,
109
- system_prompt: str = "",
110
- ) -> AsyncGenerator[str, None]: # pragma: no cover
111
- """Generate text from OpenAI models."""
112
- messages = []
113
-
114
- if system_prompt:
115
- messages.append({"role": "system", "content": system_prompt})
116
-
117
- messages.append({"role": "user", "content": prompt})
118
-
119
- # Prepare request parameters
120
- request_params = {
121
- "messages": messages,
122
- "stream": True,
123
- "model": self.text_model,
124
- }
125
- try:
126
- response = await self.client.chat.completions.create(**request_params)
127
-
128
- async for chunk in response:
129
- if chunk.choices:
130
- if chunk.choices[0].delta.content:
131
- text = chunk.choices[0].delta.content
132
- yield text
133
-
134
- except Exception as e:
135
- print(f"Error in generate_text: {str(e)}")
136
- import traceback
137
- print(traceback.format_exc())
138
- yield f"I apologize, but I encountered an error: {str(e)}"
139
-
140
- async def parse_structured_output(
141
- self,
142
- prompt: str,
143
- system_prompt: str,
144
- model_class: Type[T],
145
- ) -> T: # pragma: no cover
146
- """Generate structured output using Pydantic model parsing."""
147
- messages = []
148
- if system_prompt:
149
- messages.append({"role": "system", "content": system_prompt})
150
-
151
- messages.append({"role": "user", "content": prompt})
152
-
153
- try:
154
- # First try the beta parsing API
155
- completion = await self.client.beta.chat.completions.parse(
156
- model=self.parse_model,
157
- messages=messages,
158
- response_format=model_class,
159
- )
160
- return completion.choices[0].message.parsed
161
- except Exception as e:
162
- print(f"Error with beta.parse method: {e}")
163
-
164
- async def create_realtime_session(
165
- self,
166
- model: str = "gpt-4o-mini-realtime-preview",
167
- modalities: list = ["audio", "text"],
168
- instructions: str = "You are a helpful assistant.",
169
- voice: str = "alloy",
170
- input_audio_format: str = "pcm16",
171
- output_audio_format: str = "pcm16",
172
- ) -> Dict[str, Any]: # pragma: no cover
173
- """Create a realtime session token for WebSocket communication."""
174
- try:
175
- # Get the API key from the AsyncOpenAI client
176
- api_key = self.client.api_key
177
-
178
- # Create an async HTTP client
179
- async with httpx.AsyncClient() as client:
180
- response = await client.post(
181
- "https://api.openai.com/v1/realtime/sessions",
182
- json={
183
- "model": model,
184
- "modalities": modalities,
185
- "instructions": instructions,
186
- "voice": voice,
187
- "input_audio_format": input_audio_format,
188
- "output_audio_format": output_audio_format,
189
- },
190
- headers={
191
- "Authorization": f"Bearer {api_key}",
192
- "Content-Type": "application/json",
193
- "OpenAI-Beta": "realtime=v1"
194
- }
195
- )
196
-
197
- if response.status_code == 200:
198
- return response.json()
199
- else:
200
- raise Exception(
201
- f"Failed to create realtime session: {response.text}")
202
- except Exception as e:
203
- print(f"Error creating realtime session: {str(e)}")
204
- raise
205
-
206
- async def realtime_audio_transcription(
207
- self,
208
- audio_generator: AsyncGenerator[bytes, None],
209
- transcription_config: Optional[Dict[str, Any]] = None,
210
- on_event: Optional[Callable[[Dict[str, Any]], Any]] = None,
211
- ) -> AsyncGenerator[str, None]: # pragma: no cover
212
- """Stream real-time audio transcription using the Realtime API.
213
-
214
- Args:
215
- audio_generator: Async generator that yields audio chunks
216
- transcription_config: Optional custom configuration for transcription
217
- on_event: Optional callback function for handling raw events
218
-
219
- Yields:
220
- Transcription text as it becomes available
221
- """
222
- # Create default transcription config if none provided
223
- if transcription_config is None:
224
- transcription_config = {
225
- "input_audio_format": "pcm16",
226
- "input_audio_transcription": {
227
- "model": "gpt-4o-mini-transcribe"
228
- },
229
- "turn_detection": {
230
- "type": "server_vad",
231
- "threshold": 0.5,
232
- "prefix_padding_ms": 300,
233
- "silence_duration_ms": 200
234
- }
235
- }
236
-
237
- try:
238
- # Get the API key from the AsyncOpenAI client
239
- api_key = self.client.api_key
240
-
241
- # Create transcription session
242
- async with httpx.AsyncClient() as client:
243
- response = await client.post(
244
- "https://api.openai.com/v1/realtime/transcription_sessions",
245
- json=transcription_config,
246
- headers={
247
- "Authorization": f"Bearer {api_key}",
248
- "Content-Type": "application/json",
249
- "OpenAI-Beta": "realtime=v1"
250
- }
251
- )
252
-
253
- if response.status_code != 200:
254
- raise Exception(
255
- f"Failed to create transcription session: {response.text}")
256
-
257
- session = response.json()
258
- client_secret = session["client_secret"]["value"]
259
-
260
- # Connect to WebSocket with proper headers as dictionary
261
- url = "wss://api.openai.com/v1/realtime?model=gpt-4o-mini-transcribe"
262
- headers = {
263
- "Authorization": f"Bearer {client_secret}",
264
- "OpenAI-Beta": "realtime=v1"
265
- }
266
-
267
- async with websockets.connect(url, additional_headers=headers) as websocket:
268
- # Handle WebSocket communication in the background
269
- audio_task = None
270
-
271
- async def send_audio():
272
- try:
273
- async for audio_chunk in audio_generator:
274
- # Base64 encode the audio
275
- import base64
276
- encoded_audio = base64.b64encode(
277
- audio_chunk).decode('utf-8')
278
-
279
- # Send audio chunk
280
- await websocket.send(json.dumps({
281
- "type": "input_audio_buffer.append",
282
- "audio": encoded_audio
283
- }))
284
-
285
- # Small delay to prevent flooding
286
- await asyncio.sleep(0.05)
287
-
288
- # Commit the audio buffer when done
289
- await websocket.send(json.dumps({
290
- "type": "input_audio_buffer.commit"
291
- }))
292
- except Exception as e:
293
- print(f"Error sending audio: {str(e)}")
294
-
295
- # Start sending audio in the background
296
- audio_task = asyncio.create_task(send_audio())
297
-
298
- # Process transcription events
299
- try:
300
- while True:
301
- message = await websocket.recv()
302
- event = json.loads(message)
303
-
304
- if on_event:
305
- # Check if on_event is a coroutine function and await it if needed
306
- if asyncio.iscoroutinefunction(on_event):
307
- await on_event(event)
308
- else:
309
- on_event(event)
310
-
311
- # Extract transcription deltas
312
- if event["type"] == "conversation.item.input_audio_transcription.delta":
313
- yield event["delta"]
314
-
315
- # Also handle completed transcriptions
316
- elif event["type"] == "conversation.item.input_audio_transcription.completed":
317
- yield event["transcript"]
318
- break
319
- finally:
320
- # Clean up audio task if it's still running
321
- if audio_task and not audio_task.done():
322
- audio_task.cancel()
323
- try:
324
- await audio_task
325
- except asyncio.CancelledError:
326
- pass
327
-
328
- except Exception as e:
329
- print(f"Error in realtime audio transcription: {str(e)}")
330
- import traceback
331
- print(traceback.format_exc())
332
- yield f"I apologize, but I encountered an error transcribing the audio: {str(e)}"
File without changes