isa-model 0.3.91__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +732 -573
- isa_model/core/cache/redis_cache.py +401 -0
- isa_model/core/config/config_manager.py +53 -10
- isa_model/core/config.py +1 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/migrations.py +277 -0
- isa_model/core/database/supabase_client.py +123 -0
- isa_model/core/models/__init__.py +37 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +36 -18
- isa_model/core/models/model_repo.py +44 -38
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +101 -370
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +7 -0
- isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
- isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
- isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/core/deployment_manager.py +6 -4
- isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
- isa_model/eval/benchmarks/__init__.py +27 -0
- isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
- isa_model/eval/benchmarks.py +244 -12
- isa_model/eval/evaluators/__init__.py +8 -2
- isa_model/eval/evaluators/audio_evaluator.py +727 -0
- isa_model/eval/evaluators/embedding_evaluator.py +742 -0
- isa_model/eval/evaluators/vision_evaluator.py +564 -0
- isa_model/eval/example_evaluation.py +395 -0
- isa_model/eval/factory.py +272 -5
- isa_model/eval/isa_benchmarks.py +700 -0
- isa_model/eval/isa_integration.py +582 -0
- isa_model/eval/metrics.py +159 -6
- isa_model/eval/tests/unit/test_basic.py +396 -0
- isa_model/inference/ai_factory.py +44 -8
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +32 -6
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/base_llm_service.py +30 -6
- isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
- isa_model/inference/services/llm/ollama_llm_service.py +2 -1
- isa_model/inference/services/llm/openai_llm_service.py +652 -55
- isa_model/inference/services/llm/yyds_llm_service.py +2 -1
- isa_model/inference/services/vision/__init__.py +5 -5
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/helpers/image_utils.py +11 -5
- isa_model/inference/services/vision/isa_vision_service.py +573 -0
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/serving/api/fastapi_server.py +88 -16
- isa_model/serving/api/middleware/auth.py +311 -0
- isa_model/serving/api/middleware/security.py +278 -0
- isa_model/serving/api/routes/analytics.py +486 -0
- isa_model/serving/api/routes/deployments.py +339 -0
- isa_model/serving/api/routes/evaluations.py +579 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/unified.py +324 -165
- isa_model/serving/api/startup.py +304 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/training/__init__.py +100 -6
- isa_model/training/core/__init__.py +4 -1
- isa_model/training/examples/intelligent_training_example.py +281 -0
- isa_model/training/intelligent/__init__.py +25 -0
- isa_model/training/intelligent/decision_engine.py +643 -0
- isa_model/training/intelligent/intelligent_factory.py +888 -0
- isa_model/training/intelligent/knowledge_base.py +751 -0
- isa_model/training/intelligent/resource_optimizer.py +839 -0
- isa_model/training/intelligent/task_classifier.py +576 -0
- isa_model/training/storage/__init__.py +24 -0
- isa_model/training/storage/core_integration.py +439 -0
- isa_model/training/storage/training_repository.py +552 -0
- isa_model/training/storage/training_storage.py +628 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
- isa_model-0.4.0.dist-info/RECORD +182 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model-0.3.91.dist-info/RECORD +0 -138
- {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,57 +1,67 @@
|
|
1
1
|
import logging
|
2
2
|
import json
|
3
3
|
import asyncio
|
4
|
+
import base64
|
4
5
|
from typing import Dict, Any, List, Optional, Callable, AsyncGenerator
|
5
6
|
import aiohttp
|
6
7
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
7
8
|
|
8
|
-
from isa_model.inference.services.
|
9
|
-
from isa_model.
|
10
|
-
from isa_model.inference.billing_tracker import ServiceType
|
9
|
+
from isa_model.inference.services.audio.base_realtime_service import BaseRealtimeService, RealtimeEventType
|
10
|
+
from isa_model.core.types import ServiceType
|
11
11
|
|
12
12
|
logger = logging.getLogger(__name__)
|
13
13
|
|
14
|
-
class OpenAIRealtimeService(
|
14
|
+
class OpenAIRealtimeService(BaseRealtimeService):
|
15
15
|
"""
|
16
16
|
OpenAI Realtime API service for real-time audio conversations.
|
17
17
|
Uses gpt-4o-mini-realtime-preview model for interactive audio chat.
|
18
18
|
"""
|
19
19
|
|
20
|
-
def __init__(self,
|
21
|
-
super().__init__(
|
20
|
+
def __init__(self, provider_name: str = "openai", model_name: str = "gpt-4o-realtime-preview-2024-10-01", **kwargs):
|
21
|
+
super().__init__(provider_name, model_name, **kwargs)
|
22
22
|
|
23
|
-
|
24
|
-
self.
|
23
|
+
provider_config = self.get_provider_config()
|
24
|
+
self.api_key = provider_config.get('api_key') or self.get_api_key()
|
25
|
+
self.base_url = provider_config.get('api_base_url', 'https://api.openai.com/v1')
|
26
|
+
self.websocket_url = f"wss://api.openai.com/v1/realtime?model={self.model_name}"
|
25
27
|
|
26
|
-
# Default session configuration
|
28
|
+
# Default session configuration based on latest API
|
27
29
|
self.default_config = {
|
28
|
-
"
|
29
|
-
"
|
30
|
+
"modalities": ["text", "audio"],
|
31
|
+
"instructions": "You are a helpful assistant.",
|
30
32
|
"voice": "alloy",
|
31
33
|
"input_audio_format": "pcm16",
|
32
34
|
"output_audio_format": "pcm16",
|
33
35
|
"input_audio_transcription": {
|
34
36
|
"model": "whisper-1"
|
35
37
|
},
|
36
|
-
"turn_detection":
|
38
|
+
"turn_detection": {
|
39
|
+
"type": "server_vad",
|
40
|
+
"threshold": 0.5,
|
41
|
+
"prefix_padding_ms": 300,
|
42
|
+
"silence_duration_ms": 200
|
43
|
+
},
|
37
44
|
"tools": [],
|
38
|
-
"tool_choice": "
|
39
|
-
"temperature": 0.
|
40
|
-
"max_response_output_tokens":
|
41
|
-
|
42
|
-
|
45
|
+
"tool_choice": "auto",
|
46
|
+
"temperature": 0.8,
|
47
|
+
"max_response_output_tokens": "inf"
|
48
|
+
}
|
49
|
+
|
50
|
+
# Session limits based on API documentation
|
51
|
+
self.session_limits = {
|
52
|
+
"max_context_tokens": 128000,
|
53
|
+
"max_session_time_minutes": 15,
|
54
|
+
"audio_tokens_per_minute": 800
|
43
55
|
}
|
44
56
|
|
45
57
|
logger.info(f"Initialized OpenAIRealtimeService with model '{self.model_name}'")
|
58
|
+
|
59
|
+
# Add default event handlers for common events
|
60
|
+
self._setup_default_handlers()
|
46
61
|
|
47
|
-
@retry(
|
48
|
-
stop=stop_after_attempt(3),
|
49
|
-
wait=wait_exponential(multiplier=1, min=4, max=10),
|
50
|
-
reraise=True
|
51
|
-
)
|
52
62
|
async def create_session(
|
53
63
|
self,
|
54
|
-
instructions: str = "You are a
|
64
|
+
instructions: str = "You are a helpful assistant.",
|
55
65
|
modalities: Optional[List[str]] = None,
|
56
66
|
voice: str = "alloy",
|
57
67
|
**kwargs
|
@@ -62,80 +72,102 @@ class OpenAIRealtimeService(BaseService):
|
|
62
72
|
session_config = self.default_config.copy()
|
63
73
|
session_config.update({
|
64
74
|
"instructions": instructions,
|
65
|
-
"modalities": modalities if modalities is not None else ["
|
75
|
+
"modalities": modalities if modalities is not None else ["text", "audio"],
|
66
76
|
"voice": voice,
|
67
77
|
**kwargs
|
68
78
|
})
|
69
79
|
|
70
|
-
#
|
71
|
-
|
72
|
-
headers = {
|
73
|
-
"Authorization": f"Bearer {self.api_key}",
|
74
|
-
"Content-Type": "application/json"
|
75
|
-
}
|
80
|
+
# Store session config for WebSocket connection
|
81
|
+
self.session_config = session_config
|
76
82
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
83
|
+
# Generate a session ID (WebSocket-based, no REST endpoint)
|
84
|
+
import uuid
|
85
|
+
self.session_id = str(uuid.uuid4())
|
86
|
+
|
87
|
+
# Track session creation for billing
|
88
|
+
await self._track_usage(
|
89
|
+
service_type=ServiceType.AUDIO_REALTIME,
|
90
|
+
operation="create_session",
|
91
|
+
metadata={
|
92
|
+
"session_id": self.session_id,
|
93
|
+
"model": self.model_name,
|
94
|
+
"modalities": session_config["modalities"]
|
95
|
+
}
|
96
|
+
)
|
97
|
+
|
98
|
+
return {
|
99
|
+
"id": self.session_id,
|
100
|
+
"model": self.model_name,
|
101
|
+
"modalities": session_config["modalities"],
|
102
|
+
"instructions": instructions,
|
103
|
+
"voice": voice,
|
104
|
+
"status": "created"
|
105
|
+
}
|
97
106
|
|
98
107
|
except Exception as e:
|
99
108
|
logger.error(f"Error creating realtime session: {e}")
|
100
109
|
raise
|
101
110
|
|
102
|
-
async def connect_websocket(self,
|
103
|
-
"""Connect to the realtime WebSocket
|
111
|
+
async def connect_websocket(self, **kwargs) -> bool:
|
112
|
+
"""Connect to the realtime WebSocket"""
|
104
113
|
try:
|
105
|
-
ws_url = f"wss://api.openai.com/v1/realtime/sessions/{session_id}/ws"
|
106
114
|
headers = {
|
107
115
|
"Authorization": f"Bearer {self.api_key}",
|
108
116
|
"OpenAI-Beta": "realtime=v1"
|
109
117
|
}
|
110
118
|
|
111
|
-
|
112
|
-
|
119
|
+
self.client_session = aiohttp.ClientSession()
|
120
|
+
self.websocket = await self.client_session.ws_connect(
|
121
|
+
self.websocket_url,
|
122
|
+
headers=headers
|
123
|
+
)
|
124
|
+
|
125
|
+
# Send session.update event to configure the session
|
126
|
+
if hasattr(self, 'session_config'):
|
127
|
+
await self._send_event({
|
128
|
+
"type": "session.update",
|
129
|
+
"session": self.session_config
|
130
|
+
})
|
113
131
|
|
114
|
-
|
115
|
-
|
132
|
+
self.is_connected = True
|
133
|
+
logger.info(f"Connected to realtime WebSocket with model {self.model_name}")
|
134
|
+
return True
|
116
135
|
|
117
136
|
except Exception as e:
|
118
137
|
logger.error(f"Error connecting to WebSocket: {e}")
|
138
|
+
self.is_connected = False
|
119
139
|
raise
|
120
140
|
|
121
141
|
async def send_audio_message(
|
122
142
|
self,
|
123
|
-
ws: aiohttp.ClientWebSocketResponse,
|
124
143
|
audio_data: bytes,
|
125
|
-
format: str = "pcm16"
|
126
|
-
|
144
|
+
format: str = "pcm16",
|
145
|
+
**kwargs
|
146
|
+
) -> Dict[str, Any]:
|
127
147
|
"""Send audio data to the realtime session"""
|
128
148
|
try:
|
129
|
-
|
130
|
-
"
|
131
|
-
|
132
|
-
|
149
|
+
if not self.is_connected or not self.websocket:
|
150
|
+
raise RuntimeError("WebSocket not connected")
|
151
|
+
|
152
|
+
# Convert audio data to base64
|
153
|
+
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
133
154
|
|
134
|
-
|
155
|
+
# Send audio buffer append event
|
156
|
+
await self._send_event({
|
157
|
+
"type": RealtimeEventType.INPUT_AUDIO_BUFFER_APPEND.value,
|
158
|
+
"audio": audio_base64
|
159
|
+
})
|
135
160
|
|
136
161
|
# Commit the audio buffer
|
137
|
-
|
138
|
-
|
162
|
+
await self._send_event({
|
163
|
+
"type": RealtimeEventType.INPUT_AUDIO_BUFFER_COMMIT.value
|
164
|
+
})
|
165
|
+
|
166
|
+
return {
|
167
|
+
"status": "sent",
|
168
|
+
"audio_size_bytes": len(audio_data),
|
169
|
+
"format": format
|
170
|
+
}
|
139
171
|
|
140
172
|
except Exception as e:
|
141
173
|
logger.error(f"Error sending audio message: {e}")
|
@@ -143,13 +175,17 @@ class OpenAIRealtimeService(BaseService):
|
|
143
175
|
|
144
176
|
async def send_text_message(
|
145
177
|
self,
|
146
|
-
|
147
|
-
|
148
|
-
):
|
178
|
+
text: str,
|
179
|
+
**kwargs
|
180
|
+
) -> Dict[str, Any]:
|
149
181
|
"""Send text message to the realtime session"""
|
150
182
|
try:
|
151
|
-
|
152
|
-
"
|
183
|
+
if not self.is_connected or not self.websocket:
|
184
|
+
raise RuntimeError("WebSocket not connected")
|
185
|
+
|
186
|
+
# Create conversation item
|
187
|
+
await self._send_event({
|
188
|
+
"type": RealtimeEventType.CONVERSATION_ITEM_CREATE.value,
|
153
189
|
"item": {
|
154
190
|
"type": "message",
|
155
191
|
"role": "user",
|
@@ -160,75 +196,115 @@ class OpenAIRealtimeService(BaseService):
|
|
160
196
|
}
|
161
197
|
]
|
162
198
|
}
|
163
|
-
}
|
199
|
+
})
|
164
200
|
|
165
|
-
|
201
|
+
# Trigger response creation
|
202
|
+
await self._send_event({
|
203
|
+
"type": RealtimeEventType.RESPONSE_CREATE.value
|
204
|
+
})
|
166
205
|
|
167
|
-
|
168
|
-
|
169
|
-
|
206
|
+
return {
|
207
|
+
"status": "sent",
|
208
|
+
"text": text,
|
209
|
+
"message_length": len(text)
|
210
|
+
}
|
170
211
|
|
171
212
|
except Exception as e:
|
172
213
|
logger.error(f"Error sending text message: {e}")
|
173
214
|
raise
|
174
215
|
|
175
216
|
async def listen_for_responses(
|
176
|
-
self,
|
177
|
-
|
178
|
-
|
217
|
+
self,
|
218
|
+
message_handler: Optional[Callable] = None,
|
219
|
+
**kwargs
|
179
220
|
) -> AsyncGenerator[Dict[str, Any], None]:
|
180
221
|
"""Listen for responses from the realtime session"""
|
181
222
|
try:
|
182
|
-
|
223
|
+
if not self.is_connected or not self.websocket:
|
224
|
+
raise RuntimeError("WebSocket not connected")
|
225
|
+
|
226
|
+
async for msg in self.websocket:
|
183
227
|
if msg.type == aiohttp.WSMsgType.TEXT:
|
184
228
|
try:
|
185
|
-
|
229
|
+
event = json.loads(msg.data)
|
230
|
+
event_type = event.get("type")
|
186
231
|
|
187
|
-
# Handle
|
188
|
-
|
189
|
-
|
232
|
+
# Handle built-in event processing
|
233
|
+
await self._handle_event(event)
|
234
|
+
|
235
|
+
# Yield specific response types
|
236
|
+
if event_type == RealtimeEventType.RESPONSE_AUDIO_DELTA.value:
|
237
|
+
audio_data = event.get("delta", "")
|
190
238
|
yield {
|
191
|
-
"type": "
|
192
|
-
"data":
|
193
|
-
"format": "pcm16"
|
239
|
+
"type": "audio_delta",
|
240
|
+
"data": audio_data,
|
241
|
+
"format": "pcm16",
|
242
|
+
"raw_event": event
|
194
243
|
}
|
195
|
-
elif
|
196
|
-
|
244
|
+
elif event_type == RealtimeEventType.RESPONSE_TEXT_DELTA.value:
|
245
|
+
text_data = event.get("delta", "")
|
197
246
|
yield {
|
198
|
-
"type": "
|
199
|
-
"data":
|
247
|
+
"type": "text_delta",
|
248
|
+
"data": text_data,
|
249
|
+
"raw_event": event
|
200
250
|
}
|
201
|
-
elif
|
251
|
+
elif event_type == RealtimeEventType.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value:
|
252
|
+
transcript_data = event.get("delta", "")
|
253
|
+
yield {
|
254
|
+
"type": "transcript_delta",
|
255
|
+
"data": transcript_data,
|
256
|
+
"raw_event": event
|
257
|
+
}
|
258
|
+
elif event_type == RealtimeEventType.RESPONSE_DONE.value:
|
202
259
|
# Response completed
|
203
|
-
|
260
|
+
response = event.get("response", {})
|
261
|
+
usage = response.get("usage", {})
|
204
262
|
|
205
263
|
# Track usage for billing
|
206
|
-
self._track_usage(
|
207
|
-
service_type=ServiceType.
|
264
|
+
await self._track_usage(
|
265
|
+
service_type=ServiceType.AUDIO_REALTIME,
|
208
266
|
operation="realtime_response",
|
209
267
|
input_tokens=usage.get("input_tokens", 0),
|
210
268
|
output_tokens=usage.get("output_tokens", 0),
|
211
269
|
metadata={
|
212
|
-
"response_id":
|
213
|
-
"model": self.model_name
|
270
|
+
"response_id": response.get("id"),
|
271
|
+
"model": self.model_name,
|
272
|
+
"status": response.get("status")
|
214
273
|
}
|
215
274
|
)
|
216
275
|
|
217
276
|
yield {
|
218
|
-
"type": "
|
219
|
-
"
|
277
|
+
"type": "response_done",
|
278
|
+
"response": response,
|
279
|
+
"usage": usage,
|
280
|
+
"raw_event": event
|
281
|
+
}
|
282
|
+
elif event_type == RealtimeEventType.ERROR.value:
|
283
|
+
logger.error(f"Realtime API error: {event}")
|
284
|
+
yield {
|
285
|
+
"type": "error",
|
286
|
+
"error": event.get("error", {}),
|
287
|
+
"raw_event": event
|
220
288
|
}
|
221
289
|
|
222
290
|
# Call custom message handler if provided
|
223
291
|
if message_handler:
|
224
|
-
await message_handler(
|
292
|
+
await message_handler(event)
|
225
293
|
|
226
294
|
except json.JSONDecodeError as e:
|
227
295
|
logger.error(f"Error parsing WebSocket message: {e}")
|
228
296
|
continue
|
229
297
|
|
230
298
|
elif msg.type == aiohttp.WSMsgType.ERROR:
|
231
|
-
logger.error(f"WebSocket error: {
|
299
|
+
logger.error(f"WebSocket error: {self.websocket.exception()}")
|
300
|
+
yield {
|
301
|
+
"type": "websocket_error",
|
302
|
+
"error": str(self.websocket.exception())
|
303
|
+
}
|
304
|
+
break
|
305
|
+
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
306
|
+
logger.info("WebSocket connection closed")
|
307
|
+
self.is_connected = False
|
232
308
|
break
|
233
309
|
|
234
310
|
except Exception as e:
|
@@ -239,7 +315,8 @@ class OpenAIRealtimeService(BaseService):
|
|
239
315
|
self,
|
240
316
|
audio_data: bytes,
|
241
317
|
instructions: str = "You are a helpful assistant. Respond in audio.",
|
242
|
-
voice: str = "alloy"
|
318
|
+
voice: str = "alloy",
|
319
|
+
**kwargs
|
243
320
|
) -> Dict[str, Any]:
|
244
321
|
"""Simple audio chat - send audio, get audio response"""
|
245
322
|
try:
|
@@ -252,34 +329,42 @@ class OpenAIRealtimeService(BaseService):
|
|
252
329
|
session_id = session["id"]
|
253
330
|
|
254
331
|
# Connect to WebSocket
|
255
|
-
|
332
|
+
await self.connect_websocket()
|
256
333
|
|
257
334
|
try:
|
258
335
|
# Send audio
|
259
|
-
await self.send_audio_message(
|
336
|
+
await self.send_audio_message(audio_data)
|
260
337
|
|
261
338
|
# Collect response
|
262
339
|
audio_chunks = []
|
340
|
+
transcript_chunks = []
|
263
341
|
usage_info = {}
|
264
342
|
|
265
|
-
async for response in self.listen_for_responses(
|
266
|
-
if response["type"] == "
|
343
|
+
async for response in self.listen_for_responses():
|
344
|
+
if response["type"] == "audio_delta":
|
267
345
|
audio_chunks.append(response["data"])
|
268
|
-
elif response["type"] == "
|
346
|
+
elif response["type"] == "transcript_delta":
|
347
|
+
transcript_chunks.append(response["data"])
|
348
|
+
elif response["type"] == "response_done":
|
269
349
|
usage_info = response["usage"]
|
270
350
|
break
|
351
|
+
elif response["type"] == "error":
|
352
|
+
raise Exception(f"Realtime API error: {response['error']}")
|
271
353
|
|
272
|
-
# Combine
|
354
|
+
# Combine chunks
|
273
355
|
full_audio = "".join(audio_chunks)
|
356
|
+
full_transcript = "".join(transcript_chunks)
|
274
357
|
|
275
358
|
return {
|
276
359
|
"audio_response": full_audio,
|
360
|
+
"transcript": full_transcript,
|
277
361
|
"session_id": session_id,
|
278
|
-
"usage": usage_info
|
362
|
+
"usage": usage_info,
|
363
|
+
"format": "pcm16"
|
279
364
|
}
|
280
365
|
|
281
366
|
finally:
|
282
|
-
await
|
367
|
+
await self.disconnect()
|
283
368
|
|
284
369
|
except Exception as e:
|
285
370
|
logger.error(f"Error in simple audio chat: {e}")
|
@@ -289,9 +374,10 @@ class OpenAIRealtimeService(BaseService):
|
|
289
374
|
self,
|
290
375
|
text: str,
|
291
376
|
instructions: str = "You are a helpful assistant.",
|
292
|
-
voice: str = "alloy"
|
377
|
+
voice: str = "alloy",
|
378
|
+
**kwargs
|
293
379
|
) -> Dict[str, Any]:
|
294
|
-
"""Simple text chat - send text, get audio response"""
|
380
|
+
"""Simple text chat - send text, get audio/text response"""
|
295
381
|
try:
|
296
382
|
# Create session
|
297
383
|
session = await self.create_session(
|
@@ -302,38 +388,46 @@ class OpenAIRealtimeService(BaseService):
|
|
302
388
|
session_id = session["id"]
|
303
389
|
|
304
390
|
# Connect to WebSocket
|
305
|
-
|
391
|
+
await self.connect_websocket()
|
306
392
|
|
307
393
|
try:
|
308
394
|
# Send text
|
309
|
-
await self.send_text_message(
|
395
|
+
await self.send_text_message(text)
|
310
396
|
|
311
397
|
# Collect response
|
312
398
|
text_response = ""
|
313
399
|
audio_chunks = []
|
400
|
+
transcript_chunks = []
|
314
401
|
usage_info = {}
|
315
402
|
|
316
|
-
async for response in self.listen_for_responses(
|
317
|
-
if response["type"] == "
|
403
|
+
async for response in self.listen_for_responses():
|
404
|
+
if response["type"] == "text_delta":
|
318
405
|
text_response += response["data"]
|
319
|
-
elif response["type"] == "
|
406
|
+
elif response["type"] == "audio_delta":
|
320
407
|
audio_chunks.append(response["data"])
|
321
|
-
elif response["type"] == "
|
408
|
+
elif response["type"] == "transcript_delta":
|
409
|
+
transcript_chunks.append(response["data"])
|
410
|
+
elif response["type"] == "response_done":
|
322
411
|
usage_info = response["usage"]
|
323
412
|
break
|
413
|
+
elif response["type"] == "error":
|
414
|
+
raise Exception(f"Realtime API error: {response['error']}")
|
324
415
|
|
325
|
-
# Combine
|
416
|
+
# Combine chunks
|
326
417
|
full_audio = "".join(audio_chunks)
|
418
|
+
full_transcript = "".join(transcript_chunks)
|
327
419
|
|
328
420
|
return {
|
329
421
|
"text_response": text_response,
|
330
422
|
"audio_response": full_audio,
|
423
|
+
"transcript": full_transcript,
|
331
424
|
"session_id": session_id,
|
332
|
-
"usage": usage_info
|
425
|
+
"usage": usage_info,
|
426
|
+
"format": "pcm16"
|
333
427
|
}
|
334
428
|
|
335
429
|
finally:
|
336
|
-
await
|
430
|
+
await self.disconnect()
|
337
431
|
|
338
432
|
except Exception as e:
|
339
433
|
logger.error(f"Error in simple text chat: {e}")
|
@@ -347,7 +441,109 @@ class OpenAIRealtimeService(BaseService):
|
|
347
441
|
"""Get list of supported audio formats"""
|
348
442
|
return ["pcm16", "g711_ulaw", "g711_alaw"]
|
349
443
|
|
444
|
+
def get_session_limits(self) -> Dict[str, Any]:
|
445
|
+
"""Get session limits and constraints"""
|
446
|
+
return self.session_limits.copy()
|
447
|
+
|
448
|
+
async def update_session(self, **kwargs) -> Dict[str, Any]:
|
449
|
+
"""Update session configuration"""
|
450
|
+
try:
|
451
|
+
if not self.is_connected or not self.websocket:
|
452
|
+
raise RuntimeError("WebSocket not connected")
|
453
|
+
|
454
|
+
# Update session config
|
455
|
+
session_update = {k: v for k, v in kwargs.items() if k in self.default_config}
|
456
|
+
|
457
|
+
if session_update:
|
458
|
+
await self._send_event({
|
459
|
+
"type": "session.update",
|
460
|
+
"session": session_update
|
461
|
+
})
|
462
|
+
|
463
|
+
# Update local config
|
464
|
+
if hasattr(self, 'session_config'):
|
465
|
+
self.session_config.update(session_update)
|
466
|
+
|
467
|
+
return {
|
468
|
+
"status": "updated",
|
469
|
+
"updated_fields": list(session_update.keys())
|
470
|
+
}
|
471
|
+
|
472
|
+
except Exception as e:
|
473
|
+
logger.error(f"Error updating session: {e}")
|
474
|
+
raise
|
475
|
+
|
476
|
+
async def disconnect(self):
|
477
|
+
"""Disconnect from the realtime session"""
|
478
|
+
try:
|
479
|
+
if self.websocket and not self.websocket.closed:
|
480
|
+
await self.websocket.close()
|
481
|
+
|
482
|
+
if hasattr(self, 'client_session') and self.client_session:
|
483
|
+
await self.client_session.close()
|
484
|
+
|
485
|
+
self.is_connected = False
|
486
|
+
self.websocket = None
|
487
|
+
|
488
|
+
logger.info("Disconnected from realtime session")
|
489
|
+
|
490
|
+
except Exception as e:
|
491
|
+
logger.error(f"Error disconnecting: {e}")
|
492
|
+
|
493
|
+
async def _send_event(self, event: Dict[str, Any]):
|
494
|
+
"""Send an event to the WebSocket"""
|
495
|
+
if not self.websocket or self.websocket.closed:
|
496
|
+
raise RuntimeError("WebSocket not connected")
|
497
|
+
|
498
|
+
event_json = json.dumps(event)
|
499
|
+
await self.websocket.send_str(event_json)
|
500
|
+
logger.debug(f"Sent event: {event.get('type')}")
|
501
|
+
|
502
|
+
def _setup_default_handlers(self):
|
503
|
+
"""Setup default event handlers for common events"""
|
504
|
+
|
505
|
+
async def handle_session_created(event):
|
506
|
+
logger.info(f"Session created: {event.get('session', {}).get('id')}")
|
507
|
+
|
508
|
+
async def handle_session_updated(event):
|
509
|
+
logger.info(f"Session updated: {event.get('session', {})}")
|
510
|
+
|
511
|
+
async def handle_input_audio_buffer_committed(event):
|
512
|
+
logger.debug(f"Audio buffer committed: {event.get('item_id', 'unknown')}")
|
513
|
+
|
514
|
+
async def handle_input_audio_buffer_speech_started(event):
|
515
|
+
logger.debug(f"Speech started: {event.get('audio_start_ms', 0)}ms")
|
516
|
+
|
517
|
+
async def handle_input_audio_buffer_speech_stopped(event):
|
518
|
+
logger.debug(f"Speech stopped: {event.get('audio_end_ms', 0)}ms")
|
519
|
+
|
520
|
+
async def handle_conversation_item_created(event):
|
521
|
+
item = event.get('item', {})
|
522
|
+
logger.debug(f"Conversation item created: {item.get('type')} - {item.get('id')}")
|
523
|
+
|
524
|
+
async def handle_response_created(event):
|
525
|
+
response = event.get('response', {})
|
526
|
+
logger.debug(f"Response created: {response.get('id')}")
|
527
|
+
|
528
|
+
async def handle_rate_limits_updated(event):
|
529
|
+
limits = event.get('rate_limits', [])
|
530
|
+
logger.debug(f"Rate limits updated: {limits}")
|
531
|
+
|
532
|
+
async def handle_error(event):
|
533
|
+
error = event.get('error', {})
|
534
|
+
logger.error(f"Realtime API error: {error.get('message')} (Code: {error.get('code')})")
|
535
|
+
|
536
|
+
# Register default handlers
|
537
|
+
self.add_event_handler(RealtimeEventType.SESSION_CREATED, handle_session_created)
|
538
|
+
self.add_event_handler(RealtimeEventType.SESSION_UPDATED, handle_session_updated)
|
539
|
+
self.add_event_handler(RealtimeEventType.INPUT_AUDIO_BUFFER_COMMITTED, handle_input_audio_buffer_committed)
|
540
|
+
self.add_event_handler(RealtimeEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED, handle_input_audio_buffer_speech_started)
|
541
|
+
self.add_event_handler(RealtimeEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED, handle_input_audio_buffer_speech_stopped)
|
542
|
+
self.add_event_handler(RealtimeEventType.CONVERSATION_ITEM_CREATED, handle_conversation_item_created)
|
543
|
+
self.add_event_handler(RealtimeEventType.RESPONSE_CREATED, handle_response_created)
|
544
|
+
self.add_event_handler(RealtimeEventType.RATE_LIMITS_UPDATED, handle_rate_limits_updated)
|
545
|
+
self.add_event_handler(RealtimeEventType.ERROR, handle_error)
|
546
|
+
|
350
547
|
async def close(self):
|
351
548
|
"""Cleanup resources"""
|
352
|
-
|
353
|
-
pass
|
549
|
+
await self.disconnect()
|