isa-model 0.3.91__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. isa_model/client.py +732 -573
  2. isa_model/core/cache/redis_cache.py +401 -0
  3. isa_model/core/config/config_manager.py +53 -10
  4. isa_model/core/config.py +1 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/migrations.py +277 -0
  7. isa_model/core/database/supabase_client.py +123 -0
  8. isa_model/core/models/__init__.py +37 -0
  9. isa_model/core/models/model_billing_tracker.py +60 -88
  10. isa_model/core/models/model_manager.py +36 -18
  11. isa_model/core/models/model_repo.py +44 -38
  12. isa_model/core/models/model_statistics_tracker.py +234 -0
  13. isa_model/core/models/model_storage.py +0 -1
  14. isa_model/core/models/model_version_manager.py +959 -0
  15. isa_model/core/pricing_manager.py +2 -249
  16. isa_model/core/resilience/circuit_breaker.py +366 -0
  17. isa_model/core/security/secrets.py +358 -0
  18. isa_model/core/services/__init__.py +2 -4
  19. isa_model/core/services/intelligent_model_selector.py +101 -370
  20. isa_model/core/storage/hf_storage.py +1 -1
  21. isa_model/core/types.py +7 -0
  22. isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
  23. isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
  24. isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
  25. isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
  26. isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
  27. isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
  28. isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
  29. isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
  30. isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
  31. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
  32. isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
  33. isa_model/deployment/core/deployment_manager.py +6 -4
  34. isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
  35. isa_model/eval/benchmarks/__init__.py +27 -0
  36. isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
  37. isa_model/eval/benchmarks.py +244 -12
  38. isa_model/eval/evaluators/__init__.py +8 -2
  39. isa_model/eval/evaluators/audio_evaluator.py +727 -0
  40. isa_model/eval/evaluators/embedding_evaluator.py +742 -0
  41. isa_model/eval/evaluators/vision_evaluator.py +564 -0
  42. isa_model/eval/example_evaluation.py +395 -0
  43. isa_model/eval/factory.py +272 -5
  44. isa_model/eval/isa_benchmarks.py +700 -0
  45. isa_model/eval/isa_integration.py +582 -0
  46. isa_model/eval/metrics.py +159 -6
  47. isa_model/eval/tests/unit/test_basic.py +396 -0
  48. isa_model/inference/ai_factory.py +44 -8
  49. isa_model/inference/services/audio/__init__.py +21 -0
  50. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  51. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  52. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  53. isa_model/inference/services/audio/openai_stt_service.py +32 -6
  54. isa_model/inference/services/base_service.py +17 -1
  55. isa_model/inference/services/embedding/__init__.py +13 -0
  56. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  57. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  58. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  59. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  60. isa_model/inference/services/img/__init__.py +2 -2
  61. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  62. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  63. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  64. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  65. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  66. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  67. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  68. isa_model/inference/services/llm/base_llm_service.py +30 -6
  69. isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
  70. isa_model/inference/services/llm/ollama_llm_service.py +2 -1
  71. isa_model/inference/services/llm/openai_llm_service.py +652 -55
  72. isa_model/inference/services/llm/yyds_llm_service.py +2 -1
  73. isa_model/inference/services/vision/__init__.py +5 -5
  74. isa_model/inference/services/vision/base_vision_service.py +118 -185
  75. isa_model/inference/services/vision/helpers/image_utils.py +11 -5
  76. isa_model/inference/services/vision/isa_vision_service.py +573 -0
  77. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  78. isa_model/serving/api/fastapi_server.py +88 -16
  79. isa_model/serving/api/middleware/auth.py +311 -0
  80. isa_model/serving/api/middleware/security.py +278 -0
  81. isa_model/serving/api/routes/analytics.py +486 -0
  82. isa_model/serving/api/routes/deployments.py +339 -0
  83. isa_model/serving/api/routes/evaluations.py +579 -0
  84. isa_model/serving/api/routes/logs.py +430 -0
  85. isa_model/serving/api/routes/settings.py +582 -0
  86. isa_model/serving/api/routes/unified.py +324 -165
  87. isa_model/serving/api/startup.py +304 -0
  88. isa_model/serving/modal_proxy_server.py +249 -0
  89. isa_model/training/__init__.py +100 -6
  90. isa_model/training/core/__init__.py +4 -1
  91. isa_model/training/examples/intelligent_training_example.py +281 -0
  92. isa_model/training/intelligent/__init__.py +25 -0
  93. isa_model/training/intelligent/decision_engine.py +643 -0
  94. isa_model/training/intelligent/intelligent_factory.py +888 -0
  95. isa_model/training/intelligent/knowledge_base.py +751 -0
  96. isa_model/training/intelligent/resource_optimizer.py +839 -0
  97. isa_model/training/intelligent/task_classifier.py +576 -0
  98. isa_model/training/storage/__init__.py +24 -0
  99. isa_model/training/storage/core_integration.py +439 -0
  100. isa_model/training/storage/training_repository.py +552 -0
  101. isa_model/training/storage/training_storage.py +628 -0
  102. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
  103. isa_model-0.4.0.dist-info/RECORD +182 -0
  104. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  105. isa_model/deployment/cloud/modal/register_models.py +0 -321
  106. isa_model/inference/adapter/unified_api.py +0 -248
  107. isa_model/inference/services/helpers/stacked_config.py +0 -148
  108. isa_model/inference/services/img/flux_professional_service.py +0 -603
  109. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  110. isa_model/inference/services/others/table_transformer_service.py +0 -61
  111. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  112. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  113. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  114. isa_model/scripts/inference_tracker.py +0 -283
  115. isa_model/scripts/mlflow_manager.py +0 -379
  116. isa_model/scripts/model_registry.py +0 -465
  117. isa_model/scripts/register_models.py +0 -370
  118. isa_model/scripts/register_models_with_embeddings.py +0 -510
  119. isa_model/scripts/start_mlflow.py +0 -95
  120. isa_model/scripts/training_tracker.py +0 -257
  121. isa_model-0.3.91.dist-info/RECORD +0 -138
  122. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
  123. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,57 +1,67 @@
1
1
  import logging
2
2
  import json
3
3
  import asyncio
4
+ import base64
4
5
  from typing import Dict, Any, List, Optional, Callable, AsyncGenerator
5
6
  import aiohttp
6
7
  from tenacity import retry, stop_after_attempt, wait_exponential
7
8
 
8
- from isa_model.inference.services.base_service import BaseService
9
- from isa_model.inference.providers.base_provider import BaseProvider
10
- from isa_model.inference.billing_tracker import ServiceType
9
+ from isa_model.inference.services.audio.base_realtime_service import BaseRealtimeService, RealtimeEventType
10
+ from isa_model.core.types import ServiceType
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
- class OpenAIRealtimeService(BaseService):
14
+ class OpenAIRealtimeService(BaseRealtimeService):
15
15
  """
16
16
  OpenAI Realtime API service for real-time audio conversations.
17
17
  Uses gpt-4o-mini-realtime-preview model for interactive audio chat.
18
18
  """
19
19
 
20
- def __init__(self, provider: 'BaseProvider', model_name: str = "gpt-4o-mini-realtime-preview"):
21
- super().__init__(provider, model_name)
20
+ def __init__(self, provider_name: str = "openai", model_name: str = "gpt-4o-realtime-preview-2024-10-01", **kwargs):
21
+ super().__init__(provider_name, model_name, **kwargs)
22
22
 
23
- self.api_key = self.config.get('api_key')
24
- self.base_url = self.config.get('api_base', 'https://api.openai.com/v1')
23
+ provider_config = self.get_provider_config()
24
+ self.api_key = provider_config.get('api_key') or self.get_api_key()
25
+ self.base_url = provider_config.get('api_base_url', 'https://api.openai.com/v1')
26
+ self.websocket_url = f"wss://api.openai.com/v1/realtime?model={self.model_name}"
25
27
 
26
- # Default session configuration
28
+ # Default session configuration based on latest API
27
29
  self.default_config = {
28
- "model": self.model_name,
29
- "modalities": ["audio", "text"],
30
+ "modalities": ["text", "audio"],
31
+ "instructions": "You are a helpful assistant.",
30
32
  "voice": "alloy",
31
33
  "input_audio_format": "pcm16",
32
34
  "output_audio_format": "pcm16",
33
35
  "input_audio_transcription": {
34
36
  "model": "whisper-1"
35
37
  },
36
- "turn_detection": None,
38
+ "turn_detection": {
39
+ "type": "server_vad",
40
+ "threshold": 0.5,
41
+ "prefix_padding_ms": 300,
42
+ "silence_duration_ms": 200
43
+ },
37
44
  "tools": [],
38
- "tool_choice": "none",
39
- "temperature": 0.7,
40
- "max_response_output_tokens": 200,
41
- "speed": 1.1,
42
- "tracing": "auto"
45
+ "tool_choice": "auto",
46
+ "temperature": 0.8,
47
+ "max_response_output_tokens": "inf"
48
+ }
49
+
50
+ # Session limits based on API documentation
51
+ self.session_limits = {
52
+ "max_context_tokens": 128000,
53
+ "max_session_time_minutes": 15,
54
+ "audio_tokens_per_minute": 800
43
55
  }
44
56
 
45
57
  logger.info(f"Initialized OpenAIRealtimeService with model '{self.model_name}'")
58
+
59
+ # Add default event handlers for common events
60
+ self._setup_default_handlers()
46
61
 
47
- @retry(
48
- stop=stop_after_attempt(3),
49
- wait=wait_exponential(multiplier=1, min=4, max=10),
50
- reraise=True
51
- )
52
62
  async def create_session(
53
63
  self,
54
- instructions: str = "You are a friendly assistant.",
64
+ instructions: str = "You are a helpful assistant.",
55
65
  modalities: Optional[List[str]] = None,
56
66
  voice: str = "alloy",
57
67
  **kwargs
@@ -62,80 +72,102 @@ class OpenAIRealtimeService(BaseService):
62
72
  session_config = self.default_config.copy()
63
73
  session_config.update({
64
74
  "instructions": instructions,
65
- "modalities": modalities if modalities is not None else ["audio", "text"],
75
+ "modalities": modalities if modalities is not None else ["text", "audio"],
66
76
  "voice": voice,
67
77
  **kwargs
68
78
  })
69
79
 
70
- # Create session via REST API
71
- url = f"{self.base_url}/realtime/sessions"
72
- headers = {
73
- "Authorization": f"Bearer {self.api_key}",
74
- "Content-Type": "application/json"
75
- }
80
+ # Store session config for WebSocket connection
81
+ self.session_config = session_config
76
82
 
77
- async with aiohttp.ClientSession() as session:
78
- async with session.post(url, headers=headers, json=session_config) as response:
79
- if response.status == 200:
80
- result = await response.json()
81
-
82
- # Track usage for billing
83
- self._track_usage(
84
- service_type=ServiceType.AUDIO_STT, # Realtime combines STT/TTS
85
- operation="create_session",
86
- metadata={
87
- "session_id": result.get("id"),
88
- "model": self.model_name,
89
- "modalities": session_config["modalities"]
90
- }
91
- )
92
-
93
- return result
94
- else:
95
- error_text = await response.text()
96
- raise Exception(f"Failed to create session: {response.status} - {error_text}")
83
+ # Generate a session ID (WebSocket-based, no REST endpoint)
84
+ import uuid
85
+ self.session_id = str(uuid.uuid4())
86
+
87
+ # Track session creation for billing
88
+ await self._track_usage(
89
+ service_type=ServiceType.AUDIO_REALTIME,
90
+ operation="create_session",
91
+ metadata={
92
+ "session_id": self.session_id,
93
+ "model": self.model_name,
94
+ "modalities": session_config["modalities"]
95
+ }
96
+ )
97
+
98
+ return {
99
+ "id": self.session_id,
100
+ "model": self.model_name,
101
+ "modalities": session_config["modalities"],
102
+ "instructions": instructions,
103
+ "voice": voice,
104
+ "status": "created"
105
+ }
97
106
 
98
107
  except Exception as e:
99
108
  logger.error(f"Error creating realtime session: {e}")
100
109
  raise
101
110
 
102
- async def connect_websocket(self, session_id: str) -> aiohttp.ClientWebSocketResponse:
103
- """Connect to the realtime WebSocket for a session"""
111
+ async def connect_websocket(self, **kwargs) -> bool:
112
+ """Connect to the realtime WebSocket"""
104
113
  try:
105
- ws_url = f"wss://api.openai.com/v1/realtime/sessions/{session_id}/ws"
106
114
  headers = {
107
115
  "Authorization": f"Bearer {self.api_key}",
108
116
  "OpenAI-Beta": "realtime=v1"
109
117
  }
110
118
 
111
- session = aiohttp.ClientSession()
112
- ws = await session.ws_connect(ws_url, headers=headers)
119
+ self.client_session = aiohttp.ClientSession()
120
+ self.websocket = await self.client_session.ws_connect(
121
+ self.websocket_url,
122
+ headers=headers
123
+ )
124
+
125
+ # Send session.update event to configure the session
126
+ if hasattr(self, 'session_config'):
127
+ await self._send_event({
128
+ "type": "session.update",
129
+ "session": self.session_config
130
+ })
113
131
 
114
- logger.info(f"Connected to realtime WebSocket for session {session_id}")
115
- return ws
132
+ self.is_connected = True
133
+ logger.info(f"Connected to realtime WebSocket with model {self.model_name}")
134
+ return True
116
135
 
117
136
  except Exception as e:
118
137
  logger.error(f"Error connecting to WebSocket: {e}")
138
+ self.is_connected = False
119
139
  raise
120
140
 
121
141
  async def send_audio_message(
122
142
  self,
123
- ws: aiohttp.ClientWebSocketResponse,
124
143
  audio_data: bytes,
125
- format: str = "pcm16"
126
- ):
144
+ format: str = "pcm16",
145
+ **kwargs
146
+ ) -> Dict[str, Any]:
127
147
  """Send audio data to the realtime session"""
128
148
  try:
129
- message = {
130
- "type": "input_audio_buffer.append",
131
- "audio": audio_data.hex() if format == "pcm16" else audio_data
132
- }
149
+ if not self.is_connected or not self.websocket:
150
+ raise RuntimeError("WebSocket not connected")
151
+
152
+ # Convert audio data to base64
153
+ audio_base64 = base64.b64encode(audio_data).decode('utf-8')
133
154
 
134
- await ws.send_str(json.dumps(message))
155
+ # Send audio buffer append event
156
+ await self._send_event({
157
+ "type": RealtimeEventType.INPUT_AUDIO_BUFFER_APPEND.value,
158
+ "audio": audio_base64
159
+ })
135
160
 
136
161
  # Commit the audio buffer
137
- commit_message = {"type": "input_audio_buffer.commit"}
138
- await ws.send_str(json.dumps(commit_message))
162
+ await self._send_event({
163
+ "type": RealtimeEventType.INPUT_AUDIO_BUFFER_COMMIT.value
164
+ })
165
+
166
+ return {
167
+ "status": "sent",
168
+ "audio_size_bytes": len(audio_data),
169
+ "format": format
170
+ }
139
171
 
140
172
  except Exception as e:
141
173
  logger.error(f"Error sending audio message: {e}")
@@ -143,13 +175,17 @@ class OpenAIRealtimeService(BaseService):
143
175
 
144
176
  async def send_text_message(
145
177
  self,
146
- ws: aiohttp.ClientWebSocketResponse,
147
- text: str
148
- ):
178
+ text: str,
179
+ **kwargs
180
+ ) -> Dict[str, Any]:
149
181
  """Send text message to the realtime session"""
150
182
  try:
151
- message = {
152
- "type": "conversation.item.create",
183
+ if not self.is_connected or not self.websocket:
184
+ raise RuntimeError("WebSocket not connected")
185
+
186
+ # Create conversation item
187
+ await self._send_event({
188
+ "type": RealtimeEventType.CONVERSATION_ITEM_CREATE.value,
153
189
  "item": {
154
190
  "type": "message",
155
191
  "role": "user",
@@ -160,75 +196,115 @@ class OpenAIRealtimeService(BaseService):
160
196
  }
161
197
  ]
162
198
  }
163
- }
199
+ })
164
200
 
165
- await ws.send_str(json.dumps(message))
201
+ # Trigger response creation
202
+ await self._send_event({
203
+ "type": RealtimeEventType.RESPONSE_CREATE.value
204
+ })
166
205
 
167
- # Trigger response
168
- response_message = {"type": "response.create"}
169
- await ws.send_str(json.dumps(response_message))
206
+ return {
207
+ "status": "sent",
208
+ "text": text,
209
+ "message_length": len(text)
210
+ }
170
211
 
171
212
  except Exception as e:
172
213
  logger.error(f"Error sending text message: {e}")
173
214
  raise
174
215
 
175
216
  async def listen_for_responses(
176
- self,
177
- ws: aiohttp.ClientWebSocketResponse,
178
- message_handler: Optional[Callable] = None
217
+ self,
218
+ message_handler: Optional[Callable] = None,
219
+ **kwargs
179
220
  ) -> AsyncGenerator[Dict[str, Any], None]:
180
221
  """Listen for responses from the realtime session"""
181
222
  try:
182
- async for msg in ws:
223
+ if not self.is_connected or not self.websocket:
224
+ raise RuntimeError("WebSocket not connected")
225
+
226
+ async for msg in self.websocket:
183
227
  if msg.type == aiohttp.WSMsgType.TEXT:
184
228
  try:
185
- data = json.loads(msg.data)
229
+ event = json.loads(msg.data)
230
+ event_type = event.get("type")
186
231
 
187
- # Handle different message types
188
- if data.get("type") == "response.audio.delta":
189
- # Audio response chunk
232
+ # Handle built-in event processing
233
+ await self._handle_event(event)
234
+
235
+ # Yield specific response types
236
+ if event_type == RealtimeEventType.RESPONSE_AUDIO_DELTA.value:
237
+ audio_data = event.get("delta", "")
190
238
  yield {
191
- "type": "audio",
192
- "data": data.get("delta", ""),
193
- "format": "pcm16"
239
+ "type": "audio_delta",
240
+ "data": audio_data,
241
+ "format": "pcm16",
242
+ "raw_event": event
194
243
  }
195
- elif data.get("type") == "response.text.delta":
196
- # Text response chunk
244
+ elif event_type == RealtimeEventType.RESPONSE_TEXT_DELTA.value:
245
+ text_data = event.get("delta", "")
197
246
  yield {
198
- "type": "text",
199
- "data": data.get("delta", "")
247
+ "type": "text_delta",
248
+ "data": text_data,
249
+ "raw_event": event
200
250
  }
201
- elif data.get("type") == "response.done":
251
+ elif event_type == RealtimeEventType.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value:
252
+ transcript_data = event.get("delta", "")
253
+ yield {
254
+ "type": "transcript_delta",
255
+ "data": transcript_data,
256
+ "raw_event": event
257
+ }
258
+ elif event_type == RealtimeEventType.RESPONSE_DONE.value:
202
259
  # Response completed
203
- usage = data.get("response", {}).get("usage", {})
260
+ response = event.get("response", {})
261
+ usage = response.get("usage", {})
204
262
 
205
263
  # Track usage for billing
206
- self._track_usage(
207
- service_type=ServiceType.AUDIO_STT,
264
+ await self._track_usage(
265
+ service_type=ServiceType.AUDIO_REALTIME,
208
266
  operation="realtime_response",
209
267
  input_tokens=usage.get("input_tokens", 0),
210
268
  output_tokens=usage.get("output_tokens", 0),
211
269
  metadata={
212
- "response_id": data.get("response", {}).get("id"),
213
- "model": self.model_name
270
+ "response_id": response.get("id"),
271
+ "model": self.model_name,
272
+ "status": response.get("status")
214
273
  }
215
274
  )
216
275
 
217
276
  yield {
218
- "type": "done",
219
- "usage": usage
277
+ "type": "response_done",
278
+ "response": response,
279
+ "usage": usage,
280
+ "raw_event": event
281
+ }
282
+ elif event_type == RealtimeEventType.ERROR.value:
283
+ logger.error(f"Realtime API error: {event}")
284
+ yield {
285
+ "type": "error",
286
+ "error": event.get("error", {}),
287
+ "raw_event": event
220
288
  }
221
289
 
222
290
  # Call custom message handler if provided
223
291
  if message_handler:
224
- await message_handler(data)
292
+ await message_handler(event)
225
293
 
226
294
  except json.JSONDecodeError as e:
227
295
  logger.error(f"Error parsing WebSocket message: {e}")
228
296
  continue
229
297
 
230
298
  elif msg.type == aiohttp.WSMsgType.ERROR:
231
- logger.error(f"WebSocket error: {ws.exception()}")
299
+ logger.error(f"WebSocket error: {self.websocket.exception()}")
300
+ yield {
301
+ "type": "websocket_error",
302
+ "error": str(self.websocket.exception())
303
+ }
304
+ break
305
+ elif msg.type == aiohttp.WSMsgType.CLOSED:
306
+ logger.info("WebSocket connection closed")
307
+ self.is_connected = False
232
308
  break
233
309
 
234
310
  except Exception as e:
@@ -239,7 +315,8 @@ class OpenAIRealtimeService(BaseService):
239
315
  self,
240
316
  audio_data: bytes,
241
317
  instructions: str = "You are a helpful assistant. Respond in audio.",
242
- voice: str = "alloy"
318
+ voice: str = "alloy",
319
+ **kwargs
243
320
  ) -> Dict[str, Any]:
244
321
  """Simple audio chat - send audio, get audio response"""
245
322
  try:
@@ -252,34 +329,42 @@ class OpenAIRealtimeService(BaseService):
252
329
  session_id = session["id"]
253
330
 
254
331
  # Connect to WebSocket
255
- ws = await self.connect_websocket(session_id)
332
+ await self.connect_websocket()
256
333
 
257
334
  try:
258
335
  # Send audio
259
- await self.send_audio_message(ws, audio_data)
336
+ await self.send_audio_message(audio_data)
260
337
 
261
338
  # Collect response
262
339
  audio_chunks = []
340
+ transcript_chunks = []
263
341
  usage_info = {}
264
342
 
265
- async for response in self.listen_for_responses(ws):
266
- if response["type"] == "audio":
343
+ async for response in self.listen_for_responses():
344
+ if response["type"] == "audio_delta":
267
345
  audio_chunks.append(response["data"])
268
- elif response["type"] == "done":
346
+ elif response["type"] == "transcript_delta":
347
+ transcript_chunks.append(response["data"])
348
+ elif response["type"] == "response_done":
269
349
  usage_info = response["usage"]
270
350
  break
351
+ elif response["type"] == "error":
352
+ raise Exception(f"Realtime API error: {response['error']}")
271
353
 
272
- # Combine audio chunks
354
+ # Combine chunks
273
355
  full_audio = "".join(audio_chunks)
356
+ full_transcript = "".join(transcript_chunks)
274
357
 
275
358
  return {
276
359
  "audio_response": full_audio,
360
+ "transcript": full_transcript,
277
361
  "session_id": session_id,
278
- "usage": usage_info
362
+ "usage": usage_info,
363
+ "format": "pcm16"
279
364
  }
280
365
 
281
366
  finally:
282
- await ws.close()
367
+ await self.disconnect()
283
368
 
284
369
  except Exception as e:
285
370
  logger.error(f"Error in simple audio chat: {e}")
@@ -289,9 +374,10 @@ class OpenAIRealtimeService(BaseService):
289
374
  self,
290
375
  text: str,
291
376
  instructions: str = "You are a helpful assistant.",
292
- voice: str = "alloy"
377
+ voice: str = "alloy",
378
+ **kwargs
293
379
  ) -> Dict[str, Any]:
294
- """Simple text chat - send text, get audio response"""
380
+ """Simple text chat - send text, get audio/text response"""
295
381
  try:
296
382
  # Create session
297
383
  session = await self.create_session(
@@ -302,38 +388,46 @@ class OpenAIRealtimeService(BaseService):
302
388
  session_id = session["id"]
303
389
 
304
390
  # Connect to WebSocket
305
- ws = await self.connect_websocket(session_id)
391
+ await self.connect_websocket()
306
392
 
307
393
  try:
308
394
  # Send text
309
- await self.send_text_message(ws, text)
395
+ await self.send_text_message(text)
310
396
 
311
397
  # Collect response
312
398
  text_response = ""
313
399
  audio_chunks = []
400
+ transcript_chunks = []
314
401
  usage_info = {}
315
402
 
316
- async for response in self.listen_for_responses(ws):
317
- if response["type"] == "text":
403
+ async for response in self.listen_for_responses():
404
+ if response["type"] == "text_delta":
318
405
  text_response += response["data"]
319
- elif response["type"] == "audio":
406
+ elif response["type"] == "audio_delta":
320
407
  audio_chunks.append(response["data"])
321
- elif response["type"] == "done":
408
+ elif response["type"] == "transcript_delta":
409
+ transcript_chunks.append(response["data"])
410
+ elif response["type"] == "response_done":
322
411
  usage_info = response["usage"]
323
412
  break
413
+ elif response["type"] == "error":
414
+ raise Exception(f"Realtime API error: {response['error']}")
324
415
 
325
- # Combine audio chunks
416
+ # Combine chunks
326
417
  full_audio = "".join(audio_chunks)
418
+ full_transcript = "".join(transcript_chunks)
327
419
 
328
420
  return {
329
421
  "text_response": text_response,
330
422
  "audio_response": full_audio,
423
+ "transcript": full_transcript,
331
424
  "session_id": session_id,
332
- "usage": usage_info
425
+ "usage": usage_info,
426
+ "format": "pcm16"
333
427
  }
334
428
 
335
429
  finally:
336
- await ws.close()
430
+ await self.disconnect()
337
431
 
338
432
  except Exception as e:
339
433
  logger.error(f"Error in simple text chat: {e}")
@@ -347,7 +441,109 @@ class OpenAIRealtimeService(BaseService):
347
441
  """Get list of supported audio formats"""
348
442
  return ["pcm16", "g711_ulaw", "g711_alaw"]
349
443
 
444
+ def get_session_limits(self) -> Dict[str, Any]:
445
+ """Get session limits and constraints"""
446
+ return self.session_limits.copy()
447
+
448
+ async def update_session(self, **kwargs) -> Dict[str, Any]:
449
+ """Update session configuration"""
450
+ try:
451
+ if not self.is_connected or not self.websocket:
452
+ raise RuntimeError("WebSocket not connected")
453
+
454
+ # Update session config
455
+ session_update = {k: v for k, v in kwargs.items() if k in self.default_config}
456
+
457
+ if session_update:
458
+ await self._send_event({
459
+ "type": "session.update",
460
+ "session": session_update
461
+ })
462
+
463
+ # Update local config
464
+ if hasattr(self, 'session_config'):
465
+ self.session_config.update(session_update)
466
+
467
+ return {
468
+ "status": "updated",
469
+ "updated_fields": list(session_update.keys())
470
+ }
471
+
472
+ except Exception as e:
473
+ logger.error(f"Error updating session: {e}")
474
+ raise
475
+
476
+ async def disconnect(self):
477
+ """Disconnect from the realtime session"""
478
+ try:
479
+ if self.websocket and not self.websocket.closed:
480
+ await self.websocket.close()
481
+
482
+ if hasattr(self, 'client_session') and self.client_session:
483
+ await self.client_session.close()
484
+
485
+ self.is_connected = False
486
+ self.websocket = None
487
+
488
+ logger.info("Disconnected from realtime session")
489
+
490
+ except Exception as e:
491
+ logger.error(f"Error disconnecting: {e}")
492
+
493
+ async def _send_event(self, event: Dict[str, Any]):
494
+ """Send an event to the WebSocket"""
495
+ if not self.websocket or self.websocket.closed:
496
+ raise RuntimeError("WebSocket not connected")
497
+
498
+ event_json = json.dumps(event)
499
+ await self.websocket.send_str(event_json)
500
+ logger.debug(f"Sent event: {event.get('type')}")
501
+
502
+ def _setup_default_handlers(self):
503
+ """Setup default event handlers for common events"""
504
+
505
+ async def handle_session_created(event):
506
+ logger.info(f"Session created: {event.get('session', {}).get('id')}")
507
+
508
+ async def handle_session_updated(event):
509
+ logger.info(f"Session updated: {event.get('session', {})}")
510
+
511
+ async def handle_input_audio_buffer_committed(event):
512
+ logger.debug(f"Audio buffer committed: {event.get('item_id', 'unknown')}")
513
+
514
+ async def handle_input_audio_buffer_speech_started(event):
515
+ logger.debug(f"Speech started: {event.get('audio_start_ms', 0)}ms")
516
+
517
+ async def handle_input_audio_buffer_speech_stopped(event):
518
+ logger.debug(f"Speech stopped: {event.get('audio_end_ms', 0)}ms")
519
+
520
+ async def handle_conversation_item_created(event):
521
+ item = event.get('item', {})
522
+ logger.debug(f"Conversation item created: {item.get('type')} - {item.get('id')}")
523
+
524
+ async def handle_response_created(event):
525
+ response = event.get('response', {})
526
+ logger.debug(f"Response created: {response.get('id')}")
527
+
528
+ async def handle_rate_limits_updated(event):
529
+ limits = event.get('rate_limits', [])
530
+ logger.debug(f"Rate limits updated: {limits}")
531
+
532
+ async def handle_error(event):
533
+ error = event.get('error', {})
534
+ logger.error(f"Realtime API error: {error.get('message')} (Code: {error.get('code')})")
535
+
536
+ # Register default handlers
537
+ self.add_event_handler(RealtimeEventType.SESSION_CREATED, handle_session_created)
538
+ self.add_event_handler(RealtimeEventType.SESSION_UPDATED, handle_session_updated)
539
+ self.add_event_handler(RealtimeEventType.INPUT_AUDIO_BUFFER_COMMITTED, handle_input_audio_buffer_committed)
540
+ self.add_event_handler(RealtimeEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED, handle_input_audio_buffer_speech_started)
541
+ self.add_event_handler(RealtimeEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED, handle_input_audio_buffer_speech_stopped)
542
+ self.add_event_handler(RealtimeEventType.CONVERSATION_ITEM_CREATED, handle_conversation_item_created)
543
+ self.add_event_handler(RealtimeEventType.RESPONSE_CREATED, handle_response_created)
544
+ self.add_event_handler(RealtimeEventType.RATE_LIMITS_UPDATED, handle_rate_limits_updated)
545
+ self.add_event_handler(RealtimeEventType.ERROR, handle_error)
546
+
350
547
  async def close(self):
351
548
  """Cleanup resources"""
352
- # No persistent connections to close for REST API
353
- pass
549
+ await self.disconnect()