isa-model 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +30 -1
- isa_model/client.py +770 -0
- isa_model/core/config/__init__.py +16 -0
- isa_model/core/config/config_manager.py +514 -0
- isa_model/core/config.py +426 -0
- isa_model/core/models/model_billing_tracker.py +476 -0
- isa_model/core/models/model_manager.py +399 -0
- isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
- isa_model/core/pricing_manager.py +426 -0
- isa_model/core/services/__init__.py +19 -0
- isa_model/core/services/intelligent_model_selector.py +547 -0
- isa_model/core/types.py +291 -0
- isa_model/deployment/__init__.py +2 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
- isa_model/deployment/cloud/modal/register_models.py +321 -0
- isa_model/deployment/runtime/deployed_service.py +338 -0
- isa_model/deployment/services/__init__.py +9 -0
- isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
- isa_model/deployment/services/model_service.py +332 -0
- isa_model/deployment/services/service_monitor.py +356 -0
- isa_model/deployment/services/service_registry.py +527 -0
- isa_model/eval/__init__.py +80 -44
- isa_model/eval/config/__init__.py +10 -0
- isa_model/eval/config/evaluation_config.py +108 -0
- isa_model/eval/evaluators/__init__.py +18 -0
- isa_model/eval/evaluators/base_evaluator.py +503 -0
- isa_model/eval/evaluators/llm_evaluator.py +472 -0
- isa_model/eval/factory.py +417 -709
- isa_model/eval/infrastructure/__init__.py +24 -0
- isa_model/eval/infrastructure/experiment_tracker.py +466 -0
- isa_model/eval/metrics.py +191 -21
- isa_model/inference/ai_factory.py +181 -605
- isa_model/inference/services/audio/base_stt_service.py +65 -1
- isa_model/inference/services/audio/base_tts_service.py +75 -1
- isa_model/inference/services/audio/openai_stt_service.py +189 -151
- isa_model/inference/services/audio/openai_tts_service.py +12 -10
- isa_model/inference/services/audio/replicate_tts_service.py +61 -56
- isa_model/inference/services/base_service.py +55 -17
- isa_model/inference/services/embedding/base_embed_service.py +65 -1
- isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
- isa_model/inference/services/embedding/openai_embed_service.py +8 -10
- isa_model/inference/services/helpers/stacked_config.py +148 -0
- isa_model/inference/services/img/__init__.py +18 -0
- isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
- isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
- isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
- isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
- isa_model/inference/services/llm/__init__.py +3 -3
- isa_model/inference/services/llm/base_llm_service.py +492 -40
- isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
- isa_model/inference/services/llm/ollama_llm_service.py +51 -17
- isa_model/inference/services/llm/openai_llm_service.py +70 -19
- isa_model/inference/services/llm/yyds_llm_service.py +24 -23
- isa_model/inference/services/vision/__init__.py +38 -4
- isa_model/inference/services/vision/base_vision_service.py +218 -117
- isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
- isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
- isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
- isa_model/inference/services/vision/helpers/image_utils.py +272 -3
- isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
- isa_model/inference/services/vision/openai_vision_service.py +104 -307
- isa_model/inference/services/vision/replicate_vision_service.py +140 -325
- isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
- isa_model/scripts/register_models.py +370 -0
- isa_model/scripts/register_models_with_embeddings.py +510 -0
- isa_model/serving/api/fastapi_server.py +6 -1
- isa_model/serving/api/routes/unified.py +202 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/RECORD +77 -53
- isa_model/config/__init__.py +0 -9
- isa_model/config/config_manager.py +0 -213
- isa_model/core/model_manager.py +0 -213
- isa_model/core/model_registry.py +0 -375
- isa_model/core/vision_models_init.py +0 -116
- isa_model/inference/billing_tracker.py +0 -406
- isa_model/inference/services/llm/triton_llm_service.py +0 -481
- isa_model/inference/services/stacked/__init__.py +0 -26
- isa_model/inference/services/stacked/config.py +0 -426
- isa_model/inference/services/vision/ollama_vision_service.py +0 -194
- /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
- /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
- /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,71 @@ from typing import Dict, Any, List, Union, Optional, BinaryIO
|
|
3
3
|
from isa_model.inference.services.base_service import BaseService
|
4
4
|
|
5
5
|
class BaseSTTService(BaseService):
|
6
|
-
"""Base class for Speech-to-Text services"""
|
6
|
+
"""Base class for Speech-to-Text services with unified task dispatch"""
|
7
|
+
|
8
|
+
async def invoke(
|
9
|
+
self,
|
10
|
+
audio_input: Union[str, BinaryIO, List[Union[str, BinaryIO]]],
|
11
|
+
task: Optional[str] = None,
|
12
|
+
**kwargs
|
13
|
+
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
14
|
+
"""
|
15
|
+
统一的任务分发方法 - Base类提供通用实现
|
16
|
+
|
17
|
+
Args:
|
18
|
+
audio_input: 音频输入,可以是:
|
19
|
+
- str: 音频文件路径
|
20
|
+
- BinaryIO: 音频文件对象
|
21
|
+
- List: 多个音频文件(批量处理)
|
22
|
+
task: 任务类型,支持多种STT任务
|
23
|
+
**kwargs: 任务特定的附加参数
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
Dict or List[Dict] containing task results
|
27
|
+
"""
|
28
|
+
task = task or "transcribe"
|
29
|
+
|
30
|
+
# ==================== 语音转文本类任务 ====================
|
31
|
+
if task == "transcribe":
|
32
|
+
if isinstance(audio_input, list):
|
33
|
+
return await self.transcribe_batch(
|
34
|
+
audio_input,
|
35
|
+
kwargs.get("language"),
|
36
|
+
kwargs.get("prompt")
|
37
|
+
)
|
38
|
+
else:
|
39
|
+
return await self.transcribe(
|
40
|
+
audio_input,
|
41
|
+
kwargs.get("language"),
|
42
|
+
kwargs.get("prompt")
|
43
|
+
)
|
44
|
+
elif task == "translate":
|
45
|
+
if isinstance(audio_input, list):
|
46
|
+
raise ValueError("translate task requires single audio input")
|
47
|
+
return await self.translate(audio_input)
|
48
|
+
elif task == "batch_transcribe":
|
49
|
+
if not isinstance(audio_input, list):
|
50
|
+
audio_input = [audio_input]
|
51
|
+
return await self.transcribe_batch(
|
52
|
+
audio_input,
|
53
|
+
kwargs.get("language"),
|
54
|
+
kwargs.get("prompt")
|
55
|
+
)
|
56
|
+
elif task == "detect_language":
|
57
|
+
if isinstance(audio_input, list):
|
58
|
+
raise ValueError("detect_language task requires single audio input")
|
59
|
+
return await self.detect_language(audio_input)
|
60
|
+
else:
|
61
|
+
raise NotImplementedError(f"{self.__class__.__name__} does not support task: {task}")
|
62
|
+
|
63
|
+
def get_supported_tasks(self) -> List[str]:
|
64
|
+
"""
|
65
|
+
获取支持的任务列表
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
List of supported task names
|
69
|
+
"""
|
70
|
+
return ["transcribe", "translate", "batch_transcribe", "detect_language"]
|
7
71
|
|
8
72
|
@abstractmethod
|
9
73
|
async def transcribe(
|
@@ -3,7 +3,81 @@ from typing import Dict, Any, List, Union, Optional, BinaryIO
|
|
3
3
|
from isa_model.inference.services.base_service import BaseService
|
4
4
|
|
5
5
|
class BaseTTSService(BaseService):
|
6
|
-
"""Base class for Text-to-Speech services"""
|
6
|
+
"""Base class for Text-to-Speech services with unified task dispatch"""
|
7
|
+
|
8
|
+
async def invoke(
|
9
|
+
self,
|
10
|
+
text: Union[str, List[str]],
|
11
|
+
task: Optional[str] = None,
|
12
|
+
**kwargs
|
13
|
+
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
14
|
+
"""
|
15
|
+
统一的任务分发方法 - Base类提供通用实现
|
16
|
+
|
17
|
+
Args:
|
18
|
+
text: 输入文本,可以是:
|
19
|
+
- str: 单个文本
|
20
|
+
- List[str]: 多个文本(批量处理)
|
21
|
+
task: 任务类型,支持多种TTS任务
|
22
|
+
**kwargs: 任务特定的附加参数
|
23
|
+
|
24
|
+
Returns:
|
25
|
+
Dict or List[Dict] containing task results
|
26
|
+
"""
|
27
|
+
task = task or "synthesize"
|
28
|
+
|
29
|
+
# ==================== 语音合成类任务 ====================
|
30
|
+
if task == "synthesize":
|
31
|
+
if isinstance(text, list):
|
32
|
+
return await self.synthesize_speech_batch(
|
33
|
+
text,
|
34
|
+
kwargs.get("voice"),
|
35
|
+
kwargs.get("speed", 1.0),
|
36
|
+
kwargs.get("pitch", 1.0),
|
37
|
+
kwargs.get("format", "mp3")
|
38
|
+
)
|
39
|
+
else:
|
40
|
+
return await self.synthesize_speech(
|
41
|
+
text,
|
42
|
+
kwargs.get("voice"),
|
43
|
+
kwargs.get("speed", 1.0),
|
44
|
+
kwargs.get("pitch", 1.0),
|
45
|
+
kwargs.get("format", "mp3")
|
46
|
+
)
|
47
|
+
elif task == "synthesize_to_file":
|
48
|
+
if not kwargs.get("output_path"):
|
49
|
+
raise ValueError("output_path is required for synthesize_to_file task")
|
50
|
+
if isinstance(text, list):
|
51
|
+
raise ValueError("synthesize_to_file task requires single text input")
|
52
|
+
return await self.synthesize_speech_to_file(
|
53
|
+
text,
|
54
|
+
kwargs["output_path"],
|
55
|
+
kwargs.get("voice"),
|
56
|
+
kwargs.get("speed", 1.0),
|
57
|
+
kwargs.get("pitch", 1.0),
|
58
|
+
kwargs.get("format", "mp3")
|
59
|
+
)
|
60
|
+
elif task == "batch_synthesize":
|
61
|
+
if not isinstance(text, list):
|
62
|
+
text = [text]
|
63
|
+
return await self.synthesize_speech_batch(
|
64
|
+
text,
|
65
|
+
kwargs.get("voice"),
|
66
|
+
kwargs.get("speed", 1.0),
|
67
|
+
kwargs.get("pitch", 1.0),
|
68
|
+
kwargs.get("format", "mp3")
|
69
|
+
)
|
70
|
+
else:
|
71
|
+
raise NotImplementedError(f"{self.__class__.__name__} does not support task: {task}")
|
72
|
+
|
73
|
+
def get_supported_tasks(self) -> List[str]:
|
74
|
+
"""
|
75
|
+
获取支持的任务列表
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
List of supported task names
|
79
|
+
"""
|
80
|
+
return ["synthesize", "synthesize_to_file", "batch_synthesize"]
|
7
81
|
|
8
82
|
@abstractmethod
|
9
83
|
async def synthesize_speech(
|
@@ -5,8 +5,6 @@ from openai import AsyncOpenAI
|
|
5
5
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
6
6
|
|
7
7
|
from isa_model.inference.services.audio.base_stt_service import BaseSTTService
|
8
|
-
from isa_model.inference.providers.base_provider import BaseProvider
|
9
|
-
from isa_model.inference.billing_tracker import ServiceType
|
10
8
|
|
11
9
|
logger = logging.getLogger(__name__)
|
12
10
|
|
@@ -14,22 +12,22 @@ class OpenAISTTService(BaseSTTService):
|
|
14
12
|
"""
|
15
13
|
OpenAI Speech-to-Text service using whisper-1 model.
|
16
14
|
Supports transcription and translation to English.
|
15
|
+
Uses the new unified architecture with centralized config management.
|
17
16
|
"""
|
18
17
|
|
19
|
-
def __init__(self,
|
20
|
-
super().__init__(
|
18
|
+
def __init__(self, provider_name: str, model_name: str = "whisper-1", **kwargs):
|
19
|
+
super().__init__(provider_name, model_name, **kwargs)
|
21
20
|
|
22
|
-
# Get
|
23
|
-
provider_config =
|
21
|
+
# Get provider configuration from centralized config manager
|
22
|
+
provider_config = self.get_provider_config()
|
24
23
|
|
25
24
|
# Initialize AsyncOpenAI client with provider configuration
|
26
25
|
try:
|
27
|
-
|
28
|
-
raise ValueError("OpenAI API key not found in provider configuration")
|
26
|
+
api_key = self.get_api_key()
|
29
27
|
|
30
28
|
self.client = AsyncOpenAI(
|
31
|
-
api_key=
|
32
|
-
base_url=provider_config.get("
|
29
|
+
api_key=api_key,
|
30
|
+
base_url=provider_config.get("api_base_url", "https://api.openai.com/v1"),
|
33
31
|
organization=provider_config.get("organization")
|
34
32
|
)
|
35
33
|
|
@@ -48,205 +46,245 @@ class OpenAISTTService(BaseSTTService):
|
|
48
46
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
49
47
|
reraise=True
|
50
48
|
)
|
51
|
-
async def
|
52
|
-
"""
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
async def transcribe(
|
61
|
-
self,
|
62
|
-
audio_file: Union[str, BinaryIO],
|
63
|
-
language: Optional[str] = None,
|
64
|
-
prompt: Optional[str] = None
|
65
|
-
) -> Dict[str, Any]:
|
66
|
-
"""Transcribe audio file to text using whisper-1"""
|
67
|
-
try:
|
68
|
-
# Prepare the audio file
|
69
|
-
if isinstance(audio_file, str):
|
70
|
-
if audio_file.startswith(('http://', 'https://')):
|
71
|
-
# Download audio from URL
|
72
|
-
audio_data = await self._download_audio(audio_file)
|
73
|
-
filename = audio_file.split('/')[-1] or 'audio.wav'
|
74
|
-
else:
|
75
|
-
# Local file path
|
76
|
-
with open(audio_file, 'rb') as f:
|
77
|
-
audio_data = f.read()
|
78
|
-
filename = audio_file
|
79
|
-
else:
|
80
|
-
audio_data = audio_file.read()
|
81
|
-
filename = getattr(audio_file, 'name', 'audio.wav')
|
82
|
-
|
83
|
-
# Check file size
|
84
|
-
if len(audio_data) > self.max_file_size:
|
85
|
-
raise ValueError(f"Audio file size ({len(audio_data)} bytes) exceeds maximum ({self.max_file_size} bytes)")
|
49
|
+
async def transcribe(self, audio_file: Union[str, BinaryIO], language: Optional[str] = None, prompt: Optional[str] = None) -> Dict[str, Any]:
|
50
|
+
"""
|
51
|
+
Transcribe audio file to text using OpenAI's Whisper model.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
audio_file: Path to audio file or file-like object
|
55
|
+
language: Optional language code for better accuracy
|
56
|
+
**kwargs: Additional parameters for the transcription API
|
86
57
|
|
87
|
-
|
88
|
-
|
58
|
+
Returns:
|
59
|
+
Dict containing transcription result and metadata
|
60
|
+
"""
|
61
|
+
try:
|
62
|
+
# Prepare request parameters
|
63
|
+
transcription_params = {
|
89
64
|
"model": self.model_name,
|
90
|
-
"file": (filename, audio_data),
|
91
65
|
"response_format": "verbose_json"
|
92
66
|
}
|
93
67
|
|
94
68
|
if language:
|
95
|
-
|
96
|
-
if prompt:
|
97
|
-
kwargs["prompt"] = prompt
|
69
|
+
transcription_params["language"] = language
|
98
70
|
|
99
|
-
#
|
100
|
-
|
71
|
+
# Add optional parameters
|
72
|
+
if prompt:
|
73
|
+
transcription_params["prompt"] = prompt
|
101
74
|
|
102
|
-
#
|
103
|
-
|
104
|
-
|
105
|
-
|
75
|
+
# Handle file input
|
76
|
+
if isinstance(audio_file, str):
|
77
|
+
with open(audio_file, "rb") as f:
|
78
|
+
transcription = await self.client.audio.transcriptions.create(
|
79
|
+
file=f,
|
80
|
+
**transcription_params
|
81
|
+
)
|
82
|
+
else:
|
83
|
+
transcription = await self.client.audio.transcriptions.create(
|
84
|
+
file=audio_file,
|
85
|
+
**transcription_params
|
86
|
+
)
|
106
87
|
|
107
|
-
#
|
108
|
-
|
88
|
+
# Extract usage information for billing
|
89
|
+
result = {
|
90
|
+
"text": transcription.text,
|
91
|
+
"language": getattr(transcription, 'language', language),
|
92
|
+
"duration": getattr(transcription, 'duration', None),
|
93
|
+
"segments": getattr(transcription, 'segments', []),
|
94
|
+
"usage": {
|
95
|
+
"input_units": getattr(transcription, 'duration', 1), # Duration in seconds
|
96
|
+
"output_tokens": len(transcription.text.split()) if transcription.text else 0
|
97
|
+
}
|
98
|
+
}
|
109
99
|
|
110
|
-
|
111
|
-
|
100
|
+
# Track usage for billing
|
101
|
+
await self._track_usage(
|
102
|
+
service_type="audio_stt",
|
112
103
|
operation="transcribe",
|
113
|
-
|
114
|
-
output_tokens=output_tokens,
|
115
|
-
input_units=duration_minutes, # Duration in minutes
|
104
|
+
input_units=result["usage"]["input_units"],
|
105
|
+
output_tokens=result["usage"]["output_tokens"],
|
116
106
|
metadata={
|
117
|
-
"language": language,
|
118
|
-
"
|
119
|
-
"
|
107
|
+
"language": result.get("language"),
|
108
|
+
"model_name": self.model_name,
|
109
|
+
"provider": self.provider_name
|
120
110
|
}
|
121
111
|
)
|
122
112
|
|
123
|
-
# Format response
|
124
|
-
result = {
|
125
|
-
"text": response.text,
|
126
|
-
"language": getattr(response, 'language', language or 'unknown'),
|
127
|
-
"duration": getattr(response, 'duration', None),
|
128
|
-
"segments": getattr(response, 'segments', []),
|
129
|
-
"confidence": None, # whisper-1 doesn't provide confidence scores
|
130
|
-
"usage": usage # Include usage information
|
131
|
-
}
|
132
|
-
|
133
113
|
return result
|
134
114
|
|
135
115
|
except Exception as e:
|
136
|
-
logger.error(f"
|
116
|
+
logger.error(f"Transcription failed: {e}")
|
137
117
|
raise
|
138
|
-
|
118
|
+
|
139
119
|
@retry(
|
140
120
|
stop=stop_after_attempt(3),
|
141
121
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
142
122
|
reraise=True
|
143
123
|
)
|
144
|
-
async def translate(
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
124
|
+
async def translate(self, audio_file: Union[str, BinaryIO]) -> Dict[str, Any]:
|
125
|
+
"""
|
126
|
+
Translate audio file to English text using OpenAI's Whisper model.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
audio_file: Path to audio file or file-like object
|
130
|
+
**kwargs: Additional parameters for the translation API
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
Dict containing translation result and metadata
|
134
|
+
"""
|
149
135
|
try:
|
150
|
-
# Prepare
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
else:
|
156
|
-
audio_data = audio_file.read()
|
157
|
-
filename = getattr(audio_file, 'name', 'audio.wav')
|
136
|
+
# Prepare request parameters
|
137
|
+
translation_params = {
|
138
|
+
"model": self.model_name,
|
139
|
+
"response_format": "verbose_json"
|
140
|
+
}
|
158
141
|
|
159
|
-
#
|
160
|
-
if len(audio_data) > self.max_file_size:
|
161
|
-
raise ValueError(f"Audio file size ({len(audio_data)} bytes) exceeds maximum ({self.max_file_size} bytes)")
|
142
|
+
# No additional parameters for translation
|
162
143
|
|
163
|
-
#
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
144
|
+
# Handle file input
|
145
|
+
if isinstance(audio_file, str):
|
146
|
+
with open(audio_file, "rb") as f:
|
147
|
+
translation = await self.client.audio.translations.create(
|
148
|
+
file=f,
|
149
|
+
**translation_params
|
150
|
+
)
|
151
|
+
else:
|
152
|
+
translation = await self.client.audio.translations.create(
|
153
|
+
file=audio_file,
|
154
|
+
**translation_params
|
155
|
+
)
|
169
156
|
|
170
|
-
#
|
157
|
+
# Extract usage information for billing
|
171
158
|
result = {
|
172
|
-
"text":
|
173
|
-
"
|
174
|
-
"duration": getattr(
|
175
|
-
"segments": getattr(
|
176
|
-
"
|
159
|
+
"text": translation.text,
|
160
|
+
"language": "en", # Translation is always to English
|
161
|
+
"duration": getattr(translation, 'duration', None),
|
162
|
+
"segments": getattr(translation, 'segments', []),
|
163
|
+
"usage": {
|
164
|
+
"input_units": getattr(translation, 'duration', 1), # Duration in seconds
|
165
|
+
"output_tokens": len(translation.text.split()) if translation.text else 0
|
166
|
+
}
|
177
167
|
}
|
178
168
|
|
169
|
+
# Track usage for billing
|
170
|
+
await self._track_usage(
|
171
|
+
service_type="audio_stt",
|
172
|
+
operation="translate",
|
173
|
+
input_units=result["usage"]["input_units"],
|
174
|
+
output_tokens=result["usage"]["output_tokens"],
|
175
|
+
metadata={
|
176
|
+
"target_language": "en",
|
177
|
+
"model_name": self.model_name,
|
178
|
+
"provider": self.provider_name
|
179
|
+
}
|
180
|
+
)
|
181
|
+
|
179
182
|
return result
|
180
183
|
|
181
184
|
except Exception as e:
|
182
|
-
logger.error(f"
|
185
|
+
logger.error(f"Translation failed: {e}")
|
183
186
|
raise
|
184
|
-
|
185
|
-
async def transcribe_batch(
|
186
|
-
|
187
|
-
|
188
|
-
language: Optional[str] = None,
|
189
|
-
prompt: Optional[str] = None
|
190
|
-
) -> List[Dict[str, Any]]:
|
191
|
-
"""Transcribe multiple audio files"""
|
192
|
-
results = []
|
187
|
+
|
188
|
+
async def transcribe_batch(self, audio_files: List[Union[str, BinaryIO]], language: Optional[str] = None, prompt: Optional[str] = None) -> List[Dict[str, Any]]:
|
189
|
+
"""
|
190
|
+
Transcribe multiple audio files in batch.
|
193
191
|
|
192
|
+
Args:
|
193
|
+
audio_files: List of audio file paths or file-like objects
|
194
|
+
language: Optional language code for better accuracy
|
195
|
+
**kwargs: Additional parameters for the transcription API
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
List of transcription results
|
199
|
+
"""
|
200
|
+
results = []
|
194
201
|
for audio_file in audio_files:
|
195
202
|
try:
|
196
203
|
result = await self.transcribe(audio_file, language, prompt)
|
197
204
|
results.append(result)
|
198
205
|
except Exception as e:
|
199
|
-
logger.error(f"
|
206
|
+
logger.error(f"Failed to transcribe {audio_file}: {e}")
|
200
207
|
results.append({
|
201
|
-
"
|
202
|
-
"
|
203
|
-
"
|
204
|
-
"segments": [],
|
205
|
-
"confidence": None,
|
206
|
-
"error": str(e)
|
208
|
+
"error": str(e),
|
209
|
+
"file": str(audio_file),
|
210
|
+
"text": None
|
207
211
|
})
|
208
212
|
|
209
213
|
return results
|
210
|
-
|
214
|
+
|
211
215
|
async def detect_language(self, audio_file: Union[str, BinaryIO]) -> Dict[str, Any]:
|
212
|
-
"""
|
216
|
+
"""
|
217
|
+
Detect the language of an audio file.
|
218
|
+
|
219
|
+
Args:
|
220
|
+
audio_file: Path to audio file or file-like object
|
221
|
+
**kwargs: Additional parameters
|
222
|
+
|
223
|
+
Returns:
|
224
|
+
Dict containing detected language and confidence
|
225
|
+
"""
|
213
226
|
try:
|
214
|
-
#
|
215
|
-
|
227
|
+
# Use transcription with language detection - need to access client directly
|
228
|
+
transcription = await self.client.audio.transcriptions.create(
|
229
|
+
file=audio_file if not isinstance(audio_file, str) else open(audio_file, "rb"),
|
230
|
+
model=self.model_name,
|
231
|
+
response_format="verbose_json"
|
232
|
+
)
|
233
|
+
|
234
|
+
result = {
|
235
|
+
"text": transcription.text,
|
236
|
+
"language": getattr(transcription, 'language', "unknown")
|
237
|
+
}
|
216
238
|
|
217
239
|
return {
|
218
|
-
"language": result
|
219
|
-
"confidence": 1.0, #
|
220
|
-
"
|
240
|
+
"language": result.get("language", "unknown"),
|
241
|
+
"confidence": 1.0, # OpenAI doesn't provide confidence scores
|
242
|
+
"text_sample": result.get("text", "")[:100] if result.get("text") else ""
|
221
243
|
}
|
222
244
|
|
223
245
|
except Exception as e:
|
224
|
-
logger.error(f"
|
225
|
-
|
226
|
-
|
246
|
+
logger.error(f"Language detection failed: {e}")
|
247
|
+
return {
|
248
|
+
"language": "unknown",
|
249
|
+
"confidence": 0.0,
|
250
|
+
"error": str(e)
|
251
|
+
}
|
252
|
+
|
227
253
|
def get_supported_formats(self) -> List[str]:
|
228
|
-
"""
|
229
|
-
|
254
|
+
"""
|
255
|
+
Get list of supported audio formats.
|
256
|
+
|
257
|
+
Returns:
|
258
|
+
List of supported file extensions
|
259
|
+
"""
|
260
|
+
return self.supported_formats
|
230
261
|
|
231
262
|
def get_supported_languages(self) -> List[str]:
|
232
|
-
"""
|
233
|
-
|
263
|
+
"""
|
264
|
+
Get list of supported language codes for OpenAI Whisper.
|
265
|
+
|
266
|
+
Returns:
|
267
|
+
List of supported language codes
|
268
|
+
"""
|
234
269
|
return [
|
235
|
-
'af', '
|
236
|
-
'
|
237
|
-
'
|
238
|
-
'
|
239
|
-
'
|
240
|
-
'oc', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'sa', 'sd', 'si', 'sk', 'sl', 'sn',
|
241
|
-
'so', 'sq', 'sr', 'su', 'sv', 'sw', 'ta', 'te', 'tg', 'th', 'tk', 'tl', 'tr',
|
242
|
-
'tt', 'uk', 'ur', 'uz', 'vi', 'yi', 'yo', 'zh'
|
270
|
+
'af', 'ar', 'hy', 'az', 'be', 'bs', 'bg', 'ca', 'zh', 'hr', 'cs', 'da',
|
271
|
+
'nl', 'en', 'et', 'fi', 'fr', 'gl', 'de', 'el', 'he', 'hi', 'hu', 'is',
|
272
|
+
'id', 'it', 'ja', 'kn', 'kk', 'ko', 'lv', 'lt', 'mk', 'ms', 'mr', 'mi',
|
273
|
+
'ne', 'no', 'fa', 'pl', 'pt', 'ro', 'ru', 'sr', 'sk', 'sl', 'es', 'sw',
|
274
|
+
'sv', 'tl', 'ta', 'th', 'tr', 'uk', 'ur', 'vi', 'cy'
|
243
275
|
]
|
244
|
-
|
276
|
+
|
245
277
|
def get_max_file_size(self) -> int:
|
246
|
-
"""
|
278
|
+
"""
|
279
|
+
Get maximum file size limit in bytes.
|
280
|
+
|
281
|
+
Returns:
|
282
|
+
Maximum file size in bytes
|
283
|
+
"""
|
247
284
|
return self.max_file_size
|
248
|
-
|
285
|
+
|
249
286
|
async def close(self):
|
250
287
|
"""Cleanup resources"""
|
251
|
-
|
252
|
-
|
288
|
+
if hasattr(self.client, 'close'):
|
289
|
+
await self.client.close()
|
290
|
+
logger.info("OpenAI STT service closed")
|
@@ -4,20 +4,18 @@ import os
|
|
4
4
|
from openai import AsyncOpenAI
|
5
5
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
6
6
|
from isa_model.inference.services.audio.base_tts_service import BaseTTSService
|
7
|
-
from isa_model.inference.providers.base_provider import BaseProvider
|
8
|
-
from isa_model.inference.billing_tracker import ServiceType
|
9
7
|
import logging
|
10
8
|
|
11
9
|
logger = logging.getLogger(__name__)
|
12
10
|
|
13
11
|
class OpenAITTSService(BaseTTSService):
|
14
|
-
"""
|
12
|
+
"""OpenAI TTS service with unified architecture"""
|
15
13
|
|
16
|
-
def __init__(self,
|
17
|
-
super().__init__(
|
14
|
+
def __init__(self, provider_name: str, model_name: str = "tts-1", **kwargs):
|
15
|
+
super().__init__(provider_name, model_name, **kwargs)
|
18
16
|
|
19
|
-
# Get
|
20
|
-
provider_config =
|
17
|
+
# Get configuration from centralized config manager
|
18
|
+
provider_config = self.get_provider_config()
|
21
19
|
|
22
20
|
# Initialize AsyncOpenAI client with provider configuration
|
23
21
|
try:
|
@@ -113,8 +111,8 @@ class OpenAITTSService(BaseTTSService):
|
|
113
111
|
estimated_duration_seconds = (words / 150.0) * 60.0 / speed
|
114
112
|
|
115
113
|
# Track usage for billing (OpenAI TTS is token-based: $15 per 1M characters)
|
116
|
-
self._track_usage(
|
117
|
-
service_type=
|
114
|
+
await self._track_usage(
|
115
|
+
service_type="audio_tts",
|
118
116
|
operation="synthesize_speech",
|
119
117
|
input_tokens=len(text), # Characters as input tokens
|
120
118
|
output_tokens=0,
|
@@ -130,8 +128,12 @@ class OpenAITTSService(BaseTTSService):
|
|
130
128
|
}
|
131
129
|
)
|
132
130
|
|
131
|
+
# For HTTP API compatibility, encode audio data as base64
|
132
|
+
import base64
|
133
|
+
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
134
|
+
|
133
135
|
return {
|
134
|
-
"
|
136
|
+
"audio_data_base64": audio_base64, # Base64 encoded for JSON compatibility
|
135
137
|
"format": format,
|
136
138
|
"duration": estimated_duration_seconds,
|
137
139
|
"sample_rate": 24000 # Default for OpenAI TTS
|