PyPI - isa-model - Versions diffs - 0.0.2__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

isa-model 0.0.2py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

isa_model/__init__.py +1 -1
isa_model/core/model_manager.py +69 -4
isa_model/core/model_registry.py +273 -46
isa_model/core/storage/hf_storage.py +419 -0
isa_model/deployment/__init__.py +52 -0
isa_model/deployment/core/__init__.py +34 -0
isa_model/deployment/core/deployment_config.py +356 -0
isa_model/deployment/core/deployment_manager.py +549 -0
isa_model/deployment/core/isa_deployment_service.py +401 -0
isa_model/eval/factory.py +381 -140
isa_model/inference/ai_factory.py +427 -236
isa_model/inference/billing_tracker.py +406 -0
isa_model/inference/providers/base_provider.py +51 -4
isa_model/inference/providers/ml_provider.py +50 -0
isa_model/inference/providers/ollama_provider.py +37 -18
isa_model/inference/providers/openai_provider.py +65 -36
isa_model/inference/providers/replicate_provider.py +42 -30
isa_model/inference/services/audio/base_stt_service.py +21 -2
isa_model/inference/services/audio/openai_realtime_service.py +353 -0
isa_model/inference/services/audio/openai_stt_service.py +252 -0
isa_model/inference/services/audio/openai_tts_service.py +149 -9
isa_model/inference/services/audio/replicate_tts_service.py +239 -0
isa_model/inference/services/base_service.py +36 -1
isa_model/inference/services/embedding/base_embed_service.py +112 -0
isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
isa_model/inference/services/embedding/openai_embed_service.py +223 -0
isa_model/inference/services/llm/__init__.py +2 -0
isa_model/inference/services/llm/base_llm_service.py +158 -86
isa_model/inference/services/llm/llm_adapter.py +414 -0
isa_model/inference/services/llm/ollama_llm_service.py +252 -63
isa_model/inference/services/llm/openai_llm_service.py +231 -93
isa_model/inference/services/llm/triton_llm_service.py +481 -0
isa_model/inference/services/ml/base_ml_service.py +78 -0
isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
isa_model/inference/services/vision/__init__.py +3 -3
isa_model/inference/services/vision/base_image_gen_service.py +161 -0
isa_model/inference/services/vision/base_vision_service.py +177 -0
isa_model/inference/services/vision/helpers/image_utils.py +4 -3
isa_model/inference/services/vision/ollama_vision_service.py +151 -17
isa_model/inference/services/vision/openai_vision_service.py +275 -41
isa_model/inference/services/vision/replicate_image_gen_service.py +278 -118
isa_model/training/__init__.py +62 -32
isa_model/training/cloud/__init__.py +22 -0
isa_model/training/cloud/job_orchestrator.py +402 -0
isa_model/training/cloud/runpod_trainer.py +454 -0
isa_model/training/cloud/storage_manager.py +482 -0
isa_model/training/core/__init__.py +23 -0
isa_model/training/core/config.py +181 -0
isa_model/training/core/dataset.py +222 -0
isa_model/training/core/trainer.py +720 -0
isa_model/training/core/utils.py +213 -0
isa_model/training/factory.py +229 -198
isa_model-0.3.1.dist-info/METADATA +465 -0
isa_model-0.3.1.dist-info/RECORD +91 -0
isa_model/core/model_router.py +0 -226
isa_model/core/model_version.py +0 -0
isa_model/core/resource_manager.py +0 -202
isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
isa_model/training/engine/llama_factory/__init__.py +0 -39
isa_model/training/engine/llama_factory/config.py +0 -115
isa_model/training/engine/llama_factory/data_adapter.py +0 -284
isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
isa_model/training/engine/llama_factory/factory.py +0 -331
isa_model/training/engine/llama_factory/rl.py +0 -254
isa_model/training/engine/llama_factory/trainer.py +0 -171
isa_model/training/image_model/configs/create_config.py +0 -37
isa_model/training/image_model/configs/create_flux_config.py +0 -26
isa_model/training/image_model/configs/create_lora_config.py +0 -21
isa_model/training/image_model/prepare_massed_compute.py +0 -97
isa_model/training/image_model/prepare_upload.py +0 -17
isa_model/training/image_model/raw_data/create_captions.py +0 -16
isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
isa_model/training/image_model/raw_data/pre_processing.py +0 -200
isa_model/training/image_model/train/train.py +0 -42
isa_model/training/image_model/train/train_flux.py +0 -41
isa_model/training/image_model/train/train_lora.py +0 -57
isa_model/training/image_model/train_main.py +0 -25
isa_model-0.0.2.dist-info/METADATA +0 -327
isa_model-0.0.2.dist-info/RECORD +0 -92
isa_model-0.0.2.dist-info/licenses/LICENSE +0 -21
/isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
{isa_model-0.0.2.dist-info → isa_model-0.3.1.dist-info}/WHEEL +0 -0
{isa_model-0.0.2.dist-info → isa_model-0.3.1.dist-info}/top_level.txt +0 -0

isa_model/inference/services/audio/openai_stt_service.py ADDED Viewed

@@ -0,0 +1,252 @@
+import logging
+import aiohttp
+from typing import Dict, Any, List, Union, Optional, BinaryIO
+from openai import AsyncOpenAI
+from tenacity import retry, stop_after_attempt, wait_exponential
+from isa_model.inference.services.audio.base_stt_service import BaseSTTService
+from isa_model.inference.providers.base_provider import BaseProvider
+from isa_model.inference.billing_tracker import ServiceType
+logger = logging.getLogger(__name__)
+class OpenAISTTService(BaseSTTService):
+    """
+    OpenAI Speech-to-Text service using whisper-1 model.
+    Supports transcription and translation to English.
+    """
+    def __init__(self, provider: 'BaseProvider', model_name: str = "whisper-1"):
+        super().__init__(provider, model_name)
+        # Get full configuration from provider (including sensitive data)
+        provider_config = provider.get_full_config()
+        # Initialize AsyncOpenAI client with provider configuration
+        try:
+            if not provider_config.get("api_key"):
+                raise ValueError("OpenAI API key not found in provider configuration")
+            self.client = AsyncOpenAI(
+                api_key=provider_config["api_key"],
+                base_url=provider_config.get("base_url", "https://api.openai.com/v1"),
+                organization=provider_config.get("organization")
+            )
+            logger.info(f"Initialized OpenAISTTService with model '{self.model_name}'")
+        except Exception as e:
+            logger.error(f"Failed to initialize OpenAI client: {e}")
+            raise ValueError(f"Failed to initialize OpenAI client. Check your API key configuration: {e}") from e
+        # Model configurations
+        self.max_file_size = provider_config.get('max_file_size', 25 * 1024 * 1024)  # 25MB
+        self.supported_formats = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        reraise=True
+    )
+    async def _download_audio(self, audio_url: str) -> bytes:
+        """Download audio from URL"""
+        async with aiohttp.ClientSession() as session:
+            async with session.get(audio_url) as response:
+                if response.status == 200:
+                    return await response.read()
+                else:
+                    raise ValueError(f"Failed to download audio from {audio_url}: {response.status}")
+    async def transcribe(
+        self,
+        audio_file: Union[str, BinaryIO],
+        language: Optional[str] = None,
+        prompt: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Transcribe audio file to text using whisper-1"""
+        try:
+            # Prepare the audio file
+            if isinstance(audio_file, str):
+                if audio_file.startswith(('http://', 'https://')):
+                    # Download audio from URL
+                    audio_data = await self._download_audio(audio_file)
+                    filename = audio_file.split('/')[-1] or 'audio.wav'
+                else:
+                    # Local file path
+                    with open(audio_file, 'rb') as f:
+                        audio_data = f.read()
+                        filename = audio_file
+            else:
+                audio_data = audio_file.read()
+                filename = getattr(audio_file, 'name', 'audio.wav')
+            # Check file size
+            if len(audio_data) > self.max_file_size:
+                raise ValueError(f"Audio file size ({len(audio_data)} bytes) exceeds maximum ({self.max_file_size} bytes)")
+            # Prepare transcription parameters
+            kwargs = {
+                "model": self.model_name,
+                "file": (filename, audio_data),
+                "response_format": "verbose_json"
+            }
+            if language:
+                kwargs["language"] = language
+            if prompt:
+                kwargs["prompt"] = prompt
+            # Transcribe audio
+            response = await self.client.audio.transcriptions.create(**kwargs)
+            # Track usage for billing
+            usage = getattr(response, 'usage', {})
+            input_tokens = usage.get('input_tokens', 0) if usage else 0
+            output_tokens = usage.get('output_tokens', 0) if usage else 0
+            # For audio, also track duration in minutes
+            duration_minutes = getattr(response, 'duration', 0) / 60.0 if getattr(response, 'duration', 0) else 0
+            self._track_usage(
+                service_type=ServiceType.AUDIO_STT,
+                operation="transcribe",
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                input_units=duration_minutes,  # Duration in minutes
+                metadata={
+                    "language": language,
+                    "model": self.model_name,
+                    "file_size": len(audio_data)
+                }
+            )
+            # Format response
+            result = {
+                "text": response.text,
+                "language": getattr(response, 'language', language or 'unknown'),
+                "duration": getattr(response, 'duration', None),
+                "segments": getattr(response, 'segments', []),
+                "confidence": None,  # whisper-1 doesn't provide confidence scores
+                "usage": usage  # Include usage information
+            }
+            return result
+        except Exception as e:
+            logger.error(f"Error transcribing audio: {e}")
+            raise
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        reraise=True
+    )
+    async def translate(
+        self,
+        audio_file: Union[str, BinaryIO]
+    ) -> Dict[str, Any]:
+        """Translate audio file to English text"""
+        try:
+            # Prepare the audio file
+            if isinstance(audio_file, str):
+                with open(audio_file, 'rb') as f:
+                    audio_data = f.read()
+                    filename = audio_file
+            else:
+                audio_data = audio_file.read()
+                filename = getattr(audio_file, 'name', 'audio.wav')
+            # Check file size
+            if len(audio_data) > self.max_file_size:
+                raise ValueError(f"Audio file size ({len(audio_data)} bytes) exceeds maximum ({self.max_file_size} bytes)")
+            # Translate audio to English
+            response = await self.client.audio.translations.create(
+                model=self.model_name,
+                file=(filename, audio_data),
+                response_format="verbose_json"
+            )
+            # Format response
+            result = {
+                "text": response.text,
+                "detected_language": getattr(response, 'language', 'unknown'),
+                "duration": getattr(response, 'duration', None),
+                "segments": getattr(response, 'segments', []),
+                "confidence": None  # Whisper doesn't provide confidence scores
+            }
+            return result
+        except Exception as e:
+            logger.error(f"Error translating audio: {e}")
+            raise
+    async def transcribe_batch(
+        self,
+        audio_files: List[Union[str, BinaryIO]],
+        language: Optional[str] = None,
+        prompt: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        """Transcribe multiple audio files"""
+        results = []
+        for audio_file in audio_files:
+            try:
+                result = await self.transcribe(audio_file, language, prompt)
+                results.append(result)
+            except Exception as e:
+                logger.error(f"Error transcribing audio file: {e}")
+                results.append({
+                    "text": "",
+                    "language": "unknown",
+                    "duration": None,
+                    "segments": [],
+                    "confidence": None,
+                    "error": str(e)
+                })
+        return results
+    async def detect_language(self, audio_file: Union[str, BinaryIO]) -> Dict[str, Any]:
+        """Detect language of audio file"""
+        try:
+            # Transcribe with language detection
+            result = await self.transcribe(audio_file, language=None)
+            return {
+                "language": result["language"],
+                "confidence": 1.0,  # Whisper is generally confident
+                "alternatives": []  # Whisper doesn't provide alternatives
+            }
+        except Exception as e:
+            logger.error(f"Error detecting language: {e}")
+            raise
+    def get_supported_formats(self) -> List[str]:
+        """Get list of supported audio formats"""
+        return self.supported_formats.copy()
+    def get_supported_languages(self) -> List[str]:
+        """Get list of supported language codes"""
+        # Whisper supports 99+ languages
+        return [
+            'af', 'am', 'ar', 'as', 'az', 'ba', 'be', 'bg', 'bn', 'bo', 'br', 'bs', 'ca',
+            'cs', 'cy', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr',
+            'gl', 'gu', 'ha', 'haw', 'he', 'hi', 'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it',
+            'ja', 'jw', 'ka', 'kk', 'km', 'kn', 'ko', 'la', 'lb', 'ln', 'lo', 'lt', 'lv',
+            'mg', 'mi', 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'ne', 'nl', 'nn', 'no',
+            'oc', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'sa', 'sd', 'si', 'sk', 'sl', 'sn',
+            'so', 'sq', 'sr', 'su', 'sv', 'sw', 'ta', 'te', 'tg', 'th', 'tk', 'tl', 'tr',
+            'tt', 'uk', 'ur', 'uz', 'vi', 'yi', 'yo', 'zh'
+        ]
+    def get_max_file_size(self) -> int:
+        """Get maximum file size in bytes"""
+        return self.max_file_size
+    async def close(self):
+        """Cleanup resources"""
+        await self.client.close()
+        logger.info("OpenAISTTService client has been closed.")

isa_model/inference/services/audio/openai_tts_service.py CHANGED Viewed

@@ -1,25 +1,42 @@
-from typing import Dict, Any
+from typing import Dict, Any, List, Optional
 import tempfile
 import os
 from openai import AsyncOpenAI
 from tenacity import retry, stop_after_attempt, wait_exponential
-from isa_model.inference.services.base_service import BaseService
+from isa_model.inference.services.audio.base_tts_service import BaseTTSService
 from isa_model.inference.providers.base_provider import BaseProvider
+from isa_model.inference.billing_tracker import ServiceType
 import logging
 logger = logging.getLogger(__name__)
-class YYDSAudioService(BaseService):
+class OpenAITTSService(BaseTTSService):
     """Audio model service wrapper for YYDS"""
     def __init__(self, provider: 'BaseProvider', model_name: str):
         super().__init__(provider, model_name)
-        # 初始化 AsyncOpenAI 客户端
-        self._client = AsyncOpenAI(
-            api_key=self.config.get('api_key'),
-            base_url=self.config.get('base_url')
-        )
-        self.language = self.config.get('language', None)
+        # Get full configuration from provider (including sensitive data)
+        provider_config = provider.get_full_config()
+        # Initialize AsyncOpenAI client with provider configuration
+        try:
+            if not provider_config.get("api_key"):
+                raise ValueError("OpenAI API key not found in provider configuration")
+            self._client = AsyncOpenAI(
+                api_key=provider_config["api_key"],
+                base_url=provider_config.get("base_url", "https://api.openai.com/v1"),
+                organization=provider_config.get("organization")
+            )
+            logger.info(f"Initialized OpenAITTSService with model '{self.model_name}'")
+        except Exception as e:
+            logger.error(f"Failed to initialize OpenAI client: {e}")
+            raise ValueError(f"Failed to initialize OpenAI client. Check your API key configuration: {e}") from e
+        self.language = provider_config.get('language', None)
     @property
     def client(self) -> AsyncOpenAI:
@@ -69,3 +86,126 @@ class YYDSAudioService(BaseService):
         except Exception as e:
             logger.error(f"Error in audio transcription: {e}")
             raise
+    # 实现BaseTTSService的抽象方法
+    async def synthesize_speech(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        speed: float = 1.0,
+        pitch: float = 1.0,
+        format: str = "mp3"
+    ) -> Dict[str, Any]:
+        """Synthesize speech from text using OpenAI TTS"""
+        try:
+            response = await self._client.audio.speech.create(
+                model="tts-1",
+                voice=voice or "alloy",  # type: ignore
+                input=text,
+                response_format=format,  # type: ignore
+                speed=speed
+            )
+            audio_data = response.content
+            # Estimate audio duration for billing (rough estimation: ~150 words per minute)
+            words = len(text.split())
+            estimated_duration_seconds = (words / 150.0) * 60.0 / speed
+            # Track usage for billing (OpenAI TTS is token-based: $15 per 1M characters)
+            self._track_usage(
+                service_type=ServiceType.AUDIO_TTS,
+                operation="synthesize_speech",
+                input_tokens=len(text),  # Characters as input tokens
+                output_tokens=0,
+                input_units=len(text),  # Text length
+                output_units=estimated_duration_seconds,  # Audio duration in seconds
+                metadata={
+                    "model": self.model_name,
+                    "voice": voice or "alloy",
+                    "speed": speed,
+                    "format": format,
+                    "text_length": len(text),
+                    "estimated_duration_seconds": estimated_duration_seconds
+                }
+            )
+            return {
+                "audio_data": audio_data,
+                "format": format,
+                "duration": estimated_duration_seconds,
+                "sample_rate": 24000  # Default for OpenAI TTS
+            }
+        except Exception as e:
+            logger.error(f"Error in speech synthesis: {e}")
+            raise
+    async def synthesize_speech_to_file(
+        self,
+        text: str,
+        output_path: str,
+        voice: Optional[str] = None,
+        speed: float = 1.0,
+        pitch: float = 1.0,
+        format: str = "mp3"
+    ) -> Dict[str, Any]:
+        """Synthesize speech and save to file"""
+        result = await self.synthesize_speech(text, voice, speed, pitch, format)
+        with open(output_path, 'wb') as f:
+            f.write(result["audio_data"])
+        return {
+            "file_path": output_path,
+            "duration": result["duration"],
+            "sample_rate": result["sample_rate"]
+        }
+    async def synthesize_speech_batch(
+        self,
+        texts: List[str],
+        voice: Optional[str] = None,
+        speed: float = 1.0,
+        pitch: float = 1.0,
+        format: str = "mp3"
+    ) -> List[Dict[str, Any]]:
+        """Synthesize speech for multiple texts"""
+        results = []
+        for text in texts:
+            result = await self.synthesize_speech(text, voice, speed, pitch, format)
+            results.append(result)
+        return results
+    def get_available_voices(self) -> List[Dict[str, Any]]:
+        """Get list of available OpenAI voices"""
+        return [
+            {"id": "alloy", "name": "Alloy", "language": "en-US", "gender": "neutral", "age": "adult"},
+            {"id": "echo", "name": "Echo", "language": "en-US", "gender": "male", "age": "adult"},
+            {"id": "fable", "name": "Fable", "language": "en-US", "gender": "neutral", "age": "adult"},
+            {"id": "onyx", "name": "Onyx", "language": "en-US", "gender": "male", "age": "adult"},
+            {"id": "nova", "name": "Nova", "language": "en-US", "gender": "female", "age": "adult"},
+            {"id": "shimmer", "name": "Shimmer", "language": "en-US", "gender": "female", "age": "adult"}
+        ]
+    def get_supported_formats(self) -> List[str]:
+        """Get list of supported audio formats"""
+        return ["mp3", "opus", "aac", "flac"]
+    def get_voice_info(self, voice_id: str) -> Dict[str, Any]:
+        """Get detailed information about a specific voice"""
+        voices = {voice["id"]: voice for voice in self.get_available_voices()}
+        voice_info = voices.get(voice_id, {})
+        if voice_info:
+            voice_info.update({
+                "description": f"OpenAI {voice_info['name']} voice",
+                "sample_rate": 24000
+            })
+        return voice_info
+    async def close(self):
+        """Cleanup resources"""
+        if hasattr(self._client, 'close'):
+            await self._client.close()

isa_model/inference/services/audio/replicate_tts_service.py ADDED Viewed

@@ -0,0 +1,239 @@
+import logging
+from typing import Dict, Any, List, Optional, BinaryIO
+import replicate
+from tenacity import retry, stop_after_attempt, wait_exponential
+from isa_model.inference.services.audio.base_tts_service import BaseTTSService
+from isa_model.inference.providers.base_provider import BaseProvider
+from isa_model.inference.billing_tracker import ServiceType
+logger = logging.getLogger(__name__)
+class ReplicateTTSService(BaseTTSService):
+    """
+    Replicate Text-to-Speech service using Kokoro model.
+    High-quality voice synthesis with multiple voice options.
+    """
+    def __init__(self, provider: 'BaseProvider', model_name: str = "jaaari/kokoro-82m:f559560eb822dc509045f3921a1921234918b91739db4bf3daab2169b71c7a13"):
+        super().__init__(provider, model_name)
+        # Get full configuration from provider (including sensitive data)
+        provider_config = provider.get_full_config()
+        # Set up Replicate API token from provider configuration
+        self.api_token = provider_config.get('api_token') or provider_config.get('replicate_api_token')
+        if not self.api_token:
+            raise ValueError("Replicate API token not found in provider configuration")
+        # Set environment variable for replicate library
+        import os
+        os.environ['REPLICATE_API_TOKEN'] = self.api_token
+        # Available voices for Kokoro model
+        self.available_voices = [
+            "af_bella", "af_nicole", "af_sarah", "af_sky", "am_adam", "am_michael"
+        ]
+        # Default settings
+        self.default_voice = "af_nicole"
+        self.default_speed = 1.0
+        logger.info(f"Initialized ReplicateTTSService with model '{self.model_name}'")
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        reraise=True
+    )
+    async def synthesize_speech(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        speed: float = 1.0,
+        pitch: Optional[float] = None,
+        volume: Optional[float] = None
+    ) -> Dict[str, Any]:
+        """Synthesize speech from text using Kokoro model"""
+        try:
+            # Validate and set voice
+            selected_voice = voice or self.default_voice
+            if selected_voice not in self.available_voices:
+                logger.warning(f"Voice '{selected_voice}' not available, using default '{self.default_voice}'")
+                selected_voice = self.default_voice
+            # Prepare input parameters
+            input_params = {
+                "text": text,
+                "voice": selected_voice,
+                "speed": max(0.5, min(2.0, speed))  # Clamp speed between 0.5 and 2.0
+            }
+            logger.info(f"Synthesizing speech with voice '{selected_voice}' and speed {speed}")
+            # Run the model
+            output = await replicate.async_run(self.model_name, input=input_params)
+            # Handle different output formats
+            try:
+                if isinstance(output, str):
+                    audio_url = output
+                elif hasattr(output, 'url'):
+                    # Handle FileOutput object
+                    audio_url = str(getattr(output, 'url', output))
+                elif isinstance(output, list) and len(output) > 0:
+                    first_output = output[0]
+                    if hasattr(first_output, 'url'):
+                        audio_url = str(getattr(first_output, 'url', first_output))
+                    else:
+                        audio_url = str(first_output)
+                else:
+                    # Convert to string as fallback
+                    audio_url = str(output)
+            except Exception:
+                # Safe fallback
+                audio_url = str(output)
+            # Estimate audio duration for billing (rough estimation: ~150 words per minute)
+            words = len(text.split())
+            estimated_duration_seconds = (words / 150.0) * 60.0 / speed
+            # Track usage for billing
+            self._track_usage(
+                service_type=ServiceType.AUDIO_TTS,
+                operation="synthesize_speech",
+                input_tokens=0,
+                output_tokens=0,
+                input_units=len(text),  # Text length
+                output_units=estimated_duration_seconds,  # Audio duration in seconds
+                metadata={
+                    "model": self.model_name,
+                    "voice": selected_voice,
+                    "speed": speed,
+                    "text_length": len(text),
+                    "estimated_duration_seconds": estimated_duration_seconds
+                }
+            )
+            result = {
+                "audio_url": audio_url,
+                "text": text,
+                "voice": selected_voice,
+                "speed": speed,
+                "duration_seconds": estimated_duration_seconds,
+                "metadata": {
+                    "model": self.model_name,
+                    "provider": "replicate",
+                    "voice_options": self.available_voices
+                }
+            }
+            logger.info(f"Speech synthesis completed: {audio_url}")
+            return result
+        except Exception as e:
+            logger.error(f"Error synthesizing speech: {e}")
+            raise
+    async def synthesize_speech_to_file(
+        self,
+        text: str,
+        output_path: str,
+        voice: Optional[str] = None,
+        speed: float = 1.0,
+        pitch: Optional[float] = None,
+        volume: Optional[float] = None
+    ) -> Dict[str, Any]:
+        """Synthesize speech and save to file"""
+        try:
+            # Get audio URL
+            result = await self.synthesize_speech(text, voice, speed, pitch, volume)
+            audio_url = result["audio_url"]
+            # Download and save audio
+            import aiohttp
+            import aiofiles
+            async with aiohttp.ClientSession() as session:
+                async with session.get(audio_url) as response:
+                    response.raise_for_status()
+                    audio_data = await response.read()
+                    async with aiofiles.open(output_path, 'wb') as f:
+                        await f.write(audio_data)
+            result["output_path"] = output_path
+            result["file_size"] = len(audio_data)
+            logger.info(f"Audio saved to: {output_path}")
+            return result
+        except Exception as e:
+            logger.error(f"Error saving audio to file: {e}")
+            raise
+    async def synthesize_speech_batch(
+        self,
+        texts: List[str],
+        voice: Optional[str] = None,
+        speed: float = 1.0,
+        pitch: float = 1.0,
+        format: str = "wav"
+    ) -> List[Dict[str, Any]]:
+        """Synthesize multiple texts"""
+        results = []
+        for text in texts:
+            try:
+                result = await self.synthesize_speech(text, voice, speed)
+                results.append(result)
+            except Exception as e:
+                logger.error(f"Error synthesizing text '{text[:50]}...': {e}")
+                results.append({
+                    "audio_url": None,
+                    "text": text,
+                    "voice": voice or self.default_voice,
+                    "speed": speed,
+                    "error": str(e)
+                })
+        return results
+    def get_available_voices(self) -> List[Dict[str, Any]]:
+        """Get list of available voices"""
+        voices = []
+        for voice in self.available_voices:
+            voice_info = self.get_voice_info(voice)
+            voices.append({
+                "id": voice,
+                "name": voice.replace("_", " ").title(),
+                "language": "en-US",
+                "gender": voice_info.get("gender", "unknown"),
+                "age": "adult"
+            })
+        return voices
+    def get_supported_formats(self) -> List[str]:
+        """Get list of supported audio formats"""
+        return ["wav", "mp3"]  # Kokoro typically outputs WAV
+    def get_voice_info(self, voice_id: str) -> Dict[str, Any]:
+        """Get information about a specific voice"""
+        if voice_id not in self.available_voices:
+            return {"error": f"Voice '{voice_id}' not available"}
+        # Voice metadata (you can expand this with more details)
+        voice_info = {
+            "af_bella": {"id": "af_bella", "name": "Bella", "gender": "female", "language": "en-US", "description": "Warm, friendly female voice", "sample_rate": 22050},
+            "af_nicole": {"id": "af_nicole", "name": "Nicole", "gender": "female", "language": "en-US", "description": "Clear, professional female voice", "sample_rate": 22050},
+            "af_sarah": {"id": "af_sarah", "name": "Sarah", "gender": "female", "language": "en-US", "description": "Gentle, expressive female voice", "sample_rate": 22050},
+            "af_sky": {"id": "af_sky", "name": "Sky", "gender": "female", "language": "en-US", "description": "Bright, energetic female voice", "sample_rate": 22050},
+            "am_adam": {"id": "am_adam", "name": "Adam", "gender": "male", "language": "en-US", "description": "Deep, authoritative male voice", "sample_rate": 22050},
+            "am_michael": {"id": "am_michael", "name": "Michael", "gender": "male", "language": "en-US", "description": "Smooth, conversational male voice", "sample_rate": 22050}
+        }
+        return voice_info.get(voice_id, {"id": voice_id, "gender": "unknown", "language": "en-US", "description": "Voice information not available", "sample_rate": 22050})
+    async def close(self):
+        """Cleanup resources"""
+        logger.info("ReplicateTTSService resources cleaned up")

isa-model 0.0.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

isa-model 0.0.2py3-none-any.whl → 0.3.1py3-none-any.whl