PyPI - isa-model - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

isa-model 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

isa_model/__init__.py +1 -1
isa_model/core/model_registry.py +273 -46
isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +120 -0
isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +18 -0
isa_model/deployment/gpu_int8_ds8/app/server.py +66 -0
isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +43 -0
isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +35 -0
isa_model/eval/__init__.py +56 -0
isa_model/eval/benchmarks.py +469 -0
isa_model/eval/factory.py +582 -0
isa_model/eval/metrics.py +628 -0
isa_model/inference/ai_factory.py +98 -93
isa_model/inference/providers/openai_provider.py +21 -7
isa_model/inference/providers/replicate_provider.py +18 -5
isa_model/inference/providers/triton_provider.py +1 -1
isa_model/inference/services/audio/base_stt_service.py +91 -0
isa_model/inference/services/audio/base_tts_service.py +136 -0
isa_model/inference/services/audio/{yyds_audio_service.py → openai_tts_service.py} +4 -4
isa_model/inference/services/embedding/ollama_embed_service.py +48 -36
isa_model/inference/services/llm/__init__.py +0 -4
isa_model/inference/services/llm/base_llm_service.py +134 -0
isa_model/inference/services/llm/ollama_llm_service.py +1 -10
isa_model/inference/services/llm/openai_llm_service.py +70 -61
isa_model/inference/services/vision/__init__.py +1 -1
isa_model/inference/services/vision/ollama_vision_service.py +4 -4
isa_model/inference/services/vision/{yyds_vision_service.py → openai_vision_service.py} +5 -5
isa_model/inference/services/vision/replicate_image_gen_service.py +185 -0
isa_model/training/__init__.py +44 -0
isa_model/training/factory.py +393 -0
isa_model-0.2.0.dist-info/METADATA +327 -0
{isa_model-0.1.0.dist-info → isa_model-0.2.0.dist-info}/RECORD +35 -60
isa_model/deployment/mlflow_gateway/__init__.py +0 -8
isa_model/deployment/mlflow_gateway/start_gateway.py +0 -65
isa_model/deployment/unified_multimodal_client.py +0 -341
isa_model/inference/adapter/triton_adapter.py +0 -453
isa_model/inference/backends/Pytorch/bge_embed_backend.py +0 -188
isa_model/inference/backends/Pytorch/gemma_backend.py +0 -167
isa_model/inference/backends/Pytorch/llama_backend.py +0 -166
isa_model/inference/backends/Pytorch/whisper_backend.py +0 -194
isa_model/inference/backends/__init__.py +0 -53
isa_model/inference/backends/base_backend_client.py +0 -26
isa_model/inference/backends/container_services.py +0 -104
isa_model/inference/backends/local_services.py +0 -72
isa_model/inference/backends/openai_client.py +0 -130
isa_model/inference/backends/replicate_client.py +0 -197
isa_model/inference/backends/third_party_services.py +0 -239
isa_model/inference/backends/triton_client.py +0 -97
isa_model/inference/client_sdk/client.py +0 -134
isa_model/inference/client_sdk/client_data_std.py +0 -34
isa_model/inference/client_sdk/client_sdk_schema.py +0 -16
isa_model/inference/client_sdk/exceptions.py +0 -0
isa_model/inference/engine/triton/model_repository/bge/1/model.py +0 -174
isa_model/inference/engine/triton/model_repository/gemma/1/model.py +0 -250
isa_model/inference/engine/triton/model_repository/llama/1/model.py +0 -76
isa_model/inference/engine/triton/model_repository/whisper/1/model.py +0 -195
isa_model/inference/providers/vllm_provider.py +0 -0
isa_model/inference/providers/yyds_provider.py +0 -83
isa_model/inference/services/audio/fish_speech/handler.py +0 -215
isa_model/inference/services/audio/runpod_tts_fish_service.py +0 -212
isa_model/inference/services/audio/triton_speech_service.py +0 -138
isa_model/inference/services/audio/whisper_service.py +0 -186
isa_model/inference/services/base_tts_service.py +0 -66
isa_model/inference/services/embedding/bge_service.py +0 -183
isa_model/inference/services/embedding/ollama_rerank_service.py +0 -118
isa_model/inference/services/embedding/onnx_rerank_service.py +0 -73
isa_model/inference/services/llm/gemma_service.py +0 -143
isa_model/inference/services/llm/llama_service.py +0 -143
isa_model/inference/services/llm/replicate_llm_service.py +0 -179
isa_model/inference/services/llm/triton_llm_service.py +0 -230
isa_model/inference/services/vision/replicate_vision_service.py +0 -241
isa_model/inference/services/vision/triton_vision_service.py +0 -199
isa_model-0.1.0.dist-info/METADATA +0 -116
/isa_model/inference/{client_sdk/__init__.py → services/embedding/openai_embed_service.py} +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.2.0.dist-info}/WHEEL +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.2.0.dist-info}/licenses/LICENSE +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.2.0.dist-info}/top_level.txt +0 -0

isa_model/inference/services/audio/whisper_service.py DELETED Viewed

@@ -1,186 +0,0 @@
-import logging
-import asyncio
-import io
-import numpy as np
-from typing import Dict, Any, Optional, Union, BinaryIO
-from isa_model.inference.services.base_service import BaseService
-from isa_model.inference.backends.triton_client import TritonClient
-logger = logging.getLogger(__name__)
-class WhisperService(BaseService):
-    """
-    Service for Whisper speech-to-text using Triton Inference Server.
-    """
-    def __init__(self, triton_url: str = "localhost:8001", model_name: str = "whisper"):
-        """
-        Initialize the Whisper service.
-        Args:
-            triton_url: URL of the Triton Inference Server
-            model_name: Name of the model in Triton
-        """
-        super().__init__()
-        self.triton_url = triton_url
-        self.model_name = model_name
-        self.client = None
-        # Default configuration
-        self.default_config = {
-            "language": "en",
-            "sampling_rate": 16000
-        }
-        self.logger = logger
-    async def load(self) -> None:
-        """
-        Load the client connection to Triton.
-        """
-        if self.is_loaded():
-            return
-        try:
-            from tritonclient.http import InferenceServerClient
-            # Create Triton client
-            self.logger.info(f"Connecting to Triton server at {self.triton_url}")
-            self.client = TritonClient(self.triton_url)
-            # Check if model is ready
-            if not await self.client.is_model_ready(self.model_name):
-                self.logger.error(f"Model {self.model_name} is not ready on Triton server")
-                raise RuntimeError(f"Model {self.model_name} is not ready on Triton server")
-            self._loaded = True
-            self.logger.info(f"Connected to Triton for model {self.model_name}")
-        except Exception as e:
-            self.logger.error(f"Failed to connect to Triton: {str(e)}")
-            raise
-    async def unload(self) -> None:
-        """
-        Unload the client connection.
-        """
-        if not self.is_loaded():
-            return
-        self.client = None
-        self._loaded = False
-        self.logger.info("Triton client connection closed")
-    async def transcribe(self,
-                       audio: Union[str, BinaryIO, bytes, np.ndarray],
-                       language: str = "en",
-                       config: Optional[Dict[str, Any]] = None) -> str:
-        """
-        Transcribe audio to text using Triton.
-        Args:
-            audio: Audio input (file path, file-like object, bytes, or numpy array)
-            language: Language code (e.g., "en", "fr")
-            config: Additional configuration parameters
-        Returns:
-            Transcribed text
-        """
-        if not self.is_loaded():
-            await self.load()
-        # Process audio to get numpy array
-        audio_array = await self._process_audio_input(audio)
-        # Get configuration
-        merged_config = self.default_config.copy()
-        if config:
-            merged_config.update(config)
-        # Override language if provided
-        if language:
-            merged_config["language"] = language
-        try:
-            # Prepare inputs
-            inputs = {
-                "audio_input": audio_array,
-                "language": np.array([merged_config["language"]], dtype=np.object_)
-            }
-            # Run inference
-            result = await self.client.infer(
-                model_name=self.model_name,
-                inputs=inputs,
-                outputs=["text_output"]
-            )
-            # Extract transcription
-            transcription = result["text_output"][0].decode('utf-8')
-            return transcription
-        except Exception as e:
-            self.logger.error(f"Error during Whisper transcription: {str(e)}")
-            raise
-    async def _process_audio_input(self, audio: Union[str, BinaryIO, bytes, np.ndarray]) -> np.ndarray:
-        """
-        Process different types of audio inputs into a numpy array.
-        Args:
-            audio: Audio input (file path, file-like object, bytes, or numpy array)
-        Returns:
-            Numpy array of the audio
-        """
-        if isinstance(audio, np.ndarray):
-            return audio
-        try:
-            import librosa
-            if isinstance(audio, str):
-                # File path
-                y, sr = librosa.load(audio, sr=self.default_config["sampling_rate"])
-                return y.astype(np.float32)
-            elif isinstance(audio, (io.IOBase, BinaryIO)):
-                # File-like object
-                audio.seek(0)
-                y, sr = librosa.load(audio, sr=self.default_config["sampling_rate"])
-                return y.astype(np.float32)
-            elif isinstance(audio, bytes):
-                # Bytes
-                with io.BytesIO(audio) as audio_bytes:
-                    y, sr = librosa.load(audio_bytes, sr=self.default_config["sampling_rate"])
-                return y.astype(np.float32)
-            else:
-                raise ValueError(f"Unsupported audio type: {type(audio)}")
-        except ImportError:
-            self.logger.error("librosa not installed. Please install with: pip install librosa")
-            raise
-        except Exception as e:
-            self.logger.error(f"Error processing audio: {str(e)}")
-            raise
-    def get_model_info(self) -> Dict[str, Any]:
-        """
-        Get information about the model.
-        Returns:
-            Dictionary containing model information
-        """
-        return {
-            "name": self.model_name,
-            "type": "speech",
-            "backend": "triton",
-            "url": self.triton_url,
-            "loaded": self.is_loaded(),
-            "config": self.default_config
-        }

isa_model/inference/services/base_tts_service.py DELETED Viewed

@@ -1,66 +0,0 @@
-from abc import abstractmethod
-from typing import Dict, Any, Optional, Union, BinaryIO
-from .base_service import BaseService
-class BaseTTSService(BaseService):
-    """Base class for Text-to-Speech services"""
-    @abstractmethod
-    async def generate_speech(
-        self,
-        text: str,
-        voice_id: Optional[str] = None,
-        language: Optional[str] = None,
-        speed: float = 1.0,
-        options: Optional[Dict[str, Any]] = None
-    ) -> bytes:
-        """
-        Generate speech from text
-        Args:
-            text: The text to convert to speech
-            voice_id: Optional voice identifier
-            language: Optional language code
-            speed: Speech speed factor (1.0 is normal speed)
-            options: Additional model-specific options
-        Returns:
-            Audio data as bytes
-        """
-        pass
-    @abstractmethod
-    async def save_to_file(
-        self,
-        text: str,
-        output_file: Union[str, BinaryIO],
-        voice_id: Optional[str] = None,
-        language: Optional[str] = None,
-        speed: float = 1.0,
-        options: Optional[Dict[str, Any]] = None
-    ) -> str:
-        """
-        Generate speech and save to file
-        Args:
-            text: The text to convert to speech
-            output_file: Path to output file or file-like object
-            voice_id: Optional voice identifier
-            language: Optional language code
-            speed: Speech speed factor (1.0 is normal speed)
-            options: Additional model-specific options
-        Returns:
-            Path to the saved file
-        """
-        pass
-    @abstractmethod
-    async def get_available_voices(self) -> Dict[str, Any]:
-        """
-        Get available voices for the TTS service
-        Returns:
-            Dictionary of available voices with their details
-        """
-        pass

isa_model/inference/services/embedding/bge_service.py DELETED Viewed

@@ -1,183 +0,0 @@
-import logging
-import asyncio
-import numpy as np
-from typing import Dict, List, Any, Optional, Union
-from isa_model.inference.services.base_service import BaseService
-from isa_model.inference.backends.triton_client import TritonClient
-logger = logging.getLogger(__name__)
-class BgeEmbeddingService(BaseService):
-    """
-    Service for BGE embedding using Triton Inference Server.
-    """
-    def __init__(self, triton_url: str = "localhost:8001", model_name: str = "bge_embed"):
-        """
-        Initialize the BGE embedding service.
-        Args:
-            triton_url: URL of the Triton Inference Server
-            model_name: Name of the model in Triton
-        """
-        super().__init__()
-        self.triton_url = triton_url
-        self.model_name = model_name
-        self.client = None
-        # Default configuration
-        self.default_config = {
-            "normalize": True
-        }
-        self.logger = logger
-    async def load(self) -> None:
-        """
-        Load the client connection to Triton.
-        """
-        if self.is_loaded():
-            return
-        try:
-            # Create Triton client
-            self.logger.info(f"Connecting to Triton server at {self.triton_url}")
-            self.client = TritonClient(self.triton_url)
-            # Check if model is ready
-            if not await self.client.is_model_ready(self.model_name):
-                self.logger.error(f"Model {self.model_name} is not ready on Triton server")
-                raise RuntimeError(f"Model {self.model_name} is not ready on Triton server")
-            self._loaded = True
-            self.logger.info(f"Connected to Triton for model {self.model_name}")
-        except Exception as e:
-            self.logger.error(f"Failed to connect to Triton: {str(e)}")
-            raise
-    async def unload(self) -> None:
-        """
-        Unload the client connection.
-        """
-        if not self.is_loaded():
-            return
-        self.client = None
-        self._loaded = False
-        self.logger.info("Triton client connection closed")
-    async def embed(self,
-                   texts: Union[str, List[str]],
-                   normalize: Optional[bool] = None) -> np.ndarray:
-        """
-        Generate embeddings for texts using Triton.
-        Args:
-            texts: Single text or list of texts to embed
-            normalize: Whether to normalize embeddings (if None, use default)
-        Returns:
-            Numpy array of embeddings, shape [batch_size, embedding_dim]
-        """
-        if not self.is_loaded():
-            await self.load()
-        # Handle single text input
-        if isinstance(texts, str):
-            texts = [texts]
-        # Use default normalize setting if not specified
-        if normalize is None:
-            normalize = self.default_config["normalize"]
-        try:
-            # Prepare inputs
-            inputs = {
-                "text_input": texts,
-                "normalize": np.array([normalize], dtype=bool)
-            }
-            # Run inference
-            result = await self.client.infer(
-                model_name=self.model_name,
-                inputs=inputs,
-                outputs=["embedding"]
-            )
-            # Extract embeddings
-            embeddings = result["embedding"]
-            return embeddings
-        except Exception as e:
-            self.logger.error(f"Error during embedding generation: {str(e)}")
-            raise
-    async def similarity(self,
-                        text1: str,
-                        text2: str,
-                        normalize: Optional[bool] = None) -> float:
-        """
-        Calculate the similarity between two texts.
-        Args:
-            text1: First text
-            text2: Second text
-            normalize: Whether to normalize embeddings (if None, use default)
-        Returns:
-            Cosine similarity score (float between -1 and 1)
-        """
-        # Generate embeddings for both texts
-        embeddings = await self.embed([text1, text2], normalize=normalize)
-        # Calculate cosine similarity
-        from sklearn.metrics.pairwise import cosine_similarity
-        similarity = cosine_similarity(embeddings[0:1], embeddings[1:2])[0][0]
-        return float(similarity)
-    async def batch_similarity(self,
-                              queries: List[str],
-                              documents: List[str],
-                              normalize: Optional[bool] = None) -> np.ndarray:
-        """
-        Calculate similarities between queries and documents.
-        Args:
-            queries: List of query texts
-            documents: List of document texts
-            normalize: Whether to normalize embeddings (if None, use default)
-        Returns:
-            Numpy array of similarity scores, shape [len(queries), len(documents)]
-        """
-        # Generate embeddings for queries and documents
-        query_embeddings = await self.embed(queries, normalize=normalize)
-        doc_embeddings = await self.embed(documents, normalize=normalize)
-        # Calculate cosine similarities
-        from sklearn.metrics.pairwise import cosine_similarity
-        similarities = cosine_similarity(query_embeddings, doc_embeddings)
-        return similarities
-    def get_model_info(self) -> Dict[str, Any]:
-        """
-        Get information about the model.
-        Returns:
-            Dictionary containing model information
-        """
-        return {
-            "name": self.model_name,
-            "type": "embedding",
-            "backend": "triton",
-            "url": self.triton_url,
-            "loaded": self.is_loaded(),
-            "embedding_dim": 1024,  # Typical for BGE models
-            "config": self.default_config
-        }

isa_model/inference/services/embedding/ollama_rerank_service.py DELETED Viewed

@@ -1,118 +0,0 @@
-from typing import Dict, Any, List, Optional
-from ollama import AsyncClient
-from ...base_service import BaseRerankService
-from ...base_provider import BaseProvider
-from app.config.config_manager import config_manager
-import httpx
-import asyncio
-from functools import wraps
-logger = config_manager.get_logger(__name__)
-def retry_on_connection_error(max_retries=3, delay=1):
-    """Decorator to retry on connection errors"""
-    def decorator(func):
-        @wraps(func)
-        async def wrapper(*args, **kwargs):
-            last_error = None
-            for attempt in range(max_retries):
-                try:
-                    return await func(*args, **kwargs)
-                except (httpx.RemoteProtocolError, httpx.ConnectError) as e:
-                    last_error = e
-                    if attempt < max_retries - 1:
-                        logger.warning(f"Connection error on attempt {attempt + 1}, retrying in {delay}s: {str(e)}")
-                        await asyncio.sleep(delay)
-                    continue
-            raise last_error
-        return wrapper
-    return decorator
-class OllamaRerankService(BaseRerankService):
-    """Reranking service wrapper around Ollama"""
-    def __init__(self, provider: 'BaseProvider', model_name: str):
-        super().__init__(provider, model_name)
-        # Initialize the Ollama client for reranking
-        self.client = AsyncClient(
-            host=self.config.get('base_url', 'http://localhost:11434')
-        )
-        self.model_name = model_name
-    @retry_on_connection_error()
-    async def rerank(
-        self,
-        query: str,
-        documents: List[Dict],
-        top_k: int = 5
-    ) -> List[Dict]:
-        """Rerank documents based on query relevance"""
-        try:
-            if not query:
-                raise ValueError("Query cannot be empty")
-            if not documents:
-                return []
-            results = []
-            for doc in documents:
-                if "content" not in doc:
-                    raise ValueError("Each document must have a 'content' field")
-                # Format prompt for relevance scoring
-                prompt = f"""Rate the relevance of the following text to the query on a scale of 0-100.
-Query: {query}
-Text: {doc['content']}
-Only respond with a number between 0 and 100."""
-                # Get relevance score using direct Ollama API
-                response = await self.client.generate(
-                    model=self.model_name,
-                    prompt=prompt,
-                    stream=False
-                )
-                try:
-                    score = float(response.response.strip())
-                    score = max(0.0, min(100.0, score)) / 100.0  # Normalize to 0-1
-                except ValueError:
-                    logger.warning(f"Could not parse score from response: {response.response}")
-                    score = 0.0
-                # Update document with rerank score
-                doc_copy = doc.copy()
-                doc_copy["rerank_score"] = score
-                doc_copy["final_score"] = doc.get("score", 1.0) * score
-                results.append(doc_copy)
-            # Sort by final score in descending order
-            results.sort(key=lambda x: x["final_score"], reverse=True)
-            return results[:top_k]
-        except Exception as e:
-            logger.error(f"Error in rerank: {e}")
-            raise
-    @retry_on_connection_error()
-    async def rerank_texts(
-        self,
-        query: str,
-        texts: List[str]
-    ) -> List[Dict]:
-        """Rerank raw texts based on query relevance"""
-        try:
-            if not query:
-                raise ValueError("Query cannot be empty")
-            if not texts:
-                return []
-            # Convert texts to document format
-            documents = [{"content": text, "score": 1.0} for text in texts]
-            return await self.rerank(query, documents)
-        except Exception as e:
-            logger.error(f"Error in rerank_texts: {str(e)}")
-            raise
-    async def close(self):
-        """Cleanup resources"""
-        await self.client.aclose()

isa_model/inference/services/embedding/onnx_rerank_service.py DELETED Viewed

@@ -1,73 +0,0 @@
-from typing import Dict, Any, List, Union, Optional
-from ...base_service import BaseService
-from ...base_provider import BaseProvider
-from transformers import AutoTokenizer
-import onnxruntime as ort
-import numpy as np
-import torch
-import os
-from pathlib import Path
-class ONNXRerankService(BaseService):
-    """ONNX Reranker service for BGE models"""
-    def __init__(self, provider: 'BaseProvider', model_name: str):
-        super().__init__(provider, model_name)
-        self.model_path = self._get_model_path(model_name)
-        self.session = provider.get_session(self.model_path)
-        # Initialize tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
-        self.max_length = 512
-    def _get_model_path(self, model_name: str) -> str:
-        """Get path to ONNX model file"""
-        base_dir = Path(__file__).parent
-        model_path = base_dir / "model_converted" / model_name / "model.onnx"
-        if not model_path.exists():
-            raise FileNotFoundError(f"ONNX model not found at {model_path}. Please run the conversion script first.")
-        return str(model_path)
-    async def compute_score(self,
-                          pairs: Union[List[str], List[List[str]]],
-                          normalize: bool = False) -> Union[float, List[float]]:
-        """Compute reranking scores for query-passage pairs"""
-        try:
-            # Handle single pair case
-            if isinstance(pairs[0], str):
-                pairs = [pairs]
-            # Tokenize inputs
-            inputs = self.tokenizer(
-                pairs,
-                padding=True,
-                truncation=True,
-                return_tensors='np',
-                max_length=self.max_length
-            )
-            # Run inference
-            ort_inputs = {
-                'input_ids': inputs['input_ids'],
-                'attention_mask': inputs['attention_mask']
-            }
-            scores = self.session.run(
-                None,  # output names, None means all
-                ort_inputs
-            )[0]
-            # Convert to float and optionally normalize
-            scores = scores.flatten().tolist()
-            if normalize:
-                scores = [self._sigmoid(score) for score in scores]
-            # Return single score for single pair
-            return scores[0] if len(scores) == 1 else scores
-        except Exception as e:
-            raise RuntimeError(f"ONNX reranking failed: {e}")
-    def _sigmoid(self, x: float) -> float:
-        """Apply sigmoid function to score"""
-        return 1 / (1 + np.exp(-x))

isa-model 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

isa-model 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl