PyPI - isa-model - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

isa-model 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

isa_model/__init__.py +1 -1
isa_model/core/model_registry.py +273 -46
isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +120 -0
isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +18 -0
isa_model/deployment/gpu_int8_ds8/app/server.py +66 -0
isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +43 -0
isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +35 -0
isa_model/eval/__init__.py +56 -0
isa_model/eval/benchmarks.py +469 -0
isa_model/eval/factory.py +582 -0
isa_model/eval/metrics.py +628 -0
isa_model/inference/ai_factory.py +98 -93
isa_model/inference/providers/openai_provider.py +21 -7
isa_model/inference/providers/replicate_provider.py +18 -5
isa_model/inference/providers/triton_provider.py +1 -1
isa_model/inference/services/audio/base_stt_service.py +91 -0
isa_model/inference/services/audio/base_tts_service.py +136 -0
isa_model/inference/services/audio/{yyds_audio_service.py → openai_tts_service.py} +4 -4
isa_model/inference/services/embedding/ollama_embed_service.py +48 -36
isa_model/inference/services/llm/__init__.py +0 -4
isa_model/inference/services/llm/base_llm_service.py +134 -0
isa_model/inference/services/llm/ollama_llm_service.py +1 -10
isa_model/inference/services/llm/openai_llm_service.py +70 -61
isa_model/inference/services/vision/__init__.py +1 -1
isa_model/inference/services/vision/ollama_vision_service.py +4 -4
isa_model/inference/services/vision/{yyds_vision_service.py → openai_vision_service.py} +5 -5
isa_model/inference/services/vision/replicate_image_gen_service.py +185 -0
isa_model/training/__init__.py +44 -0
isa_model/training/factory.py +393 -0
isa_model-0.1.1.dist-info/METADATA +327 -0
{isa_model-0.1.0.dist-info → isa_model-0.1.1.dist-info}/RECORD +35 -60
isa_model/deployment/mlflow_gateway/__init__.py +0 -8
isa_model/deployment/mlflow_gateway/start_gateway.py +0 -65
isa_model/deployment/unified_multimodal_client.py +0 -341
isa_model/inference/adapter/triton_adapter.py +0 -453
isa_model/inference/backends/Pytorch/bge_embed_backend.py +0 -188
isa_model/inference/backends/Pytorch/gemma_backend.py +0 -167
isa_model/inference/backends/Pytorch/llama_backend.py +0 -166
isa_model/inference/backends/Pytorch/whisper_backend.py +0 -194
isa_model/inference/backends/__init__.py +0 -53
isa_model/inference/backends/base_backend_client.py +0 -26
isa_model/inference/backends/container_services.py +0 -104
isa_model/inference/backends/local_services.py +0 -72
isa_model/inference/backends/openai_client.py +0 -130
isa_model/inference/backends/replicate_client.py +0 -197
isa_model/inference/backends/third_party_services.py +0 -239
isa_model/inference/backends/triton_client.py +0 -97
isa_model/inference/client_sdk/client.py +0 -134
isa_model/inference/client_sdk/client_data_std.py +0 -34
isa_model/inference/client_sdk/client_sdk_schema.py +0 -16
isa_model/inference/client_sdk/exceptions.py +0 -0
isa_model/inference/engine/triton/model_repository/bge/1/model.py +0 -174
isa_model/inference/engine/triton/model_repository/gemma/1/model.py +0 -250
isa_model/inference/engine/triton/model_repository/llama/1/model.py +0 -76
isa_model/inference/engine/triton/model_repository/whisper/1/model.py +0 -195
isa_model/inference/providers/vllm_provider.py +0 -0
isa_model/inference/providers/yyds_provider.py +0 -83
isa_model/inference/services/audio/fish_speech/handler.py +0 -215
isa_model/inference/services/audio/runpod_tts_fish_service.py +0 -212
isa_model/inference/services/audio/triton_speech_service.py +0 -138
isa_model/inference/services/audio/whisper_service.py +0 -186
isa_model/inference/services/base_tts_service.py +0 -66
isa_model/inference/services/embedding/bge_service.py +0 -183
isa_model/inference/services/embedding/ollama_rerank_service.py +0 -118
isa_model/inference/services/embedding/onnx_rerank_service.py +0 -73
isa_model/inference/services/llm/gemma_service.py +0 -143
isa_model/inference/services/llm/llama_service.py +0 -143
isa_model/inference/services/llm/replicate_llm_service.py +0 -179
isa_model/inference/services/llm/triton_llm_service.py +0 -230
isa_model/inference/services/vision/replicate_vision_service.py +0 -241
isa_model/inference/services/vision/triton_vision_service.py +0 -199
isa_model-0.1.0.dist-info/METADATA +0 -116
/isa_model/inference/{client_sdk/__init__.py → services/embedding/openai_embed_service.py} +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.1.1.dist-info}/WHEEL +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.1.1.dist-info}/licenses/LICENSE +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.1.1.dist-info}/top_level.txt +0 -0

isa_model/inference/backends/Pytorch/gemma_backend.py DELETED Viewed

@@ -1,167 +0,0 @@
-import os
-import logging
-import torch
-from typing import Dict, List, Any, Optional, Union
-logger = logging.getLogger(__name__)
-class GemmaBackend:
-    """
-    PyTorch backend for the Gemma LLM model.
-    """
-    def __init__(self, model_path: Optional[str] = None, device: str = "auto"):
-        """
-        Initialize the Gemma backend.
-        Args:
-            model_path: Path to the model
-            device: Device to run the model on ("cpu", "cuda", or "auto")
-        """
-        self.model_path = model_path or os.environ.get("GEMMA_MODEL_PATH", "/models/Gemma3-4B")
-        self.device = device if device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = None
-        self.tokenizer = None
-        self._loaded = False
-        # Default generation config
-        self.default_config = {
-            "max_new_tokens": 512,
-            "temperature": 0.7,
-            "top_p": 0.9,
-            "top_k": 50,
-            "repetition_penalty": 1.1,
-            "do_sample": True
-        }
-        self.logger = logger
-    def load(self) -> None:
-        """
-        Load the model and tokenizer.
-        """
-        if self._loaded:
-            return
-        try:
-            from transformers import AutoModelForCausalLM, AutoTokenizer
-            # Load tokenizer
-            self.logger.info(f"Loading Gemma tokenizer from {self.model_path}")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-            # Load model
-            self.logger.info(f"Loading Gemma model on {self.device}")
-            if self.device == "cpu":
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_path,
-                    torch_dtype=torch.float32,
-                    low_cpu_mem_usage=True,
-                    device_map="auto"
-                )
-            else:  # cuda
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_path,
-                    torch_dtype=torch.float16,  # Use half precision on GPU
-                    device_map="auto"
-                )
-            self.model.eval()
-            self._loaded = True
-            self.logger.info("Gemma model loaded successfully")
-        except Exception as e:
-            self.logger.error(f"Failed to load Gemma model: {str(e)}")
-            raise
-    def unload(self) -> None:
-        """
-        Unload the model and tokenizer.
-        """
-        if not self._loaded:
-            return
-        self.model = None
-        self.tokenizer = None
-        self._loaded = False
-        # Force garbage collection
-        import gc
-        gc.collect()
-        if self.device == "cuda":
-            torch.cuda.empty_cache()
-        self.logger.info("Gemma model unloaded")
-    def generate(self,
-                prompt: str,
-                system_prompt: Optional[str] = None,
-                generation_config: Optional[Dict[str, Any]] = None) -> str:
-        """
-        Generate text from a prompt.
-        Args:
-            prompt: User prompt
-            system_prompt: System prompt to control model behavior
-            generation_config: Configuration for text generation
-        Returns:
-            Generated text
-        """
-        if not self._loaded:
-            self.load()
-        # Get generation config
-        config = self.default_config.copy()
-        if generation_config:
-            config.update(generation_config)
-        try:
-            # Format the prompt with system prompt if provided
-            if system_prompt:
-                # Gemma uses a specific format for system prompts
-                formatted_prompt = f"<bos><start_of_turn>system\n{system_prompt}<end_of_turn>\n<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model"
-            else:
-                formatted_prompt = f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model"
-            # Tokenize the prompt
-            inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)
-            # Generate text
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    inputs.input_ids,
-                    attention_mask=inputs.attention_mask,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    **config
-                )
-            # Decode the generated text
-            generated_text = self.tokenizer.decode(
-                outputs[0][inputs.input_ids.shape[1]:],
-                skip_special_tokens=True
-            )
-            return generated_text.strip()
-        except Exception as e:
-            self.logger.error(f"Error during Gemma text generation: {str(e)}")
-            raise
-    def get_model_info(self) -> Dict[str, Any]:
-        """
-        Get information about the model.
-        Returns:
-            Dictionary containing model information
-        """
-        return {
-            "name": "gemma3-4b",
-            "type": "llm",
-            "device": self.device,
-            "path": self.model_path,
-            "loaded": self._loaded,
-            "default_config": self.default_config
-        }

isa_model/inference/backends/Pytorch/llama_backend.py DELETED Viewed

@@ -1,166 +0,0 @@
-import os
-import logging
-import torch
-from typing import Dict, List, Any, Optional, Union
-logger = logging.getLogger(__name__)
-class LlamaBackend:
-    """
-    PyTorch backend for the Llama LLM model.
-    """
-    def __init__(self, model_path: Optional[str] = None, device: str = "auto"):
-        """
-        Initialize the Llama backend.
-        Args:
-            model_path: Path to the model
-            device: Device to run the model on ("cpu", "cuda", or "auto")
-        """
-        self.model_path = model_path or os.environ.get("LLAMA_MODEL_PATH", "/models/Llama3-8B")
-        self.device = device if device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = None
-        self.tokenizer = None
-        self._loaded = False
-        # Default generation config
-        self.default_config = {
-            "max_new_tokens": 512,
-            "temperature": 0.7,
-            "top_p": 0.9,
-            "top_k": 50,
-            "repetition_penalty": 1.1,
-            "do_sample": True
-        }
-        self.logger = logger
-    def load(self) -> None:
-        """
-        Load the model and tokenizer.
-        """
-        if self._loaded:
-            return
-        try:
-            from transformers import AutoModelForCausalLM, AutoTokenizer
-            # Load tokenizer
-            self.logger.info(f"Loading Llama tokenizer from {self.model_path}")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-            # Load model
-            self.logger.info(f"Loading Llama model on {self.device}")
-            if self.device == "cpu":
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_path,
-                    torch_dtype=torch.float32,
-                    low_cpu_mem_usage=True,
-                    device_map="auto"
-                )
-            else:  # cuda
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_path,
-                    torch_dtype=torch.float16,  # Use half precision on GPU
-                    device_map="auto"
-                )
-            self.model.eval()
-            self._loaded = True
-            self.logger.info("Llama model loaded successfully")
-        except Exception as e:
-            self.logger.error(f"Failed to load Llama model: {str(e)}")
-            raise
-    def unload(self) -> None:
-        """
-        Unload the model and tokenizer.
-        """
-        if not self._loaded:
-            return
-        self.model = None
-        self.tokenizer = None
-        self._loaded = False
-        # Force garbage collection
-        import gc
-        gc.collect()
-        if self.device == "cuda":
-            torch.cuda.empty_cache()
-        self.logger.info("Llama model unloaded")
-    def generate(self,
-                prompt: str,
-                system_prompt: Optional[str] = None,
-                generation_config: Optional[Dict[str, Any]] = None) -> str:
-        """
-        Generate text from a prompt.
-        Args:
-            prompt: User prompt
-            system_prompt: System prompt to control model behavior
-            generation_config: Configuration for text generation
-        Returns:
-            Generated text
-        """
-        if not self._loaded:
-            self.load()
-        # Get generation config
-        config = self.default_config.copy()
-        if generation_config:
-            config.update(generation_config)
-        try:
-            # Format the prompt with system prompt if provided
-            if system_prompt:
-                formatted_prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{prompt}\n<|assistant|>"
-            else:
-                formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>"
-            # Tokenize the prompt
-            inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)
-            # Generate text
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    inputs.input_ids,
-                    attention_mask=inputs.attention_mask,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    **config
-                )
-            # Decode the generated text
-            generated_text = self.tokenizer.decode(
-                outputs[0][inputs.input_ids.shape[1]:],
-                skip_special_tokens=True
-            )
-            return generated_text.strip()
-        except Exception as e:
-            self.logger.error(f"Error during Llama text generation: {str(e)}")
-            raise
-    def get_model_info(self) -> Dict[str, Any]:
-        """
-        Get information about the model.
-        Returns:
-            Dictionary containing model information
-        """
-        return {
-            "name": "llama3-8b",
-            "type": "llm",
-            "device": self.device,
-            "path": self.model_path,
-            "loaded": self._loaded,
-            "default_config": self.default_config
-        }

isa_model/inference/backends/Pytorch/whisper_backend.py DELETED Viewed

@@ -1,194 +0,0 @@
-import os
-import io
-import torch
-import logging
-import numpy as np
-from typing import Dict, Any, Optional, Union, BinaryIO
-logger = logging.getLogger(__name__)
-class WhisperBackend:
-    """
-    PyTorch backend for the Whisper speech-to-text model.
-    """
-    def __init__(self, model_path: Optional[str] = None, device: str = "auto"):
-        """
-        Initialize the Whisper backend.
-        Args:
-            model_path: Path to the model
-            device: Device to run the model on ("cpu", "cuda", or "auto")
-        """
-        self.model_path = model_path or os.environ.get("WHISPER_MODEL_PATH", "/models/Whisper-tiny")
-        self.device = device if device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = None
-        self.processor = None
-        self._loaded = False
-        # Default configuration
-        self.config = {
-            "language": "en",
-            "task": "transcribe",
-            "sampling_rate": 16000,
-            "chunk_length_s": 30,
-            "batch_size": 16
-        }
-        self.logger = logger
-    def load(self) -> None:
-        """
-        Load the model and processor.
-        """
-        if self._loaded:
-            return
-        try:
-            from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-            # Load processor
-            self.logger.info(f"Loading Whisper processor from {self.model_path}")
-            self.processor = AutoProcessor.from_pretrained(self.model_path)
-            # Load model
-            self.logger.info(f"Loading Whisper model on {self.device}")
-            if self.device == "cpu":
-                self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                    self.model_path,
-                    torch_dtype=torch.float32,
-                    low_cpu_mem_usage=True,
-                    device_map="auto"
-                )
-            else:  # cuda
-                self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                    self.model_path,
-                    torch_dtype=torch.float16,  # Use half precision on GPU
-                    device_map="auto"
-                )
-            self.model.eval()
-            self._loaded = True
-            self.logger.info("Whisper model loaded successfully")
-        except Exception as e:
-            self.logger.error(f"Failed to load Whisper model: {str(e)}")
-            raise
-    def unload(self) -> None:
-        """
-        Unload the model and processor.
-        """
-        if not self._loaded:
-            return
-        self.model = None
-        self.processor = None
-        self._loaded = False
-        # Force garbage collection
-        import gc
-        gc.collect()
-        if self.device == "cuda":
-            torch.cuda.empty_cache()
-        self.logger.info("Whisper model unloaded")
-    def transcribe(self,
-                  audio: Union[np.ndarray, str, BinaryIO, bytes],
-                  language: str = "en",
-                  **kwargs) -> str:
-        """
-        Transcribe audio to text.
-        Args:
-            audio: Audio input (numpy array, file path, file-like object, or bytes)
-            language: Language code (e.g., "en", "fr")
-            kwargs: Additional keyword arguments to override config
-        Returns:
-            Transcribed text
-        """
-        if not self._loaded:
-            self.load()
-        # Process audio to get numpy array
-        audio_array = self._process_audio_input(audio)
-        # Update config with kwargs
-        config = self.config.copy()
-        config.update(kwargs)
-        config["language"] = language
-        try:
-            # Process audio with processor
-            inputs = self.processor(
-                audio_array,
-                sampling_rate=config["sampling_rate"],
-                return_tensors="pt"
-            ).to(self.device)
-            # Generate transcription
-            with torch.no_grad():
-                output = self.model.generate(
-                    **inputs,
-                    language=config["language"],
-                    task=config["task"]
-                )
-            # Decode the output
-            transcription = self.processor.batch_decode(
-                output,
-                skip_special_tokens=True
-            )[0]
-            return transcription
-        except Exception as e:
-            self.logger.error(f"Error during Whisper transcription: {str(e)}")
-            raise
-    def _process_audio_input(self, audio: Union[np.ndarray, str, BinaryIO, bytes]) -> np.ndarray:
-        """
-        Process different types of audio inputs into a numpy array.
-        Args:
-            audio: Audio input (numpy array, file path, file-like object, or bytes)
-        Returns:
-            Numpy array of the audio
-        """
-        if isinstance(audio, np.ndarray):
-            return audio
-        try:
-            import librosa
-            if isinstance(audio, str):
-                # File path
-                y, sr = librosa.load(audio, sr=self.config["sampling_rate"])
-                return y
-            elif isinstance(audio, (io.IOBase, BinaryIO)):
-                # File-like object
-                audio.seek(0)
-                y, sr = librosa.load(audio, sr=self.config["sampling_rate"])
-                return y
-            elif isinstance(audio, bytes):
-                # Bytes
-                with io.BytesIO(audio) as audio_bytes:
-                    y, sr = librosa.load(audio_bytes, sr=self.config["sampling_rate"])
-                return y
-            else:
-                raise ValueError(f"Unsupported audio type: {type(audio)}")
-        except ImportError:
-            self.logger.error("librosa not installed. Please install with: pip install librosa")
-            raise
-        except Exception as e:
-            self.logger.error(f"Error processing audio: {str(e)}")
-            raise

isa_model/inference/backends/__init__.py DELETED Viewed

@@ -1,53 +0,0 @@
-"""
-Backend services for isa_model inference.
-Three types of backend services:
-1. Local Services: Services running locally (e.g., Ollama)
-2. Container Services: Docker/K8s deployed services (e.g., Triton, vLLM)
-3. Third-party Services: External API services with wrappers
-"""
-from .base_backend_client import BaseBackendClient
-from .triton_client import TritonBackendClient, TritonClient
-# Local Services
-from .local_services import OllamaBackendClient, LocalModelServerClient
-# Container Services
-from .container_services import (
-    VLLMBackendClient,
-    TensorFlowServingClient,
-    KubernetesServiceClient
-)
-# Third-party Services
-from .third_party_services import (
-    OpenAIClient,
-    AnthropicClient,
-    CohereClient,
-    AzureOpenAIClient,
-    GoogleAIClient
-)
-__all__ = [
-    # Base
-    "BaseBackendClient",
-    "TritonBackendClient",
-    "TritonClient",  # Backward compatibility
-    # Local Services
-    "OllamaBackendClient",
-    "LocalModelServerClient",
-    # Container Services
-    "VLLMBackendClient",
-    "TensorFlowServingClient",
-    "KubernetesServiceClient",
-    # Third-party Services
-    "OpenAIClient",
-    "AnthropicClient",
-    "CohereClient",
-    "AzureOpenAIClient",
-    "GoogleAIClient",
-]

isa_model/inference/backends/base_backend_client.py DELETED Viewed

@@ -1,26 +0,0 @@
-"""
-Base backend client interface for all AI service backends.
-Defines the common interface that all backend clients must implement.
-"""
-from abc import ABC, abstractmethod
-from typing import Dict, Any, AsyncGenerator, Optional
-class BaseBackendClient(ABC):
-    """Abstract base class for all backend clients"""
-    def __init__(self, *args, **kwargs):
-        """Initialize backend client"""
-        pass
-    @abstractmethod
-    async def health_check(self) -> bool:
-        """Check if the backend service is healthy"""
-        pass
-    async def close(self):
-        """Close any open connections"""
-        pass

isa-model 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

isa-model 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl