PyPI - isa-model - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

isa-model 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

isa_model/__init__.py +1 -1
isa_model/core/model_registry.py +273 -46
isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +120 -0
isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +18 -0
isa_model/deployment/gpu_int8_ds8/app/server.py +66 -0
isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +43 -0
isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +35 -0
isa_model/eval/__init__.py +56 -0
isa_model/eval/benchmarks.py +469 -0
isa_model/eval/factory.py +582 -0
isa_model/eval/metrics.py +628 -0
isa_model/inference/ai_factory.py +98 -93
isa_model/inference/providers/openai_provider.py +21 -7
isa_model/inference/providers/replicate_provider.py +18 -5
isa_model/inference/providers/triton_provider.py +1 -1
isa_model/inference/services/audio/base_stt_service.py +91 -0
isa_model/inference/services/audio/base_tts_service.py +136 -0
isa_model/inference/services/audio/{yyds_audio_service.py → openai_tts_service.py} +4 -4
isa_model/inference/services/embedding/ollama_embed_service.py +48 -36
isa_model/inference/services/llm/__init__.py +0 -4
isa_model/inference/services/llm/base_llm_service.py +134 -0
isa_model/inference/services/llm/ollama_llm_service.py +1 -10
isa_model/inference/services/llm/openai_llm_service.py +70 -61
isa_model/inference/services/vision/__init__.py +1 -1
isa_model/inference/services/vision/ollama_vision_service.py +4 -4
isa_model/inference/services/vision/{yyds_vision_service.py → openai_vision_service.py} +5 -5
isa_model/inference/services/vision/replicate_image_gen_service.py +185 -0
isa_model/training/__init__.py +44 -0
isa_model/training/factory.py +393 -0
isa_model-0.1.1.dist-info/METADATA +327 -0
{isa_model-0.1.0.dist-info → isa_model-0.1.1.dist-info}/RECORD +35 -60
isa_model/deployment/mlflow_gateway/__init__.py +0 -8
isa_model/deployment/mlflow_gateway/start_gateway.py +0 -65
isa_model/deployment/unified_multimodal_client.py +0 -341
isa_model/inference/adapter/triton_adapter.py +0 -453
isa_model/inference/backends/Pytorch/bge_embed_backend.py +0 -188
isa_model/inference/backends/Pytorch/gemma_backend.py +0 -167
isa_model/inference/backends/Pytorch/llama_backend.py +0 -166
isa_model/inference/backends/Pytorch/whisper_backend.py +0 -194
isa_model/inference/backends/__init__.py +0 -53
isa_model/inference/backends/base_backend_client.py +0 -26
isa_model/inference/backends/container_services.py +0 -104
isa_model/inference/backends/local_services.py +0 -72
isa_model/inference/backends/openai_client.py +0 -130
isa_model/inference/backends/replicate_client.py +0 -197
isa_model/inference/backends/third_party_services.py +0 -239
isa_model/inference/backends/triton_client.py +0 -97
isa_model/inference/client_sdk/client.py +0 -134
isa_model/inference/client_sdk/client_data_std.py +0 -34
isa_model/inference/client_sdk/client_sdk_schema.py +0 -16
isa_model/inference/client_sdk/exceptions.py +0 -0
isa_model/inference/engine/triton/model_repository/bge/1/model.py +0 -174
isa_model/inference/engine/triton/model_repository/gemma/1/model.py +0 -250
isa_model/inference/engine/triton/model_repository/llama/1/model.py +0 -76
isa_model/inference/engine/triton/model_repository/whisper/1/model.py +0 -195
isa_model/inference/providers/vllm_provider.py +0 -0
isa_model/inference/providers/yyds_provider.py +0 -83
isa_model/inference/services/audio/fish_speech/handler.py +0 -215
isa_model/inference/services/audio/runpod_tts_fish_service.py +0 -212
isa_model/inference/services/audio/triton_speech_service.py +0 -138
isa_model/inference/services/audio/whisper_service.py +0 -186
isa_model/inference/services/base_tts_service.py +0 -66
isa_model/inference/services/embedding/bge_service.py +0 -183
isa_model/inference/services/embedding/ollama_rerank_service.py +0 -118
isa_model/inference/services/embedding/onnx_rerank_service.py +0 -73
isa_model/inference/services/llm/gemma_service.py +0 -143
isa_model/inference/services/llm/llama_service.py +0 -143
isa_model/inference/services/llm/replicate_llm_service.py +0 -179
isa_model/inference/services/llm/triton_llm_service.py +0 -230
isa_model/inference/services/vision/replicate_vision_service.py +0 -241
isa_model/inference/services/vision/triton_vision_service.py +0 -199
isa_model-0.1.0.dist-info/METADATA +0 -116
/isa_model/inference/{client_sdk/__init__.py → services/embedding/openai_embed_service.py} +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.1.1.dist-info}/WHEEL +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.1.1.dist-info}/licenses/LICENSE +0 -0
{isa_model-0.1.0.dist-info → isa_model-0.1.1.dist-info}/top_level.txt +0 -0

isa_model/inference/adapter/triton_adapter.py DELETED Viewed

@@ -1,453 +0,0 @@
-#!/usr/bin/env python3
-"""
-Multimodal OpenAI-Compatible Adapter for Triton Inference Server
-This adapter translates between OpenAI API format and Triton Inference Server format.
-It supports multiple modalities (text, image, voice) through a unified API.
-Features:
-- Chat completions API (text)
-- Image generation API
-- Audio transcription API
-- Embeddings API
-The adapter routes requests to the appropriate Triton model based on the task.
-"""
-import os
-import json
-import time
-import base64
-import logging
-import requests
-import tempfile
-import uvicorn
-import uuid
-from typing import List, Dict, Any, Optional, Union
-from fastapi import FastAPI, HTTPException, UploadFile, File, Form, Body, BackgroundTasks
-from fastapi.responses import StreamingResponse, FileResponse
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field
-import numpy as np
-from datetime import datetime
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Initialize FastAPI app
-app = FastAPI(title="Multimodal OpenAI-Compatible Adapter")
-# Add CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Constants
-TRITON_URL = os.environ.get("TRITON_URL", "http://localhost:8000")
-DEFAULT_TEXT_MODEL = os.environ.get("DEFAULT_TEXT_MODEL", "llama3_cpu")
-DEFAULT_IMAGE_MODEL = os.environ.get("DEFAULT_IMAGE_MODEL", "stable_diffusion")
-DEFAULT_AUDIO_MODEL = os.environ.get("DEFAULT_AUDIO_MODEL", "whisper_tiny")
-DEFAULT_EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL", "bge_m3")
-DEFAULT_VISION_MODEL = os.environ.get("DEFAULT_VISION_MODEL", "gemma3_4b")
-# ===== Schema Definitions =====
-class ChatMessage(BaseModel):
-    role: str
-    content: Union[str, List[Dict[str, Any]]]
-    name: Optional[str] = None
-class ChatCompletionRequest(BaseModel):
-    model: str
-    messages: List[ChatMessage]
-    temperature: Optional[float] = 0.7
-    top_p: Optional[float] = 1.0
-    max_tokens: Optional[int] = 100
-    stream: Optional[bool] = False
-    stop: Optional[Union[str, List[str]]] = None
-class ImageGenerationRequest(BaseModel):
-    model: str
-    prompt: str
-    n: Optional[int] = 1
-    size: Optional[str] = "1024x1024"
-    response_format: Optional[str] = "url"
-class AudioTranscriptionRequest(BaseModel):
-    model: str
-    file: str  # Base64 encoded audio
-    response_format: Optional[str] = "text"
-    language: Optional[str] = "en"
-class EmbeddingRequest(BaseModel):
-    model: str
-    input: Union[str, List[str]]
-    encoding_format: Optional[str] = "float"
-# ===== Helper Functions =====
-def generate_response_id(prefix: str = "res") -> str:
-    """Generate a unique response ID."""
-    return f"{prefix}-{uuid.uuid4()}"
-def format_chat_response(content: str, model: str) -> Dict[str, Any]:
-    """Format chat completion response in OpenAI format."""
-    return {
-        "id": generate_response_id("chatcmpl"),
-        "object": "chat.completion",
-        "created": int(time.time()),
-        "model": model,
-        "choices": [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": content
-                },
-                "finish_reason": "stop"
-            }
-        ],
-        "usage": {
-            "prompt_tokens": 0,  # We don't track these yet
-            "completion_tokens": 0,
-            "total_tokens": 0
-        }
-    }
-def format_image_response(image_data: str, model: str) -> Dict[str, Any]:
-    """Format image generation response in OpenAI format."""
-    return {
-        "created": int(time.time()),
-        "data": [
-            {
-                "url": f"data:image/png;base64,{image_data}"
-            }
-        ]
-    }
-def format_audio_response(text: str, model: str) -> Dict[str, Any]:
-    """Format audio transcription response in OpenAI format."""
-    return {
-        "text": text
-    }
-def format_embedding_response(embeddings: List[List[float]], model: str) -> Dict[str, Any]:
-    """Format embedding response in OpenAI format."""
-    data = []
-    for i, embedding in enumerate(embeddings):
-        data.append({
-            "object": "embedding",
-            "embedding": embedding,
-            "index": i
-        })
-    return {
-        "object": "list",
-        "data": data,
-        "model": model,
-        "usage": {
-            "prompt_tokens": 0,
-            "total_tokens": 0
-        }
-    }
-def extract_content_from_messages(messages: List[ChatMessage]) -> Dict[str, Any]:
-    """Extract content from messages for Triton input."""
-    formatted_content = ""
-    image_data = None
-    for msg in messages:
-        # Handle both string content and list of content parts
-        if isinstance(msg.content, str):
-            content = msg.content
-            formatted_content += f"{msg.role.capitalize()}: {content}\n"
-        else:
-            # For multimodal content, extract text and image parts
-            text_parts = []
-            for part in msg.content:
-                if part.get("type") == "text":
-                    text_parts.append(part.get("text", ""))
-                elif part.get("type") == "image_url":
-                    # Extract image from URL (assuming base64 encoded)
-                    image_url = part.get("image_url", {}).get("url", "")
-                    if image_url.startswith("data:image/"):
-                        # Extract the base64 part
-                        image_data = image_url.split(",")[1]
-            # Add text parts to formatted content
-            content = " ".join(text_parts)
-            formatted_content += f"{msg.role.capitalize()}: {content}\n"
-    formatted_content += "Assistant:"
-    return {"text": formatted_content, "image": image_data}
-# ===== API Routes =====
-@app.post("/v1/chat/completions")
-async def chat_completions(request: ChatCompletionRequest):
-    """Handle chat completion requests."""
-    logger.info(f"Received request: {request.dict()}")
-    # Extract the formatted content from messages
-    content = extract_content_from_messages(request.messages)
-    input_text = content["text"]
-    image_data = content["image"]
-    # Use requested model or default
-    model = request.model if request.model != "default" else DEFAULT_TEXT_MODEL
-    # Prepare request for Triton
-    triton_request = {
-        "inputs": [
-            {
-                "name": "text_input",
-                "shape": [1, 1],
-                "datatype": "BYTES",
-                "data": [input_text]
-            },
-            {
-                "name": "max_tokens",
-                "shape": [1, 1],
-                "datatype": "INT32",
-                "data": [request.max_tokens]
-            },
-            {
-                "name": "temperature",
-                "shape": [1, 1],
-                "datatype": "FP32",
-                "data": [request.temperature]
-            }
-        ]
-    }
-    # Add image input if available and using vision model
-    if image_data is not None and model == "gemma3_4b":
-        try:
-            # Decode base64 image
-            from PIL import Image
-            import io
-            import numpy as np
-            # Decode and preprocess image
-            image_bytes = base64.b64decode(image_data)
-            image = Image.open(io.BytesIO(image_bytes))
-            # Resize to expected size (224x224 for most vision models)
-            image = image.resize((224, 224))
-            # Convert to RGB if not already
-            if image.mode != "RGB":
-                image = image.convert("RGB")
-            # Convert to numpy array and normalize
-            image_array = np.array(image).astype(np.float32) / 255.0
-            # Reorder from HWC to CHW format
-            image_array = np.transpose(image_array, (2, 0, 1))
-            # Add image input to Triton request
-            triton_request["inputs"].append({
-                "name": "image_input",
-                "shape": list(image_array.shape),
-                "datatype": "FP32",
-                "data": image_array.flatten().tolist()
-            })
-            logger.info("Added image input to request")
-        except Exception as e:
-            logger.error(f"Error processing image: {str(e)}")
-    logger.info(f"Sending to Triton: {triton_request}")
-    # Send to Triton
-    try:
-        response = requests.post(
-            f"{TRITON_URL}/v2/models/{model}/infer",
-            json=triton_request
-        )
-        response.raise_for_status()
-        triton_response = response.json()
-        logger.info(f"Triton response status: {response.status_code}")
-        logger.info(f"Triton response: {triton_response}")
-        # Extract text output
-        output_data = triton_response["outputs"][0]["data"][0]
-        # Format response
-        return format_chat_response(output_data, model)
-    except Exception as e:
-        logger.error(f"Error calling Triton: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error calling model: {str(e)}")
-@app.post("/v1/images/generations")
-async def generate_images(request: ImageGenerationRequest):
-    """Handle image generation requests."""
-    logger.info(f"Received image generation request: {request.dict()}")
-    # Use requested model or default
-    model = request.model if request.model != "default" else DEFAULT_IMAGE_MODEL
-    # For demo purposes - in a real implementation, this would call the Triton image model
-    # Here we'll just simulate image generation with a placeholder
-    try:
-        # Simulate Triton call (replace with actual call to Triton when image model is available)
-        # Return a placeholder image for demonstration
-        with open("placeholder.png", "rb") as f:
-            image_data = base64.b64encode(f.read()).decode("utf-8")
-        return format_image_response(image_data, model)
-    except Exception as e:
-        logger.error(f"Error generating image: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error generating image: {str(e)}")
-@app.post("/v1/audio/transcriptions")
-async def transcribe_audio(request: AudioTranscriptionRequest):
-    """Handle audio transcription requests."""
-    logger.info(f"Received audio transcription request: {request.dict()}")
-    # Use requested model or default
-    model = request.model if request.model != "default" else DEFAULT_AUDIO_MODEL
-    try:
-        # Decode the base64 audio
-        audio_data = base64.b64decode(request.file)
-        # Save to temporary file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-            temp_file.write(audio_data)
-            temp_file_path = temp_file.name
-        # Load and preprocess audio for Whisper
-        import librosa
-        import numpy as np
-        # Load audio file and resample to 16kHz for Whisper
-        audio_array, _ = librosa.load(temp_file_path, sr=16000, mono=True)
-        # Prepare request for Triton
-        triton_request = {
-            "inputs": [
-                {
-                    "name": "audio_input",
-                    "shape": [len(audio_array)],
-                    "datatype": "FP32",
-                    "data": audio_array.tolist()
-                }
-            ]
-        }
-        # Add language if provided
-        if hasattr(request, 'language') and request.language:
-            triton_request["inputs"].append({
-                "name": "language",
-                "shape": [1, 1],
-                "datatype": "BYTES",
-                "data": [request.language]
-            })
-        # Clean up temp file
-        os.unlink(temp_file_path)
-        # Send to Triton
-        response = requests.post(
-            f"{TRITON_URL}/v2/models/{model}/infer",
-            json=triton_request
-        )
-        response.raise_for_status()
-        triton_response = response.json()
-        # Extract text output
-        transcription = triton_response["outputs"][0]["data"][0]
-        return format_audio_response(transcription, model)
-    except Exception as e:
-        logger.error(f"Error transcribing audio: {str(e)}")
-        # Fallback response
-        return format_audio_response(
-            "This is a placeholder transcription. In production, this would be generated by the Whisper model.",
-            model
-        )
-@app.post("/v1/embeddings")
-async def create_embeddings(request: EmbeddingRequest):
-    """Handle embedding requests."""
-    logger.info(f"Received embedding request: {request.dict()}")
-    # Use requested model or default
-    model = request.model if request.model != "default" else DEFAULT_EMBEDDING_MODEL
-    # Convert input to list if it's a single string
-    inputs = request.input if isinstance(request.input, list) else [request.input]
-    try:
-        # Process each input text
-        all_embeddings = []
-        for text in inputs:
-            # Prepare request for Triton
-            triton_request = {
-                "inputs": [
-                    {
-                        "name": "text_input",
-                        "shape": [1, 1],
-                        "datatype": "BYTES",
-                        "data": [text]
-                    }
-                ]
-            }
-            # Send to Triton
-            response = requests.post(
-                f"{TRITON_URL}/v2/models/{model}/infer",
-                json=triton_request
-            )
-            response.raise_for_status()
-            triton_response = response.json()
-            # Extract embedding
-            embedding = triton_response["outputs"][0]["data"]
-            all_embeddings.append(embedding)
-        return format_embedding_response(all_embeddings, model)
-    except Exception as e:
-        logger.error(f"Error creating embeddings: {str(e)}")
-        # Fallback - return random embeddings
-        embeddings = []
-        for _ in inputs:
-            # Generate a random embedding vector of dimension 1024 (BGE-M3)
-            embedding = np.random.normal(0, 1, 1024).tolist()
-            embeddings.append(embedding)
-        return format_embedding_response(embeddings, model)
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy"}
-# ===== Main =====
-if __name__ == "__main__":
-    # Create placeholder image for demo
-    try:
-        if not os.path.exists("placeholder.png"):
-            # Create a simple 256x256 black image
-            import numpy as np
-            from PIL import Image
-            img = Image.fromarray(np.zeros((256, 256, 3), dtype=np.uint8))
-            img.save("placeholder.png")
-    except ImportError:
-        logger.warning("PIL not installed. Cannot create placeholder image.")
-    # Start server
-    uvicorn.run(app, host="0.0.0.0", port=8300)

isa_model/inference/backends/Pytorch/bge_embed_backend.py DELETED Viewed

@@ -1,188 +0,0 @@
-import os
-import logging
-import torch
-import numpy as np
-from typing import Dict, List, Any, Optional, Union
-logger = logging.getLogger(__name__)
-class BgeEmbedBackend:
-    """
-    PyTorch backend for the BGE embedding model.
-    """
-    def __init__(self, model_path: Optional[str] = None, device: str = "auto"):
-        """
-        Initialize the BGE embedding backend.
-        Args:
-            model_path: Path to the model
-            device: Device to run the model on ("cpu", "cuda", or "auto")
-        """
-        self.model_path = model_path or os.environ.get("BGE_MODEL_PATH", "/models/Bge-m3")
-        self.device = device if device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = None
-        self.tokenizer = None
-        self._loaded = False
-        # Default configuration
-        self.config = {
-            "normalize": True,
-            "max_length": 512,
-            "pooling_method": "cls"  # Use CLS token for sentence embedding
-        }
-        self.logger = logger
-    def load(self) -> None:
-        """
-        Load the model and tokenizer.
-        """
-        if self._loaded:
-            return
-        try:
-            from transformers import AutoModel, AutoTokenizer
-            # Load tokenizer
-            self.logger.info(f"Loading BGE tokenizer from {self.model_path}")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-            # Load model
-            self.logger.info(f"Loading BGE model on {self.device}")
-            if self.device == "cpu":
-                self.model = AutoModel.from_pretrained(
-                    self.model_path,
-                    torch_dtype=torch.float32,
-                    device_map="auto"
-                )
-            else:  # cuda
-                self.model = AutoModel.from_pretrained(
-                    self.model_path,
-                    torch_dtype=torch.float16,  # Use half precision on GPU
-                    device_map="auto"
-                )
-            self.model.eval()
-            self._loaded = True
-            self.logger.info("BGE model loaded successfully")
-        except Exception as e:
-            self.logger.error(f"Failed to load BGE model: {str(e)}")
-            raise
-    def unload(self) -> None:
-        """
-        Unload the model and tokenizer.
-        """
-        if not self._loaded:
-            return
-        self.model = None
-        self.tokenizer = None
-        self._loaded = False
-        # Force garbage collection
-        import gc
-        gc.collect()
-        if self.device == "cuda":
-            torch.cuda.empty_cache()
-        self.logger.info("BGE model unloaded")
-    def embed(self,
-             texts: Union[str, List[str]],
-             normalize: Optional[bool] = None) -> np.ndarray:
-        """
-        Generate embeddings for texts.
-        Args:
-            texts: Single text or list of texts to embed
-            normalize: Whether to normalize embeddings (if None, use default)
-        Returns:
-            Numpy array of embeddings, shape [batch_size, embedding_dim]
-        """
-        if not self._loaded:
-            self.load()
-        # Handle single text input
-        if isinstance(texts, str):
-            texts = [texts]
-        # Use default normalize setting if not specified
-        if normalize is None:
-            normalize = self.config["normalize"]
-        try:
-            # Tokenize the texts
-            inputs = self.tokenizer(
-                texts,
-                padding=True,
-                truncation=True,
-                max_length=self.config["max_length"],
-                return_tensors="pt"
-            ).to(self.device)
-            # Generate embeddings
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-                # Use [CLS] token embedding as the sentence embedding
-                embeddings = outputs.last_hidden_state[:, 0, :]
-                # Normalize embeddings if required
-                if normalize:
-                    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-            # Convert to numpy array
-            embeddings_np = embeddings.cpu().numpy()
-            return embeddings_np
-        except Exception as e:
-            self.logger.error(f"Error during BGE embedding generation: {str(e)}")
-            raise
-    def get_model_info(self) -> Dict[str, Any]:
-        """
-        Get information about the model.
-        Returns:
-            Dictionary containing model information
-        """
-        return {
-            "name": "bge-m3",
-            "type": "embedding",
-            "device": self.device,
-            "path": self.model_path,
-            "loaded": self._loaded,
-            "embedding_dim": 1024,  # Typical for BGE models
-            "config": self.config
-        }
-    def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
-        """
-        Calculate cosine similarity between two embeddings.
-        Args:
-            embedding1: First embedding vector
-            embedding2: Second embedding vector
-        Returns:
-            Cosine similarity score (float between -1 and 1)
-        """
-        from sklearn.metrics.pairwise import cosine_similarity
-        # Reshape if needed
-        if embedding1.ndim == 1:
-            embedding1 = embedding1.reshape(1, -1)
-        if embedding2.ndim == 1:
-            embedding2 = embedding2.reshape(1, -1)
-        # Calculate cosine similarity
-        similarity = cosine_similarity(embedding1, embedding2)[0][0]
-        return float(similarity)

isa-model 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

isa-model 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl