PyPI - isa-model - Versions diffs - 0.1.0__py3-none-any.whl - Mend

isa-model 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (117) hide show

isa_model/__init__.py +5 -0
isa_model/core/model_manager.py +143 -0
isa_model/core/model_registry.py +115 -0
isa_model/core/model_router.py +226 -0
isa_model/core/model_storage.py +133 -0
isa_model/core/model_version.py +0 -0
isa_model/core/resource_manager.py +202 -0
isa_model/core/storage/hf_storage.py +0 -0
isa_model/core/storage/local_storage.py +0 -0
isa_model/core/storage/minio_storage.py +0 -0
isa_model/deployment/mlflow_gateway/__init__.py +8 -0
isa_model/deployment/mlflow_gateway/start_gateway.py +65 -0
isa_model/deployment/unified_multimodal_client.py +341 -0
isa_model/inference/__init__.py +11 -0
isa_model/inference/adapter/triton_adapter.py +453 -0
isa_model/inference/adapter/unified_api.py +248 -0
isa_model/inference/ai_factory.py +354 -0
isa_model/inference/backends/Pytorch/bge_embed_backend.py +188 -0
isa_model/inference/backends/Pytorch/gemma_backend.py +167 -0
isa_model/inference/backends/Pytorch/llama_backend.py +166 -0
isa_model/inference/backends/Pytorch/whisper_backend.py +194 -0
isa_model/inference/backends/__init__.py +53 -0
isa_model/inference/backends/base_backend_client.py +26 -0
isa_model/inference/backends/container_services.py +104 -0
isa_model/inference/backends/local_services.py +72 -0
isa_model/inference/backends/openai_client.py +130 -0
isa_model/inference/backends/replicate_client.py +197 -0
isa_model/inference/backends/third_party_services.py +239 -0
isa_model/inference/backends/triton_client.py +97 -0
isa_model/inference/base.py +46 -0
isa_model/inference/client_sdk/__init__.py +0 -0
isa_model/inference/client_sdk/client.py +134 -0
isa_model/inference/client_sdk/client_data_std.py +34 -0
isa_model/inference/client_sdk/client_sdk_schema.py +16 -0
isa_model/inference/client_sdk/exceptions.py +0 -0
isa_model/inference/engine/triton/model_repository/bge/1/model.py +174 -0
isa_model/inference/engine/triton/model_repository/gemma/1/model.py +250 -0
isa_model/inference/engine/triton/model_repository/llama/1/model.py +76 -0
isa_model/inference/engine/triton/model_repository/whisper/1/model.py +195 -0
isa_model/inference/providers/__init__.py +19 -0
isa_model/inference/providers/base_provider.py +30 -0
isa_model/inference/providers/model_cache_manager.py +341 -0
isa_model/inference/providers/ollama_provider.py +73 -0
isa_model/inference/providers/openai_provider.py +87 -0
isa_model/inference/providers/replicate_provider.py +94 -0
isa_model/inference/providers/triton_provider.py +439 -0
isa_model/inference/providers/vllm_provider.py +0 -0
isa_model/inference/providers/yyds_provider.py +83 -0
isa_model/inference/services/__init__.py +14 -0
isa_model/inference/services/audio/fish_speech/handler.py +215 -0
isa_model/inference/services/audio/runpod_tts_fish_service.py +212 -0
isa_model/inference/services/audio/triton_speech_service.py +138 -0
isa_model/inference/services/audio/whisper_service.py +186 -0
isa_model/inference/services/audio/yyds_audio_service.py +71 -0
isa_model/inference/services/base_service.py +106 -0
isa_model/inference/services/base_tts_service.py +66 -0
isa_model/inference/services/embedding/bge_service.py +183 -0
isa_model/inference/services/embedding/ollama_embed_service.py +85 -0
isa_model/inference/services/embedding/ollama_rerank_service.py +118 -0
isa_model/inference/services/embedding/onnx_rerank_service.py +73 -0
isa_model/inference/services/llm/__init__.py +16 -0
isa_model/inference/services/llm/gemma_service.py +143 -0
isa_model/inference/services/llm/llama_service.py +143 -0
isa_model/inference/services/llm/ollama_llm_service.py +108 -0
isa_model/inference/services/llm/openai_llm_service.py +129 -0
isa_model/inference/services/llm/replicate_llm_service.py +179 -0
isa_model/inference/services/llm/triton_llm_service.py +230 -0
isa_model/inference/services/others/table_transformer_service.py +61 -0
isa_model/inference/services/vision/__init__.py +12 -0
isa_model/inference/services/vision/helpers/image_utils.py +58 -0
isa_model/inference/services/vision/helpers/text_splitter.py +46 -0
isa_model/inference/services/vision/ollama_vision_service.py +60 -0
isa_model/inference/services/vision/replicate_vision_service.py +241 -0
isa_model/inference/services/vision/triton_vision_service.py +199 -0
isa_model/inference/services/vision/yyds_vision_service.py +80 -0
isa_model/inference/utils/conversion/bge_rerank_convert.py +73 -0
isa_model/inference/utils/conversion/onnx_converter.py +0 -0
isa_model/inference/utils/conversion/torch_converter.py +0 -0
isa_model/scripts/inference_tracker.py +283 -0
isa_model/scripts/mlflow_manager.py +379 -0
isa_model/scripts/model_registry.py +465 -0
isa_model/scripts/start_mlflow.py +95 -0
isa_model/scripts/training_tracker.py +257 -0
isa_model/training/engine/llama_factory/__init__.py +39 -0
isa_model/training/engine/llama_factory/config.py +115 -0
isa_model/training/engine/llama_factory/data_adapter.py +284 -0
isa_model/training/engine/llama_factory/examples/__init__.py +6 -0
isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +185 -0
isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +163 -0
isa_model/training/engine/llama_factory/factory.py +331 -0
isa_model/training/engine/llama_factory/rl.py +254 -0
isa_model/training/engine/llama_factory/trainer.py +171 -0
isa_model/training/image_model/configs/create_config.py +37 -0
isa_model/training/image_model/configs/create_flux_config.py +26 -0
isa_model/training/image_model/configs/create_lora_config.py +21 -0
isa_model/training/image_model/prepare_massed_compute.py +97 -0
isa_model/training/image_model/prepare_upload.py +17 -0
isa_model/training/image_model/raw_data/create_captions.py +16 -0
isa_model/training/image_model/raw_data/create_lora_captions.py +20 -0
isa_model/training/image_model/raw_data/pre_processing.py +200 -0
isa_model/training/image_model/train/train.py +42 -0
isa_model/training/image_model/train/train_flux.py +41 -0
isa_model/training/image_model/train/train_lora.py +57 -0
isa_model/training/image_model/train_main.py +25 -0
isa_model/training/llm_model/annotation/annotation_schema.py +47 -0
isa_model/training/llm_model/annotation/processors/annotation_processor.py +126 -0
isa_model/training/llm_model/annotation/storage/dataset_manager.py +131 -0
isa_model/training/llm_model/annotation/storage/dataset_schema.py +44 -0
isa_model/training/llm_model/annotation/tests/test_annotation_flow.py +109 -0
isa_model/training/llm_model/annotation/tests/test_minio copy.py +113 -0
isa_model/training/llm_model/annotation/tests/test_minio_upload.py +43 -0
isa_model/training/llm_model/annotation/views/annotation_controller.py +158 -0
isa_model-0.1.0.dist-info/METADATA +116 -0
isa_model-0.1.0.dist-info/RECORD +117 -0
isa_model-0.1.0.dist-info/WHEEL +5 -0
isa_model-0.1.0.dist-info/licenses/LICENSE +21 -0
isa_model-0.1.0.dist-info/top_level.txt +1 -0

isa_model/inference/adapter/triton_adapter.py ADDED Viewed

@@ -0,0 +1,453 @@
+#!/usr/bin/env python3
+"""
+Multimodal OpenAI-Compatible Adapter for Triton Inference Server
+This adapter translates between OpenAI API format and Triton Inference Server format.
+It supports multiple modalities (text, image, voice) through a unified API.
+Features:
+- Chat completions API (text)
+- Image generation API
+- Audio transcription API
+- Embeddings API
+The adapter routes requests to the appropriate Triton model based on the task.
+"""
+import os
+import json
+import time
+import base64
+import logging
+import requests
+import tempfile
+import uvicorn
+import uuid
+from typing import List, Dict, Any, Optional, Union
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form, Body, BackgroundTasks
+from fastapi.responses import StreamingResponse, FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import numpy as np
+from datetime import datetime
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(title="Multimodal OpenAI-Compatible Adapter")
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Constants
+TRITON_URL = os.environ.get("TRITON_URL", "http://localhost:8000")
+DEFAULT_TEXT_MODEL = os.environ.get("DEFAULT_TEXT_MODEL", "llama3_cpu")
+DEFAULT_IMAGE_MODEL = os.environ.get("DEFAULT_IMAGE_MODEL", "stable_diffusion")
+DEFAULT_AUDIO_MODEL = os.environ.get("DEFAULT_AUDIO_MODEL", "whisper_tiny")
+DEFAULT_EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL", "bge_m3")
+DEFAULT_VISION_MODEL = os.environ.get("DEFAULT_VISION_MODEL", "gemma3_4b")
+# ===== Schema Definitions =====
+class ChatMessage(BaseModel):
+    role: str
+    content: Union[str, List[Dict[str, Any]]]
+    name: Optional[str] = None
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    max_tokens: Optional[int] = 100
+    stream: Optional[bool] = False
+    stop: Optional[Union[str, List[str]]] = None
+class ImageGenerationRequest(BaseModel):
+    model: str
+    prompt: str
+    n: Optional[int] = 1
+    size: Optional[str] = "1024x1024"
+    response_format: Optional[str] = "url"
+class AudioTranscriptionRequest(BaseModel):
+    model: str
+    file: str  # Base64 encoded audio
+    response_format: Optional[str] = "text"
+    language: Optional[str] = "en"
+class EmbeddingRequest(BaseModel):
+    model: str
+    input: Union[str, List[str]]
+    encoding_format: Optional[str] = "float"
+# ===== Helper Functions =====
+def generate_response_id(prefix: str = "res") -> str:
+    """Generate a unique response ID."""
+    return f"{prefix}-{uuid.uuid4()}"
+def format_chat_response(content: str, model: str) -> Dict[str, Any]:
+    """Format chat completion response in OpenAI format."""
+    return {
+        "id": generate_response_id("chatcmpl"),
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": content
+                },
+                "finish_reason": "stop"
+            }
+        ],
+        "usage": {
+            "prompt_tokens": 0,  # We don't track these yet
+            "completion_tokens": 0,
+            "total_tokens": 0
+        }
+    }
+def format_image_response(image_data: str, model: str) -> Dict[str, Any]:
+    """Format image generation response in OpenAI format."""
+    return {
+        "created": int(time.time()),
+        "data": [
+            {
+                "url": f"data:image/png;base64,{image_data}"
+            }
+        ]
+    }
+def format_audio_response(text: str, model: str) -> Dict[str, Any]:
+    """Format audio transcription response in OpenAI format."""
+    return {
+        "text": text
+    }
+def format_embedding_response(embeddings: List[List[float]], model: str) -> Dict[str, Any]:
+    """Format embedding response in OpenAI format."""
+    data = []
+    for i, embedding in enumerate(embeddings):
+        data.append({
+            "object": "embedding",
+            "embedding": embedding,
+            "index": i
+        })
+    return {
+        "object": "list",
+        "data": data,
+        "model": model,
+        "usage": {
+            "prompt_tokens": 0,
+            "total_tokens": 0
+        }
+    }
+def extract_content_from_messages(messages: List[ChatMessage]) -> Dict[str, Any]:
+    """Extract content from messages for Triton input."""
+    formatted_content = ""
+    image_data = None
+    for msg in messages:
+        # Handle both string content and list of content parts
+        if isinstance(msg.content, str):
+            content = msg.content
+            formatted_content += f"{msg.role.capitalize()}: {content}\n"
+        else:
+            # For multimodal content, extract text and image parts
+            text_parts = []
+            for part in msg.content:
+                if part.get("type") == "text":
+                    text_parts.append(part.get("text", ""))
+                elif part.get("type") == "image_url":
+                    # Extract image from URL (assuming base64 encoded)
+                    image_url = part.get("image_url", {}).get("url", "")
+                    if image_url.startswith("data:image/"):
+                        # Extract the base64 part
+                        image_data = image_url.split(",")[1]
+            # Add text parts to formatted content
+            content = " ".join(text_parts)
+            formatted_content += f"{msg.role.capitalize()}: {content}\n"
+    formatted_content += "Assistant:"
+    return {"text": formatted_content, "image": image_data}
+# ===== API Routes =====
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    """Handle chat completion requests."""
+    logger.info(f"Received request: {request.dict()}")
+    # Extract the formatted content from messages
+    content = extract_content_from_messages(request.messages)
+    input_text = content["text"]
+    image_data = content["image"]
+    # Use requested model or default
+    model = request.model if request.model != "default" else DEFAULT_TEXT_MODEL
+    # Prepare request for Triton
+    triton_request = {
+        "inputs": [
+            {
+                "name": "text_input",
+                "shape": [1, 1],
+                "datatype": "BYTES",
+                "data": [input_text]
+            },
+            {
+                "name": "max_tokens",
+                "shape": [1, 1],
+                "datatype": "INT32",
+                "data": [request.max_tokens]
+            },
+            {
+                "name": "temperature",
+                "shape": [1, 1],
+                "datatype": "FP32",
+                "data": [request.temperature]
+            }
+        ]
+    }
+    # Add image input if available and using vision model
+    if image_data is not None and model == "gemma3_4b":
+        try:
+            # Decode base64 image
+            from PIL import Image
+            import io
+            import numpy as np
+            # Decode and preprocess image
+            image_bytes = base64.b64decode(image_data)
+            image = Image.open(io.BytesIO(image_bytes))
+            # Resize to expected size (224x224 for most vision models)
+            image = image.resize((224, 224))
+            # Convert to RGB if not already
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+            # Convert to numpy array and normalize
+            image_array = np.array(image).astype(np.float32) / 255.0
+            # Reorder from HWC to CHW format
+            image_array = np.transpose(image_array, (2, 0, 1))
+            # Add image input to Triton request
+            triton_request["inputs"].append({
+                "name": "image_input",
+                "shape": list(image_array.shape),
+                "datatype": "FP32",
+                "data": image_array.flatten().tolist()
+            })
+            logger.info("Added image input to request")
+        except Exception as e:
+            logger.error(f"Error processing image: {str(e)}")
+    logger.info(f"Sending to Triton: {triton_request}")
+    # Send to Triton
+    try:
+        response = requests.post(
+            f"{TRITON_URL}/v2/models/{model}/infer",
+            json=triton_request
+        )
+        response.raise_for_status()
+        triton_response = response.json()
+        logger.info(f"Triton response status: {response.status_code}")
+        logger.info(f"Triton response: {triton_response}")
+        # Extract text output
+        output_data = triton_response["outputs"][0]["data"][0]
+        # Format response
+        return format_chat_response(output_data, model)
+    except Exception as e:
+        logger.error(f"Error calling Triton: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error calling model: {str(e)}")
+@app.post("/v1/images/generations")
+async def generate_images(request: ImageGenerationRequest):
+    """Handle image generation requests."""
+    logger.info(f"Received image generation request: {request.dict()}")
+    # Use requested model or default
+    model = request.model if request.model != "default" else DEFAULT_IMAGE_MODEL
+    # For demo purposes - in a real implementation, this would call the Triton image model
+    # Here we'll just simulate image generation with a placeholder
+    try:
+        # Simulate Triton call (replace with actual call to Triton when image model is available)
+        # Return a placeholder image for demonstration
+        with open("placeholder.png", "rb") as f:
+            image_data = base64.b64encode(f.read()).decode("utf-8")
+        return format_image_response(image_data, model)
+    except Exception as e:
+        logger.error(f"Error generating image: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error generating image: {str(e)}")
+@app.post("/v1/audio/transcriptions")
+async def transcribe_audio(request: AudioTranscriptionRequest):
+    """Handle audio transcription requests."""
+    logger.info(f"Received audio transcription request: {request.dict()}")
+    # Use requested model or default
+    model = request.model if request.model != "default" else DEFAULT_AUDIO_MODEL
+    try:
+        # Decode the base64 audio
+        audio_data = base64.b64decode(request.file)
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+            temp_file.write(audio_data)
+            temp_file_path = temp_file.name
+        # Load and preprocess audio for Whisper
+        import librosa
+        import numpy as np
+        # Load audio file and resample to 16kHz for Whisper
+        audio_array, _ = librosa.load(temp_file_path, sr=16000, mono=True)
+        # Prepare request for Triton
+        triton_request = {
+            "inputs": [
+                {
+                    "name": "audio_input",
+                    "shape": [len(audio_array)],
+                    "datatype": "FP32",
+                    "data": audio_array.tolist()
+                }
+            ]
+        }
+        # Add language if provided
+        if hasattr(request, 'language') and request.language:
+            triton_request["inputs"].append({
+                "name": "language",
+                "shape": [1, 1],
+                "datatype": "BYTES",
+                "data": [request.language]
+            })
+        # Clean up temp file
+        os.unlink(temp_file_path)
+        # Send to Triton
+        response = requests.post(
+            f"{TRITON_URL}/v2/models/{model}/infer",
+            json=triton_request
+        )
+        response.raise_for_status()
+        triton_response = response.json()
+        # Extract text output
+        transcription = triton_response["outputs"][0]["data"][0]
+        return format_audio_response(transcription, model)
+    except Exception as e:
+        logger.error(f"Error transcribing audio: {str(e)}")
+        # Fallback response
+        return format_audio_response(
+            "This is a placeholder transcription. In production, this would be generated by the Whisper model.",
+            model
+        )
+@app.post("/v1/embeddings")
+async def create_embeddings(request: EmbeddingRequest):
+    """Handle embedding requests."""
+    logger.info(f"Received embedding request: {request.dict()}")
+    # Use requested model or default
+    model = request.model if request.model != "default" else DEFAULT_EMBEDDING_MODEL
+    # Convert input to list if it's a single string
+    inputs = request.input if isinstance(request.input, list) else [request.input]
+    try:
+        # Process each input text
+        all_embeddings = []
+        for text in inputs:
+            # Prepare request for Triton
+            triton_request = {
+                "inputs": [
+                    {
+                        "name": "text_input",
+                        "shape": [1, 1],
+                        "datatype": "BYTES",
+                        "data": [text]
+                    }
+                ]
+            }
+            # Send to Triton
+            response = requests.post(
+                f"{TRITON_URL}/v2/models/{model}/infer",
+                json=triton_request
+            )
+            response.raise_for_status()
+            triton_response = response.json()
+            # Extract embedding
+            embedding = triton_response["outputs"][0]["data"]
+            all_embeddings.append(embedding)
+        return format_embedding_response(all_embeddings, model)
+    except Exception as e:
+        logger.error(f"Error creating embeddings: {str(e)}")
+        # Fallback - return random embeddings
+        embeddings = []
+        for _ in inputs:
+            # Generate a random embedding vector of dimension 1024 (BGE-M3)
+            embedding = np.random.normal(0, 1, 1024).tolist()
+            embeddings.append(embedding)
+        return format_embedding_response(embeddings, model)
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+# ===== Main =====
+if __name__ == "__main__":
+    # Create placeholder image for demo
+    try:
+        if not os.path.exists("placeholder.png"):
+            # Create a simple 256x256 black image
+            import numpy as np
+            from PIL import Image
+            img = Image.fromarray(np.zeros((256, 256, 3), dtype=np.uint8))
+            img.save("placeholder.png")
+    except ImportError:
+        logger.warning("PIL not installed. Cannot create placeholder image.")
+    # Start server
+    uvicorn.run(app, host="0.0.0.0", port=8300)

isa_model/inference/adapter/unified_api.py ADDED Viewed

@@ -0,0 +1,248 @@
+import os
+import json
+import logging
+from typing import Dict, List, Any, Optional, Union
+from fastapi import FastAPI, HTTPException, Depends, Request
+from pydantic import BaseModel, Field
+from isa_model.inference.ai_factory import AIFactory
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("unified_api")
+# Create FastAPI app
+app = FastAPI(
+    title="Unified AI Model API",
+    description="API for inference with Llama3-8B, Gemma3-4B, Whisper, and BGE-M3 models",
+    version="1.0.0"
+)
+# Models
+class ChatMessage(BaseModel):
+    role: str = Field(..., description="Role of the message sender (system, user, assistant)")
+    content: str = Field(..., description="Content of the message")
+class ChatCompletionRequest(BaseModel):
+    model: str = Field(..., description="Model ID to use (llama, gemma)")
+    messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
+    temperature: Optional[float] = Field(0.7, description="Sampling temperature")
+    max_tokens: Optional[int] = Field(512, description="Maximum number of tokens to generate")
+    top_p: Optional[float] = Field(0.9, description="Top-p sampling parameter")
+    top_k: Optional[int] = Field(50, description="Top-k sampling parameter")
+class ChatCompletionResponse(BaseModel):
+    model: str = Field(..., description="Model used for completion")
+    choices: List[Dict[str, Any]] = Field(..., description="Generated completions")
+    usage: Dict[str, int] = Field(..., description="Token usage statistics")
+class EmbeddingRequest(BaseModel):
+    model: str = Field(..., description="Model ID to use (bge_embed)")
+    input: Union[str, List[str]] = Field(..., description="Text to embed")
+    normalize: Optional[bool] = Field(True, description="Whether to normalize embeddings")
+class TranscriptionRequest(BaseModel):
+    model: str = Field(..., description="Model ID to use (whisper)")
+    audio: str = Field(..., description="Base64-encoded audio data or URL")
+    language: Optional[str] = Field("en", description="Language code")
+# Factory for creating services
+ai_factory = AIFactory()
+# Dependency to get LLM service
+async def get_llm_service(model: str):
+    if model == "llama":
+        return await ai_factory.get_llm_service("llama")
+    elif model == "gemma":
+        return await ai_factory.get_llm_service("gemma")
+    else:
+        raise HTTPException(status_code=400, detail=f"Unsupported model: {model}")
+# Dependency to get embedding service
+async def get_embedding_service(model: str):
+    if model == "bge_embed":
+        return await ai_factory.get_embedding_service("bge_embed")
+    else:
+        raise HTTPException(status_code=400, detail=f"Unsupported model: {model}")
+# Dependency to get speech service
+async def get_speech_service(model: str):
+    if model == "whisper":
+        return await ai_factory.get_speech_service("whisper")
+    else:
+        raise HTTPException(status_code=400, detail=f"Unsupported model: {model}")
+# Endpoints
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def chat_completion(request: ChatCompletionRequest):
+    """Generate chat completion"""
+    try:
+        # Get the appropriate service
+        service = await get_llm_service(request.model)
+        # Format messages
+        formatted_messages = [{"role": msg.role, "content": msg.content} for msg in request.messages]
+        # Extract system prompt if present
+        system_prompt = None
+        if formatted_messages and formatted_messages[0]["role"] == "system":
+            system_prompt = formatted_messages[0]["content"]
+            formatted_messages = formatted_messages[1:]
+        # Get user prompt (last user message)
+        user_prompt = ""
+        for msg in reversed(formatted_messages):
+            if msg["role"] == "user":
+                user_prompt = msg["content"]
+                break
+        if not user_prompt:
+            raise HTTPException(status_code=400, detail="No user message found")
+        # Set generation config
+        generation_config = {
+            "temperature": request.temperature,
+            "max_new_tokens": request.max_tokens,
+            "top_p": request.top_p,
+            "top_k": request.top_k
+        }
+        # Generate completion
+        completion = await service.generate(
+            prompt=user_prompt,
+            system_prompt=system_prompt,
+            generation_config=generation_config
+        )
+        # Format response
+        response = {
+            "model": request.model,
+            "choices": [
+                {
+                    "message": {
+                        "role": "assistant",
+                        "content": completion
+                    },
+                    "finish_reason": "stop",
+                    "index": 0
+                }
+            ],
+            "usage": {
+                "prompt_tokens": len(user_prompt.split()),
+                "completion_tokens": len(completion.split()),
+                "total_tokens": len(user_prompt.split()) + len(completion.split())
+            }
+        }
+        return response
+    except Exception as e:
+        logger.error(f"Error in chat completion: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/v1/embeddings")
+async def create_embedding(request: EmbeddingRequest):
+    """Generate embeddings for text"""
+    try:
+        # Get the embedding service
+        service = await get_embedding_service("bge_embed")
+        # Generate embeddings
+        if isinstance(request.input, str):
+            embeddings = await service.embed(request.input, normalize=request.normalize)
+            data = [{"embedding": embeddings[0].tolist(), "index": 0}]
+        else:
+            embeddings = await service.embed(request.input, normalize=request.normalize)
+            data = [{"embedding": emb.tolist(), "index": i} for i, emb in enumerate(embeddings)]
+        # Format response
+        response = {
+            "model": request.model,
+            "data": data,
+            "usage": {
+                "prompt_tokens": sum(len(text.split()) for text in (request.input if isinstance(request.input, list) else [request.input])),
+                "total_tokens": sum(len(text.split()) for text in (request.input if isinstance(request.input, list) else [request.input]))
+            }
+        }
+        return response
+    except Exception as e:
+        logger.error(f"Error in embedding generation: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/v1/audio/transcriptions")
+async def transcribe_audio(request: TranscriptionRequest):
+    """Transcribe audio to text"""
+    try:
+        import base64
+        # Get the speech service
+        service = await get_speech_service("whisper")
+        # Process audio
+        if request.audio.startswith(("http://", "https://")):
+            # URL - download audio
+            import requests
+            audio_data = requests.get(request.audio).content
+        else:
+            # Base64 - decode
+            audio_data = base64.b64decode(request.audio)
+        # Transcribe
+        transcription = await service.transcribe(
+            audio=audio_data,
+            language=request.language
+        )
+        # Format response
+        response = {
+            "model": request.model,
+            "text": transcription
+        }
+        return response
+    except Exception as e:
+        logger.error(f"Error in audio transcription: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+# Health check endpoint
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy"}
+# Model info endpoint
+@app.get("/v1/models")
+async def list_models():
+    """List available models"""
+    models = [
+        {
+            "id": "llama",
+            "type": "llm",
+            "description": "Llama3-8B language model"
+        },
+        {
+            "id": "gemma",
+            "type": "llm",
+            "description": "Gemma3-4B language model"
+        },
+        {
+            "id": "whisper",
+            "type": "speech",
+            "description": "Whisper-tiny speech-to-text model"
+        },
+        {
+            "id": "bge_embed",
+            "type": "embedding",
+            "description": "BGE-M3 text embedding model"
+        }
+    ]
+    return {"data": models}
+# Main entry point
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8080)