PyPI - fount-vlm-nell-02 - Versions diffs - 0.3.11__py3-none-any.whl - Mend

fount-vlm-nell-02 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

fount_vlm_nell_02-0.3.11.dist-info/METADATA +418 -0
fount_vlm_nell_02-0.3.11.dist-info/RECORD +258 -0
fount_vlm_nell_02-0.3.11.dist-info/WHEEL +5 -0
fount_vlm_nell_02-0.3.11.dist-info/entry_points.txt +5 -0
fount_vlm_nell_02-0.3.11.dist-info/licenses/LICENSE +21 -0
fount_vlm_nell_02-0.3.11.dist-info/top_level.txt +1 -0
mlx_vlm/__init__.py +16 -0
mlx_vlm/__main__.py +24 -0
mlx_vlm/chat.py +234 -0
mlx_vlm/chat_ui.py +508 -0
mlx_vlm/convert.py +284 -0
mlx_vlm/deprecation.py +52 -0
mlx_vlm/evals/__init__.py +0 -0
mlx_vlm/evals/math_vista.py +565 -0
mlx_vlm/evals/mmmu.py +528 -0
mlx_vlm/evals/mmstar.py +343 -0
mlx_vlm/evals/ocrbench.py +453 -0
mlx_vlm/evals/utils.py +37 -0
mlx_vlm/generate.py +1457 -0
mlx_vlm/lora.py +207 -0
mlx_vlm/models/__init__.py +0 -0
mlx_vlm/models/aya_vision/__init__.py +2 -0
mlx_vlm/models/aya_vision/aya_vision.py +188 -0
mlx_vlm/models/aya_vision/config.py +52 -0
mlx_vlm/models/aya_vision/language.py +202 -0
mlx_vlm/models/aya_vision/vision.py +340 -0
mlx_vlm/models/base.py +356 -0
mlx_vlm/models/cache.py +238 -0
mlx_vlm/models/deepseek_vl_v2/__init__.py +2 -0
mlx_vlm/models/deepseek_vl_v2/config.py +159 -0
mlx_vlm/models/deepseek_vl_v2/conversation.py +264 -0
mlx_vlm/models/deepseek_vl_v2/deepseek_vl_v2.py +418 -0
mlx_vlm/models/deepseek_vl_v2/language.py +539 -0
mlx_vlm/models/deepseek_vl_v2/processing_deepsek_vl_v2.py +536 -0
mlx_vlm/models/deepseek_vl_v2/vision.py +322 -0
mlx_vlm/models/deepseekocr/__init__.py +2 -0
mlx_vlm/models/deepseekocr/config.py +173 -0
mlx_vlm/models/deepseekocr/conversation.py +264 -0
mlx_vlm/models/deepseekocr/deepseekocr.py +371 -0
mlx_vlm/models/deepseekocr/language.py +547 -0
mlx_vlm/models/deepseekocr/processing_deepseekocr.py +655 -0
mlx_vlm/models/deepseekocr/sam.py +489 -0
mlx_vlm/models/deepseekocr/vision.py +263 -0
mlx_vlm/models/deepseekocr_2/__init__.py +12 -0
mlx_vlm/models/deepseekocr_2/config.py +216 -0
mlx_vlm/models/deepseekocr_2/deepseekocr_2.py +297 -0
mlx_vlm/models/deepseekocr_2/processing_deepseekocr.py +624 -0
mlx_vlm/models/deepseekocr_2/vision.py +439 -0
mlx_vlm/models/ernie4_5_moe_vl/__init__.py +5 -0
mlx_vlm/models/ernie4_5_moe_vl/config.py +139 -0
mlx_vlm/models/ernie4_5_moe_vl/ernie4_5_moe_vl.py +337 -0
mlx_vlm/models/ernie4_5_moe_vl/language.py +770 -0
mlx_vlm/models/ernie4_5_moe_vl/processor.py +686 -0
mlx_vlm/models/ernie4_5_moe_vl/vision.py +322 -0
mlx_vlm/models/fastvlm/__init__.py +2 -0
mlx_vlm/models/fastvlm/config.py +79 -0
mlx_vlm/models/fastvlm/fastvlm.py +198 -0
mlx_vlm/models/fastvlm/language.py +49 -0
mlx_vlm/models/fastvlm/vision.py +692 -0
mlx_vlm/models/florence2/__init__.py +2 -0
mlx_vlm/models/florence2/config.py +84 -0
mlx_vlm/models/florence2/florence2.py +383 -0
mlx_vlm/models/florence2/language.py +452 -0
mlx_vlm/models/florence2/processing_florence2.py +30 -0
mlx_vlm/models/florence2/vision.py +552 -0
mlx_vlm/models/gemma3/__init__.py +2 -0
mlx_vlm/models/gemma3/config.py +52 -0
mlx_vlm/models/gemma3/gemma3.py +194 -0
mlx_vlm/models/gemma3/language.py +293 -0
mlx_vlm/models/gemma3/vision.py +215 -0
mlx_vlm/models/gemma3n/__init__.py +2 -0
mlx_vlm/models/gemma3n/audio.py +1038 -0
mlx_vlm/models/gemma3n/config.py +130 -0
mlx_vlm/models/gemma3n/gemma3n.py +322 -0
mlx_vlm/models/gemma3n/language.py +631 -0
mlx_vlm/models/gemma3n/vision.py +994 -0
mlx_vlm/models/glm4v/__init__.py +3 -0
mlx_vlm/models/glm4v/config.py +79 -0
mlx_vlm/models/glm4v/glm4v.py +188 -0
mlx_vlm/models/glm4v/language.py +574 -0
mlx_vlm/models/glm4v/processing.py +220 -0
mlx_vlm/models/glm4v/vision.py +406 -0
mlx_vlm/models/glm4v_moe/__init__.py +3 -0
mlx_vlm/models/glm4v_moe/config.py +81 -0
mlx_vlm/models/glm4v_moe/glm4v_moe.py +176 -0
mlx_vlm/models/glm4v_moe/language.py +674 -0
mlx_vlm/models/glm4v_moe/processing.py +229 -0
mlx_vlm/models/glm4v_moe/vision.py +405 -0
mlx_vlm/models/glm_ocr/__init__.py +3 -0
mlx_vlm/models/glm_ocr/config.py +93 -0
mlx_vlm/models/glm_ocr/glm_ocr.py +180 -0
mlx_vlm/models/glm_ocr/language.py +585 -0
mlx_vlm/models/glm_ocr/processing.py +208 -0
mlx_vlm/models/glm_ocr/vision.py +342 -0
mlx_vlm/models/hunyuan_vl/__init__.py +7 -0
mlx_vlm/models/hunyuan_vl/config.py +136 -0
mlx_vlm/models/hunyuan_vl/hunyuan_vl.py +181 -0
mlx_vlm/models/hunyuan_vl/language.py +509 -0
mlx_vlm/models/hunyuan_vl/processing_hunyuan_vl.py +607 -0
mlx_vlm/models/hunyuan_vl/vision.py +322 -0
mlx_vlm/models/idefics2/__init__.py +2 -0
mlx_vlm/models/idefics2/config.py +65 -0
mlx_vlm/models/idefics2/idefics2.py +321 -0
mlx_vlm/models/idefics2/language.py +161 -0
mlx_vlm/models/idefics2/vision.py +244 -0
mlx_vlm/models/idefics3/__init__.py +4 -0
mlx_vlm/models/idefics3/config.py +54 -0
mlx_vlm/models/idefics3/idefics3.py +221 -0
mlx_vlm/models/idefics3/language.py +157 -0
mlx_vlm/models/idefics3/vision.py +265 -0
mlx_vlm/models/internvl_chat/__init__.py +3 -0
mlx_vlm/models/internvl_chat/config.py +89 -0
mlx_vlm/models/internvl_chat/internvl_chat.py +115 -0
mlx_vlm/models/internvl_chat/language.py +187 -0
mlx_vlm/models/internvl_chat/processor.py +395 -0
mlx_vlm/models/internvl_chat/vision.py +265 -0
mlx_vlm/models/interpolate.py +183 -0
mlx_vlm/models/jina_vlm/__init__.py +3 -0
mlx_vlm/models/jina_vlm/config.py +142 -0
mlx_vlm/models/jina_vlm/image_processor.py +430 -0
mlx_vlm/models/jina_vlm/jina_vlm.py +280 -0
mlx_vlm/models/jina_vlm/language.py +272 -0
mlx_vlm/models/jina_vlm/processing_jinavlm.py +266 -0
mlx_vlm/models/jina_vlm/vision.py +202 -0
mlx_vlm/models/kernels.py +447 -0
mlx_vlm/models/kimi_vl/__init__.py +4 -0
mlx_vlm/models/kimi_vl/config.py +84 -0
mlx_vlm/models/kimi_vl/kimi_vl.py +127 -0
mlx_vlm/models/kimi_vl/language.py +460 -0
mlx_vlm/models/kimi_vl/processing_kimi_vl.py +560 -0
mlx_vlm/models/kimi_vl/vision.py +485 -0
mlx_vlm/models/lfm2_vl/__init__.py +2 -0
mlx_vlm/models/lfm2_vl/config.py +94 -0
mlx_vlm/models/lfm2_vl/language.py +49 -0
mlx_vlm/models/lfm2_vl/lfm2_vl.py +223 -0
mlx_vlm/models/lfm2_vl/processing_lfm2_vl.py +320 -0
mlx_vlm/models/lfm2_vl/vision.py +223 -0
mlx_vlm/models/llama4/__init__.py +2 -0
mlx_vlm/models/llama4/config.py +83 -0
mlx_vlm/models/llama4/language.py +334 -0
mlx_vlm/models/llama4/llama4.py +146 -0
mlx_vlm/models/llama4/vision.py +526 -0
mlx_vlm/models/llava/__init__.py +2 -0
mlx_vlm/models/llava/config.py +61 -0
mlx_vlm/models/llava/language.py +200 -0
mlx_vlm/models/llava/llava.py +132 -0
mlx_vlm/models/llava/vision.py +233 -0
mlx_vlm/models/llava_bunny/__init__.py +2 -0
mlx_vlm/models/llava_bunny/config.py +85 -0
mlx_vlm/models/llava_bunny/language.py +194 -0
mlx_vlm/models/llava_bunny/llava_bunny.py +217 -0
mlx_vlm/models/llava_bunny/vision.py +278 -0
mlx_vlm/models/llava_next/__init__.py +2 -0
mlx_vlm/models/llava_next/config.py +60 -0
mlx_vlm/models/llava_next/language.py +192 -0
mlx_vlm/models/llava_next/llava_next.py +138 -0
mlx_vlm/models/llava_next/vision.py +217 -0
mlx_vlm/models/mistral3/__init__.py +2 -0
mlx_vlm/models/mistral3/config.py +59 -0
mlx_vlm/models/mistral3/language.py +269 -0
mlx_vlm/models/mistral3/mistral3.py +383 -0
mlx_vlm/models/mllama/__init__.py +4 -0
mlx_vlm/models/mllama/config.py +74 -0
mlx_vlm/models/mllama/language.py +377 -0
mlx_vlm/models/mllama/mllama.py +210 -0
mlx_vlm/models/mllama/vision.py +458 -0
mlx_vlm/models/molmo/__init__.py +5 -0
mlx_vlm/models/molmo/config.py +93 -0
mlx_vlm/models/molmo/language.py +208 -0
mlx_vlm/models/molmo/molmo.py +108 -0
mlx_vlm/models/molmo/processing_molmo.py +763 -0
mlx_vlm/models/molmo/vision.py +408 -0
mlx_vlm/models/molmo2/__init__.py +6 -0
mlx_vlm/models/molmo2/config.py +137 -0
mlx_vlm/models/molmo2/language.py +206 -0
mlx_vlm/models/molmo2/molmo2.py +330 -0
mlx_vlm/models/molmo2/processing.py +773 -0
mlx_vlm/models/molmo2/vision.py +286 -0
mlx_vlm/models/moondream2/__init__.py +11 -0
mlx_vlm/models/moondream2/config.py +92 -0
mlx_vlm/models/moondream2/image_crops.py +269 -0
mlx_vlm/models/moondream2/language.py +267 -0
mlx_vlm/models/moondream2/moondream2.py +522 -0
mlx_vlm/models/moondream2/processing_moondream.py +144 -0
mlx_vlm/models/moondream2/vision.py +200 -0
mlx_vlm/models/multi_modality/__init__.py +4 -0
mlx_vlm/models/multi_modality/config.py +108 -0
mlx_vlm/models/multi_modality/language.py +191 -0
mlx_vlm/models/multi_modality/multi_modality.py +338 -0
mlx_vlm/models/multi_modality/sam.py +543 -0
mlx_vlm/models/multi_modality/vision.py +450 -0
mlx_vlm/models/paddleocr_vl/__init__.py +3 -0
mlx_vlm/models/paddleocr_vl/config.py +93 -0
mlx_vlm/models/paddleocr_vl/language.py +522 -0
mlx_vlm/models/paddleocr_vl/paddleocr_vl.py +207 -0
mlx_vlm/models/paddleocr_vl/processing_paddleocr_vl.py +425 -0
mlx_vlm/models/paddleocr_vl/vision.py +358 -0
mlx_vlm/models/paligemma/__init__.py +4 -0
mlx_vlm/models/paligemma/config.py +50 -0
mlx_vlm/models/paligemma/language.py +253 -0
mlx_vlm/models/paligemma/paligemma.py +140 -0
mlx_vlm/models/paligemma/vision.py +218 -0
mlx_vlm/models/phi3_v/__init__.py +5 -0
mlx_vlm/models/phi3_v/config.py +55 -0
mlx_vlm/models/phi3_v/language.py +2 -0
mlx_vlm/models/phi3_v/phi3_v.py +239 -0
mlx_vlm/models/phi3_v/processing_phi3_v.py +704 -0
mlx_vlm/models/phi3_v/vision.py +294 -0
mlx_vlm/models/pixtral/__init__.py +4 -0
mlx_vlm/models/pixtral/config.py +69 -0
mlx_vlm/models/pixtral/language.py +195 -0
mlx_vlm/models/pixtral/pixtral.py +208 -0
mlx_vlm/models/pixtral/vision.py +293 -0
mlx_vlm/models/qwen2_5_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_5_vl/config.py +90 -0
mlx_vlm/models/qwen2_5_vl/language.py +541 -0
mlx_vlm/models/qwen2_5_vl/qwen2_5_vl.py +184 -0
mlx_vlm/models/qwen2_5_vl/vision.py +414 -0
mlx_vlm/models/qwen2_vl/__init__.py +2 -0
mlx_vlm/models/qwen2_vl/config.py +86 -0
mlx_vlm/models/qwen2_vl/language.py +539 -0
mlx_vlm/models/qwen2_vl/qwen2_vl.py +180 -0
mlx_vlm/models/qwen2_vl/vision.py +308 -0
mlx_vlm/models/qwen3_omni_moe/__init__.py +29 -0
mlx_vlm/models/qwen3_omni_moe/audio.py +317 -0
mlx_vlm/models/qwen3_omni_moe/code2wav.py +542 -0
mlx_vlm/models/qwen3_omni_moe/config.py +264 -0
mlx_vlm/models/qwen3_omni_moe/language.py +622 -0
mlx_vlm/models/qwen3_omni_moe/omni_utils.py +69 -0
mlx_vlm/models/qwen3_omni_moe/qwen3_omni_moe.py +706 -0
mlx_vlm/models/qwen3_omni_moe/talker.py +873 -0
mlx_vlm/models/qwen3_omni_moe/thinker.py +366 -0
mlx_vlm/models/qwen3_omni_moe/vision.py +419 -0
mlx_vlm/models/qwen3_vl/__init__.py +2 -0
mlx_vlm/models/qwen3_vl/config.py +103 -0
mlx_vlm/models/qwen3_vl/language.py +596 -0
mlx_vlm/models/qwen3_vl/qwen3_vl.py +166 -0
mlx_vlm/models/qwen3_vl/vision.py +441 -0
mlx_vlm/models/qwen3_vl_moe/__init__.py +2 -0
mlx_vlm/models/qwen3_vl_moe/config.py +108 -0
mlx_vlm/models/qwen3_vl_moe/language.py +656 -0
mlx_vlm/models/qwen3_vl_moe/qwen3_vl_moe.py +184 -0
mlx_vlm/models/qwen3_vl_moe/vision.py +442 -0
mlx_vlm/models/smolvlm/__init__.py +4 -0
mlx_vlm/models/smolvlm/config.py +59 -0
mlx_vlm/models/smolvlm/smolvlm.py +60 -0
mlx_vlm/prompt_utils.py +565 -0
mlx_vlm/sample_utils.py +39 -0
mlx_vlm/server.py +1107 -0
mlx_vlm/smolvlm_video_generate.py +109 -0
mlx_vlm/tokenizer_utils.py +371 -0
mlx_vlm/trainer/__init__.py +9 -0
mlx_vlm/trainer/lora.py +70 -0
mlx_vlm/trainer/trainer.py +299 -0
mlx_vlm/trainer/utils.py +160 -0
mlx_vlm/utils.py +1339 -0
mlx_vlm/version.py +1 -0
mlx_vlm/video_generate.py +611 -0

mlx_vlm/server.py ADDED Viewed

@@ -0,0 +1,1107 @@
+import argparse
+import asyncio
+import gc
+import json
+import os
+import traceback
+import uuid
+from datetime import datetime
+from typing import Any, List, Literal, Optional, Tuple, Union
+import mlx.core as mx
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import StreamingResponse
+from huggingface_hub import scan_cache_dir
+from pydantic import BaseModel, ConfigDict, Field
+from typing_extensions import Required, TypeAlias, TypedDict
+from .generate import (
+    DEFAULT_MAX_TOKENS,
+    DEFAULT_MODEL_PATH,
+    DEFAULT_SEED,
+    DEFAULT_TEMPERATURE,
+    DEFAULT_TOP_P,
+    generate,
+    stream_generate,
+)
+from .prompt_utils import apply_chat_template
+from .utils import load
+from .version import __version__
+app = FastAPI(
+    title="MLX-VLM Inference API",
+    description="API for using Vision Language Models (VLMs) and Omni Models (Vision, Audio and Video support) with MLX.",
+    version=__version__,
+)
+MAX_IMAGES = 10  # Maximum number of images to process at once
+# Loading/unloading utilities
+model_cache = {}
+class FlexibleBaseModel(BaseModel):
+    """Base model that ignores/accepts any unknown OpenAI SDK fields."""
+    model_config = ConfigDict(extra="allow")
+def load_model_resources(model_path: str, adapter_path: Optional[str]):
+    """
+    Loads model, processor, and config based on paths.
+    Handles potential loading errors.
+    """
+    try:
+        print(f"Loading model from: {model_path}")
+        if adapter_path:
+            print(f"Loading adapter from: {adapter_path}")
+        # Use the load function from utils.py which handles path resolution and loading
+        trust_remote_code = (
+            os.environ.get("MLX_TRUST_REMOTE_CODE", "false").lower() == "true"
+        )
+        model, processor = load(
+            model_path, adapter_path, trust_remote_code=trust_remote_code
+        )
+        config = model.config
+        print("Model and processor loaded successfully.")
+        return model, processor, config
+    except Exception as e:
+        print(f"Error loading model {model_path}: {e}")
+        traceback.print_exc()  # Print detailed traceback for debugging
+        raise HTTPException(status_code=500, detail=f"Failed to load model: {e}")
+def get_cached_model(model_path: str, adapter_path: Optional[str] = None):
+    """
+    Factory function to get or load the appropriate model resources from cache or by loading.
+    """
+    global model_cache
+    cache_key = (model_path, adapter_path)
+    # Return from cache if already loaded and matches the requested paths
+    if model_cache.get("cache_key") == cache_key:
+        print(f"Using cached model: {model_path}, Adapter: {adapter_path}")
+        return model_cache["model"], model_cache["processor"], model_cache["config"]
+    # If cache exists but doesn't match, clear it
+    if model_cache:
+        print("New model request, clearing existing cache...")
+        unload_model_sync()  # Use a synchronous version for internal call
+    # Load the model resources
+    model, processor, config = load_model_resources(model_path, adapter_path)
+    model_cache = {
+        "cache_key": cache_key,
+        "model_path": model_path,
+        "adapter_path": adapter_path,
+        "model": model,
+        "processor": processor,
+        "config": config,
+    }
+    return model, processor, config
+# Synchronous unload function for internal use
+def unload_model_sync():
+    global model_cache
+    if not model_cache:
+        return False
+    print(
+        f"Unloading model: {model_cache.get('model_path')}, Adapter: {model_cache.get('adapter_path')}"
+    )
+    # Clear references
+    model_cache = {}
+    # Force garbage collection
+    gc.collect()
+    mx.clear_cache()
+    print("Model unloaded and cache cleared.")
+    return True
+# OpenAI API Models
+# Models for /responses endpoint
+class ResponseInputTextParam(TypedDict, total=False):
+    text: Required[str]
+    type: Required[
+        Literal["input_text", "text"]
+    ]  # The type of the input item. Always `input_text`.
+class ResponseInputImageParam(TypedDict, total=False):
+    detail: Literal["high", "low", "auto"] = Field(
+        "auto", description="The detail level of the image to be sent to the model."
+    )
+    """The detail level of the image to be sent to the model.
+    One of `high`, `low`, or `auto`. Defaults to `auto`.
+    """
+    type: Required[
+        Literal["input_image"]
+    ]  # The type of the input item. Always `input_image`.
+    image_url: Required[str]
+    file_id: Optional[str]
+    """The ID of the file to be sent to the model.
+     NOTE : wouldn't this help the model if we passed the file_id as well to the vlm models
+    """
+class InputAudio(TypedDict, total=False):
+    data: Required[str]
+    format: Required[str]
+class ResponseInputAudioParam(TypedDict, total=False):
+    type: Required[
+        Literal["input_audio"]
+    ]  # The type of the input item. Always `input_audio`.
+    input_audio: Required[InputAudio]
+class ImageUrl(TypedDict, total=False):
+    url: Required[str]
+class ResponseImageUrlParam(TypedDict, total=False):
+    type: Required[
+        Literal["image_url"]
+    ]  # The type of the input item. Always`image_url`.
+    image_url: Required[ImageUrl]
+ResponseInputContentParam: TypeAlias = Union[
+    ResponseInputTextParam,
+    ResponseInputImageParam,
+    ResponseImageUrlParam,
+    ResponseInputAudioParam,
+]
+ResponseInputMessageContentListParam: TypeAlias = List[ResponseInputContentParam]
+class ResponseOutputText(TypedDict, total=False):
+    text: Required[str]
+    type: Required[
+        Literal["output_text"]
+    ]  # The type of the output item. Always `output_text`
+ResponseOutputMessageContentList: TypeAlias = List[ResponseOutputText]
+class ChatMessage(FlexibleBaseModel):
+    role: Literal["user", "assistant", "system", "developer"] = Field(
+        ...,
+        description="Role of the message sender (e.g., 'system', 'user', 'assistant').",
+    )
+    content: Union[
+        str, ResponseInputMessageContentListParam, ResponseOutputMessageContentList
+    ] = Field(..., description="Content of the message.")
+class OpenAIRequest(FlexibleBaseModel):
+    """
+    OpenAI-compatible request structure.
+    Using this structure : https://github.com/openai/openai-python/blob/main/src/openai/resources/responses/responses.py
+    """
+    input: Union[str, List[ChatMessage]] = Field(
+        ..., description="Input text or list of chat messages."
+    )
+    model: str = Field(..., description="The model to use for generation.")
+    max_output_tokens: int = Field(
+        DEFAULT_MAX_TOKENS, description="Maximum number of tokens to generate."
+    )
+    temperature: float = Field(
+        DEFAULT_TEMPERATURE, description="Temperature for sampling."
+    )
+    top_p: float = Field(DEFAULT_TOP_P, description="Top-p sampling.")
+    stream: bool = Field(
+        False, description="Whether to stream the response chunk by chunk."
+    )
+class OpenAIUsage(BaseModel):
+    """Token usage details including input tokens, output tokens, breakdown, and total tokens used."""
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+class OpenAIErrorObject(BaseModel):
+    """Error object returned when the model fails to generate a Response."""
+    code: Optional[str] = None
+    message: Optional[str] = None
+    param: Optional[str] = None
+    type: Optional[str] = None
+class OpenAIResponse(BaseModel):
+    id: str = Field(..., description="Unique identifier for this Response")
+    object: Literal["response"] = Field(
+        ..., description="The object type of this resource - always set to response"
+    )
+    created_at: int = Field(
+        ..., description="Unix timestamp (in seconds) of when this Response was created"
+    )
+    status: Literal["completed", "failed", "in_progress", "incomplete"] = Field(
+        ..., description="The status of the response generation"
+    )
+    error: Optional[OpenAIErrorObject] = Field(
+        None,
+        description="An error object returned when the model fails to generate a Response",
+    )
+    instructions: Optional[str] = Field(
+        None,
+        description="Inserts a system (or developer) message as the first item in the model's context",
+    )
+    max_output_tokens: Optional[int] = Field(
+        None,
+        description="An upper bound for the number of tokens that can be generated for a response",
+    )
+    model: str = Field(..., description="Model ID used to generate the response")
+    output: List[Union[ChatMessage, Any]] = Field(
+        ..., description="An array of content items generated by the model"
+    )
+    output_text: Optional[str] = Field(
+        None,
+        description="SDK-only convenience property containing aggregated text output",
+    )
+    temperature: Optional[float] = Field(
+        None, ge=0, le=2, description="Sampling temperature between 0 and 2"
+    )
+    top_p: Optional[float] = Field(
+        None, ge=0, le=1, description="Nucleus sampling probability mass"
+    )
+    truncation: Union[Literal["auto", "disabled"], str] = Field(
+        "disabled", description="The truncation strategy to use"
+    )
+    usage: OpenAIUsage = Field(
+        ..., description="Token usage details"
+    )  # we need the model to return stats
+    user: Optional[str] = Field(
+        None, description="A unique identifier representing your end-user"
+    )
+class BaseStreamEvent(BaseModel):
+    type: str
+class ContentPartOutputText(BaseModel):
+    type: Literal["output_text"]
+    text: str
+    annotations: List[str] = []
+class MessageItem(BaseModel):
+    id: str
+    type: Literal["message"]
+    status: Literal["in_progress", "completed"]
+    role: str
+    content: List[ContentPartOutputText] = []
+class ResponseCreatedEvent(BaseStreamEvent):
+    type: Literal["response.created"]
+    response: OpenAIResponse
+class ResponseInProgressEvent(BaseStreamEvent):
+    type: Literal["response.in_progress"]
+    response: OpenAIResponse
+class ResponseOutputItemAddedEvent(BaseStreamEvent):
+    type: Literal["response.output_item.added"]
+    output_index: int
+    item: MessageItem
+class ResponseContentPartAddedEvent(BaseStreamEvent):
+    type: Literal["response.content_part.added"]
+    item_id: str
+    output_index: int
+    content_index: int
+    part: ContentPartOutputText
+class ResponseOutputTextDeltaEvent(BaseStreamEvent):
+    type: Literal["response.output_text.delta"]
+    item_id: str
+    output_index: int
+    content_index: int
+    delta: str
+class ResponseOutputTextDoneEvent(BaseStreamEvent):
+    type: Literal["response.output_text.done"]
+    item_id: str
+    output_index: int
+    content_index: int
+    text: str
+class ResponseContentPartDoneEvent(BaseStreamEvent):
+    type: Literal["response.content_part.done"]
+    item_id: str
+    output_index: int
+    content_index: int
+    part: ContentPartOutputText
+class ResponseOutputItemDoneEvent(BaseStreamEvent):
+    type: Literal["response.output_item.done"]
+    output_index: int
+    item: MessageItem
+class ResponseCompletedEvent(BaseStreamEvent):
+    type: Literal["response.completed"]
+    response: OpenAIResponse
+StreamEvent = Union[
+    ResponseCreatedEvent,
+    ResponseInProgressEvent,
+    ResponseOutputItemAddedEvent,
+    ResponseContentPartAddedEvent,
+    ResponseOutputTextDeltaEvent,
+    ResponseOutputTextDoneEvent,
+    ResponseContentPartDoneEvent,
+    ResponseOutputItemDoneEvent,
+    ResponseCompletedEvent,
+]
+# Models for /chat/completion endpoint
+class VLMRequest(FlexibleBaseModel):
+    model: str = Field(
+        DEFAULT_MODEL_PATH,
+        description="The path to the local model directory or Hugging Face repo.",
+    )
+    adapter_path: Optional[str] = Field(
+        None, description="The path to the adapter weights."
+    )
+    max_tokens: int = Field(
+        DEFAULT_MAX_TOKENS, description="Maximum number of tokens to generate."
+    )
+    temperature: float = Field(
+        DEFAULT_TEMPERATURE, description="Temperature for sampling."
+    )
+    top_p: float = Field(DEFAULT_TOP_P, description="Top-p sampling.")
+    seed: int = Field(DEFAULT_SEED, description="Seed for random generation.")
+    resize_shape: Optional[Tuple[int, int]] = Field(
+        None,
+        description="Resize shape for the image (height, width). Provide two integers.",
+    )
+class GenerationRequest(VLMRequest):
+    """
+    Inherits from VLMRequest and adds additional fields for the generation request.
+    """
+    stream: bool = Field(
+        False, description="Whether to stream the response chunk by chunk."
+    )
+class UsageStats(OpenAIUsage):
+    """
+    Inherits from OpenAIUsage and adds additional fields for usage statistics.
+    """
+    prompt_tps: float = Field(..., description="Tokens per second for the prompt.")
+    generation_tps: float = Field(
+        ..., description="Tokens per second for the generation."
+    )
+    peak_memory: float = Field(
+        ..., description="Peak memory usage during the generation."
+    )
+class ChatRequest(GenerationRequest):
+    messages: List[ChatMessage]
+class ChatChoice(BaseModel):
+    finish_reason: str
+    message: ChatMessage
+class ChatResponse(BaseModel):
+    model: str
+    choices: List[ChatChoice]
+    usage: Optional[UsageStats]
+class ChatStreamChoice(BaseModel):
+    finish_reason: Optional[str] = None
+    delta: ChatMessage
+class ChatStreamChunk(BaseModel):
+    model: str
+    choices: List[ChatStreamChoice]
+    usage: Optional[UsageStats]
+# Models for /models endpoint
+class ModelInfo(BaseModel):
+    id: str
+    object: str
+    created: int
+class ModelsResponse(BaseModel):
+    object: Literal["list"]
+    data: List[ModelInfo]
+# OpenAI compatile endpoints
+@app.post("/responses")
+async def responses_endpoint(request: Request):
+    """
+    OpenAI-compatible endpoint for generating text based on a prompt and optional images.
+    using client.responses.create method.
+    example:
+    from openai import OpenAI
+    API_URL = "http://0.0.0.0:8000"
+    API_KEY = 'any'
+    def run_openai(prompt, img_url,system, stream=False, max_output_tokens=512, model="mlx-community/Qwen2.5-VL-3B-Instruct-8bit"):
+        ''' Calls the OpenAI API
+        '''
+        client = OpenAI(base_url=f"{API_URL}", api_key=API_KEY)
+        try :
+            response = client.responses.create(
+                model=model,
+                input=[
+                    {"role":"system",
+                    "content": f"{system}"
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "input_text", "text": prompt},
+                            {"type": "input_image", "image_url": f"{img_url}"},
+                        ],
+                    }
+                ],
+                max_output_tokens=max_output_tokens,
+                stream=stream
+            )
+            if not stream:
+                print(response.output[0].content[0].text)
+                print(response.usage)
+            else:
+                for event in response:
+                    # Process different event types if needed
+                    if hasattr(event, 'delta') and event.delta:
+                        print(event.delta, end="", flush=True)
+                    elif event.type == 'response.completed':
+                        print("\n--- Usage ---")
+                        print(event.response.usage)
+        except Exception as e:
+            # building a response object to match the one returned when request is successful so that it can be processed in the same way
+            return {"model - error":str(e),"content":{}, "model":model}
+    """
+    body = await request.json()
+    openai_request = OpenAIRequest(**body)
+    try:
+        # Get model, processor, config - loading if necessary
+        model, processor, config = get_cached_model(openai_request.model)
+        kwargs = {}
+        chat_messages = []
+        images = []
+        instructions = None
+        if openai_request.input:
+            if isinstance(openai_request.input, str):
+                # If input is a string, treat it as a single text message
+                chat_messages.append({"role": "user", "content": openai_request.input})
+            elif isinstance(openai_request.input, list):
+                # If input is a list, treat it as a series of chat messages
+                for message in openai_request.input:
+                    if isinstance(message, ChatMessage):
+                        if isinstance(message.content, str):
+                            chat_messages.append(
+                                {"role": message.role, "content": message.content}
+                            )
+                            if message.role == "system":
+                                instructions = message.content
+                        elif isinstance(message.content, list):
+                            # Handle list of content items
+                            for item in message.content:
+                                if isinstance(item, dict):
+                                    if item["type"] == "input_text":
+                                        chat_messages.append(
+                                            {
+                                                "role": message.role,
+                                                "content": item["text"],
+                                            }
+                                        )
+                                        if message.role == "system":
+                                            instructions = item["text"]
+                                    # examples for multiple images (https://platform.openai.com/docs/guides/images?api-mode=responses)
+                                    elif item["type"] == "input_image":
+                                        images.append(item["image_url"])
+                                    else:
+                                        print(
+                                            f"invalid input item type: {item['type']}"
+                                        )
+                                        raise HTTPException(
+                                            status_code=400,
+                                            detail="Invalid input item type.",
+                                        )
+                                else:
+                                    print(
+                                        f"Invalid message content item format: {item}"
+                                    )
+                                    raise HTTPException(
+                                        status_code=400,
+                                        detail="Missing type in input item.",
+                                    )
+                        else:
+                            print("Invalid message content format.")
+                            raise HTTPException(
+                                status_code=400, detail="Invalid input format."
+                            )
+                    else:
+                        print("not a ChatMessage")
+                        raise HTTPException(
+                            status_code=400, detail="Invalid input format."
+                        )
+            else:
+                print("neither string not list")
+                raise HTTPException(status_code=400, detail="Invalid input format.")
+        else:
+            print("no input")
+            raise HTTPException(status_code=400, detail="Missing input.")
+        formatted_prompt = apply_chat_template(
+            processor, config, chat_messages, num_images=len(images)
+        )
+        generated_at = datetime.now().timestamp()
+        response_id = f"resp_{uuid.uuid4().hex}"
+        message_id = f"msg_{uuid.uuid4().hex}"
+        if openai_request.stream:
+            # Streaming response
+            async def stream_generator():
+                token_iterator = None
+                try:
+                    # Create base response object (to match the openai pipeline)
+                    base_response = OpenAIResponse(
+                        id=response_id,
+                        object="response",
+                        created_at=int(generated_at),
+                        status="in_progress",
+                        instructions=instructions,
+                        max_output_tokens=openai_request.max_output_tokens,
+                        model=openai_request.model,
+                        output=[],
+                        output_text="",
+                        temperature=openai_request.temperature,
+                        top_p=openai_request.top_p,
+                        usage={
+                            "input_tokens": 0,  # get prompt tokens
+                            "output_tokens": 0,
+                            "total_tokens": 0,
+                        },
+                    )
+                    # Send response.created event  (to match the openai pipeline)
+                    yield f"event: response.created\ndata: {ResponseCreatedEvent(type='response.created', response=base_response).model_dump_json()}\n\n"
+                    # Send response.in_progress event  (to match the openai pipeline)
+                    yield f"event: response.in_progress\ndata: {ResponseInProgressEvent(type='response.in_progress', response=base_response).model_dump_json()}\n\n"
+                    # Send response.output_item.added event  (to match the openai pipeline)
+                    message_item = MessageItem(
+                        id=message_id,
+                        type="message",
+                        status="in_progress",
+                        role="assistant",
+                        content=[],
+                    )
+                    yield f"event: response.output_item.added\ndata: {ResponseOutputItemAddedEvent(type='response.output_item.added', output_index=0, item=message_item).model_dump_json()}\n\n"
+                    # Send response.content_part.added event
+                    content_part = ContentPartOutputText(
+                        type="output_text", text="", annotations=[]
+                    )
+                    yield f"event: response.content_part.added\ndata: {ResponseContentPartAddedEvent(type='response.content_part.added', item_id=message_id, output_index=0, content_index=0, part=content_part).model_dump_json()}\n\n"
+                    # Stream text deltas
+                    token_iterator = stream_generate(
+                        model=model,
+                        processor=processor,
+                        prompt=formatted_prompt,
+                        image=images,
+                        temperature=openai_request.temperature,
+                        max_tokens=openai_request.max_output_tokens,
+                        top_p=openai_request.top_p,
+                        **kwargs,
+                    )
+                    full_text = ""
+                    for chunk in token_iterator:
+                        if chunk is None or not hasattr(chunk, "text"):
+                            continue
+                        delta = chunk.text
+                        full_text += delta
+                        usage_stats = {
+                            "input_tokens": chunk.prompt_tokens,
+                            "output_tokens": chunk.generation_tokens,
+                        }
+                        # Send response.output_text.delta event
+                        yield f"event: response.output_text.delta\ndata: {ResponseOutputTextDeltaEvent(type='response.output_text.delta', item_id=message_id, output_index=0, content_index=0, delta=delta).model_dump_json()}\n\n"
+                        await asyncio.sleep(0.01)
+                    # Send response.output_text.done event (to match the openai pipeline)
+                    yield f"event: response.output_text.done\ndata: {ResponseOutputTextDoneEvent(type='response.output_text.done', item_id=message_id, output_index=0, content_index=0, text=full_text).model_dump_json()}\n\n"
+                    # Send response.content_part.done event (to match the openai pipeline)
+                    final_content_part = ContentPartOutputText(
+                        type="output_text", text=full_text, annotations=[]
+                    )
+                    yield f"event: response.content_part.done\ndata: {ResponseContentPartDoneEvent(type='response.content_part.done', item_id=message_id, output_index=0, content_index=0, part=final_content_part).model_dump_json()}\n\n"
+                    # Send response.output_item.done event (to match the openai pipeline)
+                    final_message_item = MessageItem(
+                        id=message_id,
+                        type="message",
+                        status="completed",
+                        role="assistant",
+                        content=[final_content_part],
+                    )
+                    yield f"event: response.output_item.done\ndata: {ResponseOutputItemDoneEvent(type='response.output_item.done', output_index=0, item=final_message_item).model_dump_json()}\n\n"
+                    # Send response.completed event (to match the openai pipeline)
+                    completed_response = base_response.model_copy(
+                        update={
+                            "status": "completed",
+                            "output": [final_message_item],
+                            "usage": {
+                                "input_tokens": usage_stats["input_tokens"],
+                                "output_tokens": usage_stats["output_tokens"],
+                                "total_tokens": usage_stats["input_tokens"]
+                                + usage_stats["output_tokens"],
+                            },
+                        }
+                    )
+                    yield f"event: response.completed\ndata: {ResponseCompletedEvent(type='response.completed', response=completed_response).model_dump_json()}\n\n"
+                except Exception as e:
+                    print(f"Error during stream generation: {e}")
+                    traceback.print_exc()
+                    error_data = json.dumps({"error": str(e)})
+                    yield f"data: {error_data}\n\n"
+                finally:
+                    mx.clear_cache()
+                    gc.collect()
+                    print("Stream finished, cleared cache.")
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            # Non-streaming response
+            try:
+                # Use generate from generate.py
+                result = generate(
+                    model=model,
+                    processor=processor,
+                    prompt=formatted_prompt,
+                    image=images,
+                    temperature=openai_request.temperature,
+                    max_tokens=openai_request.max_output_tokens,
+                    top_p=openai_request.top_p,
+                    verbose=False,  # stats are passed in the response
+                    **kwargs,
+                )
+                # Clean up resources
+                mx.clear_cache()
+                gc.collect()
+                print("Generation finished, cleared cache.")
+                response = OpenAIResponse(
+                    id=response_id,
+                    object="response",
+                    created_at=int(generated_at),
+                    status="completed",
+                    instructions=instructions,
+                    max_output_tokens=openai_request.max_output_tokens,
+                    model=openai_request.model,
+                    output=[
+                        {
+                            "role": "assistant",
+                            "content": [
+                                {
+                                    "type": "output_text",
+                                    "text": result.text,
+                                }
+                            ],
+                        }
+                    ],
+                    output_text=result.text,
+                    temperature=openai_request.temperature,
+                    top_p=openai_request.top_p,
+                    usage={
+                        "input_tokens": result.prompt_tokens,
+                        "output_tokens": result.generation_tokens,
+                        "total_tokens": result.total_tokens,
+                    },
+                )
+                return response
+            except Exception as e:
+                print(f"Error during generation: {e}")
+                traceback.print_exc()
+                mx.clear_cache()
+                gc.collect()
+                raise HTTPException(status_code=500, detail=f"Generation failed: {e}")
+    except HTTPException as http_exc:
+        # Re-raise HTTP exceptions (like model loading failure)
+        raise http_exc
+    except Exception as e:
+        # Catch unexpected errors
+        print(f"Unexpected error in /responses endpoint: {e}")
+        traceback.print_exc()
+        mx.clear_cache()
+        gc.collect()
+        raise HTTPException(
+            status_code=500, detail=f"An unexpected error occurred: {e}"
+        )
+@app.post(
+    "/chat/completions", response_model=None
+)  # Response model handled dynamically based on stream flag
+async def chat_completions_endpoint(request: ChatRequest):
+    """
+    Generate text based on a prompt and optional images.
+    Prompt must be a list of chat messages, including system, user, and assistant messages.
+    System message will be ignored if not already in the prompt.
+    Can operate in streaming or non-streaming mode.
+    """
+    try:
+        # Get model, processor, config - loading if necessary
+        model, processor, config = get_cached_model(request.model, request.adapter_path)
+        kwargs = {}
+        if request.resize_shape is not None:
+            if len(request.resize_shape) not in [1, 2]:
+                raise HTTPException(
+                    status_code=400,
+                    detail="resize_shape must contain exactly two integers (height, width)",
+                )
+            kwargs["resize_shape"] = (
+                (request.resize_shape[0],) * 2
+                if len(request.resize_shape) == 1
+                else tuple(request.resize_shape)
+            )
+        chat_messages = request.messages
+        images = []
+        audio = []
+        processed_messages = []
+        for message in request.messages:
+            if isinstance(message.content, str):
+                processed_messages.append(
+                    {"role": message.role, "content": message.content}
+                )
+            elif isinstance(message.content, list):
+                text_content = ""
+                for item in message.content:
+                    if isinstance(item, dict):
+                        # Only extract images/audio from user messages
+                        if message.role == "user":
+                            if item["type"] == "input_image":
+                                images.append(item["image_url"])
+                            elif item["type"] == "image_url":
+                                images.append(item["image_url"]["url"])
+                            elif item["type"] == "input_audio":
+                                audio.append(item["input_audio"]["data"])
+                        if item["type"] in ("text", "input_text"):
+                            text_content = item.get("text", "")
+                processed_messages.append(
+                    {"role": message.role, "content": text_content}
+                )
+        formatted_prompt = apply_chat_template(
+            processor,
+            config,
+            processed_messages,
+            num_images=len(images),
+            num_audios=len(audio),
+        )
+        if request.stream:
+            # Streaming response
+            async def stream_generator():
+                token_iterator = None
+                try:
+                    # Use stream_generate from utils
+                    token_iterator = stream_generate(
+                        model=model,
+                        processor=processor,
+                        prompt=formatted_prompt,
+                        image=images,
+                        audio=audio,
+                        temperature=request.temperature,
+                        max_tokens=request.max_tokens,
+                        top_p=request.top_p,
+                        **kwargs,
+                    )
+                    for chunk in token_iterator:
+                        if chunk is None or not hasattr(chunk, "text"):
+                            print("Warning: Received unexpected chunk format:", chunk)
+                            continue
+                        # Yield chunks in Server-Sent Events (SSE) format
+                        usage_stats = {
+                            "input_tokens": chunk.prompt_tokens,
+                            "output_tokens": chunk.generation_tokens,
+                            "total_tokens": chunk.prompt_tokens
+                            + chunk.generation_tokens,
+                            "prompt_tps": chunk.prompt_tps,
+                            "generation_tps": chunk.generation_tps,
+                            "peak_memory": chunk.peak_memory,
+                        }
+                        choices = [
+                            ChatStreamChoice(
+                                delta=ChatMessage(role="assistant", content=chunk.text)
+                            )
+                        ]
+                        chunk_data = ChatStreamChunk(
+                            model=request.model, usage=usage_stats, choices=choices
+                        )
+                        yield f"data: {chunk_data.model_dump_json()}\n\n"
+                        await asyncio.sleep(
+                            0.01
+                        )  # Small sleep to prevent blocking event loop entirely
+                    # Signal stream end
+                    choices = [
+                        ChatStreamChoice(
+                            finish_reason="stop",
+                            delta=ChatMessage(role="assistant", content=""),
+                        )
+                    ]
+                    chunk_data = ChatStreamChunk(
+                        model=request.model, usage=usage_stats, choices=choices
+                    )
+                    yield f"data: {chunk_data.model_dump_json()}\n\n"
+                except Exception as e:
+                    print(f"Error during stream generation: {e}")
+                    traceback.print_exc()
+                    error_data = json.dumps({"error": str(e)})
+                    yield f"data: {error_data}\n\n"
+                finally:
+                    mx.clear_cache()
+                    gc.collect()
+                    print("Stream finished, cleared cache.")
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            # Non-streaming response
+            try:
+                # Use generate from generate.py
+                gen_result = generate(
+                    model=model,
+                    processor=processor,
+                    prompt=formatted_prompt,
+                    image=images,
+                    audio=audio,
+                    temperature=request.temperature,
+                    max_tokens=request.max_tokens,
+                    top_p=request.top_p,
+                    verbose=False,  # Keep API output clean
+                    **kwargs,
+                )
+                # Clean up resources
+                mx.clear_cache()
+                gc.collect()
+                print("Generation finished, cleared cache.")
+                usage_stats = UsageStats(
+                    input_tokens=gen_result.prompt_tokens,
+                    output_tokens=gen_result.generation_tokens,
+                    total_tokens=gen_result.total_tokens,
+                    prompt_tps=gen_result.prompt_tps,
+                    generation_tps=gen_result.generation_tps,
+                    peak_memory=gen_result.peak_memory,
+                )
+                choices = [
+                    ChatChoice(
+                        finish_reason="stop",
+                        message=ChatMessage(role="assistant", content=gen_result.text),
+                    )
+                ]
+                result = ChatResponse(
+                    model=request.model, usage=usage_stats, choices=choices
+                )
+                return result
+            except Exception as e:
+                print(f"Error during generation: {e}")
+                traceback.print_exc()
+                mx.clear_cache()
+                gc.collect()
+                raise HTTPException(status_code=500, detail=f"Generation failed: {e}")
+    except HTTPException as http_exc:
+        # Re-raise HTTP exceptions (like model loading failure)
+        raise http_exc
+    except Exception as e:
+        # Catch unexpected errors
+        print(f"Unexpected error in /generate endpoint: {e}")
+        traceback.print_exc()
+        mx.clear_cache()
+        gc.collect()
+        raise HTTPException(
+            status_code=500, detail=f"An unexpected error occurred: {e}"
+        )
+@app.get("/models", response_model=ModelsResponse)
+def models_endpoint():
+    """
+    Return list of locally downloaded MLX models.
+    """
+    files = ["config.json", "model.safetensors.index.json", "tokenizer_config.json"]
+    def probably_mlx_lm(repo):
+        if repo.repo_type != "model":
+            return False
+        if "main" not in repo.refs:
+            return False
+        file_names = {f.file_path.name for f in repo.refs["main"].files}
+        return all(f in file_names for f in files)
+    # Scan the cache directory for downloaded mlx models
+    hf_cache_info = scan_cache_dir()
+    downloaded_models = [repo for repo in hf_cache_info.repos if probably_mlx_lm(repo)]
+    # Create a list of available models
+    models = [
+        {"id": repo.repo_id, "object": "model", "created": int(repo.last_modified)}
+        for repo in downloaded_models
+    ]
+    response = {"object": "list", "data": models}
+    return response
+# MLX_VLM API endpoints
+@app.get("/health")
+async def health_check():
+    """
+    Check if the server is healthy and what model is loaded.
+    """
+    return {
+        "status": "healthy",
+        "loaded_model": model_cache.get("model_path", None),
+        "loaded_adapter": model_cache.get("adapter_path", None),
+    }
+@app.post("/unload")
+async def unload_model_endpoint():
+    """
+    Unload the currently loaded model from memory.
+    """
+    unloaded_info = {
+        "model_name": model_cache.get("model_path", None),
+        "adapter_name": model_cache.get("adapter_path", None),
+    }
+    if not unload_model_sync():  # Use the synchronous unload function
+        return {"status": "no_model_loaded", "message": "No model is currently loaded"}
+    return {
+        "status": "success",
+        "message": f"Model unloaded successfully",
+        "unloaded": unloaded_info,
+    }
+def main():
+    parser = argparse.ArgumentParser(description="MLX VLM Http Server.")
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="0.0.0.0",
+        help="Host for the HTTP server (default:0.0.0.0)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8080,
+        help="Port for the HTTP server (default: 8080)",
+    )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code when loading models from Hugging Face Hub.",
+    )
+    args = parser.parse_args()
+    if args.trust_remote_code:
+        os.environ["MLX_TRUST_REMOTE_CODE"] = "true"
+    uvicorn.run(
+        "mlx_vlm.server:app", host=args.host, port=args.port, workers=1, reload=True
+    )  # reload=True for development to automatically restart on code changes.
+if __name__ == "__main__":
+    main()