PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/vlm.py ADDED Viewed

@@ -0,0 +1,130 @@
+from typing import Generator, Optional, List, Dict, Any, Union
+from abc import abstractmethod
+import queue
+import threading
+import base64
+from pathlib import Path
+from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
+from nexaai.base import BaseModel, ProfilingData
+class VLM(BaseModel):
+    def __init__(self, m_cfg: ModelConfig = ModelConfig()):
+        """Initialize base VLM class."""
+        self._m_cfg = m_cfg
+        self._cancel_event = threading.Event()  # New attribute to control cancellation
+    @classmethod
+    def _load_from(cls,
+                   local_path: str,
+                   mmproj_path: str = None,
+                   model_name: Optional[str] = None,
+                   m_cfg: ModelConfig = ModelConfig(),
+                   plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
+                   device_id: Optional[str] = None,
+                   **kwargs
+        ) -> 'VLM':
+        """Load VLM model from local path, routing to appropriate implementation.
+        Args:
+            local_path: Path to the main model file
+            mmproj_path: Path to the multimodal projection file
+            m_cfg: Model configuration
+            plugin_id: Plugin identifier
+            device_id: Optional device ID (not used in current binding)
+        Returns:
+            VLM instance
+        """
+        # Check plugin_id value for routing - handle both enum and string
+        plugin_value = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
+        if plugin_value == "mlx":
+            from nexaai.vlm_impl.mlx_vlm_impl import MlxVlmImpl
+            return MlxVlmImpl._load_from(local_path, mmproj_path, model_name, m_cfg, plugin_id, device_id)
+        else:
+            from nexaai.vlm_impl.pybind_vlm_impl import PyBindVLMImpl
+            return PyBindVLMImpl._load_from(local_path, mmproj_path, model_name, m_cfg, plugin_id, device_id)
+    @abstractmethod
+    def eject(self):
+        """Release the model from memory."""
+        pass
+    def cancel_generation(self):
+        """Signal to cancel any ongoing stream generation."""
+        self._cancel_event.set()
+    def reset_cancel(self):
+        """Reset the cancel event. Call before starting a new generation if needed."""
+        self._cancel_event.clear()
+    @abstractmethod
+    def reset(self):
+        """
+        Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
+        If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
+        """
+        pass
+    def _process_image(self, image: Union[bytes, str, Path]) -> bytes:
+        """Process image input to bytes format.
+        Args:
+            image: Image data as bytes, base64 string, or file path
+        Returns:
+            Image data as bytes
+        """
+        if isinstance(image, bytes):
+            return image
+        elif isinstance(image, str):
+            # Check if it's a base64 string
+            if image.startswith('data:image'):
+                # Extract base64 data from data URL
+                base64_data = image.split(',')[1] if ',' in image else image
+                return base64.b64decode(base64_data)
+            else:
+                # Assume it's a file path
+                with open(image, 'rb') as f:
+                    return f.read()
+        elif isinstance(image, Path):
+            with open(image, 'rb') as f:
+                return f.read()
+        else:
+            raise ValueError(f"Unsupported image type: {type(image)}")
+    @abstractmethod
+    def apply_chat_template(
+        self,
+        messages: List[MultiModalMessage],
+        tools: Optional[List[Dict[str, Any]]] = None,
+        enable_thinking: bool = True
+    ) -> str:
+        """Apply the chat template to multimodal messages."""
+        pass
+    @abstractmethod
+    def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
+        """Generate text with streaming."""
+        pass
+    @abstractmethod
+    def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
+        """
+        Generate text without streaming.
+        Args:
+            prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
+            g_cfg (GenerationConfig): Generation configuration.
+        Returns:
+            str: The generated text.
+        """
+        pass
+    def get_profiling_data(self) -> Optional[ProfilingData]:
+        """Get profiling data from the last generation."""
+        pass

nexaai/vlm_impl/__init__.py ADDED Viewed

File without changes

nexaai/vlm_impl/mlx_vlm_impl.py ADDED Viewed

@@ -0,0 +1,259 @@
+from typing import Generator, Optional, List, Dict, Any, Union
+from nexaai.base import ProfilingData
+from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
+from nexaai.vlm import VLM
+from nexaai.mlx_backend.vlm.interface import VLM as MLXVLMInterface
+from nexaai.mlx_backend.ml import ModelConfig as MLXModelConfig, SamplerConfig as MLXSamplerConfig, GenerationConfig as MLXGenerationConfig, EmbeddingConfig
+class MlxVlmImpl(VLM):
+    def __init__(self, m_cfg: ModelConfig = ModelConfig()):
+        """Initialize MLX VLM implementation."""
+        super().__init__(m_cfg)
+        self._mlx_vlm = None
+    @classmethod
+    def _load_from(cls,
+                   local_path: str,
+                   mmproj_path: str = None,
+                   model_name: Optional[str] = None,
+                   m_cfg: ModelConfig = ModelConfig(),
+                   plugin_id: Union[PluginID, str] = PluginID.MLX,
+                   device_id: Optional[str] = None
+        ) -> 'MlxVlmImpl':
+        """Load VLM model from local path using MLX backend.
+        Args:
+            local_path: Path to the main model file
+            mmproj_path: Path to the multimodal projection file (not used in MLX VLM)
+            m_cfg: Model configuration
+            plugin_id: Plugin identifier
+            device_id: Optional device ID
+        Returns:
+            MlxVlmImpl instance
+        """
+        try:
+            # MLX interface is already imported
+            # Create instance and load MLX VLM
+            instance = cls(m_cfg)
+            instance._mlx_vlm = MLXVLMInterface(
+                model_name=model_name,
+                model_path=local_path,
+                mmproj_path=mmproj_path,  # MLX VLM may not use this, but pass it anyway
+                context_length=m_cfg.n_ctx,
+                device=device_id
+            )
+            return instance
+        except Exception as e:
+            raise RuntimeError(f"Failed to load MLX VLM: {str(e)}")
+    def eject(self):
+        """Release the model from memory."""
+        if self._mlx_vlm:
+            self._mlx_vlm.destroy()
+            self._mlx_vlm = None
+    def reset(self):
+        """
+        Reset the VLM model context and KV cache.
+        """
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            self._mlx_vlm.reset()
+        except Exception as e:
+            raise RuntimeError(f"Failed to reset MLX VLM: {str(e)}")
+    def apply_chat_template(
+        self,
+        messages: List[MultiModalMessage],
+        tools: Optional[List[Dict[str, Any]]] = None,
+        enable_thinking: bool = True
+    ) -> str:
+        """Apply the chat template to multimodal messages."""
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            mlx_messages = []
+            total_images = 0
+            total_audios = 0
+            for msg in messages:
+                # Create a simple object with role and content attributes
+                class MLXChatMessage:
+                    def __init__(self, role, content):
+                        self.role = role
+                        self.content = content
+                # Extract text content and count media files
+                text_content = ""
+                first_content = True
+                for content_item in msg["content"]:
+                    content_type = content_item.get("type", "")
+                    if content_type == "text":
+                        if not first_content:
+                            text_content += " "
+                        text_content += content_item.get("text", "")
+                        first_content = False
+                    elif content_type == "image":
+                        total_images += 1
+                    elif content_type == "audio":
+                        total_audios += 1
+                mlx_messages.append(MLXChatMessage(msg["role"], text_content))
+            if total_images > 0 or total_audios > 0:
+                # Use apply_chat_template_with_media when media is present
+                return self._mlx_vlm.apply_chat_template_with_media(
+                    mlx_messages,
+                    num_images=total_images,
+                    num_audios=total_audios,
+                    tools=tools,
+                    enable_thinking=enable_thinking
+                )
+            else:
+                # Use regular apply_chat_template for text-only messages
+                return self._mlx_vlm.apply_chat_template(mlx_messages)
+        except Exception as e:
+            raise RuntimeError(f"Failed to apply chat template: {str(e)}")
+    def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
+        """Generate text with streaming."""
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            # Convert GenerationConfig to MLX format
+            mlx_gen_config = MLXGenerationConfig()
+            mlx_gen_config.max_tokens = g_cfg.max_tokens
+            mlx_gen_config.stop = g_cfg.stop_words
+            mlx_gen_config.image_paths = g_cfg.image_paths
+            mlx_gen_config.audio_paths = g_cfg.audio_paths
+            if g_cfg.sampler_config:
+                mlx_sampler_config = MLXSamplerConfig()
+                mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
+                mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
+                mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
+                mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
+                mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
+                mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
+                mlx_sampler_config.seed = g_cfg.sampler_config.seed
+                mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
+                mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
+                mlx_gen_config.sampler_config = mlx_sampler_config
+            import queue
+            import threading
+            # Create a queue for streaming tokens
+            token_queue = queue.Queue()
+            exception_container = [None]
+            self.reset_cancel()  # Reset cancel flag before generation
+            def token_callback(token: str, user_data: Any = None) -> bool:
+                if self._cancel_event.is_set():
+                    token_queue.put(('end', None))
+                    return False
+                try:
+                    token_queue.put(('token', token))
+                    return True
+                except Exception as e:
+                    exception_container[0] = e
+                    return False
+            # Run generation in a separate thread
+            def generate():
+                try:
+                    self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
+                except Exception as e:
+                    exception_container[0] = e
+                finally:
+                    token_queue.put(('end', None))
+            thread = threading.Thread(target=generate)
+            thread.start()
+            # Yield tokens as they come from the queue
+            while True:
+                if exception_container[0]:
+                    raise exception_container[0]
+                try:
+                    msg_type, token = token_queue.get(timeout=0.1)
+                    if msg_type == 'end':
+                        break
+                    elif msg_type == 'token':
+                        yield token
+                except queue.Empty:
+                    if not thread.is_alive():
+                        break
+                    continue
+            thread.join()
+            if exception_container[0]:
+                raise exception_container[0]
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate streaming text: {str(e)}")
+    def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
+        """
+        Generate text without streaming.
+        Args:
+            prompt (str): The prompt to generate text from.
+            g_cfg (GenerationConfig): Generation configuration.
+        Returns:
+            str: The generated text.
+        """
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            # Convert GenerationConfig to MLX format
+            mlx_gen_config = MLXGenerationConfig()
+            mlx_gen_config.max_tokens = g_cfg.max_tokens
+            mlx_gen_config.stop = g_cfg.stop_words
+            mlx_gen_config.image_paths = g_cfg.image_paths
+            mlx_gen_config.audio_paths = g_cfg.audio_paths
+            if g_cfg.sampler_config:
+                mlx_sampler_config = MLXSamplerConfig()
+                mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
+                mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
+                mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
+                mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
+                mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
+                mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
+                mlx_sampler_config.seed = g_cfg.sampler_config.seed
+                mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
+                mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
+                mlx_gen_config.sampler_config = mlx_sampler_config
+            # Simple token callback that just continues
+            def token_callback(token: str, user_data: Any = None) -> bool:
+                return not self._cancel_event.is_set()
+            # Use MLX streaming generation and return the full result
+            return self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate text: {str(e)}")
+    def get_profiling_data(self) -> Optional[ProfilingData]:
+        """Get profiling data from the last generation."""
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        return self._mlx_vlm.get_profiling_data()

nexaai/vlm_impl/pybind_vlm_impl.py ADDED Viewed

@@ -0,0 +1,275 @@
+from typing import Generator, Optional, List, Dict, Any, Union
+import queue
+import threading
+from pathlib import Path
+from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
+from nexaai.binds import vlm_bind, common_bind
+from nexaai.runtime import _ensure_runtime
+from nexaai.vlm import VLM
+from nexaai.base import ProfilingData
+from nexaai.runtime_error import ContextLengthExceededError, GenerationError
+# Error codes from ml.h
+ML_SUCCESS = 0
+ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH = -200004
+class PyBindVLMImpl(VLM):
+    def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
+        """Private constructor, should not be called directly."""
+        super().__init__(m_cfg)
+        self._handle = handle  # This is a py::capsule
+        self._profiling_data = None
+    @classmethod
+    def _load_from(
+        cls,
+        local_path: str,
+        mmproj_path: str = None,
+        model_name: Optional[str] = None,
+        m_cfg: ModelConfig = ModelConfig(),
+        plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
+        device_id: Optional[str] = None,
+    ) -> "PyBindVLMImpl":
+        """Load VLM model from local path.
+        Args:
+            local_path: Path to the main model file
+            mmproj_path: Path to the multimodal projection file
+            m_cfg: Model configuration
+            plugin_id: Plugin identifier
+            device_id: Optional device ID (not used in current binding)
+        Returns:
+            PyBindVLMImpl instance
+        """
+        _ensure_runtime()
+        config = common_bind.ModelConfig()
+        config.n_ctx = m_cfg.n_ctx
+        if m_cfg.n_threads is not None:
+            config.n_threads = m_cfg.n_threads
+        if m_cfg.n_threads_batch is not None:
+            config.n_threads_batch = m_cfg.n_threads_batch
+        if m_cfg.n_batch is not None:
+            config.n_batch = m_cfg.n_batch
+        if m_cfg.n_ubatch is not None:
+            config.n_ubatch = m_cfg.n_ubatch
+        if m_cfg.n_seq_max is not None:
+            config.n_seq_max = m_cfg.n_seq_max
+        config.n_gpu_layers = m_cfg.n_gpu_layers
+        # handle chat template strings
+        if m_cfg.chat_template_path:
+            config.chat_template_path = m_cfg.chat_template_path
+        if m_cfg.chat_template_content:
+            config.chat_template_content = m_cfg.chat_template_content
+        # handle system prompt (required for NPU plugin)
+        if m_cfg.system_prompt:
+            config.system_prompt = m_cfg.system_prompt
+        # Create handle : returns py::capsule with automatic cleanup
+        # Convert enum to string for C++ binding
+        plugin_id_str = (
+            plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
+        )
+        handle = vlm_bind.create_vlm(
+            model_path=local_path,
+            mmproj_path=mmproj_path,
+            model_name=model_name,
+            model_config=config,
+            plugin_id=plugin_id_str,
+            device_id=device_id,
+        )
+        return cls(handle, m_cfg)
+    def eject(self):
+        """Release the model from memory."""
+        # py::capsule handles cleanup automatically
+        del self._handle
+        self._handle = None
+    def reset(self):
+        """
+        Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
+        If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
+        """
+        vlm_bind.ml_vlm_reset(self._handle)
+    def apply_chat_template(
+        self,
+        messages: List[MultiModalMessage],
+        tools: Optional[List[Dict[str, Any]]] = None,
+        enable_thinking: bool = True,
+    ) -> str:
+        """Apply the chat template to multimodal messages."""
+        payload = []
+        for msg in messages:
+            role = msg["role"]
+            blocks = []
+            for c in msg["content"]:
+                t = c["type"]
+                if t == "text":
+                    blocks.append({"type": "text", "text": c.get("text", "") or ""})
+                else:
+                    # Pass through the original structure for image, audio, and any other types
+                    # Let vlm-bind.cpp handle field extraction (text/url/path)
+                    blocks.append(c)
+            payload.append({"role": role, "content": blocks})
+        result = vlm_bind.ml_vlm_apply_chat_template(
+            self._handle, payload, tools, enable_thinking
+        )
+        return result
+    def generate_stream(
+        self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()
+    ) -> Generator[str, None, None]:
+        """Generate text with streaming."""
+        token_queue = queue.Queue()
+        exception_container = [None]
+        self.reset_cancel()  # Reset cancel flag before generation
+        def on_token(token: str, user_data) -> bool:
+            if self._cancel_event.is_set():
+                token_queue.put(("end", None))
+                return False  # Stop generation
+            try:
+                token_queue.put(("token", token))
+                return True  # Continue generation
+            except Exception as e:
+                exception_container[0] = e
+                return False  # Stop generation
+        config = self._convert_generation_config(g_cfg)
+        # Run generation in thread
+        def generate():
+            try:
+                result = vlm_bind.ml_vlm_generate(
+                    handle=self._handle,
+                    prompt=prompt,
+                    config=config,
+                    on_token=on_token,
+                    user_data=None,
+                )
+                # Check for errors in result
+                error_code = result.get("error_code", ML_SUCCESS)
+                if error_code != ML_SUCCESS:
+                    error_message = result.get("error_message", "Unknown error")
+                    if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
+                        exception_container[0] = ContextLengthExceededError(
+                            error_message, error_code
+                        )
+                    else:
+                        exception_container[0] = GenerationError(
+                            error_message, error_code
+                        )
+                    token_queue.put(("end", None))
+                    return
+                self._profiling_data = ProfilingData.from_dict(
+                    result.get("profile_data", {})
+                )
+            except Exception as e:
+                exception_container[0] = e
+            finally:
+                token_queue.put(("end", None))
+        thread = threading.Thread(target=generate)
+        thread.start()
+        # Yield tokens as they come
+        try:
+            while True:
+                msg_type, token = token_queue.get()
+                if msg_type == "token":
+                    yield token
+                elif msg_type in ("error", "end"):
+                    break
+        finally:
+            thread.join()
+        if exception_container[0]:
+            raise exception_container[0]
+    def generate(
+        self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()
+    ) -> str:
+        """
+        Generate text without streaming.
+        Args:
+            prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
+            g_cfg (GenerationConfig): Generation configuration.
+        Returns:
+            str: The generated text.
+        """
+        config = self._convert_generation_config(g_cfg)
+        result = vlm_bind.ml_vlm_generate(
+            handle=self._handle,
+            prompt=prompt,
+            config=config,
+            on_token=None,  # No callback for non-streaming
+            user_data=None,
+        )
+        # Check for errors in result
+        error_code = result.get("error_code", ML_SUCCESS)
+        if error_code != ML_SUCCESS:
+            error_message = result.get("error_message", "Unknown error")
+            if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
+                raise ContextLengthExceededError(error_message, error_code)
+            else:
+                raise GenerationError(error_message, error_code)
+        self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
+        return result.get("text", "")
+    def get_profiling_data(self) -> Optional[ProfilingData]:
+        """Get profiling data."""
+        return self._profiling_data
+    def _convert_generation_config(self, g_cfg: GenerationConfig):
+        """Convert GenerationConfig to binding format."""
+        config = common_bind.GenerationConfig()
+        # Set basic generation parameters
+        config.max_tokens = g_cfg.max_tokens
+        if g_cfg.stop_words:
+            config.stop = g_cfg.stop_words
+        if g_cfg.image_paths:
+            config.image_paths = g_cfg.image_paths
+        if g_cfg.audio_paths:
+            config.audio_paths = g_cfg.audio_paths
+        if g_cfg.sampler_config:
+            sampler = common_bind.SamplerConfig()
+            sampler.temperature = g_cfg.sampler_config.temperature
+            sampler.top_p = g_cfg.sampler_config.top_p
+            sampler.top_k = g_cfg.sampler_config.top_k
+            sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
+            sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
+            sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
+            sampler.seed = g_cfg.sampler_config.seed
+            if g_cfg.sampler_config.grammar_path:
+                sampler.grammar_path = g_cfg.sampler_config.grammar_path
+            if g_cfg.sampler_config.grammar_string:
+                sampler.grammar_string = g_cfg.sampler_config.grammar_string
+            config.sampler_config = sampler
+        return config