PyPI - nexaai - Versions diffs - 1.0.4rc10__py3-none-macosx_11_0_arm64.whl - Mend

nexaai 1.0.4rc10__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nexaai might be problematic. Click here for more details.

Files changed (519) hide show

nexaai/vlm_impl/mlx_vlm_impl.py ADDED Viewed

@@ -0,0 +1,205 @@
+from typing import Generator, Optional, List, Dict, Any
+from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage
+from nexaai.vlm import VLM
+from nexaai.mlx_backend.vlm.interface import VLM as MLXVLMInterface
+from nexaai.mlx_backend.ml import ModelConfig as MLXModelConfig, SamplerConfig as MLXSamplerConfig, GenerationConfig as MLXGenerationConfig, EmbeddingConfig
+class MlxVlmImpl(VLM):
+    def __init__(self, m_cfg: ModelConfig = ModelConfig()):
+        """Initialize MLX VLM implementation."""
+        super().__init__(m_cfg)
+        self._mlx_vlm = None
+    @classmethod
+    def _load_from(cls,
+                   local_path: str,
+                   mmproj_path: str,
+                   m_cfg: ModelConfig = ModelConfig(),
+                   plugin_id: str = "mlx",
+                   device_id: Optional[str] = None
+        ) -> 'MlxVlmImpl':
+        """Load VLM model from local path using MLX backend.
+        Args:
+            local_path: Path to the main model file
+            mmproj_path: Path to the multimodal projection file (not used in MLX VLM)
+            m_cfg: Model configuration
+            plugin_id: Plugin identifier
+            device_id: Optional device ID
+        Returns:
+            MlxVlmImpl instance
+        """
+        try:
+            # MLX interface is already imported
+            # Create instance and load MLX VLM
+            instance = cls(m_cfg)
+            instance._mlx_vlm = MLXVLMInterface(
+                model_path=local_path,
+                mmproj_path=mmproj_path,  # MLX VLM may not use this, but pass it anyway
+                context_length=m_cfg.n_ctx,
+                device=device_id
+            )
+            return instance
+        except Exception as e:
+            raise RuntimeError(f"Failed to load MLX VLM: {str(e)}")
+    def eject(self):
+        """Release the model from memory."""
+        if self._mlx_vlm:
+            self._mlx_vlm.destroy()
+            self._mlx_vlm = None
+    def reset(self):
+        """
+        Reset the VLM model context and KV cache.
+        """
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            self._mlx_vlm.reset()
+        except Exception as e:
+            raise RuntimeError(f"Failed to reset MLX VLM: {str(e)}")
+    def apply_chat_template(
+        self,
+        messages: List[MultiModalMessage],
+        tools: Optional[List[Dict[str, Any]]] = None
+    ) -> str:
+        """Apply the chat template to multimodal messages."""
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            # Convert MultiModalMessage to MLX format
+            mlx_messages = []
+            for msg in messages:
+                # Create a simple object with role and content attributes
+                class MLXChatMessage:
+                    def __init__(self, role, content):
+                        self.role = role
+                        self.content = content
+                # For MLX VLM, we need to extract text content from multimodal messages
+                # This is a simplified approach - the actual implementation may need
+                # more sophisticated handling of different content types
+                text_content = ""
+                for content_item in msg["content"]:
+                    if content_item["type"] == "text":
+                        text_content += content_item.get("text", "")
+                    # Note: image/audio/video content is typically handled separately
+                    # in the generation phase, not in the chat template
+                mlx_messages.append(MLXChatMessage(msg["role"], text_content))
+            return self._mlx_vlm.apply_chat_template(mlx_messages)
+        except Exception as e:
+            raise RuntimeError(f"Failed to apply chat template: {str(e)}")
+    def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
+        """Generate text with streaming."""
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            # Get MLX config classes
+            _, MLXSamplerConfig, MLXGenerationConfig, _ = get_mlx_configs()
+            # Convert GenerationConfig to MLX format
+            mlx_gen_config = MLXGenerationConfig()
+            mlx_gen_config.max_tokens = g_cfg.max_tokens
+            mlx_gen_config.stop = g_cfg.stop_words
+            mlx_gen_config.image_paths = g_cfg.image_paths
+            mlx_gen_config.audio_paths = g_cfg.audio_paths
+            if g_cfg.sampler_config:
+                mlx_sampler_config = MLXSamplerConfig()
+                mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
+                mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
+                mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
+                mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
+                mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
+                mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
+                mlx_sampler_config.seed = g_cfg.sampler_config.seed
+                mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
+                mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
+                mlx_gen_config.sampler_config = mlx_sampler_config
+            # Create a token callback for streaming
+            def token_callback(token: str) -> bool:
+                # Check if generation should be cancelled
+                return not self._cancel_event.is_set()
+            # Use MLX VLM streaming generation
+            result = self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
+            # MLX VLM interface returns a GenerationResult, extract the text
+            if hasattr(result, 'text') and result.text:
+                # Split the result into words and yield them
+                words = result.text.split()
+                for i, word in enumerate(words):
+                    if self._cancel_event.is_set():
+                        break
+                    if i == 0:
+                        yield word
+                    else:
+                        yield " " + word
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate streaming text: {str(e)}")
+    def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
+        """
+        Generate text without streaming.
+        Args:
+            prompt (str): The prompt to generate text from.
+            g_cfg (GenerationConfig): Generation configuration.
+        Returns:
+            str: The generated text.
+        """
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            # Get MLX config classes
+            _, MLXSamplerConfig, MLXGenerationConfig, _ = get_mlx_configs()
+            # Convert GenerationConfig to MLX format
+            mlx_gen_config = MLXGenerationConfig()
+            mlx_gen_config.max_tokens = g_cfg.max_tokens
+            mlx_gen_config.stop = g_cfg.stop_words
+            mlx_gen_config.image_paths = g_cfg.image_paths
+            mlx_gen_config.audio_paths = g_cfg.audio_paths
+            if g_cfg.sampler_config:
+                mlx_sampler_config = MLXSamplerConfig()
+                mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
+                mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
+                mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
+                mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
+                mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
+                mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
+                mlx_sampler_config.seed = g_cfg.sampler_config.seed
+                mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
+                mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
+                mlx_gen_config.sampler_config = mlx_sampler_config
+            # Use MLX VLM generation
+            result = self._mlx_vlm.generate(prompt, mlx_gen_config)
+            # MLX VLM interface returns a GenerationResult, extract the text
+            if hasattr(result, 'text'):
+                return result.text
+            else:
+                # Fallback if result is just a string
+                return str(result)
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate text: {str(e)}")

nexaai/vlm_impl/pybind_vlm_impl.py ADDED Viewed

@@ -0,0 +1,228 @@
+from typing import Generator, Optional, List, Dict, Any, Union
+import queue
+import threading
+import base64
+from pathlib import Path
+from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage
+from nexaai.binds import vlm_bind, common_bind
+from nexaai.runtime import _ensure_runtime
+from nexaai.vlm import VLM
+class PyBindVLMImpl(VLM):
+    def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
+        """Private constructor, should not be called directly."""
+        super().__init__(m_cfg)
+        self._handle = handle  # This is a py::capsule
+    @classmethod
+    def _load_from(cls,
+                   local_path: str,
+                   mmproj_path: str,
+                   m_cfg: ModelConfig = ModelConfig(),
+                   plugin_id: str = "llama_cpp",
+                   device_id: Optional[str] = None
+        ) -> 'PyBindVLMImpl':
+        """Load VLM model from local path.
+        Args:
+            local_path: Path to the main model file
+            mmproj_path: Path to the multimodal projection file
+            m_cfg: Model configuration
+            plugin_id: Plugin identifier
+            device_id: Optional device ID (not used in current binding)
+        Returns:
+            PyBindVLMImpl instance
+        """
+        _ensure_runtime()
+        config = common_bind.ModelConfig()
+        config.n_ctx = m_cfg.n_ctx
+        if m_cfg.n_threads is not None:
+            config.n_threads = m_cfg.n_threads
+        if m_cfg.n_threads_batch is not None:
+            config.n_threads_batch = m_cfg.n_threads_batch
+        if m_cfg.n_batch is not None:
+            config.n_batch = m_cfg.n_batch
+        if m_cfg.n_ubatch is not None:
+            config.n_ubatch = m_cfg.n_ubatch
+        if m_cfg.n_seq_max is not None:
+            config.n_seq_max = m_cfg.n_seq_max
+        config.n_gpu_layers = m_cfg.n_gpu_layers
+        # handle chat template strings
+        if m_cfg.chat_template_path:
+            config.chat_template_path = m_cfg.chat_template_path
+        if m_cfg.chat_template_content:
+            config.chat_template_content = m_cfg.chat_template_content
+        # Create handle : returns py::capsule with automatic cleanup
+        handle = vlm_bind.create_vlm(
+            model_path=local_path,
+            mmproj_path=mmproj_path,
+            model_config=config,
+            plugin_id=plugin_id,
+            device_id=device_id
+        )
+        return cls(handle, m_cfg)
+    def eject(self):
+        """Release the model from memory."""
+        # py::capsule handles cleanup automatically
+        del self._handle
+        self._handle = None
+    def reset(self):
+        """
+        Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
+        If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
+        """
+        vlm_bind.ml_vlm_reset(self._handle)
+    def apply_chat_template(
+        self,
+        messages: List[MultiModalMessage],
+        tools: Optional[List[Dict[str, Any]]] = None
+    ) -> str:
+        """Apply the chat template to multimodal messages."""
+        payload = []
+        for msg in messages:
+            role   = msg["role"]
+            blocks = []
+            for c in msg["content"]:
+                t = c["type"]
+                if t == "text":
+                    blocks.append({"type": "text", "text": c.get("text","") or ""})
+                else:
+                    # image/audio/video
+                    src = c.get("url") or c.get("path")
+                    if not src:
+                        raise ValueError(f"No url/path for {t}")
+                    # read local file or strip data URI
+                    if Path(src).exists():
+                        raw = Path(src).read_bytes()
+                        b64 = base64.b64encode(raw).decode("ascii")
+                        blocks.append({"type": t, "text": b64})
+                    elif src.startswith("data:"):
+                        b64 = src.split(",",1)[1]
+                        blocks.append({"type": t, "text": b64})
+                    else:
+                        # remote URL
+                        blocks.append({"type": t, "text": src})
+            payload.append({"role": role, "content": blocks})
+        return vlm_bind.ml_vlm_apply_chat_template(self._handle, payload, tools)
+    def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
+        """Generate text with streaming."""
+        token_queue = queue.Queue()
+        exception_container = [None]
+        self.reset_cancel()  # Reset cancel flag before generation
+        def on_token(token: str, user_data) -> bool:
+            if self._cancel_event.is_set():
+                token_queue.put(('end', None))
+                return False  # Stop generation
+            try:
+                token_queue.put(('token', token))
+                return True  # Continue generation
+            except Exception as e:
+                exception_container[0] = e
+                return False  # Stop generation
+        config = self._convert_generation_config(g_cfg)
+        # Run generation in thread
+        def generate():
+            try:
+                vlm_bind.ml_vlm_generate(
+                    handle=self._handle,
+                    prompt=prompt,
+                    config=config,
+                    on_token=on_token,
+                    user_data=None
+                )
+            except Exception as e:
+                exception_container[0] = e
+            finally:
+                token_queue.put(('end', None))
+        thread = threading.Thread(target=generate)
+        thread.start()
+        # Yield tokens as they come
+        try:
+            while True:
+                msg_type, token = token_queue.get()
+                if msg_type == 'token':
+                    yield token
+                elif msg_type in ('error', 'end'):
+                    break
+        finally:
+            thread.join()
+        if exception_container[0]:
+            raise exception_container[0]
+    def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
+        """
+        Generate text without streaming.
+        Args:
+            prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
+            g_cfg (GenerationConfig): Generation configuration.
+        Returns:
+            str: The generated text.
+        """
+        config = self._convert_generation_config(g_cfg)
+        result = vlm_bind.ml_vlm_generate(
+            handle=self._handle,
+            prompt=prompt,
+            config=config,
+            on_token=None,  # No callback for non-streaming
+            user_data=None
+        )
+        return result.get("text", "")
+    def _convert_generation_config(self, g_cfg: GenerationConfig):
+        """Convert GenerationConfig to binding format."""
+        config = common_bind.GenerationConfig()
+        # Set basic generation parameters
+        config.max_tokens = g_cfg.max_tokens
+        if g_cfg.stop_words:
+            config.stop = g_cfg.stop_words
+        if g_cfg.image_paths:
+            config.image_paths = g_cfg.image_paths
+        if g_cfg.audio_paths:
+            config.audio_paths = g_cfg.audio_paths
+        if g_cfg.sampler_config:
+            sampler = common_bind.SamplerConfig()
+            sampler.temperature = g_cfg.sampler_config.temperature
+            sampler.top_p = g_cfg.sampler_config.top_p
+            sampler.top_k = g_cfg.sampler_config.top_k
+            sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
+            sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
+            sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
+            sampler.seed = g_cfg.sampler_config.seed
+            if g_cfg.sampler_config.grammar_path:
+                sampler.grammar_path = g_cfg.sampler_config.grammar_path
+            if g_cfg.sampler_config.grammar_string:
+                sampler.grammar_string = g_cfg.sampler_config.grammar_string
+            config.sampler_config = sampler
+        return config

nexaai-1.0.4rc10.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,26 @@
+Metadata-Version: 2.4
+Name: nexaai
+Version: 1.0.4rc10
+Summary: Python bindings for NexaSDK C-lib backend
+Author-email: "Nexa AI, Inc." <dev@nexa.ai>
+Project-URL: Homepage, https://github.com/NexaAI/nexasdk-bridge
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+Requires-Dist: huggingface_hub
+Requires-Dist: tqdm
+Requires-Dist: hf_xet
+Requires-Dist: numpy
+Requires-Dist: httpx
+Provides-Extra: mlx
+Requires-Dist: mlx; extra == "mlx"
+Requires-Dist: mlx-lm; extra == "mlx"
+Requires-Dist: mlx-vlm; extra == "mlx"
+Requires-Dist: tokenizers; extra == "mlx"
+Requires-Dist: safetensors; extra == "mlx"
+Requires-Dist: Pillow; extra == "mlx"