PyPI - nexaai - Versions diffs - 1.0.21rc5__cp313-cp313-win_arm64.whl → 1.0.21rc14__cp313-cp313-win_arm64.whl - Mend

nexaai 1.0.21rc5__cp313-cp313-win_arm64.whl → 1.0.21rc14__cp313-cp313-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nexaai might be problematic. Click here for more details.

Files changed (105) hide show

nexaai/__init__.py +95 -95
nexaai/_stub.cp313-win_arm64.pyd +0 -0
nexaai/_version.py +4 -1
nexaai/asr.py +68 -65
nexaai/asr_impl/mlx_asr_impl.py +92 -92
nexaai/asr_impl/pybind_asr_impl.py +127 -44
nexaai/base.py +39 -39
nexaai/binds/__init__.py +6 -5
nexaai/binds/asr_bind.cp313-win_arm64.pyd +0 -0
nexaai/binds/common_bind.cp313-win_arm64.pyd +0 -0
nexaai/binds/cpu_gpu/ggml-base.dll +0 -0
nexaai/binds/cpu_gpu/ggml-cpu.dll +0 -0
nexaai/binds/cpu_gpu/ggml-opencl.dll +0 -0
nexaai/binds/cpu_gpu/ggml.dll +0 -0
nexaai/binds/cpu_gpu/mtmd.dll +0 -0
nexaai/binds/cpu_gpu/nexa_cpu_gpu.dll +0 -0
nexaai/binds/cpu_gpu/nexa_plugin.dll +0 -0
nexaai/binds/embedder_bind.cp313-win_arm64.pyd +0 -0
nexaai/binds/libcrypto-3-arm64.dll +0 -0
nexaai/binds/libssl-3-arm64.dll +0 -0
nexaai/binds/llm_bind.cp313-win_arm64.pyd +0 -0
nexaai/binds/nexa_bridge.dll +0 -0
nexaai/binds/npu/convnext-sdk.dll +0 -0
nexaai/binds/npu/embed-gemma-sdk.dll +0 -0
nexaai/binds/npu/ggml-base.dll +0 -0
nexaai/binds/npu/ggml-cpu.dll +0 -0
nexaai/binds/npu/ggml-opencl.dll +0 -0
nexaai/binds/npu/ggml.dll +0 -0
nexaai/binds/npu/granite-nano-sdk.dll +0 -0
nexaai/binds/npu/granite4-sdk.dll +0 -0
nexaai/binds/npu/jina-rerank-sdk.dll +0 -0
nexaai/binds/npu/liquid-sdk.dll +0 -0
nexaai/binds/npu/llama3-3b-sdk.dll +0 -0
nexaai/binds/npu/nexa-mm-process.dll +0 -0
nexaai/binds/npu/nexa-sampling.dll +0 -0
nexaai/binds/npu/nexa_plugin.dll +0 -0
nexaai/binds/npu/omni-neural-sdk.dll +0 -0
nexaai/binds/npu/openblas.dll +0 -0
nexaai/binds/npu/paddleocr-sdk.dll +0 -0
nexaai/binds/npu/parakeet-sdk.dll +0 -0
nexaai/binds/npu/phi3-5-sdk.dll +0 -0
nexaai/binds/npu/phi4-sdk.dll +0 -0
nexaai/binds/npu/pyannote-sdk.dll +0 -0
nexaai/binds/npu/qwen3-4b-sdk.dll +0 -0
nexaai/binds/npu/qwen3vl-sdk.dll +0 -0
nexaai/binds/npu/qwen3vl-vision.dll +0 -0
nexaai/binds/npu/yolov12-sdk.dll +0 -0
nexaai/binds/npu/zlib1.dll +0 -0
nexaai/binds/rerank_bind.cp313-win_arm64.pyd +0 -0
nexaai/binds/vlm_bind.cp313-win_arm64.pyd +0 -0
nexaai/common.py +105 -105
nexaai/cv.py +93 -93
nexaai/cv_impl/mlx_cv_impl.py +89 -89
nexaai/cv_impl/pybind_cv_impl.py +32 -32
nexaai/embedder.py +73 -73
nexaai/embedder_impl/mlx_embedder_impl.py +118 -118
nexaai/embedder_impl/pybind_embedder_impl.py +96 -96
nexaai/image_gen.py +141 -141
nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -292
nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -85
nexaai/llm.py +98 -98
nexaai/llm_impl/mlx_llm_impl.py +271 -271
nexaai/llm_impl/pybind_llm_impl.py +220 -220
nexaai/log.py +92 -92
nexaai/rerank.py +57 -57
nexaai/rerank_impl/mlx_rerank_impl.py +94 -94
nexaai/rerank_impl/pybind_rerank_impl.py +136 -136
nexaai/runtime.py +68 -68
nexaai/runtime_error.py +24 -24
nexaai/tts.py +75 -75
nexaai/tts_impl/mlx_tts_impl.py +94 -94
nexaai/tts_impl/pybind_tts_impl.py +43 -43
nexaai/utils/decode.py +17 -17
nexaai/utils/manifest_utils.py +531 -531
nexaai/utils/model_manager.py +1562 -1562
nexaai/utils/model_types.py +49 -49
nexaai/utils/progress_tracker.py +384 -384
nexaai/utils/quantization_utils.py +245 -245
nexaai/vlm.py +129 -129
nexaai/vlm_impl/mlx_vlm_impl.py +258 -258
nexaai/vlm_impl/pybind_vlm_impl.py +256 -256
{nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/METADATA +1 -1
nexaai-1.0.21rc14.dist-info/RECORD +154 -0
nexaai/binds/nexaml/FLAC.dll +0 -0
nexaai/binds/nexaml/fftw3.dll +0 -0
nexaai/binds/nexaml/fftw3f.dll +0 -0
nexaai/binds/nexaml/ggml-base.dll +0 -0
nexaai/binds/nexaml/ggml-cpu.dll +0 -0
nexaai/binds/nexaml/ggml-opencl.dll +0 -0
nexaai/binds/nexaml/ggml.dll +0 -0
nexaai/binds/nexaml/libmp3lame.DLL +0 -0
nexaai/binds/nexaml/mpg123.dll +0 -0
nexaai/binds/nexaml/nexa-mm-process.dll +0 -0
nexaai/binds/nexaml/nexa-sampling.dll +0 -0
nexaai/binds/nexaml/nexa_plugin.dll +0 -0
nexaai/binds/nexaml/nexaproc.dll +0 -0
nexaai/binds/nexaml/ogg.dll +0 -0
nexaai/binds/nexaml/opus.dll +0 -0
nexaai/binds/nexaml/qwen3-vl.dll +0 -0
nexaai/binds/nexaml/qwen3vl-vision.dll +0 -0
nexaai/binds/nexaml/vorbis.dll +0 -0
nexaai/binds/nexaml/vorbisenc.dll +0 -0
nexaai-1.0.21rc5.dist-info/RECORD +0 -162
{nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/WHEEL +0 -0
{nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/top_level.txt +0 -0

nexaai/vlm_impl/mlx_vlm_impl.py CHANGED Viewed

@@ -1,259 +1,259 @@
-from typing import Generator, Optional, List, Dict, Any, Union
-from nexaai.base import ProfilingData
-from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
-from nexaai.vlm import VLM
-from nexaai.mlx_backend.vlm.interface import VLM as MLXVLMInterface
-from nexaai.mlx_backend.ml import ModelConfig as MLXModelConfig, SamplerConfig as MLXSamplerConfig, GenerationConfig as MLXGenerationConfig, EmbeddingConfig
-class MlxVlmImpl(VLM):
-    def __init__(self, m_cfg: ModelConfig = ModelConfig()):
-        """Initialize MLX VLM implementation."""
-        super().__init__(m_cfg)
-        self._mlx_vlm = None
-    @classmethod
-    def _load_from(cls,
-                   local_path: str,
-                   mmproj_path: str = None,
-                   model_name: Optional[str] = None,
-                   m_cfg: ModelConfig = ModelConfig(),
-                   plugin_id: Union[PluginID, str] = PluginID.MLX,
-                   device_id: Optional[str] = None
-        ) -> 'MlxVlmImpl':
-        """Load VLM model from local path using MLX backend.
-        Args:
-            local_path: Path to the main model file
-            mmproj_path: Path to the multimodal projection file (not used in MLX VLM)
-            m_cfg: Model configuration
-            plugin_id: Plugin identifier
-            device_id: Optional device ID
-        Returns:
-            MlxVlmImpl instance
-        """
-        try:
-            # MLX interface is already imported
-            # Create instance and load MLX VLM
-            instance = cls(m_cfg)
-            instance._mlx_vlm = MLXVLMInterface(
-                model_name=model_name,
-                model_path=local_path,
-                mmproj_path=mmproj_path,  # MLX VLM may not use this, but pass it anyway
-                context_length=m_cfg.n_ctx,
-                device=device_id
-            )
-            return instance
-        except Exception as e:
-            raise RuntimeError(f"Failed to load MLX VLM: {str(e)}")
-    def eject(self):
-        """Release the model from memory."""
-        if self._mlx_vlm:
-            self._mlx_vlm.destroy()
-            self._mlx_vlm = None
-    def reset(self):
-        """
-        Reset the VLM model context and KV cache.
-        """
-        if not self._mlx_vlm:
-            raise RuntimeError("MLX VLM not loaded")
-        try:
-            self._mlx_vlm.reset()
-        except Exception as e:
-            raise RuntimeError(f"Failed to reset MLX VLM: {str(e)}")
-    def apply_chat_template(
-        self,
-        messages: List[MultiModalMessage],
-        tools: Optional[List[Dict[str, Any]]] = None,
-        enable_thinking: bool = True
-    ) -> str:
-        """Apply the chat template to multimodal messages."""
-        if not self._mlx_vlm:
-            raise RuntimeError("MLX VLM not loaded")
-        try:
-            mlx_messages = []
-            total_images = 0
-            total_audios = 0
-            for msg in messages:
-                # Create a simple object with role and content attributes
-                class MLXChatMessage:
-                    def __init__(self, role, content):
-                        self.role = role
-                        self.content = content
-                # Extract text content and count media files
-                text_content = ""
-                first_content = True
-                for content_item in msg["content"]:
-                    content_type = content_item.get("type", "")
-                    if content_type == "text":
-                        if not first_content:
-                            text_content += " "
-                        text_content += content_item.get("text", "")
-                        first_content = False
-                    elif content_type == "image":
-                        total_images += 1
-                    elif content_type == "audio":
-                        total_audios += 1
-                mlx_messages.append(MLXChatMessage(msg["role"], text_content))
-            if total_images > 0 or total_audios > 0:
-                # Use apply_chat_template_with_media when media is present
-                return self._mlx_vlm.apply_chat_template_with_media(
-                    mlx_messages,
-                    num_images=total_images,
-                    num_audios=total_audios,
-                    tools=tools,
-                    enable_thinking=enable_thinking
-                )
-            else:
-                # Use regular apply_chat_template for text-only messages
-                return self._mlx_vlm.apply_chat_template(mlx_messages)
-        except Exception as e:
-            raise RuntimeError(f"Failed to apply chat template: {str(e)}")
-    def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
-        """Generate text with streaming."""
-        if not self._mlx_vlm:
-            raise RuntimeError("MLX VLM not loaded")
-        try:
-            # Convert GenerationConfig to MLX format
-            mlx_gen_config = MLXGenerationConfig()
-            mlx_gen_config.max_tokens = g_cfg.max_tokens
-            mlx_gen_config.stop = g_cfg.stop_words
-            mlx_gen_config.image_paths = g_cfg.image_paths
-            mlx_gen_config.audio_paths = g_cfg.audio_paths
-            if g_cfg.sampler_config:
-                mlx_sampler_config = MLXSamplerConfig()
-                mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
-                mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
-                mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
-                mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
-                mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
-                mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
-                mlx_sampler_config.seed = g_cfg.sampler_config.seed
-                mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
-                mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
-                mlx_gen_config.sampler_config = mlx_sampler_config
-            import queue
-            import threading
-            # Create a queue for streaming tokens
-            token_queue = queue.Queue()
-            exception_container = [None]
-            self.reset_cancel()  # Reset cancel flag before generation
-            def token_callback(token: str, user_data: Any = None) -> bool:
-                if self._cancel_event.is_set():
-                    token_queue.put(('end', None))
-                    return False
-                try:
-                    token_queue.put(('token', token))
-                    return True
-                except Exception as e:
-                    exception_container[0] = e
-                    return False
-            # Run generation in a separate thread
-            def generate():
-                try:
-                    self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
-                except Exception as e:
-                    exception_container[0] = e
-                finally:
-                    token_queue.put(('end', None))
-            thread = threading.Thread(target=generate)
-            thread.start()
-            # Yield tokens as they come from the queue
-            while True:
-                if exception_container[0]:
-                    raise exception_container[0]
-                try:
-                    msg_type, token = token_queue.get(timeout=0.1)
-                    if msg_type == 'end':
-                        break
-                    elif msg_type == 'token':
-                        yield token
-                except queue.Empty:
-                    if not thread.is_alive():
-                        break
-                    continue
-            thread.join()
-            if exception_container[0]:
-                raise exception_container[0]
-        except Exception as e:
-            raise RuntimeError(f"Failed to generate streaming text: {str(e)}")
-    def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
-        """
-        Generate text without streaming.
-        Args:
-            prompt (str): The prompt to generate text from.
-            g_cfg (GenerationConfig): Generation configuration.
-        Returns:
-            str: The generated text.
-        """
-        if not self._mlx_vlm:
-            raise RuntimeError("MLX VLM not loaded")
-        try:
-            # Convert GenerationConfig to MLX format
-            mlx_gen_config = MLXGenerationConfig()
-            mlx_gen_config.max_tokens = g_cfg.max_tokens
-            mlx_gen_config.stop = g_cfg.stop_words
-            mlx_gen_config.image_paths = g_cfg.image_paths
-            mlx_gen_config.audio_paths = g_cfg.audio_paths
-            if g_cfg.sampler_config:
-                mlx_sampler_config = MLXSamplerConfig()
-                mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
-                mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
-                mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
-                mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
-                mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
-                mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
-                mlx_sampler_config.seed = g_cfg.sampler_config.seed
-                mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
-                mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
-                mlx_gen_config.sampler_config = mlx_sampler_config
-            # Simple token callback that just continues
-            def token_callback(token: str, user_data: Any = None) -> bool:
-                return not self._cancel_event.is_set()
-            # Use MLX streaming generation and return the full result
-            return self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
-        except Exception as e:
-            raise RuntimeError(f"Failed to generate text: {str(e)}")
-    def get_profiling_data(self) -> Optional[ProfilingData]:
-        """Get profiling data from the last generation."""
-        if not self._mlx_vlm:
-            raise RuntimeError("MLX VLM not loaded")
+from typing import Generator, Optional, List, Dict, Any, Union
+from nexaai.base import ProfilingData
+from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
+from nexaai.vlm import VLM
+from nexaai.mlx_backend.vlm.interface import VLM as MLXVLMInterface
+from nexaai.mlx_backend.ml import ModelConfig as MLXModelConfig, SamplerConfig as MLXSamplerConfig, GenerationConfig as MLXGenerationConfig, EmbeddingConfig
+class MlxVlmImpl(VLM):
+    def __init__(self, m_cfg: ModelConfig = ModelConfig()):
+        """Initialize MLX VLM implementation."""
+        super().__init__(m_cfg)
+        self._mlx_vlm = None
+    @classmethod
+    def _load_from(cls,
+                   local_path: str,
+                   mmproj_path: str = None,
+                   model_name: Optional[str] = None,
+                   m_cfg: ModelConfig = ModelConfig(),
+                   plugin_id: Union[PluginID, str] = PluginID.MLX,
+                   device_id: Optional[str] = None
+        ) -> 'MlxVlmImpl':
+        """Load VLM model from local path using MLX backend.
+        Args:
+            local_path: Path to the main model file
+            mmproj_path: Path to the multimodal projection file (not used in MLX VLM)
+            m_cfg: Model configuration
+            plugin_id: Plugin identifier
+            device_id: Optional device ID
+        Returns:
+            MlxVlmImpl instance
+        """
+        try:
+            # MLX interface is already imported
+            # Create instance and load MLX VLM
+            instance = cls(m_cfg)
+            instance._mlx_vlm = MLXVLMInterface(
+                model_name=model_name,
+                model_path=local_path,
+                mmproj_path=mmproj_path,  # MLX VLM may not use this, but pass it anyway
+                context_length=m_cfg.n_ctx,
+                device=device_id
+            )
+            return instance
+        except Exception as e:
+            raise RuntimeError(f"Failed to load MLX VLM: {str(e)}")
+    def eject(self):
+        """Release the model from memory."""
+        if self._mlx_vlm:
+            self._mlx_vlm.destroy()
+            self._mlx_vlm = None
+    def reset(self):
+        """
+        Reset the VLM model context and KV cache.
+        """
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            self._mlx_vlm.reset()
+        except Exception as e:
+            raise RuntimeError(f"Failed to reset MLX VLM: {str(e)}")
+    def apply_chat_template(
+        self,
+        messages: List[MultiModalMessage],
+        tools: Optional[List[Dict[str, Any]]] = None,
+        enable_thinking: bool = True
+    ) -> str:
+        """Apply the chat template to multimodal messages."""
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            mlx_messages = []
+            total_images = 0
+            total_audios = 0
+            for msg in messages:
+                # Create a simple object with role and content attributes
+                class MLXChatMessage:
+                    def __init__(self, role, content):
+                        self.role = role
+                        self.content = content
+                # Extract text content and count media files
+                text_content = ""
+                first_content = True
+                for content_item in msg["content"]:
+                    content_type = content_item.get("type", "")
+                    if content_type == "text":
+                        if not first_content:
+                            text_content += " "
+                        text_content += content_item.get("text", "")
+                        first_content = False
+                    elif content_type == "image":
+                        total_images += 1
+                    elif content_type == "audio":
+                        total_audios += 1
+                mlx_messages.append(MLXChatMessage(msg["role"], text_content))
+            if total_images > 0 or total_audios > 0:
+                # Use apply_chat_template_with_media when media is present
+                return self._mlx_vlm.apply_chat_template_with_media(
+                    mlx_messages,
+                    num_images=total_images,
+                    num_audios=total_audios,
+                    tools=tools,
+                    enable_thinking=enable_thinking
+                )
+            else:
+                # Use regular apply_chat_template for text-only messages
+                return self._mlx_vlm.apply_chat_template(mlx_messages)
+        except Exception as e:
+            raise RuntimeError(f"Failed to apply chat template: {str(e)}")
+    def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
+        """Generate text with streaming."""
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            # Convert GenerationConfig to MLX format
+            mlx_gen_config = MLXGenerationConfig()
+            mlx_gen_config.max_tokens = g_cfg.max_tokens
+            mlx_gen_config.stop = g_cfg.stop_words
+            mlx_gen_config.image_paths = g_cfg.image_paths
+            mlx_gen_config.audio_paths = g_cfg.audio_paths
+            if g_cfg.sampler_config:
+                mlx_sampler_config = MLXSamplerConfig()
+                mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
+                mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
+                mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
+                mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
+                mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
+                mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
+                mlx_sampler_config.seed = g_cfg.sampler_config.seed
+                mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
+                mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
+                mlx_gen_config.sampler_config = mlx_sampler_config
+            import queue
+            import threading
+            # Create a queue for streaming tokens
+            token_queue = queue.Queue()
+            exception_container = [None]
+            self.reset_cancel()  # Reset cancel flag before generation
+            def token_callback(token: str, user_data: Any = None) -> bool:
+                if self._cancel_event.is_set():
+                    token_queue.put(('end', None))
+                    return False
+                try:
+                    token_queue.put(('token', token))
+                    return True
+                except Exception as e:
+                    exception_container[0] = e
+                    return False
+            # Run generation in a separate thread
+            def generate():
+                try:
+                    self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
+                except Exception as e:
+                    exception_container[0] = e
+                finally:
+                    token_queue.put(('end', None))
+            thread = threading.Thread(target=generate)
+            thread.start()
+            # Yield tokens as they come from the queue
+            while True:
+                if exception_container[0]:
+                    raise exception_container[0]
+                try:
+                    msg_type, token = token_queue.get(timeout=0.1)
+                    if msg_type == 'end':
+                        break
+                    elif msg_type == 'token':
+                        yield token
+                except queue.Empty:
+                    if not thread.is_alive():
+                        break
+                    continue
+            thread.join()
+            if exception_container[0]:
+                raise exception_container[0]
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate streaming text: {str(e)}")
+    def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
+        """
+        Generate text without streaming.
+        Args:
+            prompt (str): The prompt to generate text from.
+            g_cfg (GenerationConfig): Generation configuration.
+        Returns:
+            str: The generated text.
+        """
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
+        try:
+            # Convert GenerationConfig to MLX format
+            mlx_gen_config = MLXGenerationConfig()
+            mlx_gen_config.max_tokens = g_cfg.max_tokens
+            mlx_gen_config.stop = g_cfg.stop_words
+            mlx_gen_config.image_paths = g_cfg.image_paths
+            mlx_gen_config.audio_paths = g_cfg.audio_paths
+            if g_cfg.sampler_config:
+                mlx_sampler_config = MLXSamplerConfig()
+                mlx_sampler_config.temperature = g_cfg.sampler_config.temperature
+                mlx_sampler_config.top_p = g_cfg.sampler_config.top_p
+                mlx_sampler_config.top_k = g_cfg.sampler_config.top_k
+                mlx_sampler_config.repetition_penalty = g_cfg.sampler_config.repetition_penalty
+                mlx_sampler_config.presence_penalty = g_cfg.sampler_config.presence_penalty
+                mlx_sampler_config.frequency_penalty = g_cfg.sampler_config.frequency_penalty
+                mlx_sampler_config.seed = g_cfg.sampler_config.seed
+                mlx_sampler_config.grammar_path = g_cfg.sampler_config.grammar_path
+                mlx_sampler_config.grammar_string = g_cfg.sampler_config.grammar_string
+                mlx_gen_config.sampler_config = mlx_sampler_config
+            # Simple token callback that just continues
+            def token_callback(token: str, user_data: Any = None) -> bool:
+                return not self._cancel_event.is_set()
+            # Use MLX streaming generation and return the full result
+            return self._mlx_vlm.generate_stream(prompt, mlx_gen_config, token_callback)
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate text: {str(e)}")
+    def get_profiling_data(self) -> Optional[ProfilingData]:
+        """Get profiling data from the last generation."""
+        if not self._mlx_vlm:
+            raise RuntimeError("MLX VLM not loaded")
         return self._mlx_vlm.get_profiling_data()