PyPI - nexaai - Versions diffs - 1.0.18rc1__cp310-cp310-macosx_13_0_x86_64.whl → 1.0.19__cp310-cp310-macosx_13_0_x86_64.whl - Mend

nexaai 1.0.18rc1__cp310-cp310-macosx_13_0_x86_64.whl → 1.0.19__cp310-cp310-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nexaai might be problematic. Click here for more details.

Files changed (43) hide show

nexaai/_stub.cpython-310-darwin.so +0 -0
nexaai/_version.py +1 -1
nexaai/asr.py +2 -1
nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-base.dylib +0 -0
nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libmtmd.dylib +0 -0
nexaai/binds/{nexa_llama_cpp/libllama.dylib → cpu_gpu/libnexa_cpu_gpu.dylib} +0 -0
nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libnexa_plugin.dylib +0 -0
nexaai/binds/libnexa_bridge.dylib +0 -0
nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
nexaai/cv.py +2 -1
nexaai/embedder.py +1 -1
nexaai/image_gen.py +2 -1
nexaai/llm.py +5 -3
nexaai/llm_impl/mlx_llm_impl.py +2 -0
nexaai/llm_impl/pybind_llm_impl.py +2 -0
nexaai/mlx_backend/vlm/generate_qwen3_vl.py +176 -96
nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
nexaai/mlx_backend/vlm/interface.py +99 -30
nexaai/mlx_backend/vlm/main.py +58 -9
nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +338 -299
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
nexaai/rerank.py +2 -1
nexaai/tts.py +2 -1
nexaai/utils/manifest_utils.py +222 -15
nexaai/utils/model_manager.py +120 -14
nexaai/utils/model_types.py +2 -0
nexaai/vlm.py +2 -1
{nexaai-1.0.18rc1.dist-info → nexaai-1.0.19.dist-info}/METADATA +1 -2
{nexaai-1.0.18rc1.dist-info → nexaai-1.0.19.dist-info}/RECORD +43 -32
/nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-cpu.so +0 -0
/nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-metal.so +0 -0
/nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml.dylib +0 -0
{nexaai-1.0.18rc1.dist-info → nexaai-1.0.19.dist-info}/WHEEL +0 -0
{nexaai-1.0.18rc1.dist-info → nexaai-1.0.19.dist-info}/top_level.txt +0 -0

nexaai/_stub.cpython-310-darwin.so CHANGED Viewed

Binary file

nexaai/_version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # This file is generated by CMake from _version.py.in
 # Do not modify this file manually - it will be overwritten
-__version__ = "1.0.18-rc1"
+__version__ = "1.0.19"

nexaai/asr.py CHANGED Viewed

@@ -35,7 +35,8 @@ class ASR(BaseModel):
                    tokenizer_path: Optional[str] = None,
                    language: Optional[str] = None,
                    plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
-                   device_id: Optional[str] = None
+                   device_id: Optional[str] = None,
+                   **kwargs
         ) -> 'ASR':
         """Load ASR model from local path, routing to appropriate implementation."""
         # Check plugin_id value for routing - handle both enum and string

nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-base.dylib RENAMED Viewed

Binary file

nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libmtmd.dylib RENAMED Viewed

Binary file

nexaai/binds/{nexa_llama_cpp/libllama.dylib → cpu_gpu/libnexa_cpu_gpu.dylib} RENAMED Viewed

Binary file

nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libnexa_plugin.dylib RENAMED Viewed

Binary file

nexaai/binds/libnexa_bridge.dylib CHANGED Viewed

Binary file

nexaai/binds/llm_bind.cpython-310-darwin.so CHANGED Viewed

Binary file

nexaai/cv.py CHANGED Viewed

@@ -73,7 +73,8 @@ class CVModel(BaseModel):
                    _: str,  # TODO: remove this argument, this is a hack to make api design happy
                    config: CVModelConfig,
                    plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
-                   device_id: Optional[str] = None
+                   device_id: Optional[str] = None,
+                   **kwargs
         ) -> 'CVModel':
         """Load CV model from configuration, routing to appropriate implementation."""
         # Check plugin_id value for routing - handle both enum and string

nexaai/embedder.py CHANGED Viewed

@@ -22,7 +22,7 @@ class Embedder(BaseModel):
         pass
     @classmethod
-    def _load_from(cls, model_path: str, tokenizer_file: str = "tokenizer.json", plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP):
+    def _load_from(cls, model_path: str, tokenizer_file: str = "tokenizer.json", plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP, **kwargs):
         """
         Load an embedder from model files, routing to appropriate implementation.

nexaai/image_gen.py CHANGED Viewed

@@ -71,7 +71,8 @@ class ImageGen(BaseModel):
                    plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
                    device_id: Optional[str] = None,
                    float16: bool = True,
-                   quantize: bool = False
+                   quantize: bool = False,
+                   **kwargs
         ) -> 'ImageGen':
         """Load image generation model from local path, routing to appropriate implementation."""
         # Check plugin_id value for routing - handle both enum and string

nexaai/llm.py CHANGED Viewed

@@ -15,10 +15,12 @@ class LLM(BaseModel):
     @classmethod
     def _load_from(cls,
                    local_path: str,
+                   model_name: Optional[str] = None,
                    tokenizer_path: Optional[str] = None,
                    m_cfg: ModelConfig = ModelConfig(),
                    plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
-                   device_id: Optional[str] = None
+                   device_id: Optional[str] = None,
+                   **kwargs
         ) -> 'LLM':
         """Load model from local path, routing to appropriate implementation."""
         # Check plugin_id value for routing - handle both enum and string
@@ -26,10 +28,10 @@ class LLM(BaseModel):
         if plugin_value == "mlx":
             from nexaai.llm_impl.mlx_llm_impl import MLXLLMImpl
-            return MLXLLMImpl._load_from(local_path, tokenizer_path, m_cfg, plugin_id, device_id)
+            return MLXLLMImpl._load_from(local_path, model_name, tokenizer_path, m_cfg, plugin_id, device_id)
         else:
             from nexaai.llm_impl.pybind_llm_impl import PyBindLLMImpl
-            return PyBindLLMImpl._load_from(local_path, tokenizer_path, m_cfg, plugin_id, device_id)
+            return PyBindLLMImpl._load_from(local_path, model_name, tokenizer_path, m_cfg, plugin_id, device_id)
     def cancel_generation(self):
         """Signal to cancel any ongoing stream generation."""

nexaai/llm_impl/mlx_llm_impl.py CHANGED Viewed

@@ -16,6 +16,7 @@ class MLXLLMImpl(LLM):
     @classmethod
     def _load_from(cls,
                    local_path: str,
+                   model_name: Optional[str] = None,
                    tokenizer_path: Optional[str] = None,
                    m_cfg: ModelConfig = ModelConfig(),
                    plugin_id: Union[PluginID, str] = PluginID.MLX,
@@ -40,6 +41,7 @@ class MLXLLMImpl(LLM):
             instance = cls(m_cfg)
             instance._mlx_llm = MLXLLMInterface(
                 model_path=local_path,
+                # model_name=model_name, # FIXME: For MLX LLM, model_name is not used
                 tokenizer_path=tokenizer_path or local_path,
                 config=mlx_config,
                 device=device_id

nexaai/llm_impl/pybind_llm_impl.py CHANGED Viewed

@@ -19,6 +19,7 @@ class PyBindLLMImpl(LLM):
     @classmethod
     def _load_from(cls,
                    local_path: str,
+                   model_name: Optional[str] = None,
                    tokenizer_path: Optional[str] = None,
                    m_cfg: ModelConfig = ModelConfig(),
                    plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
@@ -55,6 +56,7 @@ class PyBindLLMImpl(LLM):
         plugin_id_str = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
         handle = llm_bind.ml_llm_create(
             model_path=local_path,
+            model_name=model_name,
             tokenizer_path=tokenizer_path,
             model_config=config,
             plugin_id=plugin_id_str,

nexaai/mlx_backend/vlm/generate_qwen3_vl.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import argparse
 import json
-import sys
 import os
 import mlx.core as mx
 import mlx.nn as nn
@@ -10,38 +9,21 @@ import requests
 import numpy as np
 from pathlib import Path
 from huggingface_hub import snapshot_download
-# Add current directory to path for imports
-curr_dir = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(curr_dir)
-sys.path.append(os.path.dirname(curr_dir))
-# Add the qwen3vl model directory to path
-qwen3vl_dir = os.path.join(curr_dir, "modeling", "models", "qwen3_vl")
-sys.path.append(qwen3vl_dir)
+from dataclasses import dataclass
+from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
 # Import required modules for quantized loading
 from transformers import AutoTokenizer
-# Try relative imports first, fallback to sys.path approach for Nuitka compatibility
-try:
-    from .modeling.models.qwen3_vl.llm_common.generate import nexa_generate_step
-    from .modeling.models.qwen3_vl.llm_common.cache import make_prompt_cache
-    from .modeling.models.qwen3_vl.qwen3vl import (
-        VEGModel, LLMModel, ModelArgs, VisionConfig, TextConfig, handle_multimodal_embeds
-    )
-    from .modeling.models.qwen3_vl.processor import Qwen3VLProcessor
-except ImportError:
-    # Fallback for Nuitka compiled environment - use sys.path approach
-    from llm_common.generate import nexa_generate_step
-    from llm_common.cache import make_prompt_cache
-    from qwen3vl import VEGModel, LLMModel, ModelArgs, VisionConfig, TextConfig, handle_multimodal_embeds
-    from processor import Qwen3VLProcessor
-from ml import ChatMessage
-from dataclasses import dataclass
-from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
+# Import from the nested modeling structure
+from .modeling.models.qwen3_vl.llm_common.generate import nexa_generate_step
+from .modeling.models.qwen3_vl.llm_common.cache import make_prompt_cache
+from .modeling.models.qwen3_vl.qwen3vl import (
+    VEGModel, LLMModel, ModelArgs, VisionConfig, TextConfig, handle_multimodal_embeds
+)
+from .modeling.models.qwen3_vl.processor import Qwen3VLProcessor
 from .generate import GenerationResult
+from ml import ChatMessage
 # Custom exception for context length exceeded
 class ContextLengthExceededError(Exception):
@@ -61,17 +43,156 @@ def _ensure_list(x: Union[str, List[str], None]) -> Optional[List[str]]:
     return x if isinstance(x, list) else [x]
+def get_model_configs(model_name: str):
+    """Get model configurations based on model name"""
+    # 4B model configs (default)
+    if model_name in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking"]:
+        vision_config = VisionConfig(
+            hidden_size=1024,
+            intermediate_size=4096,
+            num_heads=16,
+            num_hidden_layers=24,
+            patch_size=16,
+            temporal_patch_size=2,
+            in_channels=3,
+            hidden_act="gelu",
+            spatial_merge_size=2,
+            out_hidden_size=2560,
+            num_position_embeddings=2304,
+            deepstack_visual_indexes=[5, 11, 17],
+        )
+        text_config = TextConfig(
+            model_type="qwen3vl",
+            hidden_size=2560,
+            num_hidden_layers=36,
+            intermediate_size=9728,
+            num_attention_heads=32,
+            num_key_value_heads=8,
+            rms_norm_eps=1e-6,
+            vocab_size=151936,
+            max_position_embeddings=32768,
+            rope_theta=5000000.0,
+            head_dim=128,
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            rope_scaling={"mrope_section": [24, 20, 20],
+                          "rope_type": "default", "type": "default"},
+        )
+    # 8B model configs
+    elif model_name in ["qwen3vl-8b", "qwen3vl-8b-thinking"]:
+        vision_config = VisionConfig(
+            hidden_size=1152,
+            intermediate_size=4304,
+            num_heads=16,
+            num_hidden_layers=27,
+            patch_size=16,
+            temporal_patch_size=2,
+            in_channels=3,
+            hidden_act="gelu",
+            spatial_merge_size=2,
+            out_hidden_size=4096,
+            num_position_embeddings=2304,
+            deepstack_visual_indexes=[8, 16, 24],
+        )
+        text_config = TextConfig(
+            model_type="qwen3vl",
+            hidden_size=4096,
+            num_hidden_layers=36,
+            intermediate_size=12288,
+            num_attention_heads=32,
+            num_key_value_heads=8,
+            rms_norm_eps=1e-6,
+            vocab_size=151936,
+            max_position_embeddings=262144,
+            rope_theta=5000000,
+            head_dim=128,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            rope_scaling={"mrope_section": [24, 20, 20], "rope_type": "default", "mrope_interleaved": True},
+        )
+    else:
+        # Fallback to 4B config
+        return get_model_configs("qwen3vl-4b")
+    return vision_config, text_config
+def get_weight_filenames(model_name: str, model_path: Path):
+    """Get appropriate weight filenames based on model name and available files"""
+    # Determine model size and type based on the actual file structure
+    if "4b" in model_name:
+        size_prefix = "4b"
+    elif "8b" in model_name:
+        size_prefix = "8b"
+    else:
+        size_prefix = "4b"
+    # Determine model type
+    if "thinking" in model_name:
+        model_type = f"{size_prefix}_thinking"
+    else:
+        model_type = f"{size_prefix}_instruct"
+    # Try different weight file patterns matching the actual file structure
+    llm_patterns = [
+        # New naming convention matching actual files
+        f"qwen3vl-llm-{model_type}-q4_0.safetensors",
+        f"qwen3vl-llm-{model_type}-q8_0.safetensors",
+        f"qwen3vl-llm-{model_type}-f16.safetensors",
+        # Legacy naming convention
+        f"qwen3vl-llm-{size_prefix.upper()}-q4_0.safetensors",
+        f"qwen3vl-llm-{size_prefix.upper()}-q8_0.safetensors",
+        f"qwen3vl-llm-{size_prefix.upper()}-f16.safetensors",
+        f"qwen3vl-llm-{size_prefix.upper()}-f32.safetensors",
+    ]
+    vision_patterns = [
+        f"qwen3vl-vision-{model_type}-f16.safetensors",
+        f"qwen3vl-vision-{size_prefix.upper()}-f16.safetensors",
+    ]
+    # Find LLM weights
+    llm_weights_path = None
+    quantization_bits = None
+    for pattern in llm_patterns:
+        candidate_path = model_path / pattern
+        if candidate_path.exists():
+            llm_weights_path = candidate_path
+            if "q4_0" in pattern:
+                quantization_bits = 4
+            elif "q8_0" in pattern:
+                quantization_bits = 8
+            else:
+                quantization_bits = 16
+            break
+    # Find vision weights
+    vision_weights_path = None
+    for pattern in vision_patterns:
+        candidate_path = model_path / pattern
+        if candidate_path.exists():
+            vision_weights_path = candidate_path
+            break
+    return llm_weights_path, vision_weights_path, quantization_bits
+# Update the load_qwen3_vl function signature and implementation:
 def load_qwen3_vl(
     path_or_repo: str,
     adapter_path: Optional[str] = None,
     lazy: bool = False,
     revision: Optional[str] = None,
+    model_name: Optional[str] = None,
     **kwargs,
 ) -> Tuple[Qwen3VLBundledModel, Qwen3VLProcessor]:
-    """Load Qwen3-VL quantized models and processor.
-    Parameters are aligned with .generate.load for compatibility.
-    """
+    """Load Qwen3-VL quantized models and processor with support for different model sizes."""
     model_path = Path(path_or_repo)
     if not model_path.exists():
@@ -79,75 +200,28 @@ def load_qwen3_vl(
             model_path = Path(snapshot_download(
                 repo_id=path_or_repo, repo_type="model", revision=revision))
         else:
-            # Fallback to local modelfiles directory
-            model_path = Path(qwen3vl_dir) / "modelfiles"
+            # Fallback to local modelfiles directory relative to this file
+            curr_dir = Path(__file__).parent
+            model_path = curr_dir / "modeling" / "models" / "qwen3_vl" / "modelfiles"
             if not model_path.exists():
-                model_path = Path(curr_dir) / "modelfiles"
-    # Model configs (kept identical to main)
-    vision_config = VisionConfig(
-        hidden_size=1024,
-        intermediate_size=4096,
-        num_heads=16,
-        num_hidden_layers=24,
-        patch_size=16,
-        temporal_patch_size=2,
-        in_channels=3,
-        hidden_act="gelu",
-        spatial_merge_size=2,
-        out_hidden_size=2560,
-        num_position_embeddings=2304,
-        deepstack_visual_indexes=[5, 11, 17],
-    )
+                model_path = curr_dir / "modelfiles"
-    text_config = TextConfig(
-        model_type="qwen3vl",
-        hidden_size=2560,
-        num_hidden_layers=36,
-        intermediate_size=9728,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        rms_norm_eps=1e-6,
-        vocab_size=151936,
-        max_position_embeddings=32768,
-        rope_theta=5000000.0,
-        head_dim=128,
-        tie_word_embeddings=True,
-        attention_bias=False,
-        attention_dropout=0.0,
-        rope_scaling={"mrope_section": [24, 20, 20],
-                      "rope_type": "default", "type": "default"},
-    )
+    # Get model configurations based on model name
+    if model_name:
+        vision_config, text_config = get_model_configs(model_name)
+    else:
+        # Default to 4B config
+        vision_config, text_config = get_model_configs("qwen3vl-4b")
     vision_model = VEGModel(vision_config)
     llm_model = LLMModel(text_config)
-    # Try to load LLM model from available files in order of preference
-    preferred_order = [
-        ("qwen3vl-llm-4B-q4_0.safetensors", 4),
-        ("qwen3vl-llm-4B-q8_0.safetensors", 8),
-        ("qwen3vl-llm-4B-f32.safetensors", 32)
-    ]
-    llm_weights_path = None
-    quantization_bits = None
-    # Try loading in order of preference
-    for filename, bits in preferred_order:
-        candidate_path = model_path / filename
-        if candidate_path.exists():
-            llm_weights_path = candidate_path
-            quantization_bits = bits
-            break
-    if llm_weights_path is None:
-        # Fallback to original hardcoded path for backward compatibility
-        llm_weights_path = model_path / "qwen3vl-llm-4B-q4_0.safetensors"
-        quantization_bits = 4
-    vision_weights_path = model_path / "qwen3vl-vision-4B-f16.safetensors"
+    # Get appropriate weight filenames
+    llm_weights_path, vision_weights_path, quantization_bits = get_weight_filenames(
+        model_name or "qwen3vl-4b", model_path
+    )
-    if not vision_weights_path.exists() or not llm_weights_path.exists():
+    if not vision_weights_path or not llm_weights_path:
         raise FileNotFoundError(
             f"Missing safetensors. Vision: {vision_weights_path}, LLM: {llm_weights_path}"
         )
@@ -163,8 +237,14 @@ def load_qwen3_vl(
     llm_model.load_weights(str(llm_weights_path), strict=True)
-    # Tokenizer and processor
-    tokenizer = AutoTokenizer.from_pretrained(path_or_repo)
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(str(model_path))
+    except Exception:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(path_or_repo)
+        except Exception:
+            raise Exception("Failed to load tokenizer from the same path where model weights are loaded and original path_or_repo.")
     processor = Qwen3VLProcessor(tokenizer=tokenizer)
     return Qwen3VLBundledModel(vision_model=vision_model, llm_model=llm_model), processor