nexaai 1.0.18rc1__cp310-cp310-macosx_13_0_x86_64.whl → 1.0.19__cp310-cp310-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nexaai might be problematic. Click here for more details.
- nexaai/_stub.cpython-310-darwin.so +0 -0
- nexaai/_version.py +1 -1
- nexaai/asr.py +2 -1
- nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-base.dylib +0 -0
- nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libmtmd.dylib +0 -0
- nexaai/binds/{nexa_llama_cpp/libllama.dylib → cpu_gpu/libnexa_cpu_gpu.dylib} +0 -0
- nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libnexa_plugin.dylib +0 -0
- nexaai/binds/libnexa_bridge.dylib +0 -0
- nexaai/binds/llm_bind.cpython-310-darwin.so +0 -0
- nexaai/cv.py +2 -1
- nexaai/embedder.py +1 -1
- nexaai/image_gen.py +2 -1
- nexaai/llm.py +5 -3
- nexaai/llm_impl/mlx_llm_impl.py +2 -0
- nexaai/llm_impl/pybind_llm_impl.py +2 -0
- nexaai/mlx_backend/vlm/generate_qwen3_vl.py +176 -96
- nexaai/mlx_backend/vlm/generate_qwen3_vl_moe.py +259 -0
- nexaai/mlx_backend/vlm/interface.py +99 -30
- nexaai/mlx_backend/vlm/main.py +58 -9
- nexaai/mlx_backend/vlm/modeling/models/qwen3_vl/qwen3vl.py +338 -299
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/__init__.py +0 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/base.py +117 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/cache.py +531 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/generate.py +701 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/rope_utils.py +255 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/sample_utils.py +303 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/llm_common/tokenizer_utils.py +407 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/processor.py +476 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/qwen3vl_moe.py +1308 -0
- nexaai/mlx_backend/vlm/modeling/models/qwen3vl_moe/switch_layers.py +210 -0
- nexaai/rerank.py +2 -1
- nexaai/tts.py +2 -1
- nexaai/utils/manifest_utils.py +222 -15
- nexaai/utils/model_manager.py +120 -14
- nexaai/utils/model_types.py +2 -0
- nexaai/vlm.py +2 -1
- {nexaai-1.0.18rc1.dist-info → nexaai-1.0.19.dist-info}/METADATA +1 -2
- {nexaai-1.0.18rc1.dist-info → nexaai-1.0.19.dist-info}/RECORD +43 -32
- /nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-cpu.so +0 -0
- /nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml-metal.so +0 -0
- /nexaai/binds/{nexa_llama_cpp → cpu_gpu}/libggml.dylib +0 -0
- {nexaai-1.0.18rc1.dist-info → nexaai-1.0.19.dist-info}/WHEEL +0 -0
- {nexaai-1.0.18rc1.dist-info → nexaai-1.0.19.dist-info}/top_level.txt +0 -0
|
Binary file
|
nexaai/_version.py
CHANGED
nexaai/asr.py
CHANGED
|
@@ -35,7 +35,8 @@ class ASR(BaseModel):
|
|
|
35
35
|
tokenizer_path: Optional[str] = None,
|
|
36
36
|
language: Optional[str] = None,
|
|
37
37
|
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
38
|
-
device_id: Optional[str] = None
|
|
38
|
+
device_id: Optional[str] = None,
|
|
39
|
+
**kwargs
|
|
39
40
|
) -> 'ASR':
|
|
40
41
|
"""Load ASR model from local path, routing to appropriate implementation."""
|
|
41
42
|
# Check plugin_id value for routing - handle both enum and string
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
nexaai/cv.py
CHANGED
|
@@ -73,7 +73,8 @@ class CVModel(BaseModel):
|
|
|
73
73
|
_: str, # TODO: remove this argument, this is a hack to make api design happy
|
|
74
74
|
config: CVModelConfig,
|
|
75
75
|
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
76
|
-
device_id: Optional[str] = None
|
|
76
|
+
device_id: Optional[str] = None,
|
|
77
|
+
**kwargs
|
|
77
78
|
) -> 'CVModel':
|
|
78
79
|
"""Load CV model from configuration, routing to appropriate implementation."""
|
|
79
80
|
# Check plugin_id value for routing - handle both enum and string
|
nexaai/embedder.py
CHANGED
|
@@ -22,7 +22,7 @@ class Embedder(BaseModel):
|
|
|
22
22
|
pass
|
|
23
23
|
|
|
24
24
|
@classmethod
|
|
25
|
-
def _load_from(cls, model_path: str, tokenizer_file: str = "tokenizer.json", plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP):
|
|
25
|
+
def _load_from(cls, model_path: str, tokenizer_file: str = "tokenizer.json", plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP, **kwargs):
|
|
26
26
|
"""
|
|
27
27
|
Load an embedder from model files, routing to appropriate implementation.
|
|
28
28
|
|
nexaai/image_gen.py
CHANGED
|
@@ -71,7 +71,8 @@ class ImageGen(BaseModel):
|
|
|
71
71
|
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
72
72
|
device_id: Optional[str] = None,
|
|
73
73
|
float16: bool = True,
|
|
74
|
-
quantize: bool = False
|
|
74
|
+
quantize: bool = False,
|
|
75
|
+
**kwargs
|
|
75
76
|
) -> 'ImageGen':
|
|
76
77
|
"""Load image generation model from local path, routing to appropriate implementation."""
|
|
77
78
|
# Check plugin_id value for routing - handle both enum and string
|
nexaai/llm.py
CHANGED
|
@@ -15,10 +15,12 @@ class LLM(BaseModel):
|
|
|
15
15
|
@classmethod
|
|
16
16
|
def _load_from(cls,
|
|
17
17
|
local_path: str,
|
|
18
|
+
model_name: Optional[str] = None,
|
|
18
19
|
tokenizer_path: Optional[str] = None,
|
|
19
20
|
m_cfg: ModelConfig = ModelConfig(),
|
|
20
21
|
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
21
|
-
device_id: Optional[str] = None
|
|
22
|
+
device_id: Optional[str] = None,
|
|
23
|
+
**kwargs
|
|
22
24
|
) -> 'LLM':
|
|
23
25
|
"""Load model from local path, routing to appropriate implementation."""
|
|
24
26
|
# Check plugin_id value for routing - handle both enum and string
|
|
@@ -26,10 +28,10 @@ class LLM(BaseModel):
|
|
|
26
28
|
|
|
27
29
|
if plugin_value == "mlx":
|
|
28
30
|
from nexaai.llm_impl.mlx_llm_impl import MLXLLMImpl
|
|
29
|
-
return MLXLLMImpl._load_from(local_path, tokenizer_path, m_cfg, plugin_id, device_id)
|
|
31
|
+
return MLXLLMImpl._load_from(local_path, model_name, tokenizer_path, m_cfg, plugin_id, device_id)
|
|
30
32
|
else:
|
|
31
33
|
from nexaai.llm_impl.pybind_llm_impl import PyBindLLMImpl
|
|
32
|
-
return PyBindLLMImpl._load_from(local_path, tokenizer_path, m_cfg, plugin_id, device_id)
|
|
34
|
+
return PyBindLLMImpl._load_from(local_path, model_name, tokenizer_path, m_cfg, plugin_id, device_id)
|
|
33
35
|
|
|
34
36
|
def cancel_generation(self):
|
|
35
37
|
"""Signal to cancel any ongoing stream generation."""
|
nexaai/llm_impl/mlx_llm_impl.py
CHANGED
|
@@ -16,6 +16,7 @@ class MLXLLMImpl(LLM):
|
|
|
16
16
|
@classmethod
|
|
17
17
|
def _load_from(cls,
|
|
18
18
|
local_path: str,
|
|
19
|
+
model_name: Optional[str] = None,
|
|
19
20
|
tokenizer_path: Optional[str] = None,
|
|
20
21
|
m_cfg: ModelConfig = ModelConfig(),
|
|
21
22
|
plugin_id: Union[PluginID, str] = PluginID.MLX,
|
|
@@ -40,6 +41,7 @@ class MLXLLMImpl(LLM):
|
|
|
40
41
|
instance = cls(m_cfg)
|
|
41
42
|
instance._mlx_llm = MLXLLMInterface(
|
|
42
43
|
model_path=local_path,
|
|
44
|
+
# model_name=model_name, # FIXME: For MLX LLM, model_name is not used
|
|
43
45
|
tokenizer_path=tokenizer_path or local_path,
|
|
44
46
|
config=mlx_config,
|
|
45
47
|
device=device_id
|
|
@@ -19,6 +19,7 @@ class PyBindLLMImpl(LLM):
|
|
|
19
19
|
@classmethod
|
|
20
20
|
def _load_from(cls,
|
|
21
21
|
local_path: str,
|
|
22
|
+
model_name: Optional[str] = None,
|
|
22
23
|
tokenizer_path: Optional[str] = None,
|
|
23
24
|
m_cfg: ModelConfig = ModelConfig(),
|
|
24
25
|
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
@@ -55,6 +56,7 @@ class PyBindLLMImpl(LLM):
|
|
|
55
56
|
plugin_id_str = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
|
|
56
57
|
handle = llm_bind.ml_llm_create(
|
|
57
58
|
model_path=local_path,
|
|
59
|
+
model_name=model_name,
|
|
58
60
|
tokenizer_path=tokenizer_path,
|
|
59
61
|
model_config=config,
|
|
60
62
|
plugin_id=plugin_id_str,
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import json
|
|
3
|
-
import sys
|
|
4
3
|
import os
|
|
5
4
|
import mlx.core as mx
|
|
6
5
|
import mlx.nn as nn
|
|
@@ -10,38 +9,21 @@ import requests
|
|
|
10
9
|
import numpy as np
|
|
11
10
|
from pathlib import Path
|
|
12
11
|
from huggingface_hub import snapshot_download
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
curr_dir = os.path.dirname(os.path.abspath(__file__))
|
|
16
|
-
sys.path.append(curr_dir)
|
|
17
|
-
sys.path.append(os.path.dirname(curr_dir))
|
|
18
|
-
|
|
19
|
-
# Add the qwen3vl model directory to path
|
|
20
|
-
qwen3vl_dir = os.path.join(curr_dir, "modeling", "models", "qwen3_vl")
|
|
21
|
-
sys.path.append(qwen3vl_dir)
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
|
|
22
14
|
|
|
23
15
|
# Import required modules for quantized loading
|
|
24
16
|
from transformers import AutoTokenizer
|
|
25
17
|
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
from .modeling.models.qwen3_vl.processor import Qwen3VLProcessor
|
|
34
|
-
except ImportError:
|
|
35
|
-
# Fallback for Nuitka compiled environment - use sys.path approach
|
|
36
|
-
from llm_common.generate import nexa_generate_step
|
|
37
|
-
from llm_common.cache import make_prompt_cache
|
|
38
|
-
from qwen3vl import VEGModel, LLMModel, ModelArgs, VisionConfig, TextConfig, handle_multimodal_embeds
|
|
39
|
-
from processor import Qwen3VLProcessor
|
|
40
|
-
|
|
41
|
-
from ml import ChatMessage
|
|
42
|
-
from dataclasses import dataclass
|
|
43
|
-
from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
|
|
18
|
+
# Import from the nested modeling structure
|
|
19
|
+
from .modeling.models.qwen3_vl.llm_common.generate import nexa_generate_step
|
|
20
|
+
from .modeling.models.qwen3_vl.llm_common.cache import make_prompt_cache
|
|
21
|
+
from .modeling.models.qwen3_vl.qwen3vl import (
|
|
22
|
+
VEGModel, LLMModel, ModelArgs, VisionConfig, TextConfig, handle_multimodal_embeds
|
|
23
|
+
)
|
|
24
|
+
from .modeling.models.qwen3_vl.processor import Qwen3VLProcessor
|
|
44
25
|
from .generate import GenerationResult
|
|
26
|
+
from ml import ChatMessage
|
|
45
27
|
|
|
46
28
|
# Custom exception for context length exceeded
|
|
47
29
|
class ContextLengthExceededError(Exception):
|
|
@@ -61,17 +43,156 @@ def _ensure_list(x: Union[str, List[str], None]) -> Optional[List[str]]:
|
|
|
61
43
|
return x if isinstance(x, list) else [x]
|
|
62
44
|
|
|
63
45
|
|
|
46
|
+
def get_model_configs(model_name: str):
|
|
47
|
+
"""Get model configurations based on model name"""
|
|
48
|
+
|
|
49
|
+
# 4B model configs (default)
|
|
50
|
+
if model_name in ["qwen3vl", "qwen3vl-4b", "qwen3vl-4b-thinking"]:
|
|
51
|
+
vision_config = VisionConfig(
|
|
52
|
+
hidden_size=1024,
|
|
53
|
+
intermediate_size=4096,
|
|
54
|
+
num_heads=16,
|
|
55
|
+
num_hidden_layers=24,
|
|
56
|
+
patch_size=16,
|
|
57
|
+
temporal_patch_size=2,
|
|
58
|
+
in_channels=3,
|
|
59
|
+
hidden_act="gelu",
|
|
60
|
+
spatial_merge_size=2,
|
|
61
|
+
out_hidden_size=2560,
|
|
62
|
+
num_position_embeddings=2304,
|
|
63
|
+
deepstack_visual_indexes=[5, 11, 17],
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
text_config = TextConfig(
|
|
67
|
+
model_type="qwen3vl",
|
|
68
|
+
hidden_size=2560,
|
|
69
|
+
num_hidden_layers=36,
|
|
70
|
+
intermediate_size=9728,
|
|
71
|
+
num_attention_heads=32,
|
|
72
|
+
num_key_value_heads=8,
|
|
73
|
+
rms_norm_eps=1e-6,
|
|
74
|
+
vocab_size=151936,
|
|
75
|
+
max_position_embeddings=32768,
|
|
76
|
+
rope_theta=5000000.0,
|
|
77
|
+
head_dim=128,
|
|
78
|
+
tie_word_embeddings=True,
|
|
79
|
+
attention_bias=False,
|
|
80
|
+
attention_dropout=0.0,
|
|
81
|
+
rope_scaling={"mrope_section": [24, 20, 20],
|
|
82
|
+
"rope_type": "default", "type": "default"},
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# 8B model configs
|
|
86
|
+
elif model_name in ["qwen3vl-8b", "qwen3vl-8b-thinking"]:
|
|
87
|
+
vision_config = VisionConfig(
|
|
88
|
+
hidden_size=1152,
|
|
89
|
+
intermediate_size=4304,
|
|
90
|
+
num_heads=16,
|
|
91
|
+
num_hidden_layers=27,
|
|
92
|
+
patch_size=16,
|
|
93
|
+
temporal_patch_size=2,
|
|
94
|
+
in_channels=3,
|
|
95
|
+
hidden_act="gelu",
|
|
96
|
+
spatial_merge_size=2,
|
|
97
|
+
out_hidden_size=4096,
|
|
98
|
+
num_position_embeddings=2304,
|
|
99
|
+
deepstack_visual_indexes=[8, 16, 24],
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
text_config = TextConfig(
|
|
103
|
+
model_type="qwen3vl",
|
|
104
|
+
hidden_size=4096,
|
|
105
|
+
num_hidden_layers=36,
|
|
106
|
+
intermediate_size=12288,
|
|
107
|
+
num_attention_heads=32,
|
|
108
|
+
num_key_value_heads=8,
|
|
109
|
+
rms_norm_eps=1e-6,
|
|
110
|
+
vocab_size=151936,
|
|
111
|
+
max_position_embeddings=262144,
|
|
112
|
+
rope_theta=5000000,
|
|
113
|
+
head_dim=128,
|
|
114
|
+
tie_word_embeddings=False,
|
|
115
|
+
attention_bias=False,
|
|
116
|
+
attention_dropout=0.0,
|
|
117
|
+
rope_scaling={"mrope_section": [24, 20, 20], "rope_type": "default", "mrope_interleaved": True},
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
# Fallback to 4B config
|
|
121
|
+
return get_model_configs("qwen3vl-4b")
|
|
122
|
+
|
|
123
|
+
return vision_config, text_config
|
|
124
|
+
|
|
125
|
+
def get_weight_filenames(model_name: str, model_path: Path):
|
|
126
|
+
"""Get appropriate weight filenames based on model name and available files"""
|
|
127
|
+
|
|
128
|
+
# Determine model size and type based on the actual file structure
|
|
129
|
+
if "4b" in model_name:
|
|
130
|
+
size_prefix = "4b"
|
|
131
|
+
elif "8b" in model_name:
|
|
132
|
+
size_prefix = "8b"
|
|
133
|
+
else:
|
|
134
|
+
size_prefix = "4b"
|
|
135
|
+
|
|
136
|
+
# Determine model type
|
|
137
|
+
if "thinking" in model_name:
|
|
138
|
+
model_type = f"{size_prefix}_thinking"
|
|
139
|
+
else:
|
|
140
|
+
model_type = f"{size_prefix}_instruct"
|
|
141
|
+
|
|
142
|
+
# Try different weight file patterns matching the actual file structure
|
|
143
|
+
llm_patterns = [
|
|
144
|
+
# New naming convention matching actual files
|
|
145
|
+
f"qwen3vl-llm-{model_type}-q4_0.safetensors",
|
|
146
|
+
f"qwen3vl-llm-{model_type}-q8_0.safetensors",
|
|
147
|
+
f"qwen3vl-llm-{model_type}-f16.safetensors",
|
|
148
|
+
# Legacy naming convention
|
|
149
|
+
f"qwen3vl-llm-{size_prefix.upper()}-q4_0.safetensors",
|
|
150
|
+
f"qwen3vl-llm-{size_prefix.upper()}-q8_0.safetensors",
|
|
151
|
+
f"qwen3vl-llm-{size_prefix.upper()}-f16.safetensors",
|
|
152
|
+
f"qwen3vl-llm-{size_prefix.upper()}-f32.safetensors",
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
vision_patterns = [
|
|
156
|
+
f"qwen3vl-vision-{model_type}-f16.safetensors",
|
|
157
|
+
f"qwen3vl-vision-{size_prefix.upper()}-f16.safetensors",
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
# Find LLM weights
|
|
161
|
+
llm_weights_path = None
|
|
162
|
+
quantization_bits = None
|
|
163
|
+
|
|
164
|
+
for pattern in llm_patterns:
|
|
165
|
+
candidate_path = model_path / pattern
|
|
166
|
+
if candidate_path.exists():
|
|
167
|
+
llm_weights_path = candidate_path
|
|
168
|
+
if "q4_0" in pattern:
|
|
169
|
+
quantization_bits = 4
|
|
170
|
+
elif "q8_0" in pattern:
|
|
171
|
+
quantization_bits = 8
|
|
172
|
+
else:
|
|
173
|
+
quantization_bits = 16
|
|
174
|
+
break
|
|
175
|
+
|
|
176
|
+
# Find vision weights
|
|
177
|
+
vision_weights_path = None
|
|
178
|
+
for pattern in vision_patterns:
|
|
179
|
+
candidate_path = model_path / pattern
|
|
180
|
+
if candidate_path.exists():
|
|
181
|
+
vision_weights_path = candidate_path
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
return llm_weights_path, vision_weights_path, quantization_bits
|
|
185
|
+
|
|
186
|
+
# Update the load_qwen3_vl function signature and implementation:
|
|
64
187
|
def load_qwen3_vl(
|
|
65
188
|
path_or_repo: str,
|
|
66
189
|
adapter_path: Optional[str] = None,
|
|
67
190
|
lazy: bool = False,
|
|
68
191
|
revision: Optional[str] = None,
|
|
192
|
+
model_name: Optional[str] = None,
|
|
69
193
|
**kwargs,
|
|
70
194
|
) -> Tuple[Qwen3VLBundledModel, Qwen3VLProcessor]:
|
|
71
|
-
"""Load Qwen3-VL quantized models and processor.
|
|
72
|
-
|
|
73
|
-
Parameters are aligned with .generate.load for compatibility.
|
|
74
|
-
"""
|
|
195
|
+
"""Load Qwen3-VL quantized models and processor with support for different model sizes."""
|
|
75
196
|
|
|
76
197
|
model_path = Path(path_or_repo)
|
|
77
198
|
if not model_path.exists():
|
|
@@ -79,75 +200,28 @@ def load_qwen3_vl(
|
|
|
79
200
|
model_path = Path(snapshot_download(
|
|
80
201
|
repo_id=path_or_repo, repo_type="model", revision=revision))
|
|
81
202
|
else:
|
|
82
|
-
# Fallback to local modelfiles directory
|
|
83
|
-
|
|
203
|
+
# Fallback to local modelfiles directory relative to this file
|
|
204
|
+
curr_dir = Path(__file__).parent
|
|
205
|
+
model_path = curr_dir / "modeling" / "models" / "qwen3_vl" / "modelfiles"
|
|
84
206
|
if not model_path.exists():
|
|
85
|
-
model_path =
|
|
86
|
-
|
|
87
|
-
# Model configs (kept identical to main)
|
|
88
|
-
vision_config = VisionConfig(
|
|
89
|
-
hidden_size=1024,
|
|
90
|
-
intermediate_size=4096,
|
|
91
|
-
num_heads=16,
|
|
92
|
-
num_hidden_layers=24,
|
|
93
|
-
patch_size=16,
|
|
94
|
-
temporal_patch_size=2,
|
|
95
|
-
in_channels=3,
|
|
96
|
-
hidden_act="gelu",
|
|
97
|
-
spatial_merge_size=2,
|
|
98
|
-
out_hidden_size=2560,
|
|
99
|
-
num_position_embeddings=2304,
|
|
100
|
-
deepstack_visual_indexes=[5, 11, 17],
|
|
101
|
-
)
|
|
207
|
+
model_path = curr_dir / "modelfiles"
|
|
102
208
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
num_key_value_heads=8,
|
|
110
|
-
rms_norm_eps=1e-6,
|
|
111
|
-
vocab_size=151936,
|
|
112
|
-
max_position_embeddings=32768,
|
|
113
|
-
rope_theta=5000000.0,
|
|
114
|
-
head_dim=128,
|
|
115
|
-
tie_word_embeddings=True,
|
|
116
|
-
attention_bias=False,
|
|
117
|
-
attention_dropout=0.0,
|
|
118
|
-
rope_scaling={"mrope_section": [24, 20, 20],
|
|
119
|
-
"rope_type": "default", "type": "default"},
|
|
120
|
-
)
|
|
209
|
+
# Get model configurations based on model name
|
|
210
|
+
if model_name:
|
|
211
|
+
vision_config, text_config = get_model_configs(model_name)
|
|
212
|
+
else:
|
|
213
|
+
# Default to 4B config
|
|
214
|
+
vision_config, text_config = get_model_configs("qwen3vl-4b")
|
|
121
215
|
|
|
122
216
|
vision_model = VEGModel(vision_config)
|
|
123
217
|
llm_model = LLMModel(text_config)
|
|
124
218
|
|
|
125
|
-
#
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
("qwen3vl-llm-4B-f32.safetensors", 32)
|
|
130
|
-
]
|
|
131
|
-
|
|
132
|
-
llm_weights_path = None
|
|
133
|
-
quantization_bits = None
|
|
134
|
-
|
|
135
|
-
# Try loading in order of preference
|
|
136
|
-
for filename, bits in preferred_order:
|
|
137
|
-
candidate_path = model_path / filename
|
|
138
|
-
if candidate_path.exists():
|
|
139
|
-
llm_weights_path = candidate_path
|
|
140
|
-
quantization_bits = bits
|
|
141
|
-
break
|
|
142
|
-
|
|
143
|
-
if llm_weights_path is None:
|
|
144
|
-
# Fallback to original hardcoded path for backward compatibility
|
|
145
|
-
llm_weights_path = model_path / "qwen3vl-llm-4B-q4_0.safetensors"
|
|
146
|
-
quantization_bits = 4
|
|
147
|
-
|
|
148
|
-
vision_weights_path = model_path / "qwen3vl-vision-4B-f16.safetensors"
|
|
219
|
+
# Get appropriate weight filenames
|
|
220
|
+
llm_weights_path, vision_weights_path, quantization_bits = get_weight_filenames(
|
|
221
|
+
model_name or "qwen3vl-4b", model_path
|
|
222
|
+
)
|
|
149
223
|
|
|
150
|
-
if not vision_weights_path
|
|
224
|
+
if not vision_weights_path or not llm_weights_path:
|
|
151
225
|
raise FileNotFoundError(
|
|
152
226
|
f"Missing safetensors. Vision: {vision_weights_path}, LLM: {llm_weights_path}"
|
|
153
227
|
)
|
|
@@ -163,8 +237,14 @@ def load_qwen3_vl(
|
|
|
163
237
|
|
|
164
238
|
llm_model.load_weights(str(llm_weights_path), strict=True)
|
|
165
239
|
|
|
166
|
-
|
|
167
|
-
|
|
240
|
+
try:
|
|
241
|
+
tokenizer = AutoTokenizer.from_pretrained(str(model_path))
|
|
242
|
+
except Exception:
|
|
243
|
+
try:
|
|
244
|
+
tokenizer = AutoTokenizer.from_pretrained(path_or_repo)
|
|
245
|
+
except Exception:
|
|
246
|
+
raise Exception("Failed to load tokenizer from the same path where model weights are loaded and original path_or_repo.")
|
|
247
|
+
|
|
168
248
|
processor = Qwen3VLProcessor(tokenizer=tokenizer)
|
|
169
249
|
|
|
170
250
|
return Qwen3VLBundledModel(vision_model=vision_model, llm_model=llm_model), processor
|