nexaai 1.0.21rc5__cp313-cp313-win_arm64.whl → 1.0.21rc16__cp313-cp313-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nexaai might be problematic. Click here for more details.
- nexaai/__init__.py +95 -95
- nexaai/_stub.cp313-win_arm64.pyd +0 -0
- nexaai/_version.py +4 -1
- nexaai/asr.py +68 -65
- nexaai/asr_impl/mlx_asr_impl.py +92 -92
- nexaai/asr_impl/pybind_asr_impl.py +127 -44
- nexaai/base.py +39 -39
- nexaai/binds/__init__.py +6 -5
- nexaai/binds/asr_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/common_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/cpu_gpu/ggml-base.dll +0 -0
- nexaai/binds/cpu_gpu/ggml-cpu.dll +0 -0
- nexaai/binds/cpu_gpu/ggml-opencl.dll +0 -0
- nexaai/binds/cpu_gpu/ggml.dll +0 -0
- nexaai/binds/cpu_gpu/mtmd.dll +0 -0
- nexaai/binds/cpu_gpu/nexa_cpu_gpu.dll +0 -0
- nexaai/binds/cpu_gpu/nexa_plugin.dll +0 -0
- nexaai/binds/embedder_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/libcrypto-3-arm64.dll +0 -0
- nexaai/binds/libssl-3-arm64.dll +0 -0
- nexaai/binds/llm_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/nexa_bridge.dll +0 -0
- nexaai/binds/npu/convnext-sdk.dll +0 -0
- nexaai/binds/npu/embed-gemma-sdk.dll +0 -0
- nexaai/binds/npu/ggml-base.dll +0 -0
- nexaai/binds/npu/ggml-cpu.dll +0 -0
- nexaai/binds/{nexaml → npu}/ggml-opencl.dll +0 -0
- nexaai/binds/npu/ggml.dll +0 -0
- nexaai/binds/npu/granite-nano-sdk.dll +0 -0
- nexaai/binds/npu/granite4-sdk.dll +0 -0
- nexaai/binds/npu/jina-rerank-sdk.dll +0 -0
- nexaai/binds/npu/liquid-sdk.dll +0 -0
- nexaai/binds/npu/llama3-3b-sdk.dll +0 -0
- nexaai/binds/npu/nexa-mm-process.dll +0 -0
- nexaai/binds/npu/nexa-sampling.dll +0 -0
- nexaai/binds/npu/nexa_plugin.dll +0 -0
- nexaai/binds/npu/omni-neural-sdk.dll +0 -0
- nexaai/binds/npu/openblas.dll +0 -0
- nexaai/binds/npu/paddleocr-sdk.dll +0 -0
- nexaai/binds/npu/parakeet-sdk.dll +0 -0
- nexaai/binds/npu/phi3-5-sdk.dll +0 -0
- nexaai/binds/npu/phi4-sdk.dll +0 -0
- nexaai/binds/npu/pyannote-sdk.dll +0 -0
- nexaai/binds/npu/qwen3-4b-sdk.dll +0 -0
- nexaai/binds/npu/qwen3vl-sdk.dll +0 -0
- nexaai/binds/npu/qwen3vl-vision.dll +0 -0
- nexaai/binds/npu/yolov12-sdk.dll +0 -0
- nexaai/binds/npu/zlib1.dll +0 -0
- nexaai/binds/rerank_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/vlm_bind.cp313-win_arm64.pyd +0 -0
- nexaai/common.py +105 -105
- nexaai/cv.py +93 -93
- nexaai/cv_impl/mlx_cv_impl.py +89 -89
- nexaai/cv_impl/pybind_cv_impl.py +32 -32
- nexaai/embedder.py +73 -73
- nexaai/embedder_impl/mlx_embedder_impl.py +118 -118
- nexaai/embedder_impl/pybind_embedder_impl.py +96 -96
- nexaai/image_gen.py +141 -141
- nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -292
- nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -85
- nexaai/llm.py +98 -98
- nexaai/llm_impl/mlx_llm_impl.py +271 -271
- nexaai/llm_impl/pybind_llm_impl.py +220 -220
- nexaai/log.py +92 -92
- nexaai/rerank.py +57 -57
- nexaai/rerank_impl/mlx_rerank_impl.py +94 -94
- nexaai/rerank_impl/pybind_rerank_impl.py +136 -136
- nexaai/runtime.py +68 -68
- nexaai/runtime_error.py +24 -24
- nexaai/tts.py +75 -75
- nexaai/tts_impl/mlx_tts_impl.py +94 -94
- nexaai/tts_impl/pybind_tts_impl.py +43 -43
- nexaai/utils/decode.py +17 -17
- nexaai/utils/manifest_utils.py +531 -531
- nexaai/utils/model_manager.py +1562 -1562
- nexaai/utils/model_types.py +49 -49
- nexaai/utils/progress_tracker.py +384 -384
- nexaai/utils/quantization_utils.py +245 -245
- nexaai/vlm.py +129 -129
- nexaai/vlm_impl/mlx_vlm_impl.py +258 -258
- nexaai/vlm_impl/pybind_vlm_impl.py +256 -256
- {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc16.dist-info}/METADATA +1 -1
- nexaai-1.0.21rc16.dist-info/RECORD +154 -0
- nexaai/binds/nexaml/FLAC.dll +0 -0
- nexaai/binds/nexaml/fftw3.dll +0 -0
- nexaai/binds/nexaml/fftw3f.dll +0 -0
- nexaai/binds/nexaml/ggml-base.dll +0 -0
- nexaai/binds/nexaml/ggml-cpu.dll +0 -0
- nexaai/binds/nexaml/ggml.dll +0 -0
- nexaai/binds/nexaml/libmp3lame.DLL +0 -0
- nexaai/binds/nexaml/mpg123.dll +0 -0
- nexaai/binds/nexaml/nexa-mm-process.dll +0 -0
- nexaai/binds/nexaml/nexa-sampling.dll +0 -0
- nexaai/binds/nexaml/nexa_plugin.dll +0 -0
- nexaai/binds/nexaml/nexaproc.dll +0 -0
- nexaai/binds/nexaml/ogg.dll +0 -0
- nexaai/binds/nexaml/opus.dll +0 -0
- nexaai/binds/nexaml/qwen3-vl.dll +0 -0
- nexaai/binds/nexaml/qwen3vl-vision.dll +0 -0
- nexaai/binds/nexaml/vorbis.dll +0 -0
- nexaai/binds/nexaml/vorbisenc.dll +0 -0
- nexaai-1.0.21rc5.dist-info/RECORD +0 -162
- {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc16.dist-info}/WHEEL +0 -0
- {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc16.dist-info}/top_level.txt +0 -0
|
@@ -1,256 +1,256 @@
|
|
|
1
|
-
from typing import Generator, Optional, List, Dict, Any, Union
|
|
2
|
-
import queue
|
|
3
|
-
import threading
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
|
|
7
|
-
from nexaai.binds import vlm_bind, common_bind
|
|
8
|
-
from nexaai.runtime import _ensure_runtime
|
|
9
|
-
from nexaai.vlm import VLM
|
|
10
|
-
from nexaai.base import ProfilingData
|
|
11
|
-
from nexaai.runtime_error import ContextLengthExceededError, GenerationError
|
|
12
|
-
|
|
13
|
-
# Error codes from ml.h
|
|
14
|
-
ML_SUCCESS = 0
|
|
15
|
-
ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH = -200004
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class PyBindVLMImpl(VLM):
|
|
19
|
-
def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
|
|
20
|
-
"""Private constructor, should not be called directly."""
|
|
21
|
-
super().__init__(m_cfg)
|
|
22
|
-
self._handle = handle # This is a py::capsule
|
|
23
|
-
self._profiling_data = None
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def _load_from(cls,
|
|
27
|
-
local_path: str,
|
|
28
|
-
mmproj_path: str = None,
|
|
29
|
-
model_name: Optional[str] = None,
|
|
30
|
-
m_cfg: ModelConfig = ModelConfig(),
|
|
31
|
-
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
32
|
-
device_id: Optional[str] = None
|
|
33
|
-
) -> 'PyBindVLMImpl':
|
|
34
|
-
"""Load VLM model from local path.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
local_path: Path to the main model file
|
|
38
|
-
mmproj_path: Path to the multimodal projection file
|
|
39
|
-
m_cfg: Model configuration
|
|
40
|
-
plugin_id: Plugin identifier
|
|
41
|
-
device_id: Optional device ID (not used in current binding)
|
|
42
|
-
|
|
43
|
-
Returns:
|
|
44
|
-
PyBindVLMImpl instance
|
|
45
|
-
"""
|
|
46
|
-
_ensure_runtime()
|
|
47
|
-
|
|
48
|
-
config = common_bind.ModelConfig()
|
|
49
|
-
|
|
50
|
-
config.n_ctx = m_cfg.n_ctx
|
|
51
|
-
if m_cfg.n_threads is not None:
|
|
52
|
-
config.n_threads = m_cfg.n_threads
|
|
53
|
-
if m_cfg.n_threads_batch is not None:
|
|
54
|
-
config.n_threads_batch = m_cfg.n_threads_batch
|
|
55
|
-
if m_cfg.n_batch is not None:
|
|
56
|
-
config.n_batch = m_cfg.n_batch
|
|
57
|
-
if m_cfg.n_ubatch is not None:
|
|
58
|
-
config.n_ubatch = m_cfg.n_ubatch
|
|
59
|
-
if m_cfg.n_seq_max is not None:
|
|
60
|
-
config.n_seq_max = m_cfg.n_seq_max
|
|
61
|
-
config.n_gpu_layers = m_cfg.n_gpu_layers
|
|
62
|
-
|
|
63
|
-
# handle chat template strings
|
|
64
|
-
if m_cfg.chat_template_path:
|
|
65
|
-
config.chat_template_path = m_cfg.chat_template_path
|
|
66
|
-
|
|
67
|
-
if m_cfg.chat_template_content:
|
|
68
|
-
config.chat_template_content = m_cfg.chat_template_content
|
|
69
|
-
|
|
70
|
-
# Create handle : returns py::capsule with automatic cleanup
|
|
71
|
-
# Convert enum to string for C++ binding
|
|
72
|
-
plugin_id_str = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
|
|
73
|
-
handle = vlm_bind.create_vlm(
|
|
74
|
-
model_path=local_path,
|
|
75
|
-
mmproj_path=mmproj_path,
|
|
76
|
-
model_name=model_name,
|
|
77
|
-
model_config=config,
|
|
78
|
-
plugin_id=plugin_id_str,
|
|
79
|
-
device_id=device_id
|
|
80
|
-
)
|
|
81
|
-
return cls(handle, m_cfg)
|
|
82
|
-
|
|
83
|
-
def eject(self):
|
|
84
|
-
"""Release the model from memory."""
|
|
85
|
-
# py::capsule handles cleanup automatically
|
|
86
|
-
del self._handle
|
|
87
|
-
self._handle = None
|
|
88
|
-
|
|
89
|
-
def reset(self):
|
|
90
|
-
"""
|
|
91
|
-
Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
|
|
92
|
-
If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
|
|
93
|
-
"""
|
|
94
|
-
vlm_bind.ml_vlm_reset(self._handle)
|
|
95
|
-
|
|
96
|
-
def apply_chat_template(
|
|
97
|
-
self,
|
|
98
|
-
messages: List[MultiModalMessage],
|
|
99
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
100
|
-
enable_thinking: bool = True
|
|
101
|
-
) -> str:
|
|
102
|
-
"""Apply the chat template to multimodal messages."""
|
|
103
|
-
payload = []
|
|
104
|
-
for msg in messages:
|
|
105
|
-
role = msg["role"]
|
|
106
|
-
blocks = []
|
|
107
|
-
|
|
108
|
-
for c in msg["content"]:
|
|
109
|
-
t = c["type"]
|
|
110
|
-
if t == "text":
|
|
111
|
-
blocks.append({"type": "text", "text": c.get("text","") or ""})
|
|
112
|
-
else:
|
|
113
|
-
# Pass through the original structure for image, audio, and any other types
|
|
114
|
-
# Let vlm-bind.cpp handle field extraction (text/url/path)
|
|
115
|
-
blocks.append(c)
|
|
116
|
-
|
|
117
|
-
payload.append({"role": role, "content": blocks})
|
|
118
|
-
|
|
119
|
-
result = vlm_bind.ml_vlm_apply_chat_template(self._handle, payload, tools, enable_thinking)
|
|
120
|
-
return result
|
|
121
|
-
|
|
122
|
-
def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
|
|
123
|
-
"""Generate text with streaming."""
|
|
124
|
-
token_queue = queue.Queue()
|
|
125
|
-
exception_container = [None]
|
|
126
|
-
self.reset_cancel() # Reset cancel flag before generation
|
|
127
|
-
|
|
128
|
-
def on_token(token: str, user_data) -> bool:
|
|
129
|
-
if self._cancel_event.is_set():
|
|
130
|
-
token_queue.put(('end', None))
|
|
131
|
-
return False # Stop generation
|
|
132
|
-
try:
|
|
133
|
-
token_queue.put(('token', token))
|
|
134
|
-
return True # Continue generation
|
|
135
|
-
except Exception as e:
|
|
136
|
-
exception_container[0] = e
|
|
137
|
-
return False # Stop generation
|
|
138
|
-
|
|
139
|
-
config = self._convert_generation_config(g_cfg)
|
|
140
|
-
|
|
141
|
-
# Run generation in thread
|
|
142
|
-
def generate():
|
|
143
|
-
try:
|
|
144
|
-
result = vlm_bind.ml_vlm_generate(
|
|
145
|
-
handle=self._handle,
|
|
146
|
-
prompt=prompt,
|
|
147
|
-
config=config,
|
|
148
|
-
on_token=on_token,
|
|
149
|
-
user_data=None
|
|
150
|
-
)
|
|
151
|
-
|
|
152
|
-
# Check for errors in result
|
|
153
|
-
error_code = result.get("error_code", ML_SUCCESS)
|
|
154
|
-
if error_code != ML_SUCCESS:
|
|
155
|
-
error_message = result.get("error_message", "Unknown error")
|
|
156
|
-
if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
|
|
157
|
-
exception_container[0] = ContextLengthExceededError(error_message, error_code)
|
|
158
|
-
else:
|
|
159
|
-
exception_container[0] = GenerationError(error_message, error_code)
|
|
160
|
-
token_queue.put(('end', None))
|
|
161
|
-
return
|
|
162
|
-
|
|
163
|
-
self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
|
|
164
|
-
except Exception as e:
|
|
165
|
-
exception_container[0] = e
|
|
166
|
-
finally:
|
|
167
|
-
token_queue.put(('end', None))
|
|
168
|
-
|
|
169
|
-
thread = threading.Thread(target=generate)
|
|
170
|
-
thread.start()
|
|
171
|
-
|
|
172
|
-
# Yield tokens as they come
|
|
173
|
-
try:
|
|
174
|
-
while True:
|
|
175
|
-
msg_type, token = token_queue.get()
|
|
176
|
-
if msg_type == 'token':
|
|
177
|
-
yield token
|
|
178
|
-
elif msg_type in ('error', 'end'):
|
|
179
|
-
break
|
|
180
|
-
finally:
|
|
181
|
-
thread.join()
|
|
182
|
-
|
|
183
|
-
if exception_container[0]:
|
|
184
|
-
raise exception_container[0]
|
|
185
|
-
|
|
186
|
-
def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
|
|
187
|
-
"""
|
|
188
|
-
Generate text without streaming.
|
|
189
|
-
|
|
190
|
-
Args:
|
|
191
|
-
prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
|
|
192
|
-
g_cfg (GenerationConfig): Generation configuration.
|
|
193
|
-
|
|
194
|
-
Returns:
|
|
195
|
-
str: The generated text.
|
|
196
|
-
"""
|
|
197
|
-
config = self._convert_generation_config(g_cfg)
|
|
198
|
-
result = vlm_bind.ml_vlm_generate(
|
|
199
|
-
handle=self._handle,
|
|
200
|
-
prompt=prompt,
|
|
201
|
-
config=config,
|
|
202
|
-
on_token=None, # No callback for non-streaming
|
|
203
|
-
user_data=None
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
# Check for errors in result
|
|
207
|
-
error_code = result.get("error_code", ML_SUCCESS)
|
|
208
|
-
if error_code != ML_SUCCESS:
|
|
209
|
-
error_message = result.get("error_message", "Unknown error")
|
|
210
|
-
if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
|
|
211
|
-
raise ContextLengthExceededError(error_message, error_code)
|
|
212
|
-
else:
|
|
213
|
-
raise GenerationError(error_message, error_code)
|
|
214
|
-
|
|
215
|
-
self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
|
|
216
|
-
return result.get("text", "")
|
|
217
|
-
|
|
218
|
-
def get_profiling_data(self) -> Optional[ProfilingData]:
|
|
219
|
-
"""Get profiling data."""
|
|
220
|
-
return self._profiling_data
|
|
221
|
-
|
|
222
|
-
def _convert_generation_config(self, g_cfg: GenerationConfig):
|
|
223
|
-
"""Convert GenerationConfig to binding format."""
|
|
224
|
-
config = common_bind.GenerationConfig()
|
|
225
|
-
|
|
226
|
-
# Set basic generation parameters
|
|
227
|
-
config.max_tokens = g_cfg.max_tokens
|
|
228
|
-
|
|
229
|
-
if g_cfg.stop_words:
|
|
230
|
-
config.stop = g_cfg.stop_words
|
|
231
|
-
|
|
232
|
-
if g_cfg.image_paths:
|
|
233
|
-
config.image_paths = g_cfg.image_paths
|
|
234
|
-
|
|
235
|
-
if g_cfg.audio_paths:
|
|
236
|
-
config.audio_paths = g_cfg.audio_paths
|
|
237
|
-
|
|
238
|
-
if g_cfg.sampler_config:
|
|
239
|
-
sampler = common_bind.SamplerConfig()
|
|
240
|
-
sampler.temperature = g_cfg.sampler_config.temperature
|
|
241
|
-
sampler.top_p = g_cfg.sampler_config.top_p
|
|
242
|
-
sampler.top_k = g_cfg.sampler_config.top_k
|
|
243
|
-
sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
|
|
244
|
-
sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
|
|
245
|
-
sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
|
|
246
|
-
sampler.seed = g_cfg.sampler_config.seed
|
|
247
|
-
|
|
248
|
-
if g_cfg.sampler_config.grammar_path:
|
|
249
|
-
sampler.grammar_path = g_cfg.sampler_config.grammar_path
|
|
250
|
-
|
|
251
|
-
if g_cfg.sampler_config.grammar_string:
|
|
252
|
-
sampler.grammar_string = g_cfg.sampler_config.grammar_string
|
|
253
|
-
|
|
254
|
-
config.sampler_config = sampler
|
|
255
|
-
|
|
256
|
-
return config
|
|
1
|
+
from typing import Generator, Optional, List, Dict, Any, Union
|
|
2
|
+
import queue
|
|
3
|
+
import threading
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
|
|
7
|
+
from nexaai.binds import vlm_bind, common_bind
|
|
8
|
+
from nexaai.runtime import _ensure_runtime
|
|
9
|
+
from nexaai.vlm import VLM
|
|
10
|
+
from nexaai.base import ProfilingData
|
|
11
|
+
from nexaai.runtime_error import ContextLengthExceededError, GenerationError
|
|
12
|
+
|
|
13
|
+
# Error codes from ml.h
|
|
14
|
+
ML_SUCCESS = 0
|
|
15
|
+
ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH = -200004
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PyBindVLMImpl(VLM):
|
|
19
|
+
def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
|
|
20
|
+
"""Private constructor, should not be called directly."""
|
|
21
|
+
super().__init__(m_cfg)
|
|
22
|
+
self._handle = handle # This is a py::capsule
|
|
23
|
+
self._profiling_data = None
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def _load_from(cls,
|
|
27
|
+
local_path: str,
|
|
28
|
+
mmproj_path: str = None,
|
|
29
|
+
model_name: Optional[str] = None,
|
|
30
|
+
m_cfg: ModelConfig = ModelConfig(),
|
|
31
|
+
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
32
|
+
device_id: Optional[str] = None
|
|
33
|
+
) -> 'PyBindVLMImpl':
|
|
34
|
+
"""Load VLM model from local path.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
local_path: Path to the main model file
|
|
38
|
+
mmproj_path: Path to the multimodal projection file
|
|
39
|
+
m_cfg: Model configuration
|
|
40
|
+
plugin_id: Plugin identifier
|
|
41
|
+
device_id: Optional device ID (not used in current binding)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
PyBindVLMImpl instance
|
|
45
|
+
"""
|
|
46
|
+
_ensure_runtime()
|
|
47
|
+
|
|
48
|
+
config = common_bind.ModelConfig()
|
|
49
|
+
|
|
50
|
+
config.n_ctx = m_cfg.n_ctx
|
|
51
|
+
if m_cfg.n_threads is not None:
|
|
52
|
+
config.n_threads = m_cfg.n_threads
|
|
53
|
+
if m_cfg.n_threads_batch is not None:
|
|
54
|
+
config.n_threads_batch = m_cfg.n_threads_batch
|
|
55
|
+
if m_cfg.n_batch is not None:
|
|
56
|
+
config.n_batch = m_cfg.n_batch
|
|
57
|
+
if m_cfg.n_ubatch is not None:
|
|
58
|
+
config.n_ubatch = m_cfg.n_ubatch
|
|
59
|
+
if m_cfg.n_seq_max is not None:
|
|
60
|
+
config.n_seq_max = m_cfg.n_seq_max
|
|
61
|
+
config.n_gpu_layers = m_cfg.n_gpu_layers
|
|
62
|
+
|
|
63
|
+
# handle chat template strings
|
|
64
|
+
if m_cfg.chat_template_path:
|
|
65
|
+
config.chat_template_path = m_cfg.chat_template_path
|
|
66
|
+
|
|
67
|
+
if m_cfg.chat_template_content:
|
|
68
|
+
config.chat_template_content = m_cfg.chat_template_content
|
|
69
|
+
|
|
70
|
+
# Create handle : returns py::capsule with automatic cleanup
|
|
71
|
+
# Convert enum to string for C++ binding
|
|
72
|
+
plugin_id_str = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
|
|
73
|
+
handle = vlm_bind.create_vlm(
|
|
74
|
+
model_path=local_path,
|
|
75
|
+
mmproj_path=mmproj_path,
|
|
76
|
+
model_name=model_name,
|
|
77
|
+
model_config=config,
|
|
78
|
+
plugin_id=plugin_id_str,
|
|
79
|
+
device_id=device_id
|
|
80
|
+
)
|
|
81
|
+
return cls(handle, m_cfg)
|
|
82
|
+
|
|
83
|
+
def eject(self):
|
|
84
|
+
"""Release the model from memory."""
|
|
85
|
+
# py::capsule handles cleanup automatically
|
|
86
|
+
del self._handle
|
|
87
|
+
self._handle = None
|
|
88
|
+
|
|
89
|
+
def reset(self):
|
|
90
|
+
"""
|
|
91
|
+
Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
|
|
92
|
+
If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
|
|
93
|
+
"""
|
|
94
|
+
vlm_bind.ml_vlm_reset(self._handle)
|
|
95
|
+
|
|
96
|
+
def apply_chat_template(
|
|
97
|
+
self,
|
|
98
|
+
messages: List[MultiModalMessage],
|
|
99
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
100
|
+
enable_thinking: bool = True
|
|
101
|
+
) -> str:
|
|
102
|
+
"""Apply the chat template to multimodal messages."""
|
|
103
|
+
payload = []
|
|
104
|
+
for msg in messages:
|
|
105
|
+
role = msg["role"]
|
|
106
|
+
blocks = []
|
|
107
|
+
|
|
108
|
+
for c in msg["content"]:
|
|
109
|
+
t = c["type"]
|
|
110
|
+
if t == "text":
|
|
111
|
+
blocks.append({"type": "text", "text": c.get("text","") or ""})
|
|
112
|
+
else:
|
|
113
|
+
# Pass through the original structure for image, audio, and any other types
|
|
114
|
+
# Let vlm-bind.cpp handle field extraction (text/url/path)
|
|
115
|
+
blocks.append(c)
|
|
116
|
+
|
|
117
|
+
payload.append({"role": role, "content": blocks})
|
|
118
|
+
|
|
119
|
+
result = vlm_bind.ml_vlm_apply_chat_template(self._handle, payload, tools, enable_thinking)
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
|
|
123
|
+
"""Generate text with streaming."""
|
|
124
|
+
token_queue = queue.Queue()
|
|
125
|
+
exception_container = [None]
|
|
126
|
+
self.reset_cancel() # Reset cancel flag before generation
|
|
127
|
+
|
|
128
|
+
def on_token(token: str, user_data) -> bool:
|
|
129
|
+
if self._cancel_event.is_set():
|
|
130
|
+
token_queue.put(('end', None))
|
|
131
|
+
return False # Stop generation
|
|
132
|
+
try:
|
|
133
|
+
token_queue.put(('token', token))
|
|
134
|
+
return True # Continue generation
|
|
135
|
+
except Exception as e:
|
|
136
|
+
exception_container[0] = e
|
|
137
|
+
return False # Stop generation
|
|
138
|
+
|
|
139
|
+
config = self._convert_generation_config(g_cfg)
|
|
140
|
+
|
|
141
|
+
# Run generation in thread
|
|
142
|
+
def generate():
|
|
143
|
+
try:
|
|
144
|
+
result = vlm_bind.ml_vlm_generate(
|
|
145
|
+
handle=self._handle,
|
|
146
|
+
prompt=prompt,
|
|
147
|
+
config=config,
|
|
148
|
+
on_token=on_token,
|
|
149
|
+
user_data=None
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Check for errors in result
|
|
153
|
+
error_code = result.get("error_code", ML_SUCCESS)
|
|
154
|
+
if error_code != ML_SUCCESS:
|
|
155
|
+
error_message = result.get("error_message", "Unknown error")
|
|
156
|
+
if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
|
|
157
|
+
exception_container[0] = ContextLengthExceededError(error_message, error_code)
|
|
158
|
+
else:
|
|
159
|
+
exception_container[0] = GenerationError(error_message, error_code)
|
|
160
|
+
token_queue.put(('end', None))
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
|
|
164
|
+
except Exception as e:
|
|
165
|
+
exception_container[0] = e
|
|
166
|
+
finally:
|
|
167
|
+
token_queue.put(('end', None))
|
|
168
|
+
|
|
169
|
+
thread = threading.Thread(target=generate)
|
|
170
|
+
thread.start()
|
|
171
|
+
|
|
172
|
+
# Yield tokens as they come
|
|
173
|
+
try:
|
|
174
|
+
while True:
|
|
175
|
+
msg_type, token = token_queue.get()
|
|
176
|
+
if msg_type == 'token':
|
|
177
|
+
yield token
|
|
178
|
+
elif msg_type in ('error', 'end'):
|
|
179
|
+
break
|
|
180
|
+
finally:
|
|
181
|
+
thread.join()
|
|
182
|
+
|
|
183
|
+
if exception_container[0]:
|
|
184
|
+
raise exception_container[0]
|
|
185
|
+
|
|
186
|
+
def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
|
|
187
|
+
"""
|
|
188
|
+
Generate text without streaming.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
|
|
192
|
+
g_cfg (GenerationConfig): Generation configuration.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
str: The generated text.
|
|
196
|
+
"""
|
|
197
|
+
config = self._convert_generation_config(g_cfg)
|
|
198
|
+
result = vlm_bind.ml_vlm_generate(
|
|
199
|
+
handle=self._handle,
|
|
200
|
+
prompt=prompt,
|
|
201
|
+
config=config,
|
|
202
|
+
on_token=None, # No callback for non-streaming
|
|
203
|
+
user_data=None
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Check for errors in result
|
|
207
|
+
error_code = result.get("error_code", ML_SUCCESS)
|
|
208
|
+
if error_code != ML_SUCCESS:
|
|
209
|
+
error_message = result.get("error_message", "Unknown error")
|
|
210
|
+
if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
|
|
211
|
+
raise ContextLengthExceededError(error_message, error_code)
|
|
212
|
+
else:
|
|
213
|
+
raise GenerationError(error_message, error_code)
|
|
214
|
+
|
|
215
|
+
self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
|
|
216
|
+
return result.get("text", "")
|
|
217
|
+
|
|
218
|
+
def get_profiling_data(self) -> Optional[ProfilingData]:
|
|
219
|
+
"""Get profiling data."""
|
|
220
|
+
return self._profiling_data
|
|
221
|
+
|
|
222
|
+
def _convert_generation_config(self, g_cfg: GenerationConfig):
|
|
223
|
+
"""Convert GenerationConfig to binding format."""
|
|
224
|
+
config = common_bind.GenerationConfig()
|
|
225
|
+
|
|
226
|
+
# Set basic generation parameters
|
|
227
|
+
config.max_tokens = g_cfg.max_tokens
|
|
228
|
+
|
|
229
|
+
if g_cfg.stop_words:
|
|
230
|
+
config.stop = g_cfg.stop_words
|
|
231
|
+
|
|
232
|
+
if g_cfg.image_paths:
|
|
233
|
+
config.image_paths = g_cfg.image_paths
|
|
234
|
+
|
|
235
|
+
if g_cfg.audio_paths:
|
|
236
|
+
config.audio_paths = g_cfg.audio_paths
|
|
237
|
+
|
|
238
|
+
if g_cfg.sampler_config:
|
|
239
|
+
sampler = common_bind.SamplerConfig()
|
|
240
|
+
sampler.temperature = g_cfg.sampler_config.temperature
|
|
241
|
+
sampler.top_p = g_cfg.sampler_config.top_p
|
|
242
|
+
sampler.top_k = g_cfg.sampler_config.top_k
|
|
243
|
+
sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
|
|
244
|
+
sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
|
|
245
|
+
sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
|
|
246
|
+
sampler.seed = g_cfg.sampler_config.seed
|
|
247
|
+
|
|
248
|
+
if g_cfg.sampler_config.grammar_path:
|
|
249
|
+
sampler.grammar_path = g_cfg.sampler_config.grammar_path
|
|
250
|
+
|
|
251
|
+
if g_cfg.sampler_config.grammar_string:
|
|
252
|
+
sampler.grammar_string = g_cfg.sampler_config.grammar_string
|
|
253
|
+
|
|
254
|
+
config.sampler_config = sampler
|
|
255
|
+
|
|
256
|
+
return config
|