nexaai 1.0.21rc5__cp313-cp313-win_arm64.whl → 1.0.21rc14__cp313-cp313-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nexaai might be problematic. Click here for more details.
- nexaai/__init__.py +95 -95
- nexaai/_stub.cp313-win_arm64.pyd +0 -0
- nexaai/_version.py +4 -1
- nexaai/asr.py +68 -65
- nexaai/asr_impl/mlx_asr_impl.py +92 -92
- nexaai/asr_impl/pybind_asr_impl.py +127 -44
- nexaai/base.py +39 -39
- nexaai/binds/__init__.py +6 -5
- nexaai/binds/asr_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/common_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/cpu_gpu/ggml-base.dll +0 -0
- nexaai/binds/cpu_gpu/ggml-cpu.dll +0 -0
- nexaai/binds/cpu_gpu/ggml-opencl.dll +0 -0
- nexaai/binds/cpu_gpu/ggml.dll +0 -0
- nexaai/binds/cpu_gpu/mtmd.dll +0 -0
- nexaai/binds/cpu_gpu/nexa_cpu_gpu.dll +0 -0
- nexaai/binds/cpu_gpu/nexa_plugin.dll +0 -0
- nexaai/binds/embedder_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/libcrypto-3-arm64.dll +0 -0
- nexaai/binds/libssl-3-arm64.dll +0 -0
- nexaai/binds/llm_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/nexa_bridge.dll +0 -0
- nexaai/binds/npu/convnext-sdk.dll +0 -0
- nexaai/binds/npu/embed-gemma-sdk.dll +0 -0
- nexaai/binds/npu/ggml-base.dll +0 -0
- nexaai/binds/npu/ggml-cpu.dll +0 -0
- nexaai/binds/npu/ggml-opencl.dll +0 -0
- nexaai/binds/npu/ggml.dll +0 -0
- nexaai/binds/npu/granite-nano-sdk.dll +0 -0
- nexaai/binds/npu/granite4-sdk.dll +0 -0
- nexaai/binds/npu/jina-rerank-sdk.dll +0 -0
- nexaai/binds/npu/liquid-sdk.dll +0 -0
- nexaai/binds/npu/llama3-3b-sdk.dll +0 -0
- nexaai/binds/npu/nexa-mm-process.dll +0 -0
- nexaai/binds/npu/nexa-sampling.dll +0 -0
- nexaai/binds/npu/nexa_plugin.dll +0 -0
- nexaai/binds/npu/omni-neural-sdk.dll +0 -0
- nexaai/binds/npu/openblas.dll +0 -0
- nexaai/binds/npu/paddleocr-sdk.dll +0 -0
- nexaai/binds/npu/parakeet-sdk.dll +0 -0
- nexaai/binds/npu/phi3-5-sdk.dll +0 -0
- nexaai/binds/npu/phi4-sdk.dll +0 -0
- nexaai/binds/npu/pyannote-sdk.dll +0 -0
- nexaai/binds/npu/qwen3-4b-sdk.dll +0 -0
- nexaai/binds/npu/qwen3vl-sdk.dll +0 -0
- nexaai/binds/npu/qwen3vl-vision.dll +0 -0
- nexaai/binds/npu/yolov12-sdk.dll +0 -0
- nexaai/binds/npu/zlib1.dll +0 -0
- nexaai/binds/rerank_bind.cp313-win_arm64.pyd +0 -0
- nexaai/binds/vlm_bind.cp313-win_arm64.pyd +0 -0
- nexaai/common.py +105 -105
- nexaai/cv.py +93 -93
- nexaai/cv_impl/mlx_cv_impl.py +89 -89
- nexaai/cv_impl/pybind_cv_impl.py +32 -32
- nexaai/embedder.py +73 -73
- nexaai/embedder_impl/mlx_embedder_impl.py +118 -118
- nexaai/embedder_impl/pybind_embedder_impl.py +96 -96
- nexaai/image_gen.py +141 -141
- nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -292
- nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -85
- nexaai/llm.py +98 -98
- nexaai/llm_impl/mlx_llm_impl.py +271 -271
- nexaai/llm_impl/pybind_llm_impl.py +220 -220
- nexaai/log.py +92 -92
- nexaai/rerank.py +57 -57
- nexaai/rerank_impl/mlx_rerank_impl.py +94 -94
- nexaai/rerank_impl/pybind_rerank_impl.py +136 -136
- nexaai/runtime.py +68 -68
- nexaai/runtime_error.py +24 -24
- nexaai/tts.py +75 -75
- nexaai/tts_impl/mlx_tts_impl.py +94 -94
- nexaai/tts_impl/pybind_tts_impl.py +43 -43
- nexaai/utils/decode.py +17 -17
- nexaai/utils/manifest_utils.py +531 -531
- nexaai/utils/model_manager.py +1562 -1562
- nexaai/utils/model_types.py +49 -49
- nexaai/utils/progress_tracker.py +384 -384
- nexaai/utils/quantization_utils.py +245 -245
- nexaai/vlm.py +129 -129
- nexaai/vlm_impl/mlx_vlm_impl.py +258 -258
- nexaai/vlm_impl/pybind_vlm_impl.py +256 -256
- {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/METADATA +1 -1
- nexaai-1.0.21rc14.dist-info/RECORD +154 -0
- nexaai/binds/nexaml/FLAC.dll +0 -0
- nexaai/binds/nexaml/fftw3.dll +0 -0
- nexaai/binds/nexaml/fftw3f.dll +0 -0
- nexaai/binds/nexaml/ggml-base.dll +0 -0
- nexaai/binds/nexaml/ggml-cpu.dll +0 -0
- nexaai/binds/nexaml/ggml-opencl.dll +0 -0
- nexaai/binds/nexaml/ggml.dll +0 -0
- nexaai/binds/nexaml/libmp3lame.DLL +0 -0
- nexaai/binds/nexaml/mpg123.dll +0 -0
- nexaai/binds/nexaml/nexa-mm-process.dll +0 -0
- nexaai/binds/nexaml/nexa-sampling.dll +0 -0
- nexaai/binds/nexaml/nexa_plugin.dll +0 -0
- nexaai/binds/nexaml/nexaproc.dll +0 -0
- nexaai/binds/nexaml/ogg.dll +0 -0
- nexaai/binds/nexaml/opus.dll +0 -0
- nexaai/binds/nexaml/qwen3-vl.dll +0 -0
- nexaai/binds/nexaml/qwen3vl-vision.dll +0 -0
- nexaai/binds/nexaml/vorbis.dll +0 -0
- nexaai/binds/nexaml/vorbisenc.dll +0 -0
- nexaai-1.0.21rc5.dist-info/RECORD +0 -162
- {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/WHEEL +0 -0
- {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/top_level.txt +0 -0
|
@@ -1,220 +1,220 @@
|
|
|
1
|
-
from typing import Generator, Optional, Union
|
|
2
|
-
import queue
|
|
3
|
-
import threading
|
|
4
|
-
|
|
5
|
-
from nexaai.base import ProfilingData
|
|
6
|
-
from nexaai.common import ModelConfig, GenerationConfig, ChatMessage, PluginID
|
|
7
|
-
from nexaai.binds import llm_bind, common_bind
|
|
8
|
-
from nexaai.runtime import _ensure_runtime
|
|
9
|
-
from nexaai.llm import LLM
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class PyBindLLMImpl(LLM):
|
|
13
|
-
def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
|
|
14
|
-
"""Private constructor, should not be called directly."""
|
|
15
|
-
super().__init__(m_cfg)
|
|
16
|
-
self._handle = handle # This is a py::capsule
|
|
17
|
-
self._profiling_data = None
|
|
18
|
-
|
|
19
|
-
@classmethod
|
|
20
|
-
def _load_from(cls,
|
|
21
|
-
local_path: str,
|
|
22
|
-
model_name: Optional[str] = None,
|
|
23
|
-
tokenizer_path: Optional[str] = None,
|
|
24
|
-
m_cfg: ModelConfig = ModelConfig(),
|
|
25
|
-
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
26
|
-
device_id: Optional[str] = None
|
|
27
|
-
) -> 'PyBindLLMImpl':
|
|
28
|
-
"""Load model from local path."""
|
|
29
|
-
_ensure_runtime()
|
|
30
|
-
|
|
31
|
-
config = common_bind.ModelConfig()
|
|
32
|
-
|
|
33
|
-
config.n_ctx = m_cfg.n_ctx
|
|
34
|
-
if m_cfg.n_threads is not None:
|
|
35
|
-
config.n_threads = m_cfg.n_threads
|
|
36
|
-
if m_cfg.n_threads_batch is not None:
|
|
37
|
-
config.n_threads_batch = m_cfg.n_threads_batch
|
|
38
|
-
if m_cfg.n_batch is not None:
|
|
39
|
-
config.n_batch = m_cfg.n_batch
|
|
40
|
-
if m_cfg.n_ubatch is not None:
|
|
41
|
-
config.n_ubatch = m_cfg.n_ubatch
|
|
42
|
-
if m_cfg.n_seq_max is not None:
|
|
43
|
-
config.n_seq_max = m_cfg.n_seq_max
|
|
44
|
-
if m_cfg.n_gpu_layers is not None:
|
|
45
|
-
config.n_gpu_layers = m_cfg.n_gpu_layers
|
|
46
|
-
|
|
47
|
-
# handle chat template strings
|
|
48
|
-
if m_cfg.chat_template_path:
|
|
49
|
-
config.chat_template_path = m_cfg.chat_template_path
|
|
50
|
-
|
|
51
|
-
if m_cfg.chat_template_content:
|
|
52
|
-
config.chat_template_content = m_cfg.chat_template_content
|
|
53
|
-
|
|
54
|
-
# Create handle : returns py::capsule with automatic cleanup
|
|
55
|
-
# Convert enum to string for C++ binding
|
|
56
|
-
plugin_id_str = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
|
|
57
|
-
handle = llm_bind.ml_llm_create(
|
|
58
|
-
model_path=local_path,
|
|
59
|
-
model_name=model_name,
|
|
60
|
-
tokenizer_path=tokenizer_path,
|
|
61
|
-
model_config=config,
|
|
62
|
-
plugin_id=plugin_id_str,
|
|
63
|
-
device_id=device_id
|
|
64
|
-
)
|
|
65
|
-
return cls(handle, m_cfg)
|
|
66
|
-
|
|
67
|
-
def eject(self):
|
|
68
|
-
"""Release the model from memory."""
|
|
69
|
-
# py::capsule handles cleanup automatically
|
|
70
|
-
del self._handle
|
|
71
|
-
self._handle = None
|
|
72
|
-
|
|
73
|
-
def apply_chat_template(self, messages: list[ChatMessage], tools: Optional[str] = None, enable_thinking: bool = True, add_generation_prompt: bool = True) -> str:
|
|
74
|
-
"""Apply the chat template to messages."""
|
|
75
|
-
# Convert TypedDict to list of dicts for binding
|
|
76
|
-
message_dicts = [
|
|
77
|
-
{"role": m["role"], "content": m["content"]}
|
|
78
|
-
for m in messages
|
|
79
|
-
]
|
|
80
|
-
return llm_bind.ml_llm_apply_chat_template(self._handle, message_dicts)
|
|
81
|
-
|
|
82
|
-
def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
|
|
83
|
-
"""Generate text with streaming."""
|
|
84
|
-
token_queue = queue.Queue()
|
|
85
|
-
exception_container = [None]
|
|
86
|
-
self.reset_cancel() # Reset cancel flag before generation
|
|
87
|
-
|
|
88
|
-
def on_token(token: str, user_data) -> bool:
|
|
89
|
-
if self._cancel_event.is_set():
|
|
90
|
-
token_queue.put(('end', None))
|
|
91
|
-
return False # Stop generation
|
|
92
|
-
try:
|
|
93
|
-
token_queue.put(('token', token))
|
|
94
|
-
return True # Continue generation
|
|
95
|
-
except Exception as e:
|
|
96
|
-
exception_container[0] = e
|
|
97
|
-
return False # Stop generation
|
|
98
|
-
|
|
99
|
-
config = self._convert_generation_config(g_cfg)
|
|
100
|
-
|
|
101
|
-
# Run generation in thread
|
|
102
|
-
def generate():
|
|
103
|
-
try:
|
|
104
|
-
result = llm_bind.ml_llm_generate(
|
|
105
|
-
handle=self._handle,
|
|
106
|
-
prompt=prompt,
|
|
107
|
-
config=config,
|
|
108
|
-
on_token=on_token,
|
|
109
|
-
user_data=None
|
|
110
|
-
)
|
|
111
|
-
self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
|
|
112
|
-
except Exception as e:
|
|
113
|
-
exception_container[0] = e
|
|
114
|
-
finally:
|
|
115
|
-
token_queue.put(('end', None))
|
|
116
|
-
|
|
117
|
-
thread = threading.Thread(target=generate)
|
|
118
|
-
thread.start()
|
|
119
|
-
|
|
120
|
-
# Yield tokens as they come
|
|
121
|
-
try:
|
|
122
|
-
while True:
|
|
123
|
-
msg_type, token = token_queue.get()
|
|
124
|
-
if msg_type == 'token':
|
|
125
|
-
yield token
|
|
126
|
-
elif msg_type in ('error', 'end'):
|
|
127
|
-
break
|
|
128
|
-
finally:
|
|
129
|
-
thread.join()
|
|
130
|
-
|
|
131
|
-
if exception_container[0]:
|
|
132
|
-
raise exception_container[0]
|
|
133
|
-
|
|
134
|
-
def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
|
|
135
|
-
"""
|
|
136
|
-
Generate text without streaming.
|
|
137
|
-
|
|
138
|
-
Args:
|
|
139
|
-
prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
|
|
140
|
-
g_cfg (GenerationConfig): Generation configuration.
|
|
141
|
-
|
|
142
|
-
Returns:
|
|
143
|
-
str: The generated text.
|
|
144
|
-
"""
|
|
145
|
-
config = self._convert_generation_config(g_cfg)
|
|
146
|
-
result = llm_bind.ml_llm_generate(
|
|
147
|
-
handle=self._handle,
|
|
148
|
-
prompt=prompt,
|
|
149
|
-
config=config,
|
|
150
|
-
on_token=None, # No callback for non-streaming
|
|
151
|
-
user_data=None
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
|
|
155
|
-
return result.get("text", "")
|
|
156
|
-
|
|
157
|
-
def get_profiling_data(self) -> Optional[ProfilingData]:
|
|
158
|
-
"""Get profiling data."""
|
|
159
|
-
return self._profiling_data
|
|
160
|
-
|
|
161
|
-
def save_kv_cache(self, path: str):
|
|
162
|
-
"""
|
|
163
|
-
Save the key-value cache to the file.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
path (str): The path to the file.
|
|
167
|
-
"""
|
|
168
|
-
llm_bind.ml_llm_save_kv_cache(self._handle, path)
|
|
169
|
-
|
|
170
|
-
def load_kv_cache(self, path: str):
|
|
171
|
-
"""
|
|
172
|
-
Load the key-value cache from the file.
|
|
173
|
-
|
|
174
|
-
Args:
|
|
175
|
-
path (str): The path to the file.
|
|
176
|
-
"""
|
|
177
|
-
llm_bind.ml_llm_load_kv_cache(self._handle, path)
|
|
178
|
-
|
|
179
|
-
def reset(self):
|
|
180
|
-
"""
|
|
181
|
-
Reset the LLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
|
|
182
|
-
If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
|
|
183
|
-
"""
|
|
184
|
-
llm_bind.ml_llm_reset(self._handle)
|
|
185
|
-
|
|
186
|
-
def _convert_generation_config(self, g_cfg: GenerationConfig):
|
|
187
|
-
"""Convert GenerationConfig to binding format."""
|
|
188
|
-
config = common_bind.GenerationConfig()
|
|
189
|
-
|
|
190
|
-
# Set basic generation parameters
|
|
191
|
-
config.max_tokens = g_cfg.max_tokens
|
|
192
|
-
|
|
193
|
-
if g_cfg.stop_words:
|
|
194
|
-
config.stop = g_cfg.stop_words
|
|
195
|
-
|
|
196
|
-
if g_cfg.image_paths:
|
|
197
|
-
config.image_paths = g_cfg.image_paths
|
|
198
|
-
|
|
199
|
-
if g_cfg.audio_paths:
|
|
200
|
-
config.audio_paths = g_cfg.audio_paths
|
|
201
|
-
|
|
202
|
-
if g_cfg.sampler_config:
|
|
203
|
-
sampler = common_bind.SamplerConfig()
|
|
204
|
-
sampler.temperature = g_cfg.sampler_config.temperature
|
|
205
|
-
sampler.top_p = g_cfg.sampler_config.top_p
|
|
206
|
-
sampler.top_k = g_cfg.sampler_config.top_k
|
|
207
|
-
sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
|
|
208
|
-
sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
|
|
209
|
-
sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
|
|
210
|
-
sampler.seed = g_cfg.sampler_config.seed
|
|
211
|
-
|
|
212
|
-
if g_cfg.sampler_config.grammar_path:
|
|
213
|
-
sampler.grammar_path = g_cfg.sampler_config.grammar_path
|
|
214
|
-
|
|
215
|
-
if g_cfg.sampler_config.grammar_string:
|
|
216
|
-
sampler.grammar_string = g_cfg.sampler_config.grammar_string
|
|
217
|
-
|
|
218
|
-
config.sampler_config = sampler
|
|
219
|
-
|
|
220
|
-
return config
|
|
1
|
+
from typing import Generator, Optional, Union
|
|
2
|
+
import queue
|
|
3
|
+
import threading
|
|
4
|
+
|
|
5
|
+
from nexaai.base import ProfilingData
|
|
6
|
+
from nexaai.common import ModelConfig, GenerationConfig, ChatMessage, PluginID
|
|
7
|
+
from nexaai.binds import llm_bind, common_bind
|
|
8
|
+
from nexaai.runtime import _ensure_runtime
|
|
9
|
+
from nexaai.llm import LLM
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PyBindLLMImpl(LLM):
|
|
13
|
+
def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
|
|
14
|
+
"""Private constructor, should not be called directly."""
|
|
15
|
+
super().__init__(m_cfg)
|
|
16
|
+
self._handle = handle # This is a py::capsule
|
|
17
|
+
self._profiling_data = None
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def _load_from(cls,
|
|
21
|
+
local_path: str,
|
|
22
|
+
model_name: Optional[str] = None,
|
|
23
|
+
tokenizer_path: Optional[str] = None,
|
|
24
|
+
m_cfg: ModelConfig = ModelConfig(),
|
|
25
|
+
plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
|
|
26
|
+
device_id: Optional[str] = None
|
|
27
|
+
) -> 'PyBindLLMImpl':
|
|
28
|
+
"""Load model from local path."""
|
|
29
|
+
_ensure_runtime()
|
|
30
|
+
|
|
31
|
+
config = common_bind.ModelConfig()
|
|
32
|
+
|
|
33
|
+
config.n_ctx = m_cfg.n_ctx
|
|
34
|
+
if m_cfg.n_threads is not None:
|
|
35
|
+
config.n_threads = m_cfg.n_threads
|
|
36
|
+
if m_cfg.n_threads_batch is not None:
|
|
37
|
+
config.n_threads_batch = m_cfg.n_threads_batch
|
|
38
|
+
if m_cfg.n_batch is not None:
|
|
39
|
+
config.n_batch = m_cfg.n_batch
|
|
40
|
+
if m_cfg.n_ubatch is not None:
|
|
41
|
+
config.n_ubatch = m_cfg.n_ubatch
|
|
42
|
+
if m_cfg.n_seq_max is not None:
|
|
43
|
+
config.n_seq_max = m_cfg.n_seq_max
|
|
44
|
+
if m_cfg.n_gpu_layers is not None:
|
|
45
|
+
config.n_gpu_layers = m_cfg.n_gpu_layers
|
|
46
|
+
|
|
47
|
+
# handle chat template strings
|
|
48
|
+
if m_cfg.chat_template_path:
|
|
49
|
+
config.chat_template_path = m_cfg.chat_template_path
|
|
50
|
+
|
|
51
|
+
if m_cfg.chat_template_content:
|
|
52
|
+
config.chat_template_content = m_cfg.chat_template_content
|
|
53
|
+
|
|
54
|
+
# Create handle : returns py::capsule with automatic cleanup
|
|
55
|
+
# Convert enum to string for C++ binding
|
|
56
|
+
plugin_id_str = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
|
|
57
|
+
handle = llm_bind.ml_llm_create(
|
|
58
|
+
model_path=local_path,
|
|
59
|
+
model_name=model_name,
|
|
60
|
+
tokenizer_path=tokenizer_path,
|
|
61
|
+
model_config=config,
|
|
62
|
+
plugin_id=plugin_id_str,
|
|
63
|
+
device_id=device_id
|
|
64
|
+
)
|
|
65
|
+
return cls(handle, m_cfg)
|
|
66
|
+
|
|
67
|
+
def eject(self):
|
|
68
|
+
"""Release the model from memory."""
|
|
69
|
+
# py::capsule handles cleanup automatically
|
|
70
|
+
del self._handle
|
|
71
|
+
self._handle = None
|
|
72
|
+
|
|
73
|
+
def apply_chat_template(self, messages: list[ChatMessage], tools: Optional[str] = None, enable_thinking: bool = True, add_generation_prompt: bool = True) -> str:
|
|
74
|
+
"""Apply the chat template to messages."""
|
|
75
|
+
# Convert TypedDict to list of dicts for binding
|
|
76
|
+
message_dicts = [
|
|
77
|
+
{"role": m["role"], "content": m["content"]}
|
|
78
|
+
for m in messages
|
|
79
|
+
]
|
|
80
|
+
return llm_bind.ml_llm_apply_chat_template(self._handle, message_dicts)
|
|
81
|
+
|
|
82
|
+
def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
|
|
83
|
+
"""Generate text with streaming."""
|
|
84
|
+
token_queue = queue.Queue()
|
|
85
|
+
exception_container = [None]
|
|
86
|
+
self.reset_cancel() # Reset cancel flag before generation
|
|
87
|
+
|
|
88
|
+
def on_token(token: str, user_data) -> bool:
|
|
89
|
+
if self._cancel_event.is_set():
|
|
90
|
+
token_queue.put(('end', None))
|
|
91
|
+
return False # Stop generation
|
|
92
|
+
try:
|
|
93
|
+
token_queue.put(('token', token))
|
|
94
|
+
return True # Continue generation
|
|
95
|
+
except Exception as e:
|
|
96
|
+
exception_container[0] = e
|
|
97
|
+
return False # Stop generation
|
|
98
|
+
|
|
99
|
+
config = self._convert_generation_config(g_cfg)
|
|
100
|
+
|
|
101
|
+
# Run generation in thread
|
|
102
|
+
def generate():
|
|
103
|
+
try:
|
|
104
|
+
result = llm_bind.ml_llm_generate(
|
|
105
|
+
handle=self._handle,
|
|
106
|
+
prompt=prompt,
|
|
107
|
+
config=config,
|
|
108
|
+
on_token=on_token,
|
|
109
|
+
user_data=None
|
|
110
|
+
)
|
|
111
|
+
self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
|
|
112
|
+
except Exception as e:
|
|
113
|
+
exception_container[0] = e
|
|
114
|
+
finally:
|
|
115
|
+
token_queue.put(('end', None))
|
|
116
|
+
|
|
117
|
+
thread = threading.Thread(target=generate)
|
|
118
|
+
thread.start()
|
|
119
|
+
|
|
120
|
+
# Yield tokens as they come
|
|
121
|
+
try:
|
|
122
|
+
while True:
|
|
123
|
+
msg_type, token = token_queue.get()
|
|
124
|
+
if msg_type == 'token':
|
|
125
|
+
yield token
|
|
126
|
+
elif msg_type in ('error', 'end'):
|
|
127
|
+
break
|
|
128
|
+
finally:
|
|
129
|
+
thread.join()
|
|
130
|
+
|
|
131
|
+
if exception_container[0]:
|
|
132
|
+
raise exception_container[0]
|
|
133
|
+
|
|
134
|
+
def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
|
|
135
|
+
"""
|
|
136
|
+
Generate text without streaming.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
|
|
140
|
+
g_cfg (GenerationConfig): Generation configuration.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
str: The generated text.
|
|
144
|
+
"""
|
|
145
|
+
config = self._convert_generation_config(g_cfg)
|
|
146
|
+
result = llm_bind.ml_llm_generate(
|
|
147
|
+
handle=self._handle,
|
|
148
|
+
prompt=prompt,
|
|
149
|
+
config=config,
|
|
150
|
+
on_token=None, # No callback for non-streaming
|
|
151
|
+
user_data=None
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
|
|
155
|
+
return result.get("text", "")
|
|
156
|
+
|
|
157
|
+
def get_profiling_data(self) -> Optional[ProfilingData]:
|
|
158
|
+
"""Get profiling data."""
|
|
159
|
+
return self._profiling_data
|
|
160
|
+
|
|
161
|
+
def save_kv_cache(self, path: str):
|
|
162
|
+
"""
|
|
163
|
+
Save the key-value cache to the file.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
path (str): The path to the file.
|
|
167
|
+
"""
|
|
168
|
+
llm_bind.ml_llm_save_kv_cache(self._handle, path)
|
|
169
|
+
|
|
170
|
+
def load_kv_cache(self, path: str):
|
|
171
|
+
"""
|
|
172
|
+
Load the key-value cache from the file.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
path (str): The path to the file.
|
|
176
|
+
"""
|
|
177
|
+
llm_bind.ml_llm_load_kv_cache(self._handle, path)
|
|
178
|
+
|
|
179
|
+
def reset(self):
|
|
180
|
+
"""
|
|
181
|
+
Reset the LLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
|
|
182
|
+
If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
|
|
183
|
+
"""
|
|
184
|
+
llm_bind.ml_llm_reset(self._handle)
|
|
185
|
+
|
|
186
|
+
def _convert_generation_config(self, g_cfg: GenerationConfig):
|
|
187
|
+
"""Convert GenerationConfig to binding format."""
|
|
188
|
+
config = common_bind.GenerationConfig()
|
|
189
|
+
|
|
190
|
+
# Set basic generation parameters
|
|
191
|
+
config.max_tokens = g_cfg.max_tokens
|
|
192
|
+
|
|
193
|
+
if g_cfg.stop_words:
|
|
194
|
+
config.stop = g_cfg.stop_words
|
|
195
|
+
|
|
196
|
+
if g_cfg.image_paths:
|
|
197
|
+
config.image_paths = g_cfg.image_paths
|
|
198
|
+
|
|
199
|
+
if g_cfg.audio_paths:
|
|
200
|
+
config.audio_paths = g_cfg.audio_paths
|
|
201
|
+
|
|
202
|
+
if g_cfg.sampler_config:
|
|
203
|
+
sampler = common_bind.SamplerConfig()
|
|
204
|
+
sampler.temperature = g_cfg.sampler_config.temperature
|
|
205
|
+
sampler.top_p = g_cfg.sampler_config.top_p
|
|
206
|
+
sampler.top_k = g_cfg.sampler_config.top_k
|
|
207
|
+
sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
|
|
208
|
+
sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
|
|
209
|
+
sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
|
|
210
|
+
sampler.seed = g_cfg.sampler_config.seed
|
|
211
|
+
|
|
212
|
+
if g_cfg.sampler_config.grammar_path:
|
|
213
|
+
sampler.grammar_path = g_cfg.sampler_config.grammar_path
|
|
214
|
+
|
|
215
|
+
if g_cfg.sampler_config.grammar_string:
|
|
216
|
+
sampler.grammar_string = g_cfg.sampler_config.grammar_string
|
|
217
|
+
|
|
218
|
+
config.sampler_config = sampler
|
|
219
|
+
|
|
220
|
+
return config
|
nexaai/log.py
CHANGED
|
@@ -1,92 +1,92 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Logging configuration for NexaAI bridge.
|
|
3
|
-
|
|
4
|
-
This module provides a minimal API to configure bridge-wide logging
|
|
5
|
-
to route into Python's logging system.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import logging
|
|
9
|
-
import threading
|
|
10
|
-
from enum import IntEnum
|
|
11
|
-
from typing import Optional
|
|
12
|
-
|
|
13
|
-
from nexaai.binds import common_bind
|
|
14
|
-
from nexaai.runtime import is_initialized
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class LogLevel(IntEnum):
|
|
18
|
-
"""Log levels matching ml_LogLevel from ml.h"""
|
|
19
|
-
TRACE = 0
|
|
20
|
-
DEBUG = 1
|
|
21
|
-
INFO = 2
|
|
22
|
-
WARN = 3
|
|
23
|
-
ERROR = 4
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
# Module-level state
|
|
27
|
-
_config_lock = threading.Lock()
|
|
28
|
-
_current_logger: Optional[logging.Logger] = None
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def set_logger(logger: Optional[logging.Logger] = None, *, strict: bool = True) -> None:
|
|
32
|
-
"""
|
|
33
|
-
Set the process-wide bridge logger.
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
logger: Python logger to receive bridge logs. If None, uses "nexaai.ml" logger.
|
|
37
|
-
strict: If True, raises if called after runtime initialization.
|
|
38
|
-
If False, attempts to set anyway (best-effort).
|
|
39
|
-
|
|
40
|
-
Raises:
|
|
41
|
-
RuntimeError: If strict=True and runtime is already initialized.
|
|
42
|
-
"""
|
|
43
|
-
global _current_logger
|
|
44
|
-
|
|
45
|
-
with _config_lock:
|
|
46
|
-
# Check initialization state if strict mode
|
|
47
|
-
if strict and is_initialized():
|
|
48
|
-
raise RuntimeError(
|
|
49
|
-
"Cannot configure logging after runtime initialization. "
|
|
50
|
-
"Call set_logger() before creating any models, or use strict=False for best-effort."
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
# Use default logger if none provided
|
|
54
|
-
if logger is None:
|
|
55
|
-
logger = logging.getLogger("nexaai.ml")
|
|
56
|
-
|
|
57
|
-
_current_logger = logger
|
|
58
|
-
|
|
59
|
-
# Set the C callback
|
|
60
|
-
common_bind.ml_set_log(_log_callback)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def _log_callback(level: int, message: str) -> None:
|
|
64
|
-
"""Internal callback that forwards bridge logs to Python logger."""
|
|
65
|
-
if _current_logger is None:
|
|
66
|
-
return
|
|
67
|
-
|
|
68
|
-
# Map bridge log levels to Python logging levels
|
|
69
|
-
if level == LogLevel.TRACE or level == LogLevel.DEBUG:
|
|
70
|
-
_current_logger.debug(message)
|
|
71
|
-
elif level == LogLevel.INFO:
|
|
72
|
-
_current_logger.info(message)
|
|
73
|
-
elif level == LogLevel.WARN:
|
|
74
|
-
_current_logger.warning(message)
|
|
75
|
-
elif level == LogLevel.ERROR:
|
|
76
|
-
_current_logger.error(message)
|
|
77
|
-
else:
|
|
78
|
-
# Fallback for unknown levels
|
|
79
|
-
_current_logger.info(f"[Level {level}] {message}")
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def get_error_message(error_code: int) -> str:
|
|
83
|
-
"""
|
|
84
|
-
Get error message string for error code.
|
|
85
|
-
|
|
86
|
-
Args:
|
|
87
|
-
error_code: ML error code (typically negative)
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
Human-readable error message
|
|
91
|
-
"""
|
|
92
|
-
return common_bind.ml_get_error_message(error_code)
|
|
1
|
+
"""
|
|
2
|
+
Logging configuration for NexaAI bridge.
|
|
3
|
+
|
|
4
|
+
This module provides a minimal API to configure bridge-wide logging
|
|
5
|
+
to route into Python's logging system.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
from enum import IntEnum
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
from nexaai.binds import common_bind
|
|
14
|
+
from nexaai.runtime import is_initialized
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LogLevel(IntEnum):
|
|
18
|
+
"""Log levels matching ml_LogLevel from ml.h"""
|
|
19
|
+
TRACE = 0
|
|
20
|
+
DEBUG = 1
|
|
21
|
+
INFO = 2
|
|
22
|
+
WARN = 3
|
|
23
|
+
ERROR = 4
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Module-level state
|
|
27
|
+
_config_lock = threading.Lock()
|
|
28
|
+
_current_logger: Optional[logging.Logger] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def set_logger(logger: Optional[logging.Logger] = None, *, strict: bool = True) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Set the process-wide bridge logger.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
logger: Python logger to receive bridge logs. If None, uses "nexaai.ml" logger.
|
|
37
|
+
strict: If True, raises if called after runtime initialization.
|
|
38
|
+
If False, attempts to set anyway (best-effort).
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
RuntimeError: If strict=True and runtime is already initialized.
|
|
42
|
+
"""
|
|
43
|
+
global _current_logger
|
|
44
|
+
|
|
45
|
+
with _config_lock:
|
|
46
|
+
# Check initialization state if strict mode
|
|
47
|
+
if strict and is_initialized():
|
|
48
|
+
raise RuntimeError(
|
|
49
|
+
"Cannot configure logging after runtime initialization. "
|
|
50
|
+
"Call set_logger() before creating any models, or use strict=False for best-effort."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Use default logger if none provided
|
|
54
|
+
if logger is None:
|
|
55
|
+
logger = logging.getLogger("nexaai.ml")
|
|
56
|
+
|
|
57
|
+
_current_logger = logger
|
|
58
|
+
|
|
59
|
+
# Set the C callback
|
|
60
|
+
common_bind.ml_set_log(_log_callback)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _log_callback(level: int, message: str) -> None:
|
|
64
|
+
"""Internal callback that forwards bridge logs to Python logger."""
|
|
65
|
+
if _current_logger is None:
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
# Map bridge log levels to Python logging levels
|
|
69
|
+
if level == LogLevel.TRACE or level == LogLevel.DEBUG:
|
|
70
|
+
_current_logger.debug(message)
|
|
71
|
+
elif level == LogLevel.INFO:
|
|
72
|
+
_current_logger.info(message)
|
|
73
|
+
elif level == LogLevel.WARN:
|
|
74
|
+
_current_logger.warning(message)
|
|
75
|
+
elif level == LogLevel.ERROR:
|
|
76
|
+
_current_logger.error(message)
|
|
77
|
+
else:
|
|
78
|
+
# Fallback for unknown levels
|
|
79
|
+
_current_logger.info(f"[Level {level}] {message}")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_error_message(error_code: int) -> str:
|
|
83
|
+
"""
|
|
84
|
+
Get error message string for error code.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
error_code: ML error code (typically negative)
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Human-readable error message
|
|
91
|
+
"""
|
|
92
|
+
return common_bind.ml_get_error_message(error_code)
|