nexaai 1.0.21rc5__cp313-cp313-win_arm64.whl → 1.0.21rc14__cp313-cp313-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Files changed (105) hide show
  1. nexaai/__init__.py +95 -95
  2. nexaai/_stub.cp313-win_arm64.pyd +0 -0
  3. nexaai/_version.py +4 -1
  4. nexaai/asr.py +68 -65
  5. nexaai/asr_impl/mlx_asr_impl.py +92 -92
  6. nexaai/asr_impl/pybind_asr_impl.py +127 -44
  7. nexaai/base.py +39 -39
  8. nexaai/binds/__init__.py +6 -5
  9. nexaai/binds/asr_bind.cp313-win_arm64.pyd +0 -0
  10. nexaai/binds/common_bind.cp313-win_arm64.pyd +0 -0
  11. nexaai/binds/cpu_gpu/ggml-base.dll +0 -0
  12. nexaai/binds/cpu_gpu/ggml-cpu.dll +0 -0
  13. nexaai/binds/cpu_gpu/ggml-opencl.dll +0 -0
  14. nexaai/binds/cpu_gpu/ggml.dll +0 -0
  15. nexaai/binds/cpu_gpu/mtmd.dll +0 -0
  16. nexaai/binds/cpu_gpu/nexa_cpu_gpu.dll +0 -0
  17. nexaai/binds/cpu_gpu/nexa_plugin.dll +0 -0
  18. nexaai/binds/embedder_bind.cp313-win_arm64.pyd +0 -0
  19. nexaai/binds/libcrypto-3-arm64.dll +0 -0
  20. nexaai/binds/libssl-3-arm64.dll +0 -0
  21. nexaai/binds/llm_bind.cp313-win_arm64.pyd +0 -0
  22. nexaai/binds/nexa_bridge.dll +0 -0
  23. nexaai/binds/npu/convnext-sdk.dll +0 -0
  24. nexaai/binds/npu/embed-gemma-sdk.dll +0 -0
  25. nexaai/binds/npu/ggml-base.dll +0 -0
  26. nexaai/binds/npu/ggml-cpu.dll +0 -0
  27. nexaai/binds/npu/ggml-opencl.dll +0 -0
  28. nexaai/binds/npu/ggml.dll +0 -0
  29. nexaai/binds/npu/granite-nano-sdk.dll +0 -0
  30. nexaai/binds/npu/granite4-sdk.dll +0 -0
  31. nexaai/binds/npu/jina-rerank-sdk.dll +0 -0
  32. nexaai/binds/npu/liquid-sdk.dll +0 -0
  33. nexaai/binds/npu/llama3-3b-sdk.dll +0 -0
  34. nexaai/binds/npu/nexa-mm-process.dll +0 -0
  35. nexaai/binds/npu/nexa-sampling.dll +0 -0
  36. nexaai/binds/npu/nexa_plugin.dll +0 -0
  37. nexaai/binds/npu/omni-neural-sdk.dll +0 -0
  38. nexaai/binds/npu/openblas.dll +0 -0
  39. nexaai/binds/npu/paddleocr-sdk.dll +0 -0
  40. nexaai/binds/npu/parakeet-sdk.dll +0 -0
  41. nexaai/binds/npu/phi3-5-sdk.dll +0 -0
  42. nexaai/binds/npu/phi4-sdk.dll +0 -0
  43. nexaai/binds/npu/pyannote-sdk.dll +0 -0
  44. nexaai/binds/npu/qwen3-4b-sdk.dll +0 -0
  45. nexaai/binds/npu/qwen3vl-sdk.dll +0 -0
  46. nexaai/binds/npu/qwen3vl-vision.dll +0 -0
  47. nexaai/binds/npu/yolov12-sdk.dll +0 -0
  48. nexaai/binds/npu/zlib1.dll +0 -0
  49. nexaai/binds/rerank_bind.cp313-win_arm64.pyd +0 -0
  50. nexaai/binds/vlm_bind.cp313-win_arm64.pyd +0 -0
  51. nexaai/common.py +105 -105
  52. nexaai/cv.py +93 -93
  53. nexaai/cv_impl/mlx_cv_impl.py +89 -89
  54. nexaai/cv_impl/pybind_cv_impl.py +32 -32
  55. nexaai/embedder.py +73 -73
  56. nexaai/embedder_impl/mlx_embedder_impl.py +118 -118
  57. nexaai/embedder_impl/pybind_embedder_impl.py +96 -96
  58. nexaai/image_gen.py +141 -141
  59. nexaai/image_gen_impl/mlx_image_gen_impl.py +292 -292
  60. nexaai/image_gen_impl/pybind_image_gen_impl.py +85 -85
  61. nexaai/llm.py +98 -98
  62. nexaai/llm_impl/mlx_llm_impl.py +271 -271
  63. nexaai/llm_impl/pybind_llm_impl.py +220 -220
  64. nexaai/log.py +92 -92
  65. nexaai/rerank.py +57 -57
  66. nexaai/rerank_impl/mlx_rerank_impl.py +94 -94
  67. nexaai/rerank_impl/pybind_rerank_impl.py +136 -136
  68. nexaai/runtime.py +68 -68
  69. nexaai/runtime_error.py +24 -24
  70. nexaai/tts.py +75 -75
  71. nexaai/tts_impl/mlx_tts_impl.py +94 -94
  72. nexaai/tts_impl/pybind_tts_impl.py +43 -43
  73. nexaai/utils/decode.py +17 -17
  74. nexaai/utils/manifest_utils.py +531 -531
  75. nexaai/utils/model_manager.py +1562 -1562
  76. nexaai/utils/model_types.py +49 -49
  77. nexaai/utils/progress_tracker.py +384 -384
  78. nexaai/utils/quantization_utils.py +245 -245
  79. nexaai/vlm.py +129 -129
  80. nexaai/vlm_impl/mlx_vlm_impl.py +258 -258
  81. nexaai/vlm_impl/pybind_vlm_impl.py +256 -256
  82. {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/METADATA +1 -1
  83. nexaai-1.0.21rc14.dist-info/RECORD +154 -0
  84. nexaai/binds/nexaml/FLAC.dll +0 -0
  85. nexaai/binds/nexaml/fftw3.dll +0 -0
  86. nexaai/binds/nexaml/fftw3f.dll +0 -0
  87. nexaai/binds/nexaml/ggml-base.dll +0 -0
  88. nexaai/binds/nexaml/ggml-cpu.dll +0 -0
  89. nexaai/binds/nexaml/ggml-opencl.dll +0 -0
  90. nexaai/binds/nexaml/ggml.dll +0 -0
  91. nexaai/binds/nexaml/libmp3lame.DLL +0 -0
  92. nexaai/binds/nexaml/mpg123.dll +0 -0
  93. nexaai/binds/nexaml/nexa-mm-process.dll +0 -0
  94. nexaai/binds/nexaml/nexa-sampling.dll +0 -0
  95. nexaai/binds/nexaml/nexa_plugin.dll +0 -0
  96. nexaai/binds/nexaml/nexaproc.dll +0 -0
  97. nexaai/binds/nexaml/ogg.dll +0 -0
  98. nexaai/binds/nexaml/opus.dll +0 -0
  99. nexaai/binds/nexaml/qwen3-vl.dll +0 -0
  100. nexaai/binds/nexaml/qwen3vl-vision.dll +0 -0
  101. nexaai/binds/nexaml/vorbis.dll +0 -0
  102. nexaai/binds/nexaml/vorbisenc.dll +0 -0
  103. nexaai-1.0.21rc5.dist-info/RECORD +0 -162
  104. {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/WHEEL +0 -0
  105. {nexaai-1.0.21rc5.dist-info → nexaai-1.0.21rc14.dist-info}/top_level.txt +0 -0
@@ -1,256 +1,256 @@
1
- from typing import Generator, Optional, List, Dict, Any, Union
2
- import queue
3
- import threading
4
- from pathlib import Path
5
-
6
- from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
7
- from nexaai.binds import vlm_bind, common_bind
8
- from nexaai.runtime import _ensure_runtime
9
- from nexaai.vlm import VLM
10
- from nexaai.base import ProfilingData
11
- from nexaai.runtime_error import ContextLengthExceededError, GenerationError
12
-
13
- # Error codes from ml.h
14
- ML_SUCCESS = 0
15
- ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH = -200004
16
-
17
-
18
- class PyBindVLMImpl(VLM):
19
- def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
20
- """Private constructor, should not be called directly."""
21
- super().__init__(m_cfg)
22
- self._handle = handle # This is a py::capsule
23
- self._profiling_data = None
24
-
25
- @classmethod
26
- def _load_from(cls,
27
- local_path: str,
28
- mmproj_path: str = None,
29
- model_name: Optional[str] = None,
30
- m_cfg: ModelConfig = ModelConfig(),
31
- plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
32
- device_id: Optional[str] = None
33
- ) -> 'PyBindVLMImpl':
34
- """Load VLM model from local path.
35
-
36
- Args:
37
- local_path: Path to the main model file
38
- mmproj_path: Path to the multimodal projection file
39
- m_cfg: Model configuration
40
- plugin_id: Plugin identifier
41
- device_id: Optional device ID (not used in current binding)
42
-
43
- Returns:
44
- PyBindVLMImpl instance
45
- """
46
- _ensure_runtime()
47
-
48
- config = common_bind.ModelConfig()
49
-
50
- config.n_ctx = m_cfg.n_ctx
51
- if m_cfg.n_threads is not None:
52
- config.n_threads = m_cfg.n_threads
53
- if m_cfg.n_threads_batch is not None:
54
- config.n_threads_batch = m_cfg.n_threads_batch
55
- if m_cfg.n_batch is not None:
56
- config.n_batch = m_cfg.n_batch
57
- if m_cfg.n_ubatch is not None:
58
- config.n_ubatch = m_cfg.n_ubatch
59
- if m_cfg.n_seq_max is not None:
60
- config.n_seq_max = m_cfg.n_seq_max
61
- config.n_gpu_layers = m_cfg.n_gpu_layers
62
-
63
- # handle chat template strings
64
- if m_cfg.chat_template_path:
65
- config.chat_template_path = m_cfg.chat_template_path
66
-
67
- if m_cfg.chat_template_content:
68
- config.chat_template_content = m_cfg.chat_template_content
69
-
70
- # Create handle : returns py::capsule with automatic cleanup
71
- # Convert enum to string for C++ binding
72
- plugin_id_str = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
73
- handle = vlm_bind.create_vlm(
74
- model_path=local_path,
75
- mmproj_path=mmproj_path,
76
- model_name=model_name,
77
- model_config=config,
78
- plugin_id=plugin_id_str,
79
- device_id=device_id
80
- )
81
- return cls(handle, m_cfg)
82
-
83
- def eject(self):
84
- """Release the model from memory."""
85
- # py::capsule handles cleanup automatically
86
- del self._handle
87
- self._handle = None
88
-
89
- def reset(self):
90
- """
91
- Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
92
- If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
93
- """
94
- vlm_bind.ml_vlm_reset(self._handle)
95
-
96
- def apply_chat_template(
97
- self,
98
- messages: List[MultiModalMessage],
99
- tools: Optional[List[Dict[str, Any]]] = None,
100
- enable_thinking: bool = True
101
- ) -> str:
102
- """Apply the chat template to multimodal messages."""
103
- payload = []
104
- for msg in messages:
105
- role = msg["role"]
106
- blocks = []
107
-
108
- for c in msg["content"]:
109
- t = c["type"]
110
- if t == "text":
111
- blocks.append({"type": "text", "text": c.get("text","") or ""})
112
- else:
113
- # Pass through the original structure for image, audio, and any other types
114
- # Let vlm-bind.cpp handle field extraction (text/url/path)
115
- blocks.append(c)
116
-
117
- payload.append({"role": role, "content": blocks})
118
-
119
- result = vlm_bind.ml_vlm_apply_chat_template(self._handle, payload, tools, enable_thinking)
120
- return result
121
-
122
- def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
123
- """Generate text with streaming."""
124
- token_queue = queue.Queue()
125
- exception_container = [None]
126
- self.reset_cancel() # Reset cancel flag before generation
127
-
128
- def on_token(token: str, user_data) -> bool:
129
- if self._cancel_event.is_set():
130
- token_queue.put(('end', None))
131
- return False # Stop generation
132
- try:
133
- token_queue.put(('token', token))
134
- return True # Continue generation
135
- except Exception as e:
136
- exception_container[0] = e
137
- return False # Stop generation
138
-
139
- config = self._convert_generation_config(g_cfg)
140
-
141
- # Run generation in thread
142
- def generate():
143
- try:
144
- result = vlm_bind.ml_vlm_generate(
145
- handle=self._handle,
146
- prompt=prompt,
147
- config=config,
148
- on_token=on_token,
149
- user_data=None
150
- )
151
-
152
- # Check for errors in result
153
- error_code = result.get("error_code", ML_SUCCESS)
154
- if error_code != ML_SUCCESS:
155
- error_message = result.get("error_message", "Unknown error")
156
- if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
157
- exception_container[0] = ContextLengthExceededError(error_message, error_code)
158
- else:
159
- exception_container[0] = GenerationError(error_message, error_code)
160
- token_queue.put(('end', None))
161
- return
162
-
163
- self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
164
- except Exception as e:
165
- exception_container[0] = e
166
- finally:
167
- token_queue.put(('end', None))
168
-
169
- thread = threading.Thread(target=generate)
170
- thread.start()
171
-
172
- # Yield tokens as they come
173
- try:
174
- while True:
175
- msg_type, token = token_queue.get()
176
- if msg_type == 'token':
177
- yield token
178
- elif msg_type in ('error', 'end'):
179
- break
180
- finally:
181
- thread.join()
182
-
183
- if exception_container[0]:
184
- raise exception_container[0]
185
-
186
- def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
187
- """
188
- Generate text without streaming.
189
-
190
- Args:
191
- prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
192
- g_cfg (GenerationConfig): Generation configuration.
193
-
194
- Returns:
195
- str: The generated text.
196
- """
197
- config = self._convert_generation_config(g_cfg)
198
- result = vlm_bind.ml_vlm_generate(
199
- handle=self._handle,
200
- prompt=prompt,
201
- config=config,
202
- on_token=None, # No callback for non-streaming
203
- user_data=None
204
- )
205
-
206
- # Check for errors in result
207
- error_code = result.get("error_code", ML_SUCCESS)
208
- if error_code != ML_SUCCESS:
209
- error_message = result.get("error_message", "Unknown error")
210
- if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
211
- raise ContextLengthExceededError(error_message, error_code)
212
- else:
213
- raise GenerationError(error_message, error_code)
214
-
215
- self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
216
- return result.get("text", "")
217
-
218
- def get_profiling_data(self) -> Optional[ProfilingData]:
219
- """Get profiling data."""
220
- return self._profiling_data
221
-
222
- def _convert_generation_config(self, g_cfg: GenerationConfig):
223
- """Convert GenerationConfig to binding format."""
224
- config = common_bind.GenerationConfig()
225
-
226
- # Set basic generation parameters
227
- config.max_tokens = g_cfg.max_tokens
228
-
229
- if g_cfg.stop_words:
230
- config.stop = g_cfg.stop_words
231
-
232
- if g_cfg.image_paths:
233
- config.image_paths = g_cfg.image_paths
234
-
235
- if g_cfg.audio_paths:
236
- config.audio_paths = g_cfg.audio_paths
237
-
238
- if g_cfg.sampler_config:
239
- sampler = common_bind.SamplerConfig()
240
- sampler.temperature = g_cfg.sampler_config.temperature
241
- sampler.top_p = g_cfg.sampler_config.top_p
242
- sampler.top_k = g_cfg.sampler_config.top_k
243
- sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
244
- sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
245
- sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
246
- sampler.seed = g_cfg.sampler_config.seed
247
-
248
- if g_cfg.sampler_config.grammar_path:
249
- sampler.grammar_path = g_cfg.sampler_config.grammar_path
250
-
251
- if g_cfg.sampler_config.grammar_string:
252
- sampler.grammar_string = g_cfg.sampler_config.grammar_string
253
-
254
- config.sampler_config = sampler
255
-
256
- return config
1
+ from typing import Generator, Optional, List, Dict, Any, Union
2
+ import queue
3
+ import threading
4
+ from pathlib import Path
5
+
6
+ from nexaai.common import ModelConfig, GenerationConfig, MultiModalMessage, PluginID
7
+ from nexaai.binds import vlm_bind, common_bind
8
+ from nexaai.runtime import _ensure_runtime
9
+ from nexaai.vlm import VLM
10
+ from nexaai.base import ProfilingData
11
+ from nexaai.runtime_error import ContextLengthExceededError, GenerationError
12
+
13
+ # Error codes from ml.h
14
+ ML_SUCCESS = 0
15
+ ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH = -200004
16
+
17
+
18
+ class PyBindVLMImpl(VLM):
19
+ def __init__(self, handle: any, m_cfg: ModelConfig = ModelConfig()):
20
+ """Private constructor, should not be called directly."""
21
+ super().__init__(m_cfg)
22
+ self._handle = handle # This is a py::capsule
23
+ self._profiling_data = None
24
+
25
+ @classmethod
26
+ def _load_from(cls,
27
+ local_path: str,
28
+ mmproj_path: str = None,
29
+ model_name: Optional[str] = None,
30
+ m_cfg: ModelConfig = ModelConfig(),
31
+ plugin_id: Union[PluginID, str] = PluginID.LLAMA_CPP,
32
+ device_id: Optional[str] = None
33
+ ) -> 'PyBindVLMImpl':
34
+ """Load VLM model from local path.
35
+
36
+ Args:
37
+ local_path: Path to the main model file
38
+ mmproj_path: Path to the multimodal projection file
39
+ m_cfg: Model configuration
40
+ plugin_id: Plugin identifier
41
+ device_id: Optional device ID (not used in current binding)
42
+
43
+ Returns:
44
+ PyBindVLMImpl instance
45
+ """
46
+ _ensure_runtime()
47
+
48
+ config = common_bind.ModelConfig()
49
+
50
+ config.n_ctx = m_cfg.n_ctx
51
+ if m_cfg.n_threads is not None:
52
+ config.n_threads = m_cfg.n_threads
53
+ if m_cfg.n_threads_batch is not None:
54
+ config.n_threads_batch = m_cfg.n_threads_batch
55
+ if m_cfg.n_batch is not None:
56
+ config.n_batch = m_cfg.n_batch
57
+ if m_cfg.n_ubatch is not None:
58
+ config.n_ubatch = m_cfg.n_ubatch
59
+ if m_cfg.n_seq_max is not None:
60
+ config.n_seq_max = m_cfg.n_seq_max
61
+ config.n_gpu_layers = m_cfg.n_gpu_layers
62
+
63
+ # handle chat template strings
64
+ if m_cfg.chat_template_path:
65
+ config.chat_template_path = m_cfg.chat_template_path
66
+
67
+ if m_cfg.chat_template_content:
68
+ config.chat_template_content = m_cfg.chat_template_content
69
+
70
+ # Create handle : returns py::capsule with automatic cleanup
71
+ # Convert enum to string for C++ binding
72
+ plugin_id_str = plugin_id.value if isinstance(plugin_id, PluginID) else plugin_id
73
+ handle = vlm_bind.create_vlm(
74
+ model_path=local_path,
75
+ mmproj_path=mmproj_path,
76
+ model_name=model_name,
77
+ model_config=config,
78
+ plugin_id=plugin_id_str,
79
+ device_id=device_id
80
+ )
81
+ return cls(handle, m_cfg)
82
+
83
+ def eject(self):
84
+ """Release the model from memory."""
85
+ # py::capsule handles cleanup automatically
86
+ del self._handle
87
+ self._handle = None
88
+
89
+ def reset(self):
90
+ """
91
+ Reset the VLM model context and KV cache. If not reset, the model will skip the number of evaluated tokens and treat tokens after those as the new incremental tokens.
92
+ If your past chat history changed, or you are starting a new chat, you should always reset the model before running generate.
93
+ """
94
+ vlm_bind.ml_vlm_reset(self._handle)
95
+
96
+ def apply_chat_template(
97
+ self,
98
+ messages: List[MultiModalMessage],
99
+ tools: Optional[List[Dict[str, Any]]] = None,
100
+ enable_thinking: bool = True
101
+ ) -> str:
102
+ """Apply the chat template to multimodal messages."""
103
+ payload = []
104
+ for msg in messages:
105
+ role = msg["role"]
106
+ blocks = []
107
+
108
+ for c in msg["content"]:
109
+ t = c["type"]
110
+ if t == "text":
111
+ blocks.append({"type": "text", "text": c.get("text","") or ""})
112
+ else:
113
+ # Pass through the original structure for image, audio, and any other types
114
+ # Let vlm-bind.cpp handle field extraction (text/url/path)
115
+ blocks.append(c)
116
+
117
+ payload.append({"role": role, "content": blocks})
118
+
119
+ result = vlm_bind.ml_vlm_apply_chat_template(self._handle, payload, tools, enable_thinking)
120
+ return result
121
+
122
+ def generate_stream(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> Generator[str, None, None]:
123
+ """Generate text with streaming."""
124
+ token_queue = queue.Queue()
125
+ exception_container = [None]
126
+ self.reset_cancel() # Reset cancel flag before generation
127
+
128
+ def on_token(token: str, user_data) -> bool:
129
+ if self._cancel_event.is_set():
130
+ token_queue.put(('end', None))
131
+ return False # Stop generation
132
+ try:
133
+ token_queue.put(('token', token))
134
+ return True # Continue generation
135
+ except Exception as e:
136
+ exception_container[0] = e
137
+ return False # Stop generation
138
+
139
+ config = self._convert_generation_config(g_cfg)
140
+
141
+ # Run generation in thread
142
+ def generate():
143
+ try:
144
+ result = vlm_bind.ml_vlm_generate(
145
+ handle=self._handle,
146
+ prompt=prompt,
147
+ config=config,
148
+ on_token=on_token,
149
+ user_data=None
150
+ )
151
+
152
+ # Check for errors in result
153
+ error_code = result.get("error_code", ML_SUCCESS)
154
+ if error_code != ML_SUCCESS:
155
+ error_message = result.get("error_message", "Unknown error")
156
+ if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
157
+ exception_container[0] = ContextLengthExceededError(error_message, error_code)
158
+ else:
159
+ exception_container[0] = GenerationError(error_message, error_code)
160
+ token_queue.put(('end', None))
161
+ return
162
+
163
+ self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
164
+ except Exception as e:
165
+ exception_container[0] = e
166
+ finally:
167
+ token_queue.put(('end', None))
168
+
169
+ thread = threading.Thread(target=generate)
170
+ thread.start()
171
+
172
+ # Yield tokens as they come
173
+ try:
174
+ while True:
175
+ msg_type, token = token_queue.get()
176
+ if msg_type == 'token':
177
+ yield token
178
+ elif msg_type in ('error', 'end'):
179
+ break
180
+ finally:
181
+ thread.join()
182
+
183
+ if exception_container[0]:
184
+ raise exception_container[0]
185
+
186
+ def generate(self, prompt: str, g_cfg: GenerationConfig = GenerationConfig()) -> str:
187
+ """
188
+ Generate text without streaming.
189
+
190
+ Args:
191
+ prompt (str): The prompt to generate text from. For chat models, this is the chat messages after chat template is applied.
192
+ g_cfg (GenerationConfig): Generation configuration.
193
+
194
+ Returns:
195
+ str: The generated text.
196
+ """
197
+ config = self._convert_generation_config(g_cfg)
198
+ result = vlm_bind.ml_vlm_generate(
199
+ handle=self._handle,
200
+ prompt=prompt,
201
+ config=config,
202
+ on_token=None, # No callback for non-streaming
203
+ user_data=None
204
+ )
205
+
206
+ # Check for errors in result
207
+ error_code = result.get("error_code", ML_SUCCESS)
208
+ if error_code != ML_SUCCESS:
209
+ error_message = result.get("error_message", "Unknown error")
210
+ if error_code == ML_ERROR_LLM_TOKENIZATION_CONTEXT_LENGTH:
211
+ raise ContextLengthExceededError(error_message, error_code)
212
+ else:
213
+ raise GenerationError(error_message, error_code)
214
+
215
+ self._profiling_data = ProfilingData.from_dict(result.get("profile_data", {}))
216
+ return result.get("text", "")
217
+
218
+ def get_profiling_data(self) -> Optional[ProfilingData]:
219
+ """Get profiling data."""
220
+ return self._profiling_data
221
+
222
+ def _convert_generation_config(self, g_cfg: GenerationConfig):
223
+ """Convert GenerationConfig to binding format."""
224
+ config = common_bind.GenerationConfig()
225
+
226
+ # Set basic generation parameters
227
+ config.max_tokens = g_cfg.max_tokens
228
+
229
+ if g_cfg.stop_words:
230
+ config.stop = g_cfg.stop_words
231
+
232
+ if g_cfg.image_paths:
233
+ config.image_paths = g_cfg.image_paths
234
+
235
+ if g_cfg.audio_paths:
236
+ config.audio_paths = g_cfg.audio_paths
237
+
238
+ if g_cfg.sampler_config:
239
+ sampler = common_bind.SamplerConfig()
240
+ sampler.temperature = g_cfg.sampler_config.temperature
241
+ sampler.top_p = g_cfg.sampler_config.top_p
242
+ sampler.top_k = g_cfg.sampler_config.top_k
243
+ sampler.repetition_penalty = g_cfg.sampler_config.repetition_penalty
244
+ sampler.presence_penalty = g_cfg.sampler_config.presence_penalty
245
+ sampler.frequency_penalty = g_cfg.sampler_config.frequency_penalty
246
+ sampler.seed = g_cfg.sampler_config.seed
247
+
248
+ if g_cfg.sampler_config.grammar_path:
249
+ sampler.grammar_path = g_cfg.sampler_config.grammar_path
250
+
251
+ if g_cfg.sampler_config.grammar_string:
252
+ sampler.grammar_string = g_cfg.sampler_config.grammar_string
253
+
254
+ config.sampler_config = sampler
255
+
256
+ return config
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nexaai
3
- Version: 1.0.21rc5
3
+ Version: 1.0.21rc14
4
4
  Summary: Python bindings for NexaSDK C-lib backend
5
5
  Author-email: "Nexa AI, Inc." <dev@nexa.ai>
6
6
  Project-URL: Homepage, https://github.com/NexaAI/nexasdk-bridge