nexaai 1.0.17rc10__cp310-cp310-macosx_13_0_x86_64.whl → 1.0.18__cp310-cp310-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nexaai might be problematic. Click here for more details.

Binary file
nexaai/_version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # This file is generated by CMake from _version.py.in
2
2
  # Do not modify this file manually - it will be overwritten
3
3
 
4
- __version__ = "1.0.17-rc10"
4
+ __version__ = "1.0.18"
Binary file
@@ -41,7 +41,12 @@ except ImportError:
41
41
  from ml import ChatMessage
42
42
  from dataclasses import dataclass
43
43
  from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
44
- from .generate import GenerationResult
44
+ from .generate import GenerationResult
45
+
46
+ # Custom exception for context length exceeded
47
+ class ContextLengthExceededError(Exception):
48
+ """Raised when input context length exceeds model's maximum context size"""
49
+ pass
45
50
 
46
51
  @dataclass
47
52
  class Qwen3VLBundledModel:
@@ -67,6 +72,7 @@ def load_qwen3_vl(
67
72
 
68
73
  Parameters are aligned with .generate.load for compatibility.
69
74
  """
75
+
70
76
  model_path = Path(path_or_repo)
71
77
  if not model_path.exists():
72
78
  if "/" in path_or_repo:
@@ -154,7 +160,6 @@ def load_qwen3_vl(
154
160
  if quantization_bits in [4, 8]:
155
161
  nn.quantize(llm_model, bits=quantization_bits, group_size=64,
156
162
  class_predicate=quant_predicate)
157
- # For f32 (32-bit), no quantization needed
158
163
 
159
164
  llm_model.load_weights(str(llm_weights_path), strict=True)
160
165
 
@@ -166,11 +171,15 @@ def load_qwen3_vl(
166
171
 
167
172
  def apply_chat_template_qwen3_vl(messages: Sequence[ChatMessage], num_images: int = 0, num_audios: int = 0, tools: Optional[str] = None, enable_thinking: bool = False) -> str:
168
173
  """Apply chat template: serialize messages with content as a list of typed items."""
174
+
169
175
  messages_dict = []
170
- for msg in messages:
176
+ for i, msg in enumerate(messages):
171
177
  content_items = [{"type": "text", "text": msg.content}]
172
178
  messages_dict.append({"role": msg.role, "content": content_items})
173
- return json.dumps(messages_dict)
179
+
180
+ result = json.dumps(messages_dict)
181
+
182
+ return result
174
183
 
175
184
 
176
185
  def stream_generate_qwen3_vl(
@@ -184,15 +193,22 @@ def stream_generate_qwen3_vl(
184
193
 
185
194
  ) -> Generator[Any, None, None]:
186
195
  """Stream generation yielding .generate.GenerationResult-compatible chunks."""
187
- messages = json.loads(prompt)
196
+
197
+ try:
198
+ messages = json.loads(prompt)
199
+ except json.JSONDecodeError as e:
200
+ raise
201
+
188
202
  if image is not None:
189
203
  image_list = image if isinstance(image, list) else [image]
190
204
  pil_images = []
191
- for p in image_list:
205
+ for i, p in enumerate(image_list):
192
206
  try:
193
- pil_images.append(Image.open(p))
194
- except Exception:
207
+ img = Image.open(p)
208
+ pil_images.append(img)
209
+ except Exception as e:
195
210
  continue
211
+
196
212
  contents = [{"type": "image", "image": img} for img in pil_images]
197
213
  if messages:
198
214
  if "content" not in messages[-1] or not isinstance(messages[-1]["content"], list):
@@ -201,6 +217,7 @@ def stream_generate_qwen3_vl(
201
217
 
202
218
  raw_text, processed_images = processor.messages_to_text(
203
219
  messages, add_generation_prompt=True)
220
+
204
221
 
205
222
  inputs = processor.text_to_input_ids(
206
223
  raw_text, images=processed_images, return_tensors="mlx")
@@ -208,10 +225,18 @@ def stream_generate_qwen3_vl(
208
225
  input_ids = inputs["input_ids"]
209
226
  pixel_values = inputs.get("pixel_values")
210
227
  image_grid_thw = inputs.get("image_grid_thw")
228
+
229
+
230
+ # Check if input context exceeds KV cache size and raise error
231
+ max_kv_size = 4096 # This should match the max_kv_size used in make_prompt_cache and nexa_generate_step
232
+ if input_ids.size > max_kv_size:
233
+ error_msg = f"Input context length ({input_ids.size} tokens) exceeds maximum supported context size ({max_kv_size} tokens). Please reduce the input length."
234
+ raise ContextLengthExceededError(error_msg)
211
235
 
212
236
  inputs_embeds, deepstack_visual_embeds, visual_pos_masks, cos, sin, rope_deltas = handle_multimodal_embeds(
213
237
  model.vision_model, model.llm_model, input_ids, pixel_values, image_grid_thw
214
238
  )
239
+
215
240
 
216
241
  prompt_cache = make_prompt_cache(model.llm_model, max_kv_size=4096)
217
242
  tokenizer = processor.tokenizer
@@ -222,37 +247,45 @@ def stream_generate_qwen3_vl(
222
247
 
223
248
  gen_count = 0
224
249
  tic = time.perf_counter()
250
+
225
251
 
226
- for token, logprobs in nexa_generate_step(
227
- model=model.llm_model,
228
- prompt=None,
229
- input_embeddings=inputs_embeds,
230
- max_tokens=max_tokens,
231
- max_kv_size=4096,
232
- prompt_cache=prompt_cache,
233
- visual_pos_masks=visual_pos_masks,
234
- deepstack_visual_embeds=deepstack_visual_embeds,
235
- cos=cos,
236
- sin=sin,
237
- rope_deltas=rope_deltas,
238
- ):
239
- if token == tokenizer.eos_token_id:
240
- break
241
-
242
- text_piece = tokenizer.decode([token])
243
- gen_count += 1
244
-
245
- yield GenerationResult(
246
- text=text_piece,
247
- token=token,
248
- logprobs=logprobs,
249
- prompt_tokens=int(input_ids.size),
250
- generation_tokens=gen_count,
251
- prompt_tps=float(prompt_tps),
252
- generation_tps=float(
253
- gen_count / max(1e-6, (time.perf_counter() - tic))),
254
- peak_memory=float(mx.get_peak_memory() / 1e9),
255
- )
252
+ try:
253
+ for token, logprobs in nexa_generate_step(
254
+ model=model.llm_model,
255
+ prompt=None,
256
+ input_embeddings=inputs_embeds,
257
+ max_tokens=max_tokens,
258
+ max_kv_size=4096,
259
+ prompt_cache=prompt_cache,
260
+ visual_pos_masks=visual_pos_masks,
261
+ deepstack_visual_embeds=deepstack_visual_embeds,
262
+ cos=cos,
263
+ sin=sin,
264
+ rope_deltas=rope_deltas,
265
+ ):
266
+ if token == tokenizer.eos_token_id:
267
+ break
268
+
269
+ text_piece = tokenizer.decode([token])
270
+ gen_count += 1
271
+
272
+ current_tps = gen_count / max(1e-6, (time.perf_counter() - tic))
273
+
274
+ yield GenerationResult(
275
+ text=text_piece,
276
+ token=token,
277
+ logprobs=logprobs,
278
+ prompt_tokens=int(input_ids.size),
279
+ generation_tokens=gen_count,
280
+ prompt_tps=float(prompt_tps),
281
+ generation_tps=float(current_tps),
282
+ peak_memory=float(mx.get_peak_memory() / 1e9),
283
+ )
284
+ except Exception as e:
285
+ import traceback
286
+ traceback.print_exc()
287
+ raise
288
+
256
289
 
257
290
  def quant_predicate(path: str, mod: nn.Module) -> bool:
258
291
  """Quantization predicate to exclude certain layers from quantization."""
@@ -25,7 +25,7 @@ from profiling import ProfilingMixin, ProfilingData, StopReason
25
25
 
26
26
  # Import from the actual mlx_vlm structure
27
27
  from .generate import generate, stream_generate, load
28
- from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl
28
+ from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl, ContextLengthExceededError
29
29
 
30
30
  from .modeling.prompt_utils import apply_chat_template
31
31
 
@@ -124,7 +124,7 @@ class VLM(ProfilingMixin):
124
124
  prompt: str,
125
125
  config: Optional[GenerationConfig] = None,
126
126
  ) -> GenerationResult:
127
- """Generate text from prompt."""
127
+ """Generate text from prompt."""
128
128
  # Start profiling
129
129
  self._start_profiling()
130
130
 
@@ -148,12 +148,15 @@ class VLM(ProfilingMixin):
148
148
  # Extract incremental portion of the prompt (similar to llama.cpp VLM)
149
149
  full_prompt_len = len(prompt)
150
150
  incremental_prompt = prompt
151
-
152
- if self.global_n_past_chars < full_prompt_len:
153
- incremental_prompt = prompt[self.global_n_past_chars:]
154
- else:
155
- # No new text to process
156
- incremental_prompt = ""
151
+
152
+ # Apply incremental processing only for non-qwen3vl models
153
+ # qwen3vl requires complete JSON conversation structure
154
+ if self.model_name != "qwen3vl":
155
+ if self.global_n_past_chars < full_prompt_len:
156
+ incremental_prompt = prompt[self.global_n_past_chars:]
157
+ else:
158
+ # No new text to process
159
+ incremental_prompt = ""
157
160
 
158
161
  # End prompt processing, start decode
159
162
  self._prompt_end()
@@ -196,13 +199,15 @@ class VLM(ProfilingMixin):
196
199
  self._update_generated_tokens(generated_tokens)
197
200
  self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
198
201
 
199
- # Update global character position
200
- self.global_n_past_chars = full_prompt_len + len(text)
202
+ # Update global character position (not needed for qwen3vl JSON processing)
203
+ if self.model_name != "qwen3vl":
204
+ old_pos = self.global_n_past_chars
205
+ self.global_n_past_chars = full_prompt_len + len(text)
201
206
 
202
207
  self._decode_end()
203
208
  self._end_profiling()
204
209
 
205
- return GenerationResult(
210
+ result = GenerationResult(
206
211
  text=text,
207
212
  prompt_tokens=prompt_tokens,
208
213
  generation_tokens=generated_tokens,
@@ -211,7 +216,18 @@ class VLM(ProfilingMixin):
211
216
  generation_tps=stats.get("generation_tps", 0.0),
212
217
  peak_memory=stats.get("peak_memory", 0.0),
213
218
  )
219
+
220
+ return result
221
+
222
+ except ContextLengthExceededError as e:
223
+ self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
224
+ self._decode_end()
225
+ self._end_profiling()
226
+ # Re-raise the original exception without wrapping it
227
+ raise e
214
228
  except Exception as e:
229
+ import traceback
230
+ traceback.print_exc()
215
231
  self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
216
232
  self._decode_end()
217
233
  self._end_profiling()
@@ -224,6 +240,7 @@ class VLM(ProfilingMixin):
224
240
  on_token: Optional[TokenCallback],
225
241
  ) -> GenerationResult:
226
242
  """Generate text with streaming callback. Unified method for both text and multimodal generation."""
243
+
227
244
  # Start profiling
228
245
  self._start_profiling()
229
246
 
@@ -236,6 +253,7 @@ class VLM(ProfilingMixin):
236
253
  if self.sampler_config is not None:
237
254
  gen_kwargs.update(self.sampler_config.__dict__)
238
255
 
256
+
239
257
  # Get image and audio paths from config
240
258
  image_paths = config.image_paths if config else None
241
259
  audio_paths = config.audio_paths if config else None
@@ -244,15 +262,20 @@ class VLM(ProfilingMixin):
244
262
  image_list = [str(path) for path in image_paths] if image_paths else None
245
263
  audio_list = [str(path) for path in audio_paths] if audio_paths else None
246
264
 
265
+
247
266
  # Extract incremental portion of the prompt (similar to llama.cpp VLM)
248
267
  full_prompt_len = len(prompt)
249
268
  incremental_prompt = prompt
250
269
 
251
- if self.global_n_past_chars < full_prompt_len:
252
- incremental_prompt = prompt[self.global_n_past_chars:]
253
- else:
254
- # No new text to process
255
- incremental_prompt = ""
270
+
271
+ # Apply incremental processing only for non-qwen3vl models
272
+ # qwen3vl requires complete JSON conversation structure
273
+ if self.model_name != "qwen3vl":
274
+ if self.global_n_past_chars < full_prompt_len:
275
+ incremental_prompt = prompt[self.global_n_past_chars:]
276
+ else:
277
+ # No new text to process
278
+ incremental_prompt = ""
256
279
 
257
280
  # End prompt processing, start decode
258
281
  self._prompt_end()
@@ -264,6 +287,8 @@ class VLM(ProfilingMixin):
264
287
  stream_generate_impl = stream_generate_qwen3_vl if self.model_name == "qwen3vl" else stream_generate
265
288
 
266
289
  try:
290
+ token_count = 0
291
+
267
292
  for result in stream_generate_impl(
268
293
  self.model,
269
294
  self.processor,
@@ -272,7 +297,9 @@ class VLM(ProfilingMixin):
272
297
  audio=audio_list,
273
298
  **gen_kwargs,
274
299
  ):
275
- # Record TTFT on first token
300
+ token_count += 1
301
+
302
+ # Record TTFT on first token
276
303
  if first_token:
277
304
  self._record_ttft()
278
305
  first_token = False
@@ -285,6 +312,7 @@ class VLM(ProfilingMixin):
285
312
  text += result.text
286
313
  last_result = result
287
314
 
315
+
288
316
  # Set stop reason if not user stop
289
317
  if self._profiling_context.stop_reason != StopReason.ML_STOP_REASON_USER:
290
318
  self._set_stop_reason(StopReason.ML_STOP_REASON_EOS)
@@ -294,13 +322,15 @@ class VLM(ProfilingMixin):
294
322
  self._update_prompt_tokens(last_result.prompt_tokens)
295
323
  self._update_generated_tokens(last_result.generation_tokens)
296
324
 
297
- # Update global character position
298
- self.global_n_past_chars = full_prompt_len + len(text)
325
+ # Update global character position (not needed for qwen3vl JSON processing)
326
+ if self.model_name != "qwen3vl":
327
+ old_pos = self.global_n_past_chars
328
+ self.global_n_past_chars = full_prompt_len + len(text)
299
329
 
300
330
  self._decode_end()
301
331
  self._end_profiling()
302
332
 
303
- return GenerationResult(
333
+ result = GenerationResult(
304
334
  text=text,
305
335
  token=last_result.token if last_result else None,
306
336
  logprobs=last_result.logprobs if last_result else None,
@@ -311,7 +341,18 @@ class VLM(ProfilingMixin):
311
341
  generation_tps=last_result.generation_tps if last_result else 0.0,
312
342
  peak_memory=last_result.peak_memory if last_result else 0.0,
313
343
  )
344
+
345
+ return result
346
+
347
+ except ContextLengthExceededError as e:
348
+ self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
349
+ self._decode_end()
350
+ self._end_profiling()
351
+ # Re-raise the original exception without wrapping it
352
+ raise e
314
353
  except Exception as e:
354
+ import traceback
355
+ traceback.print_exc()
315
356
  self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
316
357
  self._decode_end()
317
358
  self._end_profiling()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nexaai
3
- Version: 1.0.17rc10
3
+ Version: 1.0.18
4
4
  Summary: Python bindings for NexaSDK C-lib backend
5
5
  Author-email: "Nexa AI, Inc." <dev@nexa.ai>
6
6
  Project-URL: Homepage, https://github.com/NexaAI/nexasdk-bridge
@@ -1,6 +1,6 @@
1
1
  nexaai/__init__.py,sha256=L8oB7GFZZMGnUpCg0PecDbI_ycKuQak-ZEJ4Y12_QIw,2184
2
- nexaai/_stub.cpython-310-darwin.so,sha256=9tKb2YBVS2quTKD-OUHqxI2blqCaKef4HLZI6DeZwS4,49832
3
- nexaai/_version.py,sha256=nKUy6Z6ytRT-zOfWuQqpmzK7vKqdC96nmMqwtCJBwMM,144
2
+ nexaai/_stub.cpython-310-darwin.so,sha256=rme8AeSXOZBPbhUbP9GKQTvYon2BSiwm3T1rVEmo0nA,49832
3
+ nexaai/_version.py,sha256=u4x0epv_LKPUfQvNf5zaekZDwaMD-RVDvPvcAmx-b40,139
4
4
  nexaai/asr.py,sha256=NljMXDErwPNMOPaRkJZMEDka9Nk8xyur7L8i924TStY,2054
5
5
  nexaai/base.py,sha256=N8PRgDFA-XPku2vWnQIofQ7ipz3pPlO6f8YZGnuhquE,982
6
6
  nexaai/common.py,sha256=Y0NJNLTi4Nq4x1WL6PQsSvGUto0eGmWhjpsC6jcekfA,3444
@@ -19,7 +19,7 @@ nexaai/asr_impl/pybind_asr_impl.py,sha256=pE9Hb_hMi5yAc4MF83bLVOb8zDtreCkB3_u7XE
19
19
  nexaai/binds/__init__.py,sha256=eYuay_8DDXeOUWz2_R9HFSabohxs6hvZn391t2L0Po0,104
20
20
  nexaai/binds/common_bind.cpython-310-darwin.so,sha256=BoXByRlNGDaNS1YyZyCF-s7h0vXP9NLPlJMQQ5pqusU,235488
21
21
  nexaai/binds/embedder_bind.cpython-310-darwin.so,sha256=b2NoXFAJvPLi_P1X7lXLKmAUU0v2HJI3Zwa10gfqHdw,202032
22
- nexaai/binds/libnexa_bridge.dylib,sha256=Yopwbcp5VQ9NF6o9un48Kb5FoqnyIS3QxHNRh8ak_hU,250408
22
+ nexaai/binds/libnexa_bridge.dylib,sha256=59iLj-0ieCv-tU5pcJc7Tj-84pseGPAXL7JOi19bdhc,250408
23
23
  nexaai/binds/llm_bind.cpython-310-darwin.so,sha256=p1ZTGMolEkWywkmwzOUjTr3RpSEH21BHZAggVzo89Ks,183088
24
24
  nexaai/binds/vlm_bind.cpython-310-darwin.so,sha256=LGd-tykePnQFfGca25HnPIBfXsfrMzbwyx6d5Ld3xps,183000
25
25
  nexaai/binds/nexa_llama_cpp/libggml-base.dylib,sha256=GyOkHOM-5uHp7NUZ4Sr9BWak6BYpcc9aqI9A-zPnQp4,629528
@@ -246,8 +246,8 @@ nexaai/mlx_backend/tts/__init__.py,sha256=fuT_9_xpYJ28m4yjly5L2jChUrzlSQz-b_S7nu
246
246
  nexaai/mlx_backend/tts/interface.py,sha256=0FvZbIyOvg8jERZEQ6bygbv7v02O9xHO4-TPUlar0b4,9568
247
247
  nexaai/mlx_backend/vlm/__init__.py,sha256=_25kvMEviX16Hg3bro8Ws70V0eeIEqYKV8ZDXqYzKew,73
248
248
  nexaai/mlx_backend/vlm/generate.py,sha256=DqHFEAuqk-nko8ho6U9GAXTDAWz4d8GTe_hCt-XFyCw,19071
249
- nexaai/mlx_backend/vlm/generate_qwen3_vl.py,sha256=undjso1mfxqpd6FMTksSA5qagRttxAGbOBj1x7cqI1s,9211
250
- nexaai/mlx_backend/vlm/interface.py,sha256=0BLfodbYOU71jFvAvv01FuLBE_KBtyB-8Cd7LqzzRHY,17450
249
+ nexaai/mlx_backend/vlm/generate_qwen3_vl.py,sha256=eeizW18u6dHPZOOnJtQUJkiqMAIIpOSS-IOjacXGsz4,10240
250
+ nexaai/mlx_backend/vlm/interface.py,sha256=HOPzWNMs6QaHO6x0Z83kW1xkRRmb8_xo6xQLKsOWqAo,19013
251
251
  nexaai/mlx_backend/vlm/main.py,sha256=nPcg25jupeDD74uvRoxpWp3Dsulw7WddI7vll6zejak,10664
252
252
  nexaai/mlx_backend/vlm/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
253
253
  nexaai/mlx_backend/vlm/modeling/convert.py,sha256=ia5i9cgTufFGmKyhkYUaW0nfNqT_bMo8i-Hg_zy5JC4,1863
@@ -387,7 +387,7 @@ nexaai/utils/quantization_utils.py,sha256=FYcNSAKGlBqFDUTx3jSKOr2lnq4nyiyC0ZG8oS
387
387
  nexaai/vlm_impl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
388
388
  nexaai/vlm_impl/mlx_vlm_impl.py,sha256=pLtWm_ckz8a0U-AtAOMVseFDO4OVPvHyYO2KlfBaGYk,10833
389
389
  nexaai/vlm_impl/pybind_vlm_impl.py,sha256=FAbhpRJzHgI78r0mUvKybO97R1szvNhH0aTn_I52oT4,8597
390
- nexaai-1.0.17rc10.dist-info/METADATA,sha256=CBp42bC2oj1pRu7t-v7qqiH6ZlQ1QLFHSpI3QL1JypU,1202
391
- nexaai-1.0.17rc10.dist-info/WHEEL,sha256=0KYp5feZ1CMUhsfFXKpSQTbSmQbXy4mv6yPPVBXg2EM,110
392
- nexaai-1.0.17rc10.dist-info/top_level.txt,sha256=LRE2YERlrZk2vfuygnSzsEeqSknnZbz3Z1MHyNmBU4w,7
393
- nexaai-1.0.17rc10.dist-info/RECORD,,
390
+ nexaai-1.0.18.dist-info/METADATA,sha256=yh4CFZmHv1dg0aN41La3qaRlXU5XC1_7erEOspSE95s,1198
391
+ nexaai-1.0.18.dist-info/WHEEL,sha256=0KYp5feZ1CMUhsfFXKpSQTbSmQbXy4mv6yPPVBXg2EM,110
392
+ nexaai-1.0.18.dist-info/top_level.txt,sha256=LRE2YERlrZk2vfuygnSzsEeqSknnZbz3Z1MHyNmBU4w,7
393
+ nexaai-1.0.18.dist-info/RECORD,,